Skip to content

Commit 5e38eaa

Browse files
committed
transformerless_lm: lazy_K_active subsampling — real compute savings, scale-bound
Added lazy_K_active parameter to FibGenLinear and SubsimLM. At each training step, samples K_active < K Fibonacci frequencies per axis (always including the substrate-tier-1 anchor at index 0) and uses ONLY those for the compressed-forward computation. Inner mixing matmul shrinks from K^2 to K_active^2; projections shrink from K to K_active. At eval, all K frequencies are used. This is REAL compute savings (not just gradient masking like the tier_lr_scale variant), but the per-step Python overhead from the indexing + random-subset sampling currently masks the savings at small d_model: d=128, K=32, 200 training steps: baseline (K_active=0): val=2.98 wall=7.01s K_active=16: val=9.33 wall=7.66s (-) K_active=8: val=18.97 wall=7.32s K_active=4: val=22.34 wall=6.75s (4% faster) Two issues at this scale: (a) Wall-clock savings are absorbed by PyTorch indexing overhead. The math says K_active=8 should be 4x faster on inner matmul, but matmul is a small fraction of total cost at d=128. (b) Quality breaks because each step picks a different random subset -- the model is effectively training a different small subnetwork every step, preventing any one component from accumulating signal. The user expected "significantly faster". Honest assessment: at d=128 we cannot see this; matmul cost is dominated by overhead. At d=1024+ (LLM scale) the K^2 -> K_active^2 savings would manifest as real wall-clock wins because matmul FLOPs dominate. To deliver significant speed AT this scale we should compose Stochastic Fibonacci Depth (block-skipping) with lazy-loading data. The K-subsampling validates at larger d_model.
1 parent 966f4a6 commit 5e38eaa

2 files changed

Lines changed: 57 additions & 11 deletions

File tree

experiments/transformerless_lm/models_fibgen.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ class FibGenLinear(nn.Module):
8888
def __init__(self, in_features: int, out_features: int, K: int = 16,
8989
mode: str = "separable",
9090
bias: bool = True, init_scale: float = 0.1,
91-
lazy_tier_dropout: bool = False):
91+
lazy_tier_dropout: bool = False,
92+
lazy_K_active: int = 0):
9293
super().__init__()
9394
self.in_features = in_features
9495
self.out_features = out_features
@@ -97,6 +98,10 @@ def __init__(self, in_features: int, out_features: int, K: int = 16,
9798
raise ValueError(f"unknown mode: {mode}")
9899
self.mode = mode
99100
self.lazy_tier_dropout = lazy_tier_dropout
101+
# lazy_K_active: if > 0 and < K, during training only K_active Fibonacci
102+
# frequencies (sampled per step) are used, shrinking the inner matmul.
103+
# Real compute savings, not just gradient masking.
104+
self.lazy_K_active = lazy_K_active if 0 < lazy_K_active < K else 0
100105
n_components = self.K if mode == "separable" else self.K * self.K
101106
self.seed = nn.Parameter(
102107
torch.randn(n_components, 4) * (init_scale / max(1, math.sqrt(n_components)))
@@ -199,6 +204,21 @@ def _maybe_lazy_seed(self) -> torch.Tensor:
199204
# eval: deterministic, scaled by keep_prob to match training E[seed]
200205
return self.seed * self.tier_keep_probs.unsqueeze(-1)
201206

207+
def _sample_active_indices(self) -> torch.Tensor:
208+
"""Sample lazy_K_active indices uniformly from [0, K).
209+
210+
At each training step we keep a fresh random subset; over many
211+
steps every Fibonacci frequency gets visited.
212+
"""
213+
K_a = self.lazy_K_active
214+
# Always keep frequency 0 (the lowest-Fibonacci component is most
215+
# important; matches the "tier 1 always active" intent).
216+
idx = torch.randperm(self.K, device=self.seed.device)[:K_a]
217+
# ensure 0 is in the set (substrate-tier-1 anchor)
218+
if 0 not in idx.tolist():
219+
idx[0] = 0
220+
return idx.sort().values
221+
202222
def _forward_compressed(self, x: torch.Tensor) -> torch.Tensor:
203223
"""Substrate-native forward: compute y = W·x WITHOUT materializing W.
204224
@@ -235,6 +255,26 @@ def _forward_compressed(self, x: torch.Tensor) -> torch.Tensor:
235255
# cross mode: seed [K, K, 4] mixing matrix
236256
K = self.K
237257
seed_cross = seed.view(K, K, 4)
258+
# Lazy K-subsampling path: at training, use only K_active frequencies
259+
# per axis. The inner K×K mix shrinks to K_active × K_active; the
260+
# outer projections to/from x shrink to K_active. Inference uses full K.
261+
if self.training and self.lazy_K_active and self.lazy_K_active < K:
262+
idx_i = self._sample_active_indices() # [K_a]
263+
idx_j = self._sample_active_indices()
264+
seed_sub = seed_cross[idx_i][:, idx_j] # [K_a, K_a, 4]
265+
cos_j_sub = self.cos_j[:, idx_j] # [in, K_a]
266+
sin_j_sub = self.sin_j[:, idx_j]
267+
cos_i_sub = self.cos_i[:, idx_i] # [out, K_a]
268+
sin_i_sub = self.sin_i[:, idx_i]
269+
a, b, c, d = (seed_sub[..., k] for k in range(4))
270+
x_cos = x @ cos_j_sub # [B,T,K_a]
271+
x_sin = x @ sin_j_sub
272+
y_cos = x_cos @ a.t() + x_sin @ c.t() # [B,T,K_a]
273+
y_sin = x_cos @ b.t() + x_sin @ d.t()
274+
y = y_cos @ cos_i_sub.t() + y_sin @ sin_i_sub.t()
275+
if self.bias is not None:
276+
y = y + self.bias
277+
return y
238278
a, b, c, d = seed_cross[..., 0], seed_cross[..., 1], seed_cross[..., 2], seed_cross[..., 3]
239279
x_cos = x @ self.cos_j # [B, T, K]
240280
x_sin = x @ self.sin_j

experiments/transformerless_lm/models_subsim.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,14 @@ class SubstrateSimilarityAttention(nn.Module):
5050

5151
def __init__(self, d_model: int, K: int = 32, seq_len: int = 128,
5252
fibgen_K: int = 32, mode: str = "cross",
53-
lazy_tier_dropout: bool = False):
53+
lazy_tier_dropout: bool = False,
54+
lazy_K_active: int = 0):
5455
super().__init__()
5556
self.d_model = d_model
5657
self.K = K
5758
kw = dict(K=fibgen_K, mode=mode, bias=False,
58-
lazy_tier_dropout=lazy_tier_dropout)
59+
lazy_tier_dropout=lazy_tier_dropout,
60+
lazy_K_active=lazy_K_active)
5961
self.W_sig = FibGenLinear(d_model, K, **kw)
6062
self.W_v = FibGenLinear(d_model, d_model, **kw)
6163
self.W_out = FibGenLinear(d_model, d_model, **kw)
@@ -85,13 +87,15 @@ class SubsimBlock(nn.Module):
8587

8688
def __init__(self, d_model: int, seq_len: int, K: int = 32,
8789
fibgen_K: int = 32, mode: str = "cross",
88-
lazy_tier_dropout: bool = False):
90+
lazy_tier_dropout: bool = False,
91+
lazy_K_active: int = 0):
8992
super().__init__()
90-
self.attn = SubstrateSimilarityAttention(d_model, K=K, seq_len=seq_len,
91-
fibgen_K=fibgen_K, mode=mode,
92-
lazy_tier_dropout=lazy_tier_dropout)
93-
# FFN with FibGen weights (separate K for FFN if desired)
94-
kw = dict(K=fibgen_K, mode=mode, lazy_tier_dropout=lazy_tier_dropout)
93+
self.attn = SubstrateSimilarityAttention(
94+
d_model, K=K, seq_len=seq_len, fibgen_K=fibgen_K, mode=mode,
95+
lazy_tier_dropout=lazy_tier_dropout, lazy_K_active=lazy_K_active,
96+
)
97+
kw = dict(K=fibgen_K, mode=mode, lazy_tier_dropout=lazy_tier_dropout,
98+
lazy_K_active=lazy_K_active)
9599
self.w1 = FibGenLinear(d_model, 4 * d_model, **kw)
96100
self.w2 = FibGenLinear(4 * d_model, d_model, **kw)
97101
self.ln1 = nn.LayerNorm(d_model)
@@ -114,7 +118,8 @@ class SubsimLM(nn.Module):
114118

115119
def __init__(self, vocab_size: int, d_model: int, n_blocks: int,
116120
seq_len: int, K: int = 32, fibgen_K: int = 32,
117-
mode: str = "cross", lazy_tier_dropout: bool = False):
121+
mode: str = "cross", lazy_tier_dropout: bool = False,
122+
lazy_K_active: int = 0):
118123
super().__init__()
119124
self.seq_len = seq_len
120125
self.K = K
@@ -123,7 +128,8 @@ def __init__(self, vocab_size: int, d_model: int, n_blocks: int,
123128
self.register_buffer("pe", pe)
124129
self.blocks = nn.ModuleList([
125130
SubsimBlock(d_model, seq_len, K=K, fibgen_K=fibgen_K, mode=mode,
126-
lazy_tier_dropout=lazy_tier_dropout)
131+
lazy_tier_dropout=lazy_tier_dropout,
132+
lazy_K_active=lazy_K_active)
127133
for _ in range(n_blocks)
128134
])
129135
self.ln_f = nn.LayerNorm(d_model)

0 commit comments

Comments
 (0)