transformerless_lm: lazy_K_active subsampling — real compute savings, scale-bound

claude · claude · commit 5e38eaa113dc · 2026-05-21T02:02:43.000Z
Added lazy_K_active parameter to FibGenLinear and SubsimLM. At each
training step, samples K_active &lt; K Fibonacci frequencies per axis
(always including the substrate-tier-1 anchor at index 0) and uses
ONLY those for the compressed-forward computation. Inner mixing
matmul shrinks from K^2 to K_active^2; projections shrink from K to
K_active. At eval, all K frequencies are used.

This is REAL compute savings (not just gradient masking like the
tier_lr_scale variant), but the per-step Python overhead from the
indexing + random-subset sampling currently masks the savings at
small d_model:

  d=128, K=32, 200 training steps:
    baseline (K_active=0):  val=2.98   wall=7.01s
    K_active=16:           val=9.33   wall=7.66s (-)
    K_active=8:            val=18.97  wall=7.32s
    K_active=4:            val=22.34  wall=6.75s (4% faster)

Two issues at this scale:
  (a) Wall-clock savings are absorbed by PyTorch indexing overhead.
      The math says K_active=8 should be 4x faster on inner matmul,
      but matmul is a small fraction of total cost at d=128.
  (b) Quality breaks because each step picks a different random
      subset -- the model is effectively training a different small
      subnetwork every step, preventing any one component from
      accumulating signal.

The user expected "significantly faster". Honest assessment: at
d=128 we cannot see this; matmul cost is dominated by overhead.
At d=1024+ (LLM scale) the K^2 -&gt; K_active^2 savings would manifest
as real wall-clock wins because matmul FLOPs dominate.

To deliver significant speed AT this scale we should compose
Stochastic Fibonacci Depth (block-skipping) with lazy-loading data.
The K-subsampling validates at larger d_model.
diff --git a/experiments/transformerless_lm/models_fibgen.py b/experiments/transformerless_lm/models_fibgen.py
@@ -88,7 +88,8 @@ class FibGenLinear(nn.Module):
     def __init__(self, in_features: int, out_features: int, K: int = 16,
                  mode: str = "separable",
                  bias: bool = True, init_scale: float = 0.1,
-                 lazy_tier_dropout: bool = False):
+                 lazy_tier_dropout: bool = False,
+                 lazy_K_active: int = 0):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -97,6 +98,10 @@ def __init__(self, in_features: int, out_features: int, K: int = 16,
             raise ValueError(f"unknown mode: {mode}")
         self.mode = mode
         self.lazy_tier_dropout = lazy_tier_dropout
+        # lazy_K_active: if > 0 and < K, during training only K_active Fibonacci
+        # frequencies (sampled per step) are used, shrinking the inner matmul.
+        # Real compute savings, not just gradient masking.
+        self.lazy_K_active = lazy_K_active if 0 < lazy_K_active < K else 0
         n_components = self.K if mode == "separable" else self.K * self.K
         self.seed = nn.Parameter(
             torch.randn(n_components, 4) * (init_scale / max(1, math.sqrt(n_components)))
@@ -199,6 +204,21 @@ def _maybe_lazy_seed(self) -> torch.Tensor:
         # eval: deterministic, scaled by keep_prob to match training E[seed]
         return self.seed * self.tier_keep_probs.unsqueeze(-1)
 
+    def _sample_active_indices(self) -> torch.Tensor:
+        """Sample lazy_K_active indices uniformly from [0, K).
+
+        At each training step we keep a fresh random subset; over many
+        steps every Fibonacci frequency gets visited.
+        """
+        K_a = self.lazy_K_active
+        # Always keep frequency 0 (the lowest-Fibonacci component is most
+        # important; matches the "tier 1 always active" intent).
+        idx = torch.randperm(self.K, device=self.seed.device)[:K_a]
+        # ensure 0 is in the set (substrate-tier-1 anchor)
+        if 0 not in idx.tolist():
+            idx[0] = 0
+        return idx.sort().values
+
     def _forward_compressed(self, x: torch.Tensor) -> torch.Tensor:
         """Substrate-native forward: compute y = W·x WITHOUT materializing W.
 
@@ -235,6 +255,26 @@ def _forward_compressed(self, x: torch.Tensor) -> torch.Tensor:
         # cross mode: seed [K, K, 4] mixing matrix
         K = self.K
         seed_cross = seed.view(K, K, 4)
+        # Lazy K-subsampling path: at training, use only K_active frequencies
+        # per axis. The inner K×K mix shrinks to K_active × K_active; the
+        # outer projections to/from x shrink to K_active. Inference uses full K.
+        if self.training and self.lazy_K_active and self.lazy_K_active < K:
+            idx_i = self._sample_active_indices()                # [K_a]
+            idx_j = self._sample_active_indices()
+            seed_sub = seed_cross[idx_i][:, idx_j]               # [K_a, K_a, 4]
+            cos_j_sub = self.cos_j[:, idx_j]                     # [in, K_a]
+            sin_j_sub = self.sin_j[:, idx_j]
+            cos_i_sub = self.cos_i[:, idx_i]                     # [out, K_a]
+            sin_i_sub = self.sin_i[:, idx_i]
+            a, b, c, d = (seed_sub[..., k] for k in range(4))
+            x_cos = x @ cos_j_sub                                 # [B,T,K_a]
+            x_sin = x @ sin_j_sub
+            y_cos = x_cos @ a.t() + x_sin @ c.t()                 # [B,T,K_a]
+            y_sin = x_cos @ b.t() + x_sin @ d.t()
+            y = y_cos @ cos_i_sub.t() + y_sin @ sin_i_sub.t()
+            if self.bias is not None:
+                y = y + self.bias
+            return y
         a, b, c, d = seed_cross[..., 0], seed_cross[..., 1], seed_cross[..., 2], seed_cross[..., 3]
         x_cos = x @ self.cos_j                            # [B, T, K]
         x_sin = x @ self.sin_j
diff --git a/experiments/transformerless_lm/models_subsim.py b/experiments/transformerless_lm/models_subsim.py
@@ -50,12 +50,14 @@ class SubstrateSimilarityAttention(nn.Module):
 
     def __init__(self, d_model: int, K: int = 32, seq_len: int = 128,
                  fibgen_K: int = 32, mode: str = "cross",
-                 lazy_tier_dropout: bool = False):
+                 lazy_tier_dropout: bool = False,
+                 lazy_K_active: int = 0):
         super().__init__()
         self.d_model = d_model
         self.K = K
         kw = dict(K=fibgen_K, mode=mode, bias=False,
-                   lazy_tier_dropout=lazy_tier_dropout)
+                   lazy_tier_dropout=lazy_tier_dropout,
+                   lazy_K_active=lazy_K_active)
         self.W_sig = FibGenLinear(d_model, K, **kw)
         self.W_v = FibGenLinear(d_model, d_model, **kw)
         self.W_out = FibGenLinear(d_model, d_model, **kw)
@@ -85,13 +87,15 @@ class SubsimBlock(nn.Module):
 
     def __init__(self, d_model: int, seq_len: int, K: int = 32,
                  fibgen_K: int = 32, mode: str = "cross",
-                 lazy_tier_dropout: bool = False):
+                 lazy_tier_dropout: bool = False,
+                 lazy_K_active: int = 0):
         super().__init__()
-        self.attn = SubstrateSimilarityAttention(d_model, K=K, seq_len=seq_len,
-                                                   fibgen_K=fibgen_K, mode=mode,
-                                                   lazy_tier_dropout=lazy_tier_dropout)
-        # FFN with FibGen weights (separate K for FFN if desired)
-        kw = dict(K=fibgen_K, mode=mode, lazy_tier_dropout=lazy_tier_dropout)
+        self.attn = SubstrateSimilarityAttention(
+            d_model, K=K, seq_len=seq_len, fibgen_K=fibgen_K, mode=mode,
+            lazy_tier_dropout=lazy_tier_dropout, lazy_K_active=lazy_K_active,
+        )
+        kw = dict(K=fibgen_K, mode=mode, lazy_tier_dropout=lazy_tier_dropout,
+                   lazy_K_active=lazy_K_active)
         self.w1 = FibGenLinear(d_model, 4 * d_model, **kw)
         self.w2 = FibGenLinear(4 * d_model, d_model, **kw)
         self.ln1 = nn.LayerNorm(d_model)
@@ -114,7 +118,8 @@ class SubsimLM(nn.Module):
 
     def __init__(self, vocab_size: int, d_model: int, n_blocks: int,
                  seq_len: int, K: int = 32, fibgen_K: int = 32,
-                 mode: str = "cross", lazy_tier_dropout: bool = False):
+                 mode: str = "cross", lazy_tier_dropout: bool = False,
+                 lazy_K_active: int = 0):
         super().__init__()
         self.seq_len = seq_len
         self.K = K
@@ -123,7 +128,8 @@ def __init__(self, vocab_size: int, d_model: int, n_blocks: int,
         self.register_buffer("pe", pe)
         self.blocks = nn.ModuleList([
             SubsimBlock(d_model, seq_len, K=K, fibgen_K=fibgen_K, mode=mode,
-                          lazy_tier_dropout=lazy_tier_dropout)
+                          lazy_tier_dropout=lazy_tier_dropout,
+                          lazy_K_active=lazy_K_active)
             for _ in range(n_blocks)
         ])
         self.ln_f = nn.LayerNorm(d_model)