mudler · mudler · Jun 6, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/benchmarks/BENCHMARK.md b/benchmarks/BENCHMARK.md
@@ -498,6 +498,34 @@ Transcripts from the **diverse** clip set (no ground truth for most). NeMo vs pa
 | NeMo (PyTorch CPU) | hello this is a test of the voxtrol speech to text system<EOU> |
 | parakeet.cpp f32 | hello this is a test of the voxtrol speech to text system<EOU> |
 
+## Long-audio attention (banded local)
+
+The FastConformer encoder uses **global** relative-position self-attention, which
+is O(T²) in time *and memory*. On a long clip this explodes: a ~16.6-min file
+subsamples to T≈12k encoder frames, and the score/mask tensors alone reach tens
+of GB — enough to OOM a node. parakeet.cpp ports NeMo's `rel_pos_local_attn`
+(Longformer-style **banded** attention, `change_attention_model('rel_pos_local_attn',[W,W])`):
+each query attends only to keys within a ±W window, making attention **O(T·W)**.
+It auto-enables for long audio (encoder frames > 8192); `PARAKEET_ATT_CONTEXT=W`
+forces a window (`0` = full attention). The band is built with a **chunk-matmul**
+construction (overlapping K/V chunks + one batched GEMM + a diagonal skew-view),
+so the graph node count is **independent of the window** — the window goes to
+NeMo's full `[128,128]` at no extra graph cost.
+
+**16.6-min clip** (`tdt-0.6b-v3`, f32, NVIDIA GB10 DGX Spark, CPU / 16 threads):
+
+| Attention | Window | Wall | RTFx | Peak RSS |
+|---|---|---|---|---|
+| full (global, O(T²)) | — | 148.3 s | 6.7× | 54.0 GB |
+| banded | W=32 | 39.5 s | 25.2× | 8.9 GB |
+| **banded** | **W=128** (NeMo full) | **36.9 s** | **27.0×** | **9.4 GB** |
+
+Banded attention at NeMo's full W=128 is **~4× faster and ~5.7× less peak memory**
+than the global path, with a coherent transcript — and the chunk-matmul keeps the
+wide window as cheap as the narrow one. Short clips (the LibriSpeech set above)
+stay on the global path and are byte-identical to before; banding only engages
+past the long-audio threshold.
+
 ## Findings
 
 ### Accuracy

diff --git a/scripts/gen_band_ref.py b/scripts/gen_band_ref.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Generate the DETERMINISTIC brute-force band-attention reference fixture used by
+``test_relpos_attention_local``.
+
+NeMo's ``RelPositionMultiHeadAttentionLongformer.forward`` is non-deterministic
+on short sequences (``sliding_chunks_matmul_pv`` reads uninitialized memory at
+sequence boundaries via ``F.pad(value=-1)`` + ``as_strided`` — two identical
+forward() calls differ by >1e3). So a hook-captured ``l0_attn_out`` baseline is
+unusable for bit-parity on a short clip. Instead we recompute ``l0_attn_out`` as
+plain band attention (the well-defined math the longformer approximates), which
+the C++ ``forward_local`` matches to ~1e-3. End-to-end NeMo quality is anchored
+separately by the long-audio WER capstone, where the boundary noise is moot.
+
+Reads ``l0_attn_in`` and ``pos_emb`` from an existing local baseline (produced by
+``gen_nemo_baseline.py --att-context-size W``) so the inputs are the real NeMo
+ones; only the reference output is recomputed deterministically.
+
+Usage:
+    python scripts/gen_band_ref.py --model nvidia/parakeet-tdt_ctc-110m \
+        --in-baseline baseline_110m_local8.gguf --att-context 8 \
+        --output baseline_110m_local8_ref.gguf
+"""
+import argparse
+import math
+
+import gguf
+import numpy as np
+import torch
+
+import nemo.collections.asr as nemo_asr
+
+
+def read_tensor(path, name):
+    r = gguf.GGUFReader(path)
+    t = {x.name: x for x in r.tensors}[name]
+    return np.array(t.data, dtype=np.float32).reshape(
+        tuple(int(d) for d in reversed(t.shape)))
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", default="nvidia/parakeet-tdt_ctc-110m")
+    ap.add_argument("--in-baseline", required=True,
+                    help="local baseline gguf with l0_attn_in + pos_emb")
+    ap.add_argument("--att-context", type=int, required=True, help="window W")
+    ap.add_argument("--output", required=True)
+    args = ap.parse_args()
+
+    li_np = read_tensor(args.in_baseline, "l0_attn_in")     # (T, D)
+    pos_np = read_tensor(args.in_baseline, "pos_emb")        # (2W+1, D)
+    li = torch.tensor(li_np)[None]
+    pos = torch.tensor(pos_np)[None]
+    T, D = li.shape[1], li.shape[2]
+    w = args.att_context
+    vlen = T - 1  # last frame is a center-pad/padding frame for the fixture clip
+
+    m = nemo_asr.models.ASRModel.from_pretrained(args.model, map_location="cpu")
+    m.eval()
+    m.change_attention_model("rel_pos_local_attn", [w, w])
+    a0 = m.encoder.layers[0].self_attn
+    h, dk = a0.h, a0.d_k
+    s = math.sqrt(dk)
+    P = 2 * w + 1
+
+    with torch.no_grad():
+        q = a0.linear_q(li).view(1, T, h, dk).transpose(1, 2)
+        k = a0.linear_k(li).view(1, T, h, dk).transpose(1, 2)
+        v = a0.linear_v(li).view(1, T, h, dk).transpose(1, 2)
+        p = a0.linear_pos(pos).view(1, -1, h, dk).transpose(1, 2)
+        qu = q + a0.pos_bias_u.unsqueeze(1)
+        qv = q + a0.pos_bias_v.unsqueeze(1)
+        sc = torch.full((1, h, T, P), -1e30)
+        for t in range(T):
+            for c in range(P):
+                key = t - w + c
+                if 0 <= key < vlen:
+                    sc[0, :, t, c] = ((qu[0, :, t] * k[0, :, key]).sum(-1)
+                                      + (qv[0, :, t] * p[0, :, c]).sum(-1)) / s
+        at = torch.softmax(sc, dim=-1)
+        ctx = torch.zeros(1, h, T, dk)
+        for t in range(T):
+            for c in range(P):
+                key = t - w + c
+                if 0 <= key < vlen:
+                    ctx[0, :, t] += at[0, :, t, c:c + 1] * v[0, :, key]
+        om = a0.linear_out(ctx.transpose(1, 2).reshape(1, T, h * dk))[0].clone()
+        om[vlen:] = a0.linear_out(torch.zeros(1, h * dk))[0]  # padded query rows -> bias
+
+    ref = om.numpy().astype(np.float32)
+    W = gguf.GGUFWriter(args.output, "pk-band-ref")
+    W.add_tensor("l0_attn_in", np.ascontiguousarray(li_np))
+    W.add_tensor("pos_emb", np.ascontiguousarray(pos_np))
+    W.add_tensor("l0_attn_out", np.ascontiguousarray(ref))
+    W.write_header_to_file()
+    W.write_kv_data_to_file()
+    W.write_tensors_to_file()
+    W.close()
+    print(f"wrote {args.output}: T={T} D={D} W={w}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/gen_nemo_baseline.py b/scripts/gen_nemo_baseline.py
@@ -436,6 +436,16 @@ def main():
         help="dump per-token/word timestamps + max_prob confidence for both "
         "heads (TDT/RNNT + CTC) instead of the encoder-stage baseline.",
     )
+    ap.add_argument(
+        "--att-context-size",
+        type=int,
+        default=None,
+        help="if set to W, switch the encoder to NeMo local (Longformer) "
+        "attention via change_attention_model('rel_pos_local_attn', [W, W]) "
+        "before the forward, so the dumped pos_emb (2W+1), l0_attn_in/out and "
+        "transcripts reflect banded local attention. Anchors the C++ "
+        "banded-attention parity tests at NeMo quality.",
+    )
     args = ap.parse_args()
 
     is_local = pathlib.Path(args.model).exists()
@@ -452,6 +462,22 @@ def main():
     # Determinism: zero the spectrogram dither so the mel is reproducible.
     m.preprocessor.featurizer.dither = 0.0
 
+    # Optional: switch to NeMo local (Longformer) attention so the dumped
+    # baseline anchors the C++ banded-attention path. Must run BEFORE the hooks
+    # below, since change_attention_model swaps the pos_enc and self_attn modules
+    # (a hook registered on the old module would never fire). Mirrors the v3
+    # model card's long-audio recipe.
+    if args.att_context_size is not None:
+        w = args.att_context_size
+        m.change_attention_model("rel_pos_local_attn", [w, w])
+        try:
+            m.change_subsampling_conv_chunking_factor(1)
+        except Exception as e:  # pragma: no cover - older NeMo without the API
+            print(f"note: change_subsampling_conv_chunking_factor skipped: {e}",
+                  file=sys.stderr)
+        print(f"local attention: rel_pos_local_attn att_context_size=[{w},{w}]",
+              file=sys.stderr)
+
     # --timestamps: dump per-token/word timestamps + max_prob confidence for both
     # heads, then return. Kept as a separate early path so the encoder-stage
     # baseline behaviour below is completely untouched.

diff --git a/src/backend.cpp b/src/backend.cpp
@@ -16,10 +16,19 @@
 
 namespace pk {
 
+// Gallocr buffer size (bytes) after the most recent single-backend (CPU)
+// compute. Lets tests assert attention memory scales O(T*window), not O(T^2).
+static size_t g_last_graph_alloc_bytes = 0;
+size_t last_graph_alloc_bytes() { return g_last_graph_alloc_bytes; }
+
 namespace {
 // Number of graph nodes the metadata context must hold. The biggest single
-// graph today is a streaming conformer layer (~150 nodes); leave generous head
-// room for Task 2's fused encoder (~85 layers worth of ops in one graph).
+// graph today is the fused encoder. Banded local attention adds O(window) ops
+// per layer (~6*(2W+1) nodes), so the encoder caps its window (see
+// local_attn_window) to stay within this budget; bumping it globally regresses
+// small models (~+22% on tdt_ctc-110m) because the per-compute context + graph
+// hash-set scale with kGraphSize. A larger window needs the efficient
+// chunk-matmul construction (O(1) nodes) instead.
 constexpr size_t kGraphSize = 16384;
 
 struct PendingInput {
@@ -242,6 +251,7 @@ bool Backend::compute(const std::function<ggml_tensor*(ggml_context*)>& build,
         }
         alloc_ok = ggml_gallocr_alloc_graph(impl_->galloc, gf);
         if (!alloc_ok) PK_LOG("Backend::compute: ggml_gallocr_alloc_graph failed");
+        else g_last_graph_alloc_bytes = ggml_gallocr_get_buffer_size(impl_->galloc, 0);
     }
     if (!alloc_ok) {
         impl_->pending.clear();

diff --git a/src/conformer.cpp b/src/conformer.cpp
@@ -286,11 +286,14 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx,
                                                  ggml_tensor* xt, int T, int B,
                                                  ggml_tensor* pe, int pos_len,
                                                  const std::vector<int>& valid_len,
-                                                 GraphInputPool& pool) const {
+                                                 GraphInputPool& pool,
+                                                 int att_left, int att_right) const {
     const int D  = d_model_;
     const int K  = conv_kernel_;
     const float ln_eps = 1e-5f;            // LayerNorm eps (NeMo nn.LayerNorm default)
-    assert(pos_len == 2 * T - 1);
+    const bool local_attn = att_left >= 0;
+    assert(local_attn ? (pos_len == att_left + att_right + 1)
+                      : (pos_len == 2 * T - 1));
 
     const std::string pre = "encoder.layers." + std::to_string(layer_idx_) + ".";
     const ModelLoader& ml = ml_;
@@ -332,8 +335,10 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx,
     // === Stage B: r = r + self_attn(norm_self_att(r)). ===
     ggml_tensor* attn_in = layer_norm(r, "norm_self_att");
     RelPosAttention attn(ml_, layer_idx_);
-    ggml_tensor* attn_out = attn.build_graph_batched(ctx, attn_in, T, B, pe,
-                                             pos_len, valid_len, pool); // [D, T, B]
+    ggml_tensor* attn_out = local_attn
+        ? attn.build_graph_batched_local_chunked(ctx, attn_in, T, B, pe, pos_len, valid_len,
+                                                 att_left, att_right, pool)    // [D, T, B]
+        : attn.build_graph_batched(ctx, attn_in, T, B, pe, pos_len, valid_len, pool);
     r = ggml_add(ctx, r, attn_out);
 
     // === Stage C: r = r + conv(norm_conv(r)). ===
@@ -359,11 +364,14 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx,
 ggml_tensor* ConformerLayer::build_graph(ggml_context* ctx, ggml_tensor* xt,
                                          int T, ggml_tensor* pe, int pos_len,
                                          int valid_len,
-                                         GraphInputPool& pool) const {
+                                         GraphInputPool& pool,
+                                         int att_left, int att_right) const {
     const int D  = d_model_;
     const int K  = conv_kernel_;
     const float ln_eps = 1e-5f;            // LayerNorm eps (NeMo nn.LayerNorm default)
-    assert(pos_len == 2 * T - 1);
+    const bool local_attn = att_left >= 0;
+    assert(local_attn ? (pos_len == att_left + att_right + 1)
+                      : (pos_len == 2 * T - 1));
 
     const std::string pre = "encoder.layers." + std::to_string(layer_idx_) + ".";
     const ModelLoader& ml = ml_;
@@ -404,8 +412,10 @@ ggml_tensor* ConformerLayer::build_graph(ggml_context* ctx, ggml_tensor* xt,
     // === Stage B: r = r + self_attn(norm_self_att(r)). ===
     ggml_tensor* attn_in = layer_norm(r, "norm_self_att");
     RelPosAttention attn(ml_, layer_idx_);
-    ggml_tensor* attn_out = attn.build_graph(ctx, attn_in, T, pe, pos_len,
-                                             valid_len, pool); // [D, T]
+    ggml_tensor* attn_out = local_attn
+        ? attn.build_graph_local_chunked(ctx, attn_in, T, pe, pos_len, valid_len,
+                                         att_left, att_right, pool)    // [D, T]
+        : attn.build_graph(ctx, attn_in, T, pe, pos_len, valid_len, pool); // [D, T]
     r = ggml_add(ctx, r, attn_out);
 
     // === Stage C: r = r + conv(norm_conv(r)). ===

diff --git a/src/conformer.hpp b/src/conformer.hpp
@@ -55,16 +55,24 @@ class ConformerLayer {
     // This is the unit reused by the fused encoder AND the unit test; computing
     // the entire layer as ONE sub-graph (vs the old 5 sub-graphs) is what lets
     // the fused encoder be a single graph.
+    // When att_left/att_right >= 0, the self-attention uses NeMo
+    // rel_pos_local_attn (banded, O(T*window)): `pe` must then be the LOCAL
+    // positional encoding [d_model, att_left+att_right+1]. Defaults (-1, -1) keep
+    // full attention with `pe` = [d_model, 2T-1].
     ggml_tensor* build_graph(ggml_context* ctx, ggml_tensor* xt, int T,
                              ggml_tensor* pe, int pos_len, int valid_len,
-                             GraphInputPool& pool) const;
+                             GraphInputPool& pool,
+                             int att_left = -1, int att_right = -1) const;
 
     // Batched GRAPH-BUILDER. `xt` is [D, T, B]; `pe` is [D, pos_len] (shared
     // across the batch). `valid_len` is per item (size B). Returns [D, T, B].
+    // att_left/att_right >= 0 routes self-attention to banded local attention
+    // (pe = LOCAL [d_model, att_left+att_right+1]); defaults (-1,-1) = full.
     ggml_tensor* build_graph_batched(ggml_context* ctx, ggml_tensor* xt, int T,
                                      int B, ggml_tensor* pe, int pos_len,
                                      const std::vector<int>& valid_len,
-                                     GraphInputPool& pool) const;
+                                     GraphInputPool& pool,
+                                     int att_left = -1, int att_right = -1) const;
 
     // x: [T, d_model]; pos_emb: [pos_len=2T-1, d_model]; out: [T, d_model].
     void forward(const std::vector<float>& x, int T,

diff --git a/src/encoder.cpp b/src/encoder.cpp
@@ -8,10 +8,32 @@
 #include "ggml.h"
 #include <cassert>
 #include <cmath>
+#include <cstdlib>
 #include <vector>
 
 namespace pk {
 
+// Decide the self-attention window for an encoder of Tp frames. Returns W>0 to
+// use NeMo rel_pos_local_attn [W,W] (banded, O(T*window)); -1 for full attention.
+//
+// The attention uses the chunk-matmul banded path (build_graph_local_chunked),
+// which emits O(1) graph nodes regardless of window, so W can go to NeMo's full
+// [128,128] without overflowing the metadata-context budget (backend.cpp
+// kGraphSize). (The older pad-and-shift path emitted ~6*(2W+1) nodes/layer,
+// which is why this was capped at 32.)
+static int local_attn_window(int Tp) {
+    constexpr int kMaxLocalWindow = 128;
+    if (const char* e = std::getenv("PARAKEET_ATT_CONTEXT")) {
+        const int w = std::atoi(e);
+        if (w <= 0) return -1;                 // 0 / negative -> force full attention
+        return w > kMaxLocalWindow ? kMaxLocalWindow : w;
+    }
+    // Auto: long audio (~>11 min at 8x subsampling) switches to local attention
+    // so full O(T^2) attention can't OOM the device.
+    constexpr int kLocalThreshold = 8192;
+    return Tp > kLocalThreshold ? kMaxLocalWindow : -1;
+}
+
 Encoder::Encoder(const ModelLoader& ml)
     : ml_(ml) {
     d_model_  = (int)ml.config().d_model;
@@ -60,18 +82,24 @@ void Encoder::forward_capture(const std::vector<float>& mel, int n_mels, int T,
                 x = ggml_scale(ctx, x, std::sqrt((float)d_model_));
             }
 
-            // ---- 3. Relative positional encoding pos_emb [d_model, 2T'-1]. ----
-            const int pos_len = 2 * Tp - 1;
+            // ---- 3. Positional encoding. Long audio uses NeMo
+            //         rel_pos_local_attn (banded, O(T*window)) so attention can't
+            //         OOM; short audio keeps full attention (NeMo-exact). ----
+            const int att_w   = local_attn_window(Tp);
+            const bool local  = att_w > 0;
+            const int pos_len = local ? (2 * att_w + 1) : (2 * Tp - 1);
             std::vector<float>& pe_host = pool.alloc_f32();
-            rel_pos_encoding(Tp, d_model_, pe_host); // row-major [pos_len, d_model]
+            if (local) local_rel_pos_encoding(att_w, att_w, d_model_, pe_host);
+            else       rel_pos_encoding(Tp, d_model_, pe_host); // [pos_len, d_model]
             int64_t pe_ne[2] = {d_model_, pos_len};
             ggml_tensor* pe = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, pe_ne,
                                   pe_host.data(), pe_host.size() * sizeof(float));
 
             // ---- 4. Conformer layer stack (all in-graph). ----
             for (int i = 0; i < n_layers_; ++i) {
                 ConformerLayer layer(ml_, i);
-                x = layer.build_graph(ctx, x, Tp, pe, pos_len, valid_len, pool);
+                x = layer.build_graph(ctx, x, Tp, pe, pos_len, valid_len, pool,
+                                      local ? att_w : -1, local ? att_w : -1);
                 // Capture requested layer outputs from the SAME graph (row-major
                 // [T', d_model], matching the layer output orientation).
                 for (size_t c = 0; c < capture_layers.size(); ++c) {
@@ -122,18 +150,22 @@ void Encoder::forward_batch(const MelBatch& mels,
             // ---- 2. xscaling (gated; off for this model). ----
             if (xscaling_) x = ggml_scale(ctx, x, std::sqrt((float)d_model_));
 
-            // ---- 3. Relative positional encoding pos_emb [d_model, 2T'-1]. ----
-            const int pos_len = 2 * Tp - 1;
+            // ---- 3. Positional encoding (local for long audio; see B=1 path). ----
+            const int att_w   = local_attn_window(Tp);
+            const bool local  = att_w > 0;
+            const int pos_len = local ? (2 * att_w + 1) : (2 * Tp - 1);
             std::vector<float>& pe_host = pool.alloc_f32();
-            rel_pos_encoding(Tp, d_model_, pe_host); // row-major [pos_len, d_model]
+            if (local) local_rel_pos_encoding(att_w, att_w, d_model_, pe_host);
+            else       rel_pos_encoding(Tp, d_model_, pe_host); // [pos_len, d_model]
             int64_t pe_ne[2] = {d_model_, pos_len};
             ggml_tensor* pe = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, pe_ne,
                                   pe_host.data(), pe_host.size() * sizeof(float));
 
             // ---- 4. Conformer layer stack (all in-graph, shared pe). ----
             for (int i = 0; i < n_layers_; ++i) {
                 ConformerLayer layer(ml_, i);
-                x = layer.build_graph_batched(ctx, x, Tp, mels.B, pe, pos_len, vout, pool);
+                x = layer.build_graph_batched(ctx, x, Tp, mels.B, pe, pos_len, vout, pool,
+                                              local ? att_w : -1, local ? att_w : -1);
             }
             return x; // [d_model, Tp, B]
         }, flat);