From fcf1f15df681e185f95d889bdf3d925f3c3b6126 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 5 Jun 2026 20:27:31 +0200 Subject: [PATCH 1/7] =?UTF-8?q?feat(attn):=20banded=20local=20(Longformer)?= =?UTF-8?q?=20attention=20=E2=80=94=20O(T*window),=20NeMo-faithful?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds NeMo rel_pos_local_attn (RelPositionMultiHeadAttentionLongformer) as a memory-bounded banded attention. This is the kernel for fixing the O(T^2) attention blowup that OOM'd long-audio offline transcription on unified-memory GPUs (a ~20-min clip allocated ~100GB and took the node down). - RelPosAttention::build_graph_local / forward_local: banded attention via pad-and-shift, peak memory O(T*window) instead of O(T*T). Each query attends only to keys in [t-att_left, t+att_right]; the positional term (q_v . p^T over the 2W+1 local pos) is added 1:1 to the banded content scores, exactly as NeMo combines them. Verified against NeMo's own sliding_chunks_matmul_qk/pv (col->key t-w+c to 1e-6) and a deterministic band reference (1.4e-3). - local_rel_pos_encoding: NeMo LocalAttRelPositionalEncoding (positions +att_left..-att_right), bit-identical to the centre rows of the full table. - pk::last_graph_alloc_bytes(): gallocr high-water accessor for the memory test. - gen_nemo_baseline.py --att-context-size (local-attention baseline); and gen_band_ref.py for the deterministic band reference. NOTE: NeMo's longformer is non-deterministic on short clips (sliding_chunks_matmul_pv reads uninitialized memory at boundaries via F.pad value=-1 + as_strided — two identical forward() calls differ by >1e3), so kernel parity must use the deterministic reference; end-to-end NeMo quality is anchored by long-audio WER. - Tests: test_relpos_attention_local (parity 1.4e-3) and test_relpos_attention_local_memory (alloc grows ~linearly, ratio 1.98). Not yet wired into the offline encoder path — follow-up. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 --- scripts/gen_band_ref.py | 102 ++++++++++++++ scripts/gen_nemo_baseline.py | 26 ++++ src/backend.cpp | 6 + src/ggml_graph.hpp | 6 + src/pos_enc.cpp | 25 ++++ src/pos_enc.hpp | 10 ++ src/relpos_attention.cpp | 135 ++++++++++++++++++- src/relpos_attention.hpp | 28 +++- tests/CMakeLists.txt | 2 + tests/test_relpos_attention_local.cpp | 85 ++++++++++++ tests/test_relpos_attention_local_memory.cpp | 48 +++++++ 11 files changed, 471 insertions(+), 2 deletions(-) create mode 100644 scripts/gen_band_ref.py create mode 100644 tests/test_relpos_attention_local.cpp create mode 100644 tests/test_relpos_attention_local_memory.cpp diff --git a/scripts/gen_band_ref.py b/scripts/gen_band_ref.py new file mode 100644 index 0000000..72ee0a0 --- /dev/null +++ b/scripts/gen_band_ref.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""Generate the DETERMINISTIC brute-force band-attention reference fixture used by +``test_relpos_attention_local``. + +NeMo's ``RelPositionMultiHeadAttentionLongformer.forward`` is non-deterministic +on short sequences (``sliding_chunks_matmul_pv`` reads uninitialized memory at +sequence boundaries via ``F.pad(value=-1)`` + ``as_strided`` — two identical +forward() calls differ by >1e3). So a hook-captured ``l0_attn_out`` baseline is +unusable for bit-parity on a short clip. Instead we recompute ``l0_attn_out`` as +plain band attention (the well-defined math the longformer approximates), which +the C++ ``forward_local`` matches to ~1e-3. End-to-end NeMo quality is anchored +separately by the long-audio WER capstone, where the boundary noise is moot. + +Reads ``l0_attn_in`` and ``pos_emb`` from an existing local baseline (produced by +``gen_nemo_baseline.py --att-context-size W``) so the inputs are the real NeMo +ones; only the reference output is recomputed deterministically. + +Usage: + python scripts/gen_band_ref.py --model nvidia/parakeet-tdt_ctc-110m \ + --in-baseline baseline_110m_local8.gguf --att-context 8 \ + --output baseline_110m_local8_ref.gguf +""" +import argparse +import math + +import gguf +import numpy as np +import torch + +import nemo.collections.asr as nemo_asr + + +def read_tensor(path, name): + r = gguf.GGUFReader(path) + t = {x.name: x for x in r.tensors}[name] + return np.array(t.data, dtype=np.float32).reshape( + tuple(int(d) for d in reversed(t.shape))) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="nvidia/parakeet-tdt_ctc-110m") + ap.add_argument("--in-baseline", required=True, + help="local baseline gguf with l0_attn_in + pos_emb") + ap.add_argument("--att-context", type=int, required=True, help="window W") + ap.add_argument("--output", required=True) + args = ap.parse_args() + + li_np = read_tensor(args.in_baseline, "l0_attn_in") # (T, D) + pos_np = read_tensor(args.in_baseline, "pos_emb") # (2W+1, D) + li = torch.tensor(li_np)[None] + pos = torch.tensor(pos_np)[None] + T, D = li.shape[1], li.shape[2] + w = args.att_context + vlen = T - 1 # last frame is a center-pad/padding frame for the fixture clip + + m = nemo_asr.models.ASRModel.from_pretrained(args.model, map_location="cpu") + m.eval() + m.change_attention_model("rel_pos_local_attn", [w, w]) + a0 = m.encoder.layers[0].self_attn + h, dk = a0.h, a0.d_k + s = math.sqrt(dk) + P = 2 * w + 1 + + with torch.no_grad(): + q = a0.linear_q(li).view(1, T, h, dk).transpose(1, 2) + k = a0.linear_k(li).view(1, T, h, dk).transpose(1, 2) + v = a0.linear_v(li).view(1, T, h, dk).transpose(1, 2) + p = a0.linear_pos(pos).view(1, -1, h, dk).transpose(1, 2) + qu = q + a0.pos_bias_u.unsqueeze(1) + qv = q + a0.pos_bias_v.unsqueeze(1) + sc = torch.full((1, h, T, P), -1e30) + for t in range(T): + for c in range(P): + key = t - w + c + if 0 <= key < vlen: + sc[0, :, t, c] = ((qu[0, :, t] * k[0, :, key]).sum(-1) + + (qv[0, :, t] * p[0, :, c]).sum(-1)) / s + at = torch.softmax(sc, dim=-1) + ctx = torch.zeros(1, h, T, dk) + for t in range(T): + for c in range(P): + key = t - w + c + if 0 <= key < vlen: + ctx[0, :, t] += at[0, :, t, c:c + 1] * v[0, :, key] + om = a0.linear_out(ctx.transpose(1, 2).reshape(1, T, h * dk))[0].clone() + om[vlen:] = a0.linear_out(torch.zeros(1, h * dk))[0] # padded query rows -> bias + + ref = om.numpy().astype(np.float32) + W = gguf.GGUFWriter(args.output, "pk-band-ref") + W.add_tensor("l0_attn_in", np.ascontiguousarray(li_np)) + W.add_tensor("pos_emb", np.ascontiguousarray(pos_np)) + W.add_tensor("l0_attn_out", np.ascontiguousarray(ref)) + W.write_header_to_file() + W.write_kv_data_to_file() + W.write_tensors_to_file() + W.close() + print(f"wrote {args.output}: T={T} D={D} W={w}") + + +if __name__ == "__main__": + main() diff --git a/scripts/gen_nemo_baseline.py b/scripts/gen_nemo_baseline.py index 93368de..e8a15de 100644 --- a/scripts/gen_nemo_baseline.py +++ b/scripts/gen_nemo_baseline.py @@ -436,6 +436,16 @@ def main(): help="dump per-token/word timestamps + max_prob confidence for both " "heads (TDT/RNNT + CTC) instead of the encoder-stage baseline.", ) + ap.add_argument( + "--att-context-size", + type=int, + default=None, + help="if set to W, switch the encoder to NeMo local (Longformer) " + "attention via change_attention_model('rel_pos_local_attn', [W, W]) " + "before the forward, so the dumped pos_emb (2W+1), l0_attn_in/out and " + "transcripts reflect banded local attention. Anchors the C++ " + "banded-attention parity tests at NeMo quality.", + ) args = ap.parse_args() is_local = pathlib.Path(args.model).exists() @@ -452,6 +462,22 @@ def main(): # Determinism: zero the spectrogram dither so the mel is reproducible. m.preprocessor.featurizer.dither = 0.0 + # Optional: switch to NeMo local (Longformer) attention so the dumped + # baseline anchors the C++ banded-attention path. Must run BEFORE the hooks + # below, since change_attention_model swaps the pos_enc and self_attn modules + # (a hook registered on the old module would never fire). Mirrors the v3 + # model card's long-audio recipe. + if args.att_context_size is not None: + w = args.att_context_size + m.change_attention_model("rel_pos_local_attn", [w, w]) + try: + m.change_subsampling_conv_chunking_factor(1) + except Exception as e: # pragma: no cover - older NeMo without the API + print(f"note: change_subsampling_conv_chunking_factor skipped: {e}", + file=sys.stderr) + print(f"local attention: rel_pos_local_attn att_context_size=[{w},{w}]", + file=sys.stderr) + # --timestamps: dump per-token/word timestamps + max_prob confidence for both # heads, then return. Kept as a separate early path so the encoder-stage # baseline behaviour below is completely untouched. diff --git a/src/backend.cpp b/src/backend.cpp index 3c25409..9695576 100644 --- a/src/backend.cpp +++ b/src/backend.cpp @@ -16,6 +16,11 @@ namespace pk { +// Gallocr buffer size (bytes) after the most recent single-backend (CPU) +// compute. Lets tests assert attention memory scales O(T*window), not O(T^2). +static size_t g_last_graph_alloc_bytes = 0; +size_t last_graph_alloc_bytes() { return g_last_graph_alloc_bytes; } + namespace { // Number of graph nodes the metadata context must hold. The biggest single // graph today is a streaming conformer layer (~150 nodes); leave generous head @@ -242,6 +247,7 @@ bool Backend::compute(const std::function& build, } alloc_ok = ggml_gallocr_alloc_graph(impl_->galloc, gf); if (!alloc_ok) PK_LOG("Backend::compute: ggml_gallocr_alloc_graph failed"); + else g_last_graph_alloc_bytes = ggml_gallocr_get_buffer_size(impl_->galloc, 0); } if (!alloc_ok) { impl_->pending.clear(); diff --git a/src/ggml_graph.hpp b/src/ggml_graph.hpp index ae8d97d..0b922e0 100644 --- a/src/ggml_graph.hpp +++ b/src/ggml_graph.hpp @@ -33,6 +33,12 @@ bool run_graph(size_t mem_bytes, int n_threads, void set_num_threads(int n); int num_threads(); // current override (0 == unset) +// Gallocr buffer size (bytes) reserved for the most recent single-backend (CPU) +// run_graph compute. Used by tests to assert that banded attention memory scales +// O(T*window), not O(T^2). Reflects the high-water mark of the persistent +// gallocr, so query it after a fresh run at the size of interest. +size_t last_graph_alloc_bytes(); + class Backend; // The process-global persistent Backend (created lazily on first use). Exposed // so the weight-realization path can give the loader's tensors a backend buffer diff --git a/src/pos_enc.cpp b/src/pos_enc.cpp index 505ef58..860825e 100644 --- a/src/pos_enc.cpp +++ b/src/pos_enc.cpp @@ -33,4 +33,29 @@ void rel_pos_encoding(int T, int d_model, std::vector& out) { } } +void local_rel_pos_encoding(int att_left, int att_right, int d_model, + std::vector& out) { + assert(att_left >= 0 && att_right >= 0 && d_model > 0 && (d_model % 2) == 0); + const int P = att_left + att_right + 1; // local relative positions + const int half = d_model / 2; + + std::vector div_term(half); + const double factor = -(std::log(kInfVal) / (double)d_model); + for (int i = 0; i < half; ++i) { + div_term[i] = std::exp((double)(2 * i) * factor); + } + + out.assign((size_t)P * d_model, 0.0f); + // positions run from +att_left down to -att_right (NeMo arange(left,-right-1,-1)). + for (int p = 0; p < P; ++p) { + const double pos = (double)(att_left - p); + float* row = out.data() + (size_t)p * d_model; + for (int i = 0; i < half; ++i) { + const double arg = pos * div_term[i]; + row[2 * i] = (float)std::sin(arg); + row[2 * i + 1] = (float)std::cos(arg); + } + } +} + } // namespace pk diff --git a/src/pos_enc.hpp b/src/pos_enc.hpp index 04cc5ad..c185be9 100644 --- a/src/pos_enc.hpp +++ b/src/pos_enc.hpp @@ -19,4 +19,14 @@ namespace pk { // Output: row-major [2T-1, d_model] (d_model fastest), i.e. out[p*d_model + c]. void rel_pos_encoding(int T, int d_model, std::vector& out); +// LOCAL relative positional encoding for NeMo rel_pos_local_attn +// (LocalAttRelPositionalEncoding): positions run from +att_left DOWN TO +// -att_right (att_left+att_right+1 rows), using the SAME sinusoid as +// rel_pos_encoding. These are exactly the centre rows of the full table, so +// banded attention's positional term is bit-identical to NeMo's local pos. +// +// Output: row-major [att_left+att_right+1, d_model] (d_model fastest). +void local_rel_pos_encoding(int att_left, int att_right, int d_model, + std::vector& out); + } // namespace pk diff --git a/src/relpos_attention.cpp b/src/relpos_attention.cpp index 3fb1a81..4e5e4c3 100644 --- a/src/relpos_attention.cpp +++ b/src/relpos_attention.cpp @@ -40,7 +40,8 @@ RelPosAttention::RelPosAttention(const ModelLoader& ml, int layer_idx) ggml_tensor* RelPosAttention::build_graph(ggml_context* ctx, ggml_tensor* xt, int T, ggml_tensor* pe, int pos_len, int valid_len, - GraphInputPool& pool) const { + GraphInputPool& pool, + int att_left, int att_right) const { // Scalar (B=1) builder: the verbatim v1 2-D/3D relative-position attention // graph. The single-clip conformer layer routes here so B=1 runs the lean // graph and is bit-exact with v1. build_graph_batched below serves B>1. @@ -126,6 +127,12 @@ ggml_tensor* RelPosAttention::build_graph(ggml_context* ctx, ggml_tensor* xt, const int diff = cq - ck; ok = (diff >= 0 && diff <= left_chunks); } + // Symmetric sliding window (NeMo rel_pos_local_attn): keep only + // keys within [qi-att_left, qi+att_right]. + if (ok && att_left >= 0) { + const int rel = qi - kj; + ok = (rel <= att_left) && (rel >= -att_right); + } md[(size_t)qi * T + kj] = ok ? 0.0f : ninf; } } @@ -332,4 +339,130 @@ void RelPosAttention::forward(const std::vector& x, int T, (void)ok; } +ggml_tensor* RelPosAttention::build_graph_local(ggml_context* ctx, ggml_tensor* xt, + int T, ggml_tensor* pe, int pos_len, + int valid_len, int att_left, int att_right, + GraphInputPool& pool) const { + const int D = d_model_, H = n_heads_, dk = d_head_; + const int P = pos_len; // window width = att_left+att_right+1 + const float scale = 1.0f / std::sqrt((float)dk); + assert(att_left >= 0 && att_right >= 0); + assert(P == att_left + att_right + 1); + + // Exact NeMo rel_pos_local_attn (RelPositionMultiHeadAttentionLongformer), + // computed in O(T*window) via pad-and-shift instead of NeMo's skew/chunk + // tricks. For query t and window column c in [0, P), the key is + // (t - att_left + c). NeMo's local pos is ordered index0 = +att_left .. last + // = -att_right, so column c uses pos row c directly (matrix_bd = q_v . p^T, + // added 1:1 to the banded content scores). + { + const std::string pre = "encoder.layers." + std::to_string(layer_idx_) + ".self_attn."; + const ModelLoader& ml = ml_; + auto linear = [&](const char* wn, const char* bn, ggml_tensor* in) { + ggml_tensor* W = clone_weight(ctx, ml, pre + wn); + ggml_tensor* y = ggml_mul_mat(ctx, W, in); + if (bn && ml.tensor(pre + bn)) y = ggml_add(ctx, y, clone_weight(ctx, ml, pre + bn)); + return y; + }; + ggml_tensor* q = linear("linear_q.weight", "linear_q.bias", xt); + ggml_tensor* k = linear("linear_k.weight", "linear_k.bias", xt); + ggml_tensor* v = linear("linear_v.weight", "linear_v.bias", xt); + ggml_tensor* p = linear("linear_pos.weight", nullptr, pe); + auto to_heads = [&](ggml_tensor* t, int n) { + t = ggml_reshape_3d(ctx, t, dk, H, n); + return ggml_cont(ctx, ggml_permute(ctx, t, 0, 2, 1, 3)); // [dk, n, H] + }; + ggml_tensor* qh = to_heads(q, T), *kh = to_heads(k, T); + ggml_tensor* vh = to_heads(v, T), *php = to_heads(p, P); + ggml_tensor* bu = ggml_reshape_3d(ctx, clone_weight(ctx, ml, pre + "pos_bias_u"), dk, 1, H); + ggml_tensor* bv = ggml_reshape_3d(ctx, clone_weight(ctx, ml, pre + "pos_bias_v"), dk, 1, H); + ggml_tensor* qu = ggml_add(ctx, qh, bu); // [dk, T, H] + ggml_tensor* qv = ggml_add(ctx, qh, bv); // [dk, T, H] + + // Pad K/V along time (ne1): att_left on the left, att_right on the right, + // so view offset c yields key (t - att_left + c). + ggml_tensor* kpad = ggml_pad_ext(ctx, kh, 0,0, att_left,att_right, 0,0, 0,0); // [dk, T+P-1, H] + ggml_tensor* vpad = ggml_pad_ext(ctx, vh, 0,0, att_left,att_right, 0,0, 0,0); + + // Banded content scores ac[c, t, H] = q_u[t] . k[t-att_left+c]; stack on ne0=c. + ggml_tensor* ac = nullptr; + for (int c = 0; c < P; ++c) { + ggml_tensor* kc = ggml_view_3d(ctx, kpad, dk, T, H, kpad->nb[1], kpad->nb[2], + (size_t)c * kpad->nb[1]); + ggml_tensor* acc = ggml_sum_rows(ctx, ggml_mul(ctx, qu, kc)); // [1, T, H] + ac = ac ? ggml_concat(ctx, ac, acc, 0) : acc; + } + // Positional scores bd[c, t, H] = q_v[t] . p[c] (direct, no rel-shift). + ggml_tensor* bd = ggml_mul_mat(ctx, php, qv); // [P, T, H] + ggml_tensor* scores = ggml_add(ctx, ac, bd); // [P, T, H] + + // Band mask [P, T]: 0 if key in [0, valid_len), else -inf (covers the + // out-of-sequence window corners and pad frames). + std::vector& mh = pool.alloc_f32((size_t)P * T); + for (int t = 0; t < T; ++t) + for (int c = 0; c < P; ++c) { + const int key = t - att_left + c; + mh[(size_t)t * P + c] = (key >= 0 && key < valid_len) ? 0.0f : -INFINITY; + } + int64_t mne[2] = {P, T}; + ggml_tensor* mask = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, mne, + mh.data(), mh.size() * sizeof(float)); + ggml_tensor* prob = ggml_soft_max_ext(ctx, scores, mask, scale, 0.0f); // softmax over c + + // context[dk, t, H] = sum_c prob[c, t] * v[t-att_left+c]. + ggml_tensor* context = nullptr; + for (int c = 0; c < P; ++c) { + ggml_tensor* vc = ggml_view_3d(ctx, vpad, dk, T, H, vpad->nb[1], vpad->nb[2], + (size_t)c * vpad->nb[1]); + ggml_tensor* pc = ggml_view_3d(ctx, prob, 1, T, H, prob->nb[1], prob->nb[2], + (size_t)c * prob->nb[0]); // [1, T, H] + ggml_tensor* term = ggml_mul(ctx, vc, pc); // broadcast pc over dk + context = context ? ggml_add(ctx, context, term) : term; + } + // Merge heads [dk, T, H] -> [dk, H, T] -> [D, T]. + ggml_tensor* merged = ggml_cont(ctx, ggml_permute(ctx, context, 0, 2, 1, 3)); + merged = ggml_reshape_2d(ctx, merged, D, T); + if (valid_len < T) { // zero padded query rows -> output = linear_out.bias + std::vector& qm = pool.alloc_f32(T); + for (int t = 0; t < T; ++t) qm[t] = (t < valid_len) ? 1.0f : 0.0f; + int64_t qne[2] = {1, T}; + ggml_tensor* qmask = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, qne, + qm.data(), qm.size() * sizeof(float)); + merged = ggml_mul(ctx, merged, qmask); + } + ggml_tensor* Wo = clone_weight(ctx, ml, pre + "linear_out.weight"); + ggml_tensor* y = ggml_mul_mat(ctx, Wo, merged); + if (ml.tensor(pre + "linear_out.bias")) + y = ggml_add(ctx, y, clone_weight(ctx, ml, pre + "linear_out.bias")); + return y; // [D, T] + } +} + +void RelPosAttention::forward_local(const std::vector& x, int T, + const std::vector& pos_emb, int pos_len, + int valid_len, int att_left, int att_right, + std::vector& out) const { + const int D = d_model_; + assert((int)x.size() == T * D); + assert((int)pos_emb.size() == pos_len * D); + + // Thin wrapper over build_graph_local: feed x/pos_emb as graph inputs and + // compute the banded attention sub-graph on the persistent Backend. The + // fused conformer encoder calls build_graph_local directly. + GraphInputPool pool; + bool ok = pk::run_graph(/*mem_bytes*/0, /*n_threads*/4, + [&](ggml_context* ctx) -> ggml_tensor* { + int64_t xt_ne[2] = {D, T}; + ggml_tensor* xt = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, xt_ne, + x.data(), (size_t)T * D * sizeof(float)); + int64_t pe_ne[2] = {D, pos_len}; + ggml_tensor* pe = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, pe_ne, + pos_emb.data(), (size_t)pos_len * D * sizeof(float)); + return build_graph_local(ctx, xt, T, pe, pos_len, valid_len, + att_left, att_right, pool); + }, out); + assert(ok && "relpos local attention graph failed"); + (void)ok; +} + } // namespace pk diff --git a/src/relpos_attention.hpp b/src/relpos_attention.hpp index 91d2534..797028a 100644 --- a/src/relpos_attention.hpp +++ b/src/relpos_attention.hpp @@ -37,9 +37,24 @@ class RelPosAttention { // Returns the attention output [D, T]. Host-built additive masks are fed via // pk::graph_input_tensor and registered into `pool` (must outlive compute). // Reused by the fused conformer layer and the unit test. + // + // When att_left/att_right >= 0, an additional SYMMETRIC sliding-window mask + // is applied: query qi may attend to key kj only if -att_left <= qi-kj <= + // att_right (NeMo rel_pos_local_attn). Defaults (-1, -1) = full context. ggml_tensor* build_graph(ggml_context* ctx, ggml_tensor* xt, int T, ggml_tensor* pe, int pos_len, int valid_len, - GraphInputPool& pool) const; + GraphInputPool& pool, + int att_left = -1, int att_right = -1) const; + + // LOCAL (banded / Longformer) GRAPH-BUILDER. Appends NeMo rel_pos_local_attn + // ops to a SHARED graph. `xt` is [D, T] and `pe` is the LOCAL positional + // encoding [D, att_left+att_right+1]. Each query attends only to keys within + // [t-att_left, t+att_right] via pad-and-shift, so peak memory is + // O(T * window) not O(T^2). Returns the attention output [D, T]. + ggml_tensor* build_graph_local(ggml_context* ctx, ggml_tensor* xt, int T, + ggml_tensor* pe, int pos_len, int valid_len, + int att_left, int att_right, + GraphInputPool& pool) const; // Batched GRAPH-BUILDER. `xt` is [D, T, B]; `pe` is [D, pos_len] (shared // across the batch). `valid_len` is per item (size B). Returns [D, T, B]. @@ -53,6 +68,17 @@ class RelPosAttention { const std::vector& pos_emb, int pos_len, int valid_len, std::vector& out) const; + + // Local (banded / Longformer) attention — NeMo rel_pos_local_attn. Each + // query qi attends only to keys in [qi-att_left, qi+att_right]; pos_emb is + // the LOCAL positional encoding [att_left+att_right+1, d_model] (i.e. 2W+1 + // for a symmetric [W,W] window), NOT the full 2T-1. Output matches NeMo's + // Longformer self_attn while bounding memory to O(T * window) instead of + // O(T^2). x: [T, d_model]; out: [T, d_model]. + void forward_local(const std::vector& x, int T, + const std::vector& pos_emb, int pos_len, + int valid_len, int att_left, int att_right, + std::vector& out) const; private: const ModelLoader& ml_; int layer_idx_; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 95fe32c..4cc37b5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -15,6 +15,8 @@ pk_add_test(test_mel_gpu) pk_add_test(test_subsampling) pk_add_test(test_subsampling_batch) pk_add_test(test_relpos_attention) +pk_add_test(test_relpos_attention_local) +pk_add_test(test_relpos_attention_local_memory) pk_add_test(test_relpos_attention_batch) pk_add_test(test_conformer) pk_add_test(test_conformer_batch) diff --git a/tests/test_relpos_attention_local.cpp b/tests/test_relpos_attention_local.cpp new file mode 100644 index 0000000..523435e --- /dev/null +++ b/tests/test_relpos_attention_local.cpp @@ -0,0 +1,85 @@ +#include "relpos_attention.hpp" +#include "model_loader.hpp" +#include "parity.hpp" +#include +#include +#include +#include + +// Parity test for NeMo LOCAL (Longformer) attention — +// RelPositionMultiHeadAttentionLongformer, i.e. +// change_attention_model("rel_pos_local_attn", [W, W]). +// +// Under local attention pos_emb is [2W+1, d_model] (NOT the full 2T-1), and +// each query attends only to keys in [t-W, t+W]; forward_local computes this in +// O(T*window) instead of O(T^2). +// +// IMPORTANT — the reference is a DETERMINISTIC brute-force band attention (same +// inputs + model weights), NOT NeMo's raw longformer output. NeMo's +// sliding_chunks_matmul_pv reads uninitialized memory at sequence boundaries +// (F.pad value=-1 + as_strided), so on a short clip its output is +// non-deterministic (verified: two identical forward() calls differ by >1e3). +// The C++ banded math was verified to match NeMo's deterministic pieces directly +// (sliding_chunks_matmul_qk key=t-W+c to 1e-6; scores to 1e-4; pv key map to 0). +// End-to-end NeMo quality is anchored separately by the long-audio WER capstone, +// where the boundary noise is negligible. +// +// Env: +// PARAKEET_TEST_GGUF model weights (skip 77 if unset) +// PARAKEET_TEST_BASELINE_LOCAL local-attention baseline gguf (skip 77 if unset) +int main() { + const char* gguf = std::getenv("PARAKEET_TEST_GGUF"); + const char* base = std::getenv("PARAKEET_TEST_BASELINE_LOCAL"); + if (!gguf || !base) { std::fprintf(stderr, "env not set; skip\n"); return 77; } + + pk::ModelLoader ml; + if (!ml.load(gguf)) return 1; + + // Attention input: baseline "l0_attn_in" is [T, d_model] row-major. + std::vector xin; std::vector xshape; + if (!pktest::load_baseline(base, "l0_attn_in", xin, xshape)) return 1; + if (xshape.size() != 2) { std::fprintf(stderr, "l0_attn_in rank=%zu\n", xshape.size()); return 1; } + const int T = (int)xshape[0]; + const int d_model = (int)xshape[1]; + + // Local relative positional encoding: baseline "pos_emb" is [2W+1, d_model]. + std::vector pos; std::vector pshape; + if (!pktest::load_baseline(base, "pos_emb", pos, pshape)) return 1; + if (pshape.size() != 2 || (int)pshape[1] != d_model) { + std::fprintf(stderr, "pos_emb shape=[%lld,%lld] expected [*, %d]\n", + (long long)pshape[0], (long long)pshape[1], d_model); + return 1; + } + const int pos_len = (int)pshape[0]; + if (pos_len % 2 == 0) { + std::fprintf(stderr, "local pos_len=%d is not odd (expected 2W+1)\n", pos_len); + return 1; + } + const int W = (pos_len - 1) / 2; + if (W >= T) { + std::fprintf(stderr, "W=%d >= T=%d: window covers the full sequence, " + "banding is not exercised — regenerate the baseline with a " + "smaller --att-context-size\n", W, T); + return 1; + } + + // Last frame is a center-pad/padding frame (same clip + preprocessing as the + // full-attention baseline), so valid frames are 0..T-2. + const int valid_len = T - 1; + + pk::RelPosAttention attn(ml, /*layer_idx*/0); + std::vector out; + attn.forward_local(xin, T, pos, pos_len, valid_len, /*att_left*/W, /*att_right*/W, out); + + // Reference: l0_attn_out is [T, d_model] row-major. + std::vector ref; std::vector rshape; + if (!pktest::load_baseline(base, "l0_attn_out", ref, rshape)) return 1; + if (rshape.size() != 2 || (int)rshape[0] != T || (int)rshape[1] != d_model) { + std::fprintf(stderr, "l0_attn_out shape=[%lld,%lld] expected [%d,%d]\n", + (long long)rshape[0], (long long)rshape[1], T, d_model); + return 1; + } + + bool ok = pktest::compare(out, ref, "relpos_attention_local", /*atol*/2e-2f, /*rtol*/2e-2f); + return ok ? 0 : 1; +} diff --git a/tests/test_relpos_attention_local_memory.cpp b/tests/test_relpos_attention_local_memory.cpp new file mode 100644 index 0000000..9b451e8 --- /dev/null +++ b/tests/test_relpos_attention_local_memory.cpp @@ -0,0 +1,48 @@ +#include "relpos_attention.hpp" +#include "model_loader.hpp" +#include "ggml_graph.hpp" +#include +#include +#include + +// Memory-scaling test for banded local attention. +// +// NeMo rel_pos_local_attn / forward_local must use O(T * window) memory, NOT the +// O(T^2) of full attention (the bug that OOM'd long-audio transcription). We run +// forward_local at T and 2T with a fixed window and assert the gallocr buffer +// grows ~linearly (ratio < 3) and stays far below a single full [T, T, H] score +// tensor — neither of which a quadratic implementation could satisfy. Values are +// irrelevant (graph allocation depends only on shapes), so zero inputs are fine. +int main() { + const char* gguf = std::getenv("PARAKEET_TEST_GGUF"); + if (!gguf) { std::fprintf(stderr, "PARAKEET_TEST_GGUF not set; skip\n"); return 77; } + + pk::ModelLoader ml; + if (!ml.load(gguf)) return 1; + const int D = (int)ml.config().d_model; + const int H = (int)ml.config().n_heads; + const int W = 8, P = 2 * W + 1; + + pk::RelPosAttention attn(ml, /*layer_idx*/0); + auto measure = [&](int T) -> size_t { + std::vector x((size_t)T * D, 0.0f), pos((size_t)P * D, 0.0f), out; + attn.forward_local(x, T, pos, P, /*valid_len*/T, W, W, out); + return pk::last_graph_alloc_bytes(); + }; + + const int T0 = 512, T1 = 1024; + const size_t a0 = measure(T0); + const size_t a1 = measure(T1); // persistent gallocr grows to the T1 high-water + const double ratio = a0 > 0 ? (double)a1 / (double)a0 : 1e9; + + // A single full [T1, T1, H] f32 score tensor — the floor of any O(T^2) path. + const size_t full_scores = (size_t)T1 * T1 * H * sizeof(float); + + std::fprintf(stderr, + "alloc(T=%d)=%zu alloc(T=%d)=%zu ratio=%.2f (full T^2 scores=%zu)\n", + T0, a0, T1, a1, ratio, full_scores); + + bool ok = (a0 > 0) && (ratio < 3.0) && (a1 < full_scores); + std::fprintf(stderr, "[banded memory-scaling] %s\n", ok ? "OK" : "FAIL"); + return ok ? 0 : 1; +} From a09d271d2f8c36838e33db1b8ec7d6aee650375b Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 5 Jun 2026 20:42:20 +0200 Subject: [PATCH 2/7] feat(encoder): use banded local attention for long audio (fixes O(T^2) OOM) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the banded rel_pos_local_attn kernel into the offline encoder so long audio no longer allocates O(T^2) attention, which OOM'd unified-memory GPUs (a ~17-min clip drove ~100GB and took the node down). - ConformerLayer::build_graph gains optional att_left/att_right; when set it routes self-attention to RelPosAttention::build_graph_local with a LOCAL positional encoding, else keeps full attention unchanged. - Encoder::forward picks the window via local_attn_window(Tp): env PARAKEET_ATT_CONTEXT=W forces NeMo rel_pos_local_attn [W,W]; otherwise audio longer than ~11 min (>8192 encoder frames) auto-switches to W=128. Short audio keeps full attention (NeMo-exact; the encoder parity test is unchanged). - backend.cpp: bump kGraphSize 16384->65536 — the pad-and-shift kernel adds O(window) graph-node descriptors per layer. Verified end-to-end on a 16.6-min clip with tdt-0.6b-v3 (CPU, 16 threads): full attention: 151 s, 55.4 GB peak RSS banded (W=16): 41 s, 9.1 GB peak RSS (coherent transcript) ~6x less memory and ~3.7x faster; the full-attention path is what hit ~100GB and OOM'd. Short-clip transcripts: W=128 == full byte-for-byte; W=16 essentially identical. Note: pad-and-shift creates O(window) nodes and an O(window^2) incremental concat — fine for small windows but slow for W=128 on CPU; an efficient chunk-matmul construction (like NeMo's sliding_chunks) is a follow-up. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 --- src/backend.cpp | 7 +++++-- src/conformer.cpp | 13 +++++++++---- src/conformer.hpp | 7 ++++++- src/encoder.cpp | 30 ++++++++++++++++++++++++++---- 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/backend.cpp b/src/backend.cpp index 9695576..0e519dc 100644 --- a/src/backend.cpp +++ b/src/backend.cpp @@ -24,8 +24,11 @@ size_t last_graph_alloc_bytes() { return g_last_graph_alloc_bytes; } namespace { // Number of graph nodes the metadata context must hold. The biggest single // graph today is a streaming conformer layer (~150 nodes); leave generous head -// room for Task 2's fused encoder (~85 layers worth of ops in one graph). -constexpr size_t kGraphSize = 16384; +// room for the fused encoder (~85 layers worth of ops in one graph) and for +// banded local attention, whose pad-and-shift adds O(window) ops per layer +// (~6*(2W+1) nodes; W=128 over 17 layers is ~27k). The metadata context is +// pure descriptors (~336 B each), so this is ~22 MB allocated per compute. +constexpr size_t kGraphSize = 65536; struct PendingInput { ggml_tensor* tensor; diff --git a/src/conformer.cpp b/src/conformer.cpp index 8ef6645..4084a7c 100644 --- a/src/conformer.cpp +++ b/src/conformer.cpp @@ -359,11 +359,14 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx, ggml_tensor* ConformerLayer::build_graph(ggml_context* ctx, ggml_tensor* xt, int T, ggml_tensor* pe, int pos_len, int valid_len, - GraphInputPool& pool) const { + GraphInputPool& pool, + int att_left, int att_right) const { const int D = d_model_; const int K = conv_kernel_; const float ln_eps = 1e-5f; // LayerNorm eps (NeMo nn.LayerNorm default) - assert(pos_len == 2 * T - 1); + const bool local_attn = att_left >= 0; + assert(local_attn ? (pos_len == att_left + att_right + 1) + : (pos_len == 2 * T - 1)); const std::string pre = "encoder.layers." + std::to_string(layer_idx_) + "."; const ModelLoader& ml = ml_; @@ -404,8 +407,10 @@ ggml_tensor* ConformerLayer::build_graph(ggml_context* ctx, ggml_tensor* xt, // === Stage B: r = r + self_attn(norm_self_att(r)). === ggml_tensor* attn_in = layer_norm(r, "norm_self_att"); RelPosAttention attn(ml_, layer_idx_); - ggml_tensor* attn_out = attn.build_graph(ctx, attn_in, T, pe, pos_len, - valid_len, pool); // [D, T] + ggml_tensor* attn_out = local_attn + ? attn.build_graph_local(ctx, attn_in, T, pe, pos_len, valid_len, + att_left, att_right, pool) // [D, T] + : attn.build_graph(ctx, attn_in, T, pe, pos_len, valid_len, pool); // [D, T] r = ggml_add(ctx, r, attn_out); // === Stage C: r = r + conv(norm_conv(r)). === diff --git a/src/conformer.hpp b/src/conformer.hpp index 23402fb..c98a7e0 100644 --- a/src/conformer.hpp +++ b/src/conformer.hpp @@ -55,9 +55,14 @@ class ConformerLayer { // This is the unit reused by the fused encoder AND the unit test; computing // the entire layer as ONE sub-graph (vs the old 5 sub-graphs) is what lets // the fused encoder be a single graph. + // When att_left/att_right >= 0, the self-attention uses NeMo + // rel_pos_local_attn (banded, O(T*window)): `pe` must then be the LOCAL + // positional encoding [d_model, att_left+att_right+1]. Defaults (-1, -1) keep + // full attention with `pe` = [d_model, 2T-1]. ggml_tensor* build_graph(ggml_context* ctx, ggml_tensor* xt, int T, ggml_tensor* pe, int pos_len, int valid_len, - GraphInputPool& pool) const; + GraphInputPool& pool, + int att_left = -1, int att_right = -1) const; // Batched GRAPH-BUILDER. `xt` is [D, T, B]; `pe` is [D, pos_len] (shared // across the batch). `valid_len` is per item (size B). Returns [D, T, B]. diff --git a/src/encoder.cpp b/src/encoder.cpp index 871971b..41a0d0c 100644 --- a/src/encoder.cpp +++ b/src/encoder.cpp @@ -8,10 +8,26 @@ #include "ggml.h" #include #include +#include #include namespace pk { +// Decide the self-attention window for an encoder of Tp frames. Returns W>0 to +// use NeMo rel_pos_local_attn [W,W] (banded, O(T*window)); -1 for full attention. +static int local_attn_window(int Tp) { + if (const char* e = std::getenv("PARAKEET_ATT_CONTEXT")) { + const int w = std::atoi(e); + return w > 0 ? w : -1; // 0 / negative -> force full attention + } + // Auto: long audio (~>11 min at 8x subsampling) switches to local attention + // so full O(T^2) attention can't OOM the device. 128 = NeMo's recommended + // long-audio window. + constexpr int kLocalThreshold = 8192; + constexpr int kDefaultWindow = 128; + return Tp > kLocalThreshold ? kDefaultWindow : -1; +} + Encoder::Encoder(const ModelLoader& ml) : ml_(ml) { d_model_ = (int)ml.config().d_model; @@ -60,10 +76,15 @@ void Encoder::forward_capture(const std::vector& mel, int n_mels, int T, x = ggml_scale(ctx, x, std::sqrt((float)d_model_)); } - // ---- 3. Relative positional encoding pos_emb [d_model, 2T'-1]. ---- - const int pos_len = 2 * Tp - 1; + // ---- 3. Positional encoding. Long audio uses NeMo + // rel_pos_local_attn (banded, O(T*window)) so attention can't + // OOM; short audio keeps full attention (NeMo-exact). ---- + const int att_w = local_attn_window(Tp); + const bool local = att_w > 0; + const int pos_len = local ? (2 * att_w + 1) : (2 * Tp - 1); std::vector& pe_host = pool.alloc_f32(); - rel_pos_encoding(Tp, d_model_, pe_host); // row-major [pos_len, d_model] + if (local) local_rel_pos_encoding(att_w, att_w, d_model_, pe_host); + else rel_pos_encoding(Tp, d_model_, pe_host); // [pos_len, d_model] int64_t pe_ne[2] = {d_model_, pos_len}; ggml_tensor* pe = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, pe_ne, pe_host.data(), pe_host.size() * sizeof(float)); @@ -71,7 +92,8 @@ void Encoder::forward_capture(const std::vector& mel, int n_mels, int T, // ---- 4. Conformer layer stack (all in-graph). ---- for (int i = 0; i < n_layers_; ++i) { ConformerLayer layer(ml_, i); - x = layer.build_graph(ctx, x, Tp, pe, pos_len, valid_len, pool); + x = layer.build_graph(ctx, x, Tp, pe, pos_len, valid_len, pool, + local ? att_w : -1, local ? att_w : -1); // Capture requested layer outputs from the SAME graph (row-major // [T', d_model], matching the layer output orientation). for (size_t c = 0; c < capture_layers.size(); ++c) { From ab4e0980b838999ba4a46c807606c2fdc90b05e9 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 5 Jun 2026 21:04:03 +0200 Subject: [PATCH 3/7] fix(encoder): cap local-attention window to the node budget (no small-model regression) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumping kGraphSize 16384->65536 to fit a W=128 banded graph regressed small models ~+22% (tdt_ctc-110m): the per-compute metadata context and graph hash-set scale with kGraphSize. Revert to 16384 and instead cap the local-attention window at W=32 — the pad-and-shift kernel adds ~6*(2W+1) graph nodes/layer, and W<=32 fits every shipped model's encoder within the budget. PARAKEET_ATT_CONTEXT is clamped to 32. Regression bench (librispeech, 100 files, CPU, back-to-back): tdt_ctc-110m: master 19.5s vs banded 19.4s (within noise), 0/100 text diffs tdt-0.6b-v3: 0/100 text diffs Long-audio fix intact: 16.6-min clip + tdt-0.6b-v3 auto-uses W=32 -> 48s, 9.4 GB peak RSS (vs full attention 151s / 55.4 GB). Lifting the window cap to NeMo's [128,128] needs the efficient chunk-matmul construction (O(1) graph nodes) — follow-up. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 --- src/backend.cpp | 13 +++++++------ src/encoder.cpp | 15 ++++++++++----- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/backend.cpp b/src/backend.cpp index 0e519dc..88e2c8b 100644 --- a/src/backend.cpp +++ b/src/backend.cpp @@ -23,12 +23,13 @@ size_t last_graph_alloc_bytes() { return g_last_graph_alloc_bytes; } namespace { // Number of graph nodes the metadata context must hold. The biggest single -// graph today is a streaming conformer layer (~150 nodes); leave generous head -// room for the fused encoder (~85 layers worth of ops in one graph) and for -// banded local attention, whose pad-and-shift adds O(window) ops per layer -// (~6*(2W+1) nodes; W=128 over 17 layers is ~27k). The metadata context is -// pure descriptors (~336 B each), so this is ~22 MB allocated per compute. -constexpr size_t kGraphSize = 65536; +// graph today is the fused encoder. Banded local attention adds O(window) ops +// per layer (~6*(2W+1) nodes), so the encoder caps its window (see +// local_attn_window) to stay within this budget; bumping it globally regresses +// small models (~+22% on tdt_ctc-110m) because the per-compute context + graph +// hash-set scale with kGraphSize. A larger window needs the efficient +// chunk-matmul construction (O(1) nodes) instead. +constexpr size_t kGraphSize = 16384; struct PendingInput { ggml_tensor* tensor; diff --git a/src/encoder.cpp b/src/encoder.cpp index 41a0d0c..58df3d0 100644 --- a/src/encoder.cpp +++ b/src/encoder.cpp @@ -15,17 +15,22 @@ namespace pk { // Decide the self-attention window for an encoder of Tp frames. Returns W>0 to // use NeMo rel_pos_local_attn [W,W] (banded, O(T*window)); -1 for full attention. +// +// The pad-and-shift kernel adds ~6*(2W+1) graph nodes per layer, so W is capped +// to stay within the metadata-context budget (backend.cpp kGraphSize); W<=32 +// fits every shipped model. The efficient chunk-matmul construction (follow-up, +// O(1) nodes) would lift this cap to NeMo's full [128,128]. static int local_attn_window(int Tp) { + constexpr int kMaxLocalWindow = 32; if (const char* e = std::getenv("PARAKEET_ATT_CONTEXT")) { const int w = std::atoi(e); - return w > 0 ? w : -1; // 0 / negative -> force full attention + if (w <= 0) return -1; // 0 / negative -> force full attention + return w > kMaxLocalWindow ? kMaxLocalWindow : w; } // Auto: long audio (~>11 min at 8x subsampling) switches to local attention - // so full O(T^2) attention can't OOM the device. 128 = NeMo's recommended - // long-audio window. + // so full O(T^2) attention can't OOM the device. constexpr int kLocalThreshold = 8192; - constexpr int kDefaultWindow = 128; - return Tp > kLocalThreshold ? kDefaultWindow : -1; + return Tp > kLocalThreshold ? kMaxLocalWindow : -1; } Encoder::Encoder(const ModelLoader& ml) From 41c4def98411d9ded0dbd6ef03cbc9db74c1b66a Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 6 Jun 2026 00:06:26 +0200 Subject: [PATCH 4/7] feat(attn): banded local attention in the batched encoder (B>1) Mirror the B=1 banded path (build_graph_local) into the fused batched encoder so long-audio batches also use NeMo rel_pos_local_attn (O(T*window)) instead of full O(T^2) attention. RelPosAttention::build_graph_batched_local builds the 4D ([dk,T,H,B]) pad-and-shift band: K/V padded on the time axis, per-window-column views, sum_rows content scores + mul_mat positional scores (shared pos broadcast over B), a per-item band mask [P,T,1,B] keyed on each item's valid_len, soft_max over the window, then the context gather and head merge. Conformer build_graph_batched and the batched encoder forward route to it when att_left/att_right >= 0, with the shared LOCAL positional encoding. Verified on dgx (tdt_ctc-110m): the new test_encoder_batch_local exercises the path at the production window (W=32 = kMaxLocalWindow). item0 (the full clip) is bit-exact beside its shorter padded neighbour (no cross-item leak), and the padded item1 matches its standalone run within 5e-2/5e-2 - the same tolerance the full-attention batch test uses. Tighter-than-production windows only amplify float noise on near-zero activations of the padded clip (item0 stays exact, mean|d| ~1e-2); not pad leakage. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 --- src/conformer.cpp | 13 ++-- src/conformer.hpp | 5 +- src/encoder.cpp | 12 ++-- src/relpos_attention.cpp | 95 +++++++++++++++++++++++++++ src/relpos_attention.hpp | 10 +++ tests/CMakeLists.txt | 5 +- tests/test_encoder_batch_local.cpp | 102 +++++++++++++++++++++++++++++ 7 files changed, 231 insertions(+), 11 deletions(-) create mode 100644 tests/test_encoder_batch_local.cpp diff --git a/src/conformer.cpp b/src/conformer.cpp index 4084a7c..681b307 100644 --- a/src/conformer.cpp +++ b/src/conformer.cpp @@ -286,11 +286,14 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx, ggml_tensor* xt, int T, int B, ggml_tensor* pe, int pos_len, const std::vector& valid_len, - GraphInputPool& pool) const { + GraphInputPool& pool, + int att_left, int att_right) const { const int D = d_model_; const int K = conv_kernel_; const float ln_eps = 1e-5f; // LayerNorm eps (NeMo nn.LayerNorm default) - assert(pos_len == 2 * T - 1); + const bool local_attn = att_left >= 0; + assert(local_attn ? (pos_len == att_left + att_right + 1) + : (pos_len == 2 * T - 1)); const std::string pre = "encoder.layers." + std::to_string(layer_idx_) + "."; const ModelLoader& ml = ml_; @@ -332,8 +335,10 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx, // === Stage B: r = r + self_attn(norm_self_att(r)). === ggml_tensor* attn_in = layer_norm(r, "norm_self_att"); RelPosAttention attn(ml_, layer_idx_); - ggml_tensor* attn_out = attn.build_graph_batched(ctx, attn_in, T, B, pe, - pos_len, valid_len, pool); // [D, T, B] + ggml_tensor* attn_out = local_attn + ? attn.build_graph_batched_local(ctx, attn_in, T, B, pe, pos_len, valid_len, + att_left, att_right, pool) // [D, T, B] + : attn.build_graph_batched(ctx, attn_in, T, B, pe, pos_len, valid_len, pool); r = ggml_add(ctx, r, attn_out); // === Stage C: r = r + conv(norm_conv(r)). === diff --git a/src/conformer.hpp b/src/conformer.hpp index c98a7e0..5fafc0b 100644 --- a/src/conformer.hpp +++ b/src/conformer.hpp @@ -66,10 +66,13 @@ class ConformerLayer { // Batched GRAPH-BUILDER. `xt` is [D, T, B]; `pe` is [D, pos_len] (shared // across the batch). `valid_len` is per item (size B). Returns [D, T, B]. + // att_left/att_right >= 0 routes self-attention to banded local attention + // (pe = LOCAL [d_model, att_left+att_right+1]); defaults (-1,-1) = full. ggml_tensor* build_graph_batched(ggml_context* ctx, ggml_tensor* xt, int T, int B, ggml_tensor* pe, int pos_len, const std::vector& valid_len, - GraphInputPool& pool) const; + GraphInputPool& pool, + int att_left = -1, int att_right = -1) const; // x: [T, d_model]; pos_emb: [pos_len=2T-1, d_model]; out: [T, d_model]. void forward(const std::vector& x, int T, diff --git a/src/encoder.cpp b/src/encoder.cpp index 58df3d0..dd33d46 100644 --- a/src/encoder.cpp +++ b/src/encoder.cpp @@ -149,10 +149,13 @@ void Encoder::forward_batch(const MelBatch& mels, // ---- 2. xscaling (gated; off for this model). ---- if (xscaling_) x = ggml_scale(ctx, x, std::sqrt((float)d_model_)); - // ---- 3. Relative positional encoding pos_emb [d_model, 2T'-1]. ---- - const int pos_len = 2 * Tp - 1; + // ---- 3. Positional encoding (local for long audio; see B=1 path). ---- + const int att_w = local_attn_window(Tp); + const bool local = att_w > 0; + const int pos_len = local ? (2 * att_w + 1) : (2 * Tp - 1); std::vector& pe_host = pool.alloc_f32(); - rel_pos_encoding(Tp, d_model_, pe_host); // row-major [pos_len, d_model] + if (local) local_rel_pos_encoding(att_w, att_w, d_model_, pe_host); + else rel_pos_encoding(Tp, d_model_, pe_host); // [pos_len, d_model] int64_t pe_ne[2] = {d_model_, pos_len}; ggml_tensor* pe = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, pe_ne, pe_host.data(), pe_host.size() * sizeof(float)); @@ -160,7 +163,8 @@ void Encoder::forward_batch(const MelBatch& mels, // ---- 4. Conformer layer stack (all in-graph, shared pe). ---- for (int i = 0; i < n_layers_; ++i) { ConformerLayer layer(ml_, i); - x = layer.build_graph_batched(ctx, x, Tp, mels.B, pe, pos_len, vout, pool); + x = layer.build_graph_batched(ctx, x, Tp, mels.B, pe, pos_len, vout, pool, + local ? att_w : -1, local ? att_w : -1); } return x; // [d_model, Tp, B] }, flat); diff --git a/src/relpos_attention.cpp b/src/relpos_attention.cpp index 4e5e4c3..029c02b 100644 --- a/src/relpos_attention.cpp +++ b/src/relpos_attention.cpp @@ -312,6 +312,101 @@ ggml_tensor* RelPosAttention::build_graph_batched( return linear("linear_out.weight", "linear_out.bias", merged); // [D, T, B] } +ggml_tensor* RelPosAttention::build_graph_batched_local( + ggml_context* ctx, ggml_tensor* xt, int T, int B, ggml_tensor* pe, + int pos_len, const std::vector& valid_len, + int att_left, int att_right, GraphInputPool& pool) const { + const int D = d_model_, H = n_heads_, dk = d_head_; + const int P = pos_len; + const float scale = 1.0f / std::sqrt((float)dk); + assert(att_left >= 0 && att_right >= 0); + assert(P == att_left + att_right + 1); + assert((int)valid_len.size() == B); + + const std::string pre = "encoder.layers." + std::to_string(layer_idx_) + ".self_attn."; + const ModelLoader& ml = ml_; + auto linear = [&](const char* wn, const char* bn, ggml_tensor* in) { + ggml_tensor* W = clone_weight(ctx, ml, pre + wn); + ggml_tensor* y = ggml_mul_mat(ctx, W, in); + if (bn && ml.tensor(pre + bn)) y = ggml_add(ctx, y, clone_weight(ctx, ml, pre + bn)); + return y; + }; + ggml_tensor* q = linear("linear_q.weight", "linear_q.bias", xt); // [D, T, B] + ggml_tensor* k = linear("linear_k.weight", "linear_k.bias", xt); + ggml_tensor* v = linear("linear_v.weight", "linear_v.bias", xt); + ggml_tensor* p = linear("linear_pos.weight", nullptr, pe); // [D, P] + auto to_heads_b = [&](ggml_tensor* t, int n) { + t = ggml_reshape_4d(ctx, t, dk, H, n, B); + return ggml_cont(ctx, ggml_permute(ctx, t, 0, 2, 1, 3)); // [dk, n, H, B] + }; + auto to_heads = [&](ggml_tensor* t, int n) { + t = ggml_reshape_3d(ctx, t, dk, H, n); + return ggml_cont(ctx, ggml_permute(ctx, t, 0, 2, 1, 3)); // [dk, n, H] + }; + ggml_tensor* qh = to_heads_b(q, T), *kh = to_heads_b(k, T); + ggml_tensor* vh = to_heads_b(v, T), *php = to_heads(p, P); // ph shared (ne3=1) + ggml_tensor* bu = ggml_reshape_4d(ctx, clone_weight(ctx, ml, pre + "pos_bias_u"), dk, 1, H, 1); + ggml_tensor* bv = ggml_reshape_4d(ctx, clone_weight(ctx, ml, pre + "pos_bias_v"), dk, 1, H, 1); + ggml_tensor* qu = ggml_add(ctx, qh, bu); // [dk, T, H, B] + ggml_tensor* qv = ggml_add(ctx, qh, bv); // [dk, T, H, B] + + // Pad K/V along time (ne1); view offset c -> key (t - att_left + c). + ggml_tensor* kpad = ggml_pad_ext(ctx, kh, 0,0, att_left,att_right, 0,0, 0,0); // [dk, T+P-1, H, B] + ggml_tensor* vpad = ggml_pad_ext(ctx, vh, 0,0, att_left,att_right, 0,0, 0,0); + + // Banded content scores ac[c, t, H, B]; stack on ne0=c. + ggml_tensor* ac = nullptr; + for (int c = 0; c < P; ++c) { + ggml_tensor* kc = ggml_view_4d(ctx, kpad, dk, T, H, B, + kpad->nb[1], kpad->nb[2], kpad->nb[3], (size_t)c * kpad->nb[1]); + ggml_tensor* acc = ggml_sum_rows(ctx, ggml_mul(ctx, qu, kc)); // [1, T, H, B] + ac = ac ? ggml_concat(ctx, ac, acc, 0) : acc; + } + ggml_tensor* bd = ggml_mul_mat(ctx, php, qv); // [P, T, H, B] (php broadcasts over B) + ggml_tensor* scores = ggml_add(ctx, ac, bd); // [P, T, H, B] + + // Per-item band mask [P, T, 1, B]. + std::vector& mh = pool.alloc_f32((size_t)B * T * P); + for (int b = 0; b < B; ++b) { + const int vl = valid_len[b]; + for (int t = 0; t < T; ++t) + for (int c = 0; c < P; ++c) { + const int key = t - att_left + c; + mh[(size_t)b * T * P + (size_t)t * P + c] = (key >= 0 && key < vl) ? 0.0f : -INFINITY; + } + } + int64_t mne[4] = {P, T, 1, B}; + ggml_tensor* mask = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 4, mne, + mh.data(), mh.size() * sizeof(float)); + ggml_tensor* prob = ggml_soft_max_ext(ctx, scores, mask, scale, 0.0f); // softmax over c + + // context[dk, t, H, B] = sum_c prob[c, t] * v[t-att_left+c]. + ggml_tensor* context = nullptr; + for (int c = 0; c < P; ++c) { + ggml_tensor* vc = ggml_view_4d(ctx, vpad, dk, T, H, B, + vpad->nb[1], vpad->nb[2], vpad->nb[3], (size_t)c * vpad->nb[1]); + ggml_tensor* pc = ggml_view_4d(ctx, prob, 1, T, H, B, + prob->nb[1], prob->nb[2], prob->nb[3], (size_t)c * prob->nb[0]); // [1,T,H,B] + ggml_tensor* term = ggml_mul(ctx, vc, pc); + context = context ? ggml_add(ctx, context, term) : term; + } + // Merge heads [dk, T, H, B] -> [dk, H, T, B] -> [D, T, B]. + ggml_tensor* merged = ggml_cont(ctx, ggml_permute(ctx, context, 0, 2, 1, 3)); + merged = ggml_reshape_3d(ctx, merged, D, T, B); + bool any_pad = false; + for (int b = 0; b < B; ++b) any_pad = any_pad || (valid_len[b] < T); + if (any_pad) { + std::vector& qm = pool.alloc_f32((size_t)B * T); + for (int b = 0; b < B; ++b) + for (int t = 0; t < T; ++t) qm[(size_t)b * T + t] = (t < valid_len[b]) ? 1.0f : 0.0f; + int64_t qne[3] = {1, T, B}; + ggml_tensor* qmask = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 3, qne, + qm.data(), qm.size() * sizeof(float)); + merged = ggml_mul(ctx, merged, qmask); + } + return linear("linear_out.weight", "linear_out.bias", merged); // [D, T, B] +} + void RelPosAttention::forward(const std::vector& x, int T, const std::vector& pos_emb, int pos_len, int valid_len, diff --git a/src/relpos_attention.hpp b/src/relpos_attention.hpp index 797028a..0daaa17 100644 --- a/src/relpos_attention.hpp +++ b/src/relpos_attention.hpp @@ -63,6 +63,16 @@ class RelPosAttention { const std::vector& valid_len, GraphInputPool& pool) const; + // Batched LOCAL (banded / Longformer) GRAPH-BUILDER. Same as + // build_graph_local but for B>1: `xt` is [D, T, B], `pe` is the LOCAL + // positional encoding [D, att_left+att_right+1] (shared across the batch), + // `valid_len` is per item. Returns [D, T, B]. Banded -> O(T*window) memory. + ggml_tensor* build_graph_batched_local(ggml_context* ctx, ggml_tensor* xt, int T, + int B, ggml_tensor* pe, int pos_len, + const std::vector& valid_len, + int att_left, int att_right, + GraphInputPool& pool) const; + // x: [T, d_model]; pos_emb: [2T-1, d_model]; out: [T, d_model]. void forward(const std::vector& x, int T, const std::vector& pos_emb, int pos_len, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4cc37b5..4ab5998 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,6 +23,7 @@ pk_add_test(test_conformer_batch) pk_add_test(test_conv_eou) pk_add_test(test_encoder) pk_add_test(test_encoder_batch) +pk_add_test(test_encoder_batch_local) pk_add_test(test_encoder_eou) pk_add_test(test_streaming_encoder) pk_add_test(test_ctc) @@ -54,7 +55,7 @@ pk_add_test(test_capi_stream) pk_add_test(test_capi_timestamps) pk_add_test(test_capi_batch_json) set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch - test_conformer test_conformer_batch test_conv_eou test_encoder test_encoder_batch test_encoder_eou + test_conformer test_conformer_batch test_conv_eou test_encoder test_encoder_batch test_encoder_batch_local test_encoder_eou test_streaming_encoder test_ctc test_prediction test_prediction_step test_prediction_step_batch test_joint test_joint_step_batch test_transducer_core test_tdt_greedy @@ -67,7 +68,7 @@ set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling te PROPERTIES LABELS "model") # These tests read fixtures/baselines via paths relative to the project root. set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch test_conformer test_conformer_batch - test_conv_eou test_encoder test_encoder_batch test_encoder_eou test_streaming_encoder + test_conv_eou test_encoder test_encoder_batch test_encoder_batch_local test_encoder_eou test_streaming_encoder test_ctc test_prediction test_prediction_step test_prediction_step_batch test_joint test_joint_step_batch test_transducer_core test_tdt_greedy diff --git a/tests/test_encoder_batch_local.cpp b/tests/test_encoder_batch_local.cpp new file mode 100644 index 0000000..16aad78 --- /dev/null +++ b/tests/test_encoder_batch_local.cpp @@ -0,0 +1,102 @@ +#include "encoder.hpp" +#include "model_loader.hpp" +#include "parity.hpp" +#include +#include +#include +#include +#include + +// Batched encoder equivalence + padding invariance with NeMo rel_pos_local_attn +// (banded / Longformer) forced ON. This is the local-attention twin of +// test_encoder_batch.cpp: it exercises RelPosAttention::build_graph_batched_local +// through the whole fused batched encoder. +// +// PARAKEET_ATT_CONTEXT forces the symmetric window for EVERY utterance regardless +// of length (encoder.cpp local_attn_window). We use W=32 = kMaxLocalWindow, the +// exact window the auto path selects for long audio (Tp > 8192) in production, so +// this test gates the real shipped configuration. +// +// As in test_encoder_batch, item0 is the full baseline clip and item1 is its +// first 3/4 zero-padded up to T0. The two correctness properties: +// * item0 must be BIT-EXACT to its standalone run (0.0) - the shorter padded +// neighbour must not perturb the full clip at all (no cross-item leakage). +// * item1's valid region must match its standalone run within 5e-2/5e-2. +// item1's standalone run is at its OWN (shorter) Tp while the batched run is +// at the padded width T0; the differing tensor shapes change ggml reduction +// order, so near-zero activations of the padded clip carry float noise that +// accumulates over the 17 conformer layers. The tolerance mirrors the +// full-attention test. (Tightening the window past production's W=32 sharpens +// the banded softmax and amplifies that noise on near-zero elements - a +// numerical effect, not pad leakage: item0 stays bit-exact and item1's +// mean|d| stays ~1e-2 throughout.) +int main() { + const char* gguf = std::getenv("PARAKEET_TEST_GGUF"); + const char* base = std::getenv("PARAKEET_TEST_BASELINE"); + if (!gguf || !base) { std::fprintf(stderr, "env not set; skip\n"); return 77; } + + // Force banded local attention (production cap) for the whole run. + setenv("PARAKEET_ATT_CONTEXT", "32", /*overwrite*/1); + + pk::ModelLoader ml; + if (!ml.load(gguf)) { std::fprintf(stderr, "model load failed\n"); return 1; } + + std::vector mel; std::vector ms; + if (!pktest::load_baseline(base, "mel", mel, ms)) return 1; + if (ms.size() != 2) { std::fprintf(stderr, "mel rank=%zu\n", ms.size()); return 1; } + const int n_mels = (int)ms[0]; + const int T0 = (int)ms[1]; + const int T1 = (T0 * 3) / 4; + + pk::Encoder enc(ml); + + // --- Standalone references (local attention via the forced env) --------- + std::vector e0, e1; int dm = 0, to0 = 0, to1 = 0; + enc.forward(mel, n_mels, T0, e0, dm, to0); + + std::vector mel1((size_t)n_mels * T1); + for (int m = 0; m < n_mels; ++m) + for (int t = 0; t < T1; ++t) + mel1[(size_t)m * T1 + t] = mel[(size_t)m * T0 + t]; + int dm1 = 0; + enc.forward(mel1, n_mels, T1, e1, dm1, to1); + + // --- Batched (T_max=T0; item1 zero-padded) ------------------------------ + pk::MelBatch mb; + mb.B = 2; mb.n_mels = n_mels; mb.T_max = T0; mb.valid_T = { T0, T1 }; + mb.data.assign((size_t)2 * n_mels * T0, 0.0f); + for (int m = 0; m < n_mels; ++m) { + for (int t = 0; t < T0; ++t) mb.data[((size_t)0 * n_mels + m) * T0 + t] = mel[(size_t)m * T0 + t]; + for (int t = 0; t < T1; ++t) mb.data[((size_t)1 * n_mels + m) * T0 + t] = mel1[(size_t)m * T1 + t]; + } + std::vector> eo; int dmb = 0, tob = 0; std::vector vt; + enc.forward_batch(mb, eo, dmb, tob, vt); + + if (dmb <= 0 || tob <= 0 || eo.size() != 2 || vt.size() != 2) { + std::fprintf(stderr, "bad batched output dmb=%d tob=%d |eo|=%zu |vt|=%zu\n", + dmb, tob, eo.size(), vt.size()); + return 1; + } + + std::fprintf(stderr, + "[encbatch_local] dm=%d to0=%d to1=%d | dmb=%d tob=%d vt={%d,%d}\n", + dm, to0, to1, dmb, tob, vt[0], vt[1]); + + auto slice_cols = [&](const std::vector& full, int Tfull, int Tvalid) { + std::vector s((size_t)dmb * Tvalid); + for (int c = 0; c < dmb; ++c) + for (int t = 0; t < Tvalid; ++t) + s[(size_t)c * Tvalid + t] = full[(size_t)c * Tfull + t]; + return s; + }; + std::vector b0 = slice_cols(eo[0], vt[0], vt[0]); + std::vector b1 = slice_cols(eo[1], vt[1], vt[1]); + std::vector ref0 = slice_cols(e0, to0, vt[0]); + std::vector ref1 = slice_cols(e1, to1, vt[1]); + + // item0 must match its standalone run to within float noise (a real leak + // from the shorter neighbour would be order-1, not 1e-3): no cross-item leak. + bool a = pktest::compare(b0, ref0, "encbatch_local.item0", 1e-3f, 1e-3f); + bool b = pktest::compare(b1, ref1, "encbatch_local.item1", 5e-2f, 5e-2f); + return (a && b) ? 0 : 1; +} From d46c0ec8e39ede950d3aa1ea8385657cdd089d81 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 6 Jun 2026 02:22:52 +0200 Subject: [PATCH 5/7] feat(attn): chunk-matmul banded local attention (O(1) graph nodes) The pad-and-shift banded path (build_graph_local) is correct but emits O(window) graph nodes per layer (a P-iteration view+mul+sum_rows+concat loop), which is why the window was capped at W=32. build_graph_local_chunked computes the exact same NeMo rel_pos_local_attn output with a fixed, O(1) number of nodes regardless of window, lifting the cap toward NeMo's full [128,128]. Construction: time is tiled into chunks of C frames; each chunk carries its own C+P-1 keys/values (the P-1 halo overlaps the neighbour), so a query attends only within its chunk. K/V are gathered as OVERLAPPING strided chunk views - which ggml's view-bounds check (ggml.c: data_size = dense product of ne, ignoring nb) rejects unless the source is OVER-padded to (C+P-1)*G frames; with that pad the view is legal and a single batched ggml_mul_mat produces the per-chunk q.k blocks [C+P-1, C, G, H]. A diagonal "skew" view (nb1 walking C+P on a [C+P-1,...] tensor, which passes the bounds check since P <= C+P-1) extracts the [P,T] band. The PV side inverse-skews the softmaxed band back to a [C+P-1, C] banded matrix (pad ne0 by C, skew-view, mask the lower off-band), then one batched matmul against the transposed V chunks gathers the context. Verified against the trusted pad-and-shift path (forward_local, itself 1.4e-3 vs a deterministic brute-force band reference): new test test_relpos_attention_local_chunked runs synthetic x/pos through the real layer-0 weights for T up to 333 and W up to 128 (chunk < W and chunk == W), matching forward_local to <1e-3 (max|d| ~6e-4). Existing pad-and-shift path and all encoder/conformer regressions unchanged. Encoder wiring (raise the cap and route long audio to this kernel) follows. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 --- src/relpos_attention.cpp | 153 ++++++++++++++++++ src/relpos_attention.hpp | 21 +++ tests/CMakeLists.txt | 3 +- tests/test_relpos_attention_local_chunked.cpp | 66 ++++++++ 4 files changed, 242 insertions(+), 1 deletion(-) create mode 100644 tests/test_relpos_attention_local_chunked.cpp diff --git a/src/relpos_attention.cpp b/src/relpos_attention.cpp index 029c02b..b33ea34 100644 --- a/src/relpos_attention.cpp +++ b/src/relpos_attention.cpp @@ -560,4 +560,157 @@ void RelPosAttention::forward_local(const std::vector& x, int T, (void)ok; } +ggml_tensor* RelPosAttention::build_graph_local_chunked( + ggml_context* ctx, ggml_tensor* xt, int T, ggml_tensor* pe, int pos_len, + int valid_len, int att_left, int att_right, GraphInputPool& pool, + int chunk) const { + const int D = d_model_, H = n_heads_, dk = d_head_; + const int P = pos_len; // window width = att_left+att_right+1 + const float scale = 1.0f / std::sqrt((float)dk); + assert(att_left >= 0 && att_right >= 0); + assert(P == att_left + att_right + 1); + + // Tile time into chunks of C frames (G chunks, Tp = G*C padded length). Each + // chunk carries its own C+P-1 keys/values (the P-1 halo overlaps the next + // chunk), so a query in chunk g only attends within g. Default C spans the + // window so the halo is one chunk wide. + int C = chunk > 0 ? chunk : (att_left + att_right); + if (C < 1) C = 1; + const int G = (T + C - 1) / C; + const int Tp = G * C; + const int Lk = (C + P - 1) * G; // dense length the chunk VIEW needs + + const std::string pre = "encoder.layers." + std::to_string(layer_idx_) + ".self_attn."; + const ModelLoader& ml = ml_; + auto linear = [&](const char* wn, const char* bn, ggml_tensor* in) { + ggml_tensor* W = clone_weight(ctx, ml, pre + wn); + ggml_tensor* y = ggml_mul_mat(ctx, W, in); + if (bn && ml.tensor(pre + bn)) y = ggml_add(ctx, y, clone_weight(ctx, ml, pre + bn)); + return y; + }; + ggml_tensor* q = linear("linear_q.weight", "linear_q.bias", xt); + ggml_tensor* k = linear("linear_k.weight", "linear_k.bias", xt); + ggml_tensor* v = linear("linear_v.weight", "linear_v.bias", xt); + ggml_tensor* p = linear("linear_pos.weight", nullptr, pe); + auto to_heads = [&](ggml_tensor* t, int n) { + t = ggml_reshape_3d(ctx, t, dk, H, n); + return ggml_cont(ctx, ggml_permute(ctx, t, 0, 2, 1, 3)); // [dk, n, H] + }; + ggml_tensor* qh = to_heads(q, T), *kh = to_heads(k, T); + ggml_tensor* vh = to_heads(v, T), *php = to_heads(p, P); + ggml_tensor* bu = ggml_reshape_3d(ctx, clone_weight(ctx, ml, pre + "pos_bias_u"), dk, 1, H); + ggml_tensor* bv = ggml_reshape_3d(ctx, clone_weight(ctx, ml, pre + "pos_bias_v"), dk, 1, H); + ggml_tensor* qu = ggml_add(ctx, qh, bu); // [dk, T, H] + ggml_tensor* qv = ggml_add(ctx, qh, bv); // [dk, T, H] + + // ---- Content scores ac[c,t,H] via chunked matmul + diagonal skew-view ---- + // Pad queries to Tp and reshape into non-overlapping chunks [dk, C, G, H]. + ggml_tensor* qu_p = (Tp > T) ? ggml_pad_ext(ctx, qu, 0,0, 0,Tp-T, 0,0, 0,0) : qu; + ggml_tensor* qu_c = ggml_reshape_4d(ctx, qu_p, dk, C, G, H); + // Pad keys (left att_left, right att_right) then OVER-pad to Lk so the + // overlapping chunk view's dense ne-product fits ggml's bounds check. + ggml_tensor* kpad = ggml_pad_ext(ctx, kh, 0,0, att_left,att_right, 0,0, 0,0); // [dk,T+P-1,H] + if (Lk > (int)kpad->ne[1]) kpad = ggml_pad_ext(ctx, kpad, 0,0, 0,Lk-(int)kpad->ne[1], 0,0, 0,0); + // Overlapping key chunks [dk, C+P-1, G, H]: chunk g advances C along time. + ggml_tensor* kchunk = ggml_view_4d(ctx, kpad, dk, C+P-1, G, H, + kpad->nb[1], (size_t)C*kpad->nb[1], kpad->nb[2], 0); + kchunk = ggml_cont(ctx, kchunk); + // Per-chunk q.k block [C+P-1, C, G, H]: sc[j,i,g] = k[gC+j] . qu[gC+i]. + ggml_tensor* sc = ggml_mul_mat(ctx, kchunk, qu_c); + // Diagonal skew: ac_band[c,i,g] = sc[i+c, i, g] -> [P, C, G, H], nb1 walks (C+P). + ggml_tensor* acb = ggml_view_4d(ctx, sc, P, C, G, H, + (size_t)(C+P)*sc->nb[0], sc->nb[2], sc->nb[3], 0); + acb = ggml_cont(ctx, acb); + acb = ggml_reshape_3d(ctx, acb, P, Tp, H); + ggml_tensor* ac = (Tp > T) ? ggml_view_3d(ctx, acb, P, T, H, acb->nb[1], acb->nb[2], 0) : acb; + + // ---- Positional scores bd[c,t,H] = qv[t].p[c] (same as build_graph_local) ---- + ggml_tensor* bd = ggml_mul_mat(ctx, php, qv); // [P, T, H] + ggml_tensor* scores = ggml_add(ctx, ac, bd); // [P, T, H] + + // Band mask [P, T]: 0 if key in [0, valid_len), else -inf. + std::vector& mh = pool.alloc_f32((size_t)P * T); + for (int t = 0; t < T; ++t) + for (int c = 0; c < P; ++c) { + const int key = t - att_left + c; + mh[(size_t)t * P + c] = (key >= 0 && key < valid_len) ? 0.0f : -INFINITY; + } + int64_t mne[2] = {P, T}; + ggml_tensor* mask = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, mne, + mh.data(), mh.size() * sizeof(float)); + ggml_tensor* prob = ggml_soft_max_ext(ctx, scores, mask, scale, 0.0f); // softmax over c + + // ---- Context[dk,t,H] = sum_c prob[c,t] v[t-att_left+c] via inverse-skew + matmul ---- + // Pad prob to Tp, chunk [P, C, G, H], inverse-skew to a banded [C+P-1, C, G, H]. + ggml_tensor* prob_p = (Tp > T) ? ggml_pad_ext(ctx, prob, 0,0, 0,Tp-T, 0,0, 0,0) : prob; + ggml_tensor* prob_c = ggml_reshape_4d(ctx, prob_p, P, C, G, H); + ggml_tensor* probpad = ggml_pad_ext(ctx, prob_c, 0,C, 0,0, 0,0, 0,0); // ne0 P->C+P + // Pfull[j,i,g] = prob_c[j-i, i, g] (skew view; upper off-band already zero + // from the pad, lower off-band masked below). + ggml_tensor* pfull = ggml_view_4d(ctx, probpad, C+P-1, C, G, H, + (size_t)(C+P-1)*probpad->nb[0], probpad->nb[2], probpad->nb[3], 0); + pfull = ggml_cont(ctx, pfull); + std::vector& b01 = pool.alloc_f32((size_t)(C+P-1) * C); + for (int i = 0; i < C; ++i) + for (int j = 0; j < C+P-1; ++j) { + const int rel = j - i; + b01[(size_t)i * (C+P-1) + j] = (rel >= 0 && rel < P) ? 1.0f : 0.0f; + } + int64_t bne[2] = {C+P-1, C}; + ggml_tensor* band01 = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, bne, + b01.data(), b01.size() * sizeof(float)); + pfull = ggml_mul(ctx, pfull, band01); // zero the lower off-band (broadcast over G,H) + // Over-padded transposed V chunks [C+P-1, dk, G, H]: Vchunk[j,d,g]=v[gC+j]. + ggml_tensor* vpad = ggml_pad_ext(ctx, vh, 0,0, att_left,att_right, 0,0, 0,0); // [dk,T+P-1,H] + if (Lk > (int)vpad->ne[1]) vpad = ggml_pad_ext(ctx, vpad, 0,0, 0,Lk-(int)vpad->ne[1], 0,0, 0,0); + ggml_tensor* vpt = ggml_cont(ctx, ggml_permute(ctx, vpad, 1, 0, 2, 3)); // [Lk, dk, H] + ggml_tensor* vchunk = ggml_view_4d(ctx, vpt, C+P-1, dk, G, H, + vpt->nb[1], (size_t)C*vpt->nb[0], vpt->nb[2], 0); + vchunk = ggml_cont(ctx, vchunk); + // context_g[d,i] = sum_j Vchunk[j,d] Pfull[j,i] -> [dk, C, G, H]. + ggml_tensor* cc = ggml_mul_mat(ctx, vchunk, pfull); + cc = ggml_reshape_3d(ctx, cc, dk, Tp, H); + ggml_tensor* context = (Tp > T) ? ggml_view_3d(ctx, cc, dk, T, H, cc->nb[1], cc->nb[2], 0) : cc; + + // Merge heads [dk,T,H] -> [dk,H,T] -> [D,T]; mask padded query rows; linear_out. + ggml_tensor* merged = ggml_cont(ctx, ggml_permute(ctx, context, 0, 2, 1, 3)); + merged = ggml_reshape_2d(ctx, merged, D, T); + if (valid_len < T) { + std::vector& qm = pool.alloc_f32(T); + for (int t = 0; t < T; ++t) qm[t] = (t < valid_len) ? 1.0f : 0.0f; + int64_t qne[2] = {1, T}; + ggml_tensor* qmask = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, qne, + qm.data(), qm.size() * sizeof(float)); + merged = ggml_mul(ctx, merged, qmask); + } + ggml_tensor* Wo = clone_weight(ctx, ml, pre + "linear_out.weight"); + ggml_tensor* y = ggml_mul_mat(ctx, Wo, merged); + if (ml.tensor(pre + "linear_out.bias")) + y = ggml_add(ctx, y, clone_weight(ctx, ml, pre + "linear_out.bias")); + return y; // [D, T] +} + +void RelPosAttention::forward_local_chunked(const std::vector& x, int T, + const std::vector& pos_emb, int pos_len, + int valid_len, int att_left, int att_right, + std::vector& out, int chunk) const { + const int D = d_model_; + assert((int)x.size() == T * D); + assert((int)pos_emb.size() == pos_len * D); + GraphInputPool pool; + bool ok = pk::run_graph(/*mem_bytes*/0, /*n_threads*/4, + [&](ggml_context* ctx) -> ggml_tensor* { + int64_t xt_ne[2] = {D, T}; + ggml_tensor* xt = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, xt_ne, + x.data(), (size_t)T * D * sizeof(float)); + int64_t pe_ne[2] = {D, pos_len}; + ggml_tensor* pe = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, pe_ne, + pos_emb.data(), (size_t)pos_len * D * sizeof(float)); + return build_graph_local_chunked(ctx, xt, T, pe, pos_len, valid_len, + att_left, att_right, pool, chunk); + }, out); + assert(ok && "relpos local chunked attention graph failed"); + (void)ok; +} + } // namespace pk diff --git a/src/relpos_attention.hpp b/src/relpos_attention.hpp index 0daaa17..f8814d7 100644 --- a/src/relpos_attention.hpp +++ b/src/relpos_attention.hpp @@ -73,6 +73,27 @@ class RelPosAttention { int att_left, int att_right, GraphInputPool& pool) const; + // CHUNK-MATMUL LOCAL GRAPH-BUILDER. Same math/output as build_graph_local + // (NeMo rel_pos_local_attn, banded O(T*window)) but built with O(1) graph + // nodes regardless of window: time is tiled into chunks of `chunk` frames, + // K/V are gathered as OVER-PADDED overlapping chunks (so ggml's dense + // ne-product view-bounds check passes), the per-chunk q.k blocks are one + // batched ggml_mul_mat, and a diagonal "skew" view extracts the [P,T] band. + // Lets the window go to NeMo's full [128,128] without the pad-and-shift + // path's O(window) nodes + O(window^2) concat. `pe` is the LOCAL positional + // encoding [D, att_left+att_right+1]. chunk<=0 picks a default. + ggml_tensor* build_graph_local_chunked(ggml_context* ctx, ggml_tensor* xt, + int T, ggml_tensor* pe, int pos_len, + int valid_len, int att_left, int att_right, + GraphInputPool& pool, int chunk = -1) const; + + // Thin wrapper over build_graph_local_chunked (test entry point). Same + // signature/semantics as forward_local; output must match it. + void forward_local_chunked(const std::vector& x, int T, + const std::vector& pos_emb, int pos_len, + int valid_len, int att_left, int att_right, + std::vector& out, int chunk = -1) const; + // x: [T, d_model]; pos_emb: [2T-1, d_model]; out: [T, d_model]. void forward(const std::vector& x, int T, const std::vector& pos_emb, int pos_len, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4ab5998..181ad91 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -16,6 +16,7 @@ pk_add_test(test_subsampling) pk_add_test(test_subsampling_batch) pk_add_test(test_relpos_attention) pk_add_test(test_relpos_attention_local) +pk_add_test(test_relpos_attention_local_chunked) pk_add_test(test_relpos_attention_local_memory) pk_add_test(test_relpos_attention_batch) pk_add_test(test_conformer) @@ -54,7 +55,7 @@ pk_add_test(test_capi_batch) pk_add_test(test_capi_stream) pk_add_test(test_capi_timestamps) pk_add_test(test_capi_batch_json) -set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch +set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked test_conformer test_conformer_batch test_conv_eou test_encoder test_encoder_batch test_encoder_batch_local test_encoder_eou test_streaming_encoder test_ctc test_prediction test_prediction_step test_prediction_step_batch diff --git a/tests/test_relpos_attention_local_chunked.cpp b/tests/test_relpos_attention_local_chunked.cpp new file mode 100644 index 0000000..f3d4b05 --- /dev/null +++ b/tests/test_relpos_attention_local_chunked.cpp @@ -0,0 +1,66 @@ +#include "relpos_attention.hpp" +#include "model_loader.hpp" +#include "parity.hpp" +#include +#include +#include +#include +#include + +// Equivalence test for the CHUNK-MATMUL banded attention +// (build_graph_local_chunked) against the trusted pad-and-shift banded path +// (forward_local, itself verified to 1.4e-3 vs a deterministic brute-force band +// reference). Both consume the SAME synthetic x / local pos_emb through the SAME +// layer-0 weights, so they must agree to within float reduction-order noise +// (matmul vs sum_rows-loop). This decouples the test from any baseline clip +// length, so we can exercise large windows (W up to 128 = NeMo's full +// [128,128]) - the whole point of the chunk-matmul construction. +// +// Env: PARAKEET_TEST_GGUF (weights; skip 77 if unset). +static void fill_synth(std::vector& v, int rows, int d, int seed) { + v.resize((size_t)rows * d); + for (int r = 0; r < rows; ++r) + for (int c = 0; c < d; ++c) + // smooth, bounded, distinct per (r,c,seed); not all-equal so banding matters + v[(size_t)r * d + c] = + 0.5f * std::sin(0.017f * (r + 1) * (c + 1 + seed) + 0.3f * seed) + + 0.1f * std::cos(0.0031f * (r * 7 + c * 3 + seed)); +} + +int main() { + const char* gguf = std::getenv("PARAKEET_TEST_GGUF"); + if (!gguf) { std::fprintf(stderr, "env not set; skip\n"); return 77; } + + pk::ModelLoader ml; + if (!ml.load(gguf)) { std::fprintf(stderr, "model load failed\n"); return 1; } + const int d_model = (int)ml.config().d_model; + pk::RelPosAttention attn(ml, /*layer_idx*/0); + + struct Case { int T, W, chunk; }; + const Case cases[] = { + { 37, 8, -1}, // small, default chunk; padded clip (valid_len=T-1) + { 64, 16, 16}, // T multiple of chunk + { 70, 16, 16}, // T NOT a multiple of chunk (padding path) + { 200, 32, 32}, // production cap window + { 256, 128, 64}, // NeMo full window, chunk x, pos; + fill_synth(x, T, d_model, /*seed*/1 + (W & 7)); + fill_synth(pos, pos_len, d_model, /*seed*/2); + + std::vector ref, got; + attn.forward_local(x, T, pos, pos_len, valid_len, W, W, ref); + attn.forward_local_chunked(x, T, pos, pos_len, valid_len, W, W, got, c.chunk); + + char label[64]; + std::snprintf(label, sizeof(label), "chunked.T%d_W%d_c%d", T, W, c.chunk); + all_ok &= pktest::compare(got, ref, label, /*atol*/2e-3f, /*rtol*/2e-3f); + } + return all_ok ? 0 : 1; +} From 36c87c62c622b156eb017641f8b34dc9778507f5 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 6 Jun 2026 02:27:40 +0200 Subject: [PATCH 6/7] feat(encoder): route local attention to chunk-matmul, lift window cap to 128 Wire the O(1)-node chunk-matmul kernel into the encoder and raise the local window cap from 32 to NeMo's full 128. Both conformer attention paths now use it: build_graph (B=1) -> build_graph_local_chunked, build_graph_batched (B>1) -> build_graph_batched_local_chunked. The batched wrapper runs the 4D chunk kernel once per item and stacks the [D,T] outputs into [D,T,B] (the chunk graph is already 4D, so it can't also carry a batch dim); that is O(B) nodes, still O(1) in the window, and B is small. local_attn_window's cap (kMaxLocalWindow) goes 32 -> 128: the pad-and-shift path emitted ~6*(2W+1) nodes/layer (hence the 32 cap to fit kGraphSize), but the chunk-matmul path is window-independent in node count, so long audio now runs at NeMo's full [128,128] window. The pad-and-shift build_graph_local / build_graph_batched_local are kept as the verification oracle for test_relpos_attention_local{,_chunked}. Verified on dgx: full ctest green (51/51). test_encoder_batch_local passes at every forced window W=8..128 (now through the chunked path). e2e on a 16.6-min clip (tdt-0.6b-v3, CPU/16t), auto-local W=128: 36.8s / 9.8GB peak RSS, coherent transcript - faster than the W=32 pad-and-shift capstone (41-48s / 9.1GB) at a 4x wider, NeMo-faithful window, and ~5.6x under the full-attention path that OOM'd the node (151s / 55GB). Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 --- src/conformer.cpp | 8 ++++---- src/encoder.cpp | 11 ++++++----- src/relpos_attention.cpp | 20 ++++++++++++++++++++ src/relpos_attention.hpp | 12 ++++++++++++ 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/src/conformer.cpp b/src/conformer.cpp index 681b307..0622f05 100644 --- a/src/conformer.cpp +++ b/src/conformer.cpp @@ -336,8 +336,8 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx, ggml_tensor* attn_in = layer_norm(r, "norm_self_att"); RelPosAttention attn(ml_, layer_idx_); ggml_tensor* attn_out = local_attn - ? attn.build_graph_batched_local(ctx, attn_in, T, B, pe, pos_len, valid_len, - att_left, att_right, pool) // [D, T, B] + ? attn.build_graph_batched_local_chunked(ctx, attn_in, T, B, pe, pos_len, valid_len, + att_left, att_right, pool) // [D, T, B] : attn.build_graph_batched(ctx, attn_in, T, B, pe, pos_len, valid_len, pool); r = ggml_add(ctx, r, attn_out); @@ -413,8 +413,8 @@ ggml_tensor* ConformerLayer::build_graph(ggml_context* ctx, ggml_tensor* xt, ggml_tensor* attn_in = layer_norm(r, "norm_self_att"); RelPosAttention attn(ml_, layer_idx_); ggml_tensor* attn_out = local_attn - ? attn.build_graph_local(ctx, attn_in, T, pe, pos_len, valid_len, - att_left, att_right, pool) // [D, T] + ? attn.build_graph_local_chunked(ctx, attn_in, T, pe, pos_len, valid_len, + att_left, att_right, pool) // [D, T] : attn.build_graph(ctx, attn_in, T, pe, pos_len, valid_len, pool); // [D, T] r = ggml_add(ctx, r, attn_out); diff --git a/src/encoder.cpp b/src/encoder.cpp index dd33d46..f295989 100644 --- a/src/encoder.cpp +++ b/src/encoder.cpp @@ -16,12 +16,13 @@ namespace pk { // Decide the self-attention window for an encoder of Tp frames. Returns W>0 to // use NeMo rel_pos_local_attn [W,W] (banded, O(T*window)); -1 for full attention. // -// The pad-and-shift kernel adds ~6*(2W+1) graph nodes per layer, so W is capped -// to stay within the metadata-context budget (backend.cpp kGraphSize); W<=32 -// fits every shipped model. The efficient chunk-matmul construction (follow-up, -// O(1) nodes) would lift this cap to NeMo's full [128,128]. +// The attention uses the chunk-matmul banded path (build_graph_local_chunked), +// which emits O(1) graph nodes regardless of window, so W can go to NeMo's full +// [128,128] without overflowing the metadata-context budget (backend.cpp +// kGraphSize). (The older pad-and-shift path emitted ~6*(2W+1) nodes/layer, +// which is why this was capped at 32.) static int local_attn_window(int Tp) { - constexpr int kMaxLocalWindow = 32; + constexpr int kMaxLocalWindow = 128; if (const char* e = std::getenv("PARAKEET_ATT_CONTEXT")) { const int w = std::atoi(e); if (w <= 0) return -1; // 0 / negative -> force full attention diff --git a/src/relpos_attention.cpp b/src/relpos_attention.cpp index b33ea34..ef6ffa1 100644 --- a/src/relpos_attention.cpp +++ b/src/relpos_attention.cpp @@ -690,6 +690,26 @@ ggml_tensor* RelPosAttention::build_graph_local_chunked( return y; // [D, T] } +ggml_tensor* RelPosAttention::build_graph_batched_local_chunked( + ggml_context* ctx, ggml_tensor* xt, int T, int B, ggml_tensor* pe, + int pos_len, const std::vector& valid_len, int att_left, int att_right, + GraphInputPool& pool, int chunk) const { + const int D = d_model_; + assert((int)valid_len.size() == B); + // Run the O(1) chunk kernel per item (the 4D chunk graph can't also carry a + // batch dim), then stack the per-item [D,T] outputs back into [D,T,B]. + ggml_tensor* out = nullptr; + for (int b = 0; b < B; ++b) { + ggml_tensor* xb = ggml_view_2d(ctx, xt, D, T, xt->nb[1], (size_t)b * xt->nb[2]); + xb = ggml_cont(ctx, xb); // linear() mul_mat wants a dense [D,T] item + ggml_tensor* yb = build_graph_local_chunked(ctx, xb, T, pe, pos_len, + valid_len[b], att_left, att_right, pool, chunk); // [D,T] + yb = ggml_reshape_3d(ctx, yb, D, T, 1); + out = out ? ggml_concat(ctx, out, yb, 2) : yb; + } + return out; // [D, T, B] +} + void RelPosAttention::forward_local_chunked(const std::vector& x, int T, const std::vector& pos_emb, int pos_len, int valid_len, int att_left, int att_right, diff --git a/src/relpos_attention.hpp b/src/relpos_attention.hpp index f8814d7..ba364c3 100644 --- a/src/relpos_attention.hpp +++ b/src/relpos_attention.hpp @@ -94,6 +94,18 @@ class RelPosAttention { int valid_len, int att_left, int att_right, std::vector& out, int chunk = -1) const; + // Batched CHUNK-MATMUL local GRAPH-BUILDER. `xt` is [D, T, B]; `pe` is the + // LOCAL positional encoding [D, att_left+att_right+1] (shared). Runs the O(1) + // chunk-matmul construction per item (the 4D chunk kernel can't also carry a + // batch dim - ggml is 4D), so this is O(B) nodes, still O(1) in the window. + // `valid_len` is per item. Returns [D, T, B]. Same output as + // build_graph_batched_local but window-cap-free. + ggml_tensor* build_graph_batched_local_chunked(ggml_context* ctx, ggml_tensor* xt, + int T, int B, ggml_tensor* pe, int pos_len, + const std::vector& valid_len, + int att_left, int att_right, + GraphInputPool& pool, int chunk = -1) const; + // x: [T, d_model]; pos_emb: [2T-1, d_model]; out: [T, d_model]. void forward(const std::vector& x, int T, const std::vector& pos_emb, int pos_len, From 0e81654c98270ccb63eeda39453d9c09fef74245 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 6 Jun 2026 09:02:03 +0200 Subject: [PATCH 7/7] docs(bench): add long-audio banded-attention section Document the banded local attention (rel_pos_local_attn) memory/speed win that the chunk-matmul kernel enables. 16.6-min clip, tdt-0.6b-v3, GB10 CPU/16t: global O(T^2) attention 148.3s / 54.0GB vs banded W=128 36.9s / 9.4GB (~4x faster, ~5.7x less peak RAM) at NeMo's full window, with the chunk-matmul making W=128 as cheap as W=32. Notes that short clips stay on the global path and are unchanged. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 --- benchmarks/BENCHMARK.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/benchmarks/BENCHMARK.md b/benchmarks/BENCHMARK.md index cc7187e..ee0b36e 100644 --- a/benchmarks/BENCHMARK.md +++ b/benchmarks/BENCHMARK.md @@ -498,6 +498,34 @@ Transcripts from the **diverse** clip set (no ground truth for most). NeMo vs pa | NeMo (PyTorch CPU) | hello this is a test of the voxtrol speech to text system | | parakeet.cpp f32 | hello this is a test of the voxtrol speech to text system | +## Long-audio attention (banded local) + +The FastConformer encoder uses **global** relative-position self-attention, which +is O(T²) in time *and memory*. On a long clip this explodes: a ~16.6-min file +subsamples to T≈12k encoder frames, and the score/mask tensors alone reach tens +of GB — enough to OOM a node. parakeet.cpp ports NeMo's `rel_pos_local_attn` +(Longformer-style **banded** attention, `change_attention_model('rel_pos_local_attn',[W,W])`): +each query attends only to keys within a ±W window, making attention **O(T·W)**. +It auto-enables for long audio (encoder frames > 8192); `PARAKEET_ATT_CONTEXT=W` +forces a window (`0` = full attention). The band is built with a **chunk-matmul** +construction (overlapping K/V chunks + one batched GEMM + a diagonal skew-view), +so the graph node count is **independent of the window** — the window goes to +NeMo's full `[128,128]` at no extra graph cost. + +**16.6-min clip** (`tdt-0.6b-v3`, f32, NVIDIA GB10 DGX Spark, CPU / 16 threads): + +| Attention | Window | Wall | RTFx | Peak RSS | +|---|---|---|---|---| +| full (global, O(T²)) | — | 148.3 s | 6.7× | 54.0 GB | +| banded | W=32 | 39.5 s | 25.2× | 8.9 GB | +| **banded** | **W=128** (NeMo full) | **36.9 s** | **27.0×** | **9.4 GB** | + +Banded attention at NeMo's full W=128 is **~4× faster and ~5.7× less peak memory** +than the global path, with a coherent transcript — and the chunk-matmul keeps the +wide window as cheap as the narrow one. Short clips (the LibriSpeech set above) +stay on the global path and are byte-identical to before; banding only engages +past the long-audio threshold. + ## Findings ### Accuracy