Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions benchmarks/BENCHMARK.md
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,34 @@ Transcripts from the **diverse** clip set (no ground truth for most). NeMo vs pa
| NeMo (PyTorch CPU) | hello this is a test of the voxtrol speech to text system<EOU> |
| parakeet.cpp f32 | hello this is a test of the voxtrol speech to text system<EOU> |

## Long-audio attention (banded local)

The FastConformer encoder uses **global** relative-position self-attention, which
is O(T²) in time *and memory*. On a long clip this explodes: a ~16.6-min file
subsamples to T≈12k encoder frames, and the score/mask tensors alone reach tens
of GB — enough to OOM a node. parakeet.cpp ports NeMo's `rel_pos_local_attn`
(Longformer-style **banded** attention, `change_attention_model('rel_pos_local_attn',[W,W])`):
each query attends only to keys within a ±W window, making attention **O(T·W)**.
It auto-enables for long audio (encoder frames > 8192); `PARAKEET_ATT_CONTEXT=W`
forces a window (`0` = full attention). The band is built with a **chunk-matmul**
construction (overlapping K/V chunks + one batched GEMM + a diagonal skew-view),
so the graph node count is **independent of the window** — the window goes to
NeMo's full `[128,128]` at no extra graph cost.

**16.6-min clip** (`tdt-0.6b-v3`, f32, NVIDIA GB10 DGX Spark, CPU / 16 threads):

| Attention | Window | Wall | RTFx | Peak RSS |
|---|---|---|---|---|
| full (global, O(T²)) | — | 148.3 s | 6.7× | 54.0 GB |
| banded | W=32 | 39.5 s | 25.2× | 8.9 GB |
| **banded** | **W=128** (NeMo full) | **36.9 s** | **27.0×** | **9.4 GB** |

Banded attention at NeMo's full W=128 is **~4× faster and ~5.7× less peak memory**
than the global path, with a coherent transcript — and the chunk-matmul keeps the
wide window as cheap as the narrow one. Short clips (the LibriSpeech set above)
stay on the global path and are byte-identical to before; banding only engages
past the long-audio threshold.

## Findings

### Accuracy
Expand Down
102 changes: 102 additions & 0 deletions scripts/gen_band_ref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""Generate the DETERMINISTIC brute-force band-attention reference fixture used by
``test_relpos_attention_local``.

NeMo's ``RelPositionMultiHeadAttentionLongformer.forward`` is non-deterministic
on short sequences (``sliding_chunks_matmul_pv`` reads uninitialized memory at
sequence boundaries via ``F.pad(value=-1)`` + ``as_strided`` — two identical
forward() calls differ by >1e3). So a hook-captured ``l0_attn_out`` baseline is
unusable for bit-parity on a short clip. Instead we recompute ``l0_attn_out`` as
plain band attention (the well-defined math the longformer approximates), which
the C++ ``forward_local`` matches to ~1e-3. End-to-end NeMo quality is anchored
separately by the long-audio WER capstone, where the boundary noise is moot.

Reads ``l0_attn_in`` and ``pos_emb`` from an existing local baseline (produced by
``gen_nemo_baseline.py --att-context-size W``) so the inputs are the real NeMo
ones; only the reference output is recomputed deterministically.

Usage:
python scripts/gen_band_ref.py --model nvidia/parakeet-tdt_ctc-110m \
--in-baseline baseline_110m_local8.gguf --att-context 8 \
--output baseline_110m_local8_ref.gguf
"""
import argparse
import math

import gguf
import numpy as np
import torch

import nemo.collections.asr as nemo_asr


def read_tensor(path, name):
r = gguf.GGUFReader(path)
t = {x.name: x for x in r.tensors}[name]
return np.array(t.data, dtype=np.float32).reshape(
tuple(int(d) for d in reversed(t.shape)))


def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", default="nvidia/parakeet-tdt_ctc-110m")
ap.add_argument("--in-baseline", required=True,
help="local baseline gguf with l0_attn_in + pos_emb")
ap.add_argument("--att-context", type=int, required=True, help="window W")
ap.add_argument("--output", required=True)
args = ap.parse_args()

li_np = read_tensor(args.in_baseline, "l0_attn_in") # (T, D)
pos_np = read_tensor(args.in_baseline, "pos_emb") # (2W+1, D)
li = torch.tensor(li_np)[None]
pos = torch.tensor(pos_np)[None]
T, D = li.shape[1], li.shape[2]
w = args.att_context
vlen = T - 1 # last frame is a center-pad/padding frame for the fixture clip

m = nemo_asr.models.ASRModel.from_pretrained(args.model, map_location="cpu")
m.eval()
m.change_attention_model("rel_pos_local_attn", [w, w])
a0 = m.encoder.layers[0].self_attn
h, dk = a0.h, a0.d_k
s = math.sqrt(dk)
P = 2 * w + 1

with torch.no_grad():
q = a0.linear_q(li).view(1, T, h, dk).transpose(1, 2)
k = a0.linear_k(li).view(1, T, h, dk).transpose(1, 2)
v = a0.linear_v(li).view(1, T, h, dk).transpose(1, 2)
p = a0.linear_pos(pos).view(1, -1, h, dk).transpose(1, 2)
qu = q + a0.pos_bias_u.unsqueeze(1)
qv = q + a0.pos_bias_v.unsqueeze(1)
sc = torch.full((1, h, T, P), -1e30)
for t in range(T):
for c in range(P):
key = t - w + c
if 0 <= key < vlen:
sc[0, :, t, c] = ((qu[0, :, t] * k[0, :, key]).sum(-1)
+ (qv[0, :, t] * p[0, :, c]).sum(-1)) / s
at = torch.softmax(sc, dim=-1)
ctx = torch.zeros(1, h, T, dk)
for t in range(T):
for c in range(P):
key = t - w + c
if 0 <= key < vlen:
ctx[0, :, t] += at[0, :, t, c:c + 1] * v[0, :, key]
om = a0.linear_out(ctx.transpose(1, 2).reshape(1, T, h * dk))[0].clone()
om[vlen:] = a0.linear_out(torch.zeros(1, h * dk))[0] # padded query rows -> bias

ref = om.numpy().astype(np.float32)
W = gguf.GGUFWriter(args.output, "pk-band-ref")
W.add_tensor("l0_attn_in", np.ascontiguousarray(li_np))
W.add_tensor("pos_emb", np.ascontiguousarray(pos_np))
W.add_tensor("l0_attn_out", np.ascontiguousarray(ref))
W.write_header_to_file()
W.write_kv_data_to_file()
W.write_tensors_to_file()
W.close()
print(f"wrote {args.output}: T={T} D={D} W={w}")


if __name__ == "__main__":
main()
26 changes: 26 additions & 0 deletions scripts/gen_nemo_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,16 @@ def main():
help="dump per-token/word timestamps + max_prob confidence for both "
"heads (TDT/RNNT + CTC) instead of the encoder-stage baseline.",
)
ap.add_argument(
"--att-context-size",
type=int,
default=None,
help="if set to W, switch the encoder to NeMo local (Longformer) "
"attention via change_attention_model('rel_pos_local_attn', [W, W]) "
"before the forward, so the dumped pos_emb (2W+1), l0_attn_in/out and "
"transcripts reflect banded local attention. Anchors the C++ "
"banded-attention parity tests at NeMo quality.",
)
args = ap.parse_args()

is_local = pathlib.Path(args.model).exists()
Expand All @@ -452,6 +462,22 @@ def main():
# Determinism: zero the spectrogram dither so the mel is reproducible.
m.preprocessor.featurizer.dither = 0.0

# Optional: switch to NeMo local (Longformer) attention so the dumped
# baseline anchors the C++ banded-attention path. Must run BEFORE the hooks
# below, since change_attention_model swaps the pos_enc and self_attn modules
# (a hook registered on the old module would never fire). Mirrors the v3
# model card's long-audio recipe.
if args.att_context_size is not None:
w = args.att_context_size
m.change_attention_model("rel_pos_local_attn", [w, w])
try:
m.change_subsampling_conv_chunking_factor(1)
except Exception as e: # pragma: no cover - older NeMo without the API
print(f"note: change_subsampling_conv_chunking_factor skipped: {e}",
file=sys.stderr)
print(f"local attention: rel_pos_local_attn att_context_size=[{w},{w}]",
file=sys.stderr)

# --timestamps: dump per-token/word timestamps + max_prob confidence for both
# heads, then return. Kept as a separate early path so the encoder-stage
# baseline behaviour below is completely untouched.
Expand Down
14 changes: 12 additions & 2 deletions src/backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,19 @@

namespace pk {

// Gallocr buffer size (bytes) after the most recent single-backend (CPU)
// compute. Lets tests assert attention memory scales O(T*window), not O(T^2).
static size_t g_last_graph_alloc_bytes = 0;
size_t last_graph_alloc_bytes() { return g_last_graph_alloc_bytes; }

namespace {
// Number of graph nodes the metadata context must hold. The biggest single
// graph today is a streaming conformer layer (~150 nodes); leave generous head
// room for Task 2's fused encoder (~85 layers worth of ops in one graph).
// graph today is the fused encoder. Banded local attention adds O(window) ops
// per layer (~6*(2W+1) nodes), so the encoder caps its window (see
// local_attn_window) to stay within this budget; bumping it globally regresses
// small models (~+22% on tdt_ctc-110m) because the per-compute context + graph
// hash-set scale with kGraphSize. A larger window needs the efficient
// chunk-matmul construction (O(1) nodes) instead.
constexpr size_t kGraphSize = 16384;

struct PendingInput {
Expand Down Expand Up @@ -242,6 +251,7 @@ bool Backend::compute(const std::function<ggml_tensor*(ggml_context*)>& build,
}
alloc_ok = ggml_gallocr_alloc_graph(impl_->galloc, gf);
if (!alloc_ok) PK_LOG("Backend::compute: ggml_gallocr_alloc_graph failed");
else g_last_graph_alloc_bytes = ggml_gallocr_get_buffer_size(impl_->galloc, 0);
}
if (!alloc_ok) {
impl_->pending.clear();
Expand Down
26 changes: 18 additions & 8 deletions src/conformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,11 +286,14 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx,
ggml_tensor* xt, int T, int B,
ggml_tensor* pe, int pos_len,
const std::vector<int>& valid_len,
GraphInputPool& pool) const {
GraphInputPool& pool,
int att_left, int att_right) const {
const int D = d_model_;
const int K = conv_kernel_;
const float ln_eps = 1e-5f; // LayerNorm eps (NeMo nn.LayerNorm default)
assert(pos_len == 2 * T - 1);
const bool local_attn = att_left >= 0;
assert(local_attn ? (pos_len == att_left + att_right + 1)
: (pos_len == 2 * T - 1));

const std::string pre = "encoder.layers." + std::to_string(layer_idx_) + ".";
const ModelLoader& ml = ml_;
Expand Down Expand Up @@ -332,8 +335,10 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx,
// === Stage B: r = r + self_attn(norm_self_att(r)). ===
ggml_tensor* attn_in = layer_norm(r, "norm_self_att");
RelPosAttention attn(ml_, layer_idx_);
ggml_tensor* attn_out = attn.build_graph_batched(ctx, attn_in, T, B, pe,
pos_len, valid_len, pool); // [D, T, B]
ggml_tensor* attn_out = local_attn
? attn.build_graph_batched_local_chunked(ctx, attn_in, T, B, pe, pos_len, valid_len,
att_left, att_right, pool) // [D, T, B]
: attn.build_graph_batched(ctx, attn_in, T, B, pe, pos_len, valid_len, pool);
r = ggml_add(ctx, r, attn_out);

// === Stage C: r = r + conv(norm_conv(r)). ===
Expand All @@ -359,11 +364,14 @@ ggml_tensor* ConformerLayer::build_graph_batched(ggml_context* ctx,
ggml_tensor* ConformerLayer::build_graph(ggml_context* ctx, ggml_tensor* xt,
int T, ggml_tensor* pe, int pos_len,
int valid_len,
GraphInputPool& pool) const {
GraphInputPool& pool,
int att_left, int att_right) const {
const int D = d_model_;
const int K = conv_kernel_;
const float ln_eps = 1e-5f; // LayerNorm eps (NeMo nn.LayerNorm default)
assert(pos_len == 2 * T - 1);
const bool local_attn = att_left >= 0;
assert(local_attn ? (pos_len == att_left + att_right + 1)
: (pos_len == 2 * T - 1));

const std::string pre = "encoder.layers." + std::to_string(layer_idx_) + ".";
const ModelLoader& ml = ml_;
Expand Down Expand Up @@ -404,8 +412,10 @@ ggml_tensor* ConformerLayer::build_graph(ggml_context* ctx, ggml_tensor* xt,
// === Stage B: r = r + self_attn(norm_self_att(r)). ===
ggml_tensor* attn_in = layer_norm(r, "norm_self_att");
RelPosAttention attn(ml_, layer_idx_);
ggml_tensor* attn_out = attn.build_graph(ctx, attn_in, T, pe, pos_len,
valid_len, pool); // [D, T]
ggml_tensor* attn_out = local_attn
? attn.build_graph_local_chunked(ctx, attn_in, T, pe, pos_len, valid_len,
att_left, att_right, pool) // [D, T]
: attn.build_graph(ctx, attn_in, T, pe, pos_len, valid_len, pool); // [D, T]
r = ggml_add(ctx, r, attn_out);

// === Stage C: r = r + conv(norm_conv(r)). ===
Expand Down
12 changes: 10 additions & 2 deletions src/conformer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,24 @@ class ConformerLayer {
// This is the unit reused by the fused encoder AND the unit test; computing
// the entire layer as ONE sub-graph (vs the old 5 sub-graphs) is what lets
// the fused encoder be a single graph.
// When att_left/att_right >= 0, the self-attention uses NeMo
// rel_pos_local_attn (banded, O(T*window)): `pe` must then be the LOCAL
// positional encoding [d_model, att_left+att_right+1]. Defaults (-1, -1) keep
// full attention with `pe` = [d_model, 2T-1].
ggml_tensor* build_graph(ggml_context* ctx, ggml_tensor* xt, int T,
ggml_tensor* pe, int pos_len, int valid_len,
GraphInputPool& pool) const;
GraphInputPool& pool,
int att_left = -1, int att_right = -1) const;

// Batched GRAPH-BUILDER. `xt` is [D, T, B]; `pe` is [D, pos_len] (shared
// across the batch). `valid_len` is per item (size B). Returns [D, T, B].
// att_left/att_right >= 0 routes self-attention to banded local attention
// (pe = LOCAL [d_model, att_left+att_right+1]); defaults (-1,-1) = full.
ggml_tensor* build_graph_batched(ggml_context* ctx, ggml_tensor* xt, int T,
int B, ggml_tensor* pe, int pos_len,
const std::vector<int>& valid_len,
GraphInputPool& pool) const;
GraphInputPool& pool,
int att_left = -1, int att_right = -1) const;

// x: [T, d_model]; pos_emb: [pos_len=2T-1, d_model]; out: [T, d_model].
void forward(const std::vector<float>& x, int T,
Expand Down
48 changes: 40 additions & 8 deletions src/encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,32 @@
#include "ggml.h"
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <vector>

namespace pk {

// Decide the self-attention window for an encoder of Tp frames. Returns W>0 to
// use NeMo rel_pos_local_attn [W,W] (banded, O(T*window)); -1 for full attention.
//
// The attention uses the chunk-matmul banded path (build_graph_local_chunked),
// which emits O(1) graph nodes regardless of window, so W can go to NeMo's full
// [128,128] without overflowing the metadata-context budget (backend.cpp
// kGraphSize). (The older pad-and-shift path emitted ~6*(2W+1) nodes/layer,
// which is why this was capped at 32.)
static int local_attn_window(int Tp) {
constexpr int kMaxLocalWindow = 128;
if (const char* e = std::getenv("PARAKEET_ATT_CONTEXT")) {
const int w = std::atoi(e);
if (w <= 0) return -1; // 0 / negative -> force full attention
return w > kMaxLocalWindow ? kMaxLocalWindow : w;
}
// Auto: long audio (~>11 min at 8x subsampling) switches to local attention
// so full O(T^2) attention can't OOM the device.
constexpr int kLocalThreshold = 8192;
return Tp > kLocalThreshold ? kMaxLocalWindow : -1;
}

Encoder::Encoder(const ModelLoader& ml)
: ml_(ml) {
d_model_ = (int)ml.config().d_model;
Expand Down Expand Up @@ -60,18 +82,24 @@ void Encoder::forward_capture(const std::vector<float>& mel, int n_mels, int T,
x = ggml_scale(ctx, x, std::sqrt((float)d_model_));
}

// ---- 3. Relative positional encoding pos_emb [d_model, 2T'-1]. ----
const int pos_len = 2 * Tp - 1;
// ---- 3. Positional encoding. Long audio uses NeMo
// rel_pos_local_attn (banded, O(T*window)) so attention can't
// OOM; short audio keeps full attention (NeMo-exact). ----
const int att_w = local_attn_window(Tp);
const bool local = att_w > 0;
const int pos_len = local ? (2 * att_w + 1) : (2 * Tp - 1);
std::vector<float>& pe_host = pool.alloc_f32();
rel_pos_encoding(Tp, d_model_, pe_host); // row-major [pos_len, d_model]
if (local) local_rel_pos_encoding(att_w, att_w, d_model_, pe_host);
else rel_pos_encoding(Tp, d_model_, pe_host); // [pos_len, d_model]
int64_t pe_ne[2] = {d_model_, pos_len};
ggml_tensor* pe = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, pe_ne,
pe_host.data(), pe_host.size() * sizeof(float));

// ---- 4. Conformer layer stack (all in-graph). ----
for (int i = 0; i < n_layers_; ++i) {
ConformerLayer layer(ml_, i);
x = layer.build_graph(ctx, x, Tp, pe, pos_len, valid_len, pool);
x = layer.build_graph(ctx, x, Tp, pe, pos_len, valid_len, pool,
local ? att_w : -1, local ? att_w : -1);
// Capture requested layer outputs from the SAME graph (row-major
// [T', d_model], matching the layer output orientation).
for (size_t c = 0; c < capture_layers.size(); ++c) {
Expand Down Expand Up @@ -122,18 +150,22 @@ void Encoder::forward_batch(const MelBatch& mels,
// ---- 2. xscaling (gated; off for this model). ----
if (xscaling_) x = ggml_scale(ctx, x, std::sqrt((float)d_model_));

// ---- 3. Relative positional encoding pos_emb [d_model, 2T'-1]. ----
const int pos_len = 2 * Tp - 1;
// ---- 3. Positional encoding (local for long audio; see B=1 path). ----
const int att_w = local_attn_window(Tp);
const bool local = att_w > 0;
const int pos_len = local ? (2 * att_w + 1) : (2 * Tp - 1);
std::vector<float>& pe_host = pool.alloc_f32();
rel_pos_encoding(Tp, d_model_, pe_host); // row-major [pos_len, d_model]
if (local) local_rel_pos_encoding(att_w, att_w, d_model_, pe_host);
else rel_pos_encoding(Tp, d_model_, pe_host); // [pos_len, d_model]
int64_t pe_ne[2] = {d_model_, pos_len};
ggml_tensor* pe = pk::graph_input_tensor(ctx, GGML_TYPE_F32, 2, pe_ne,
pe_host.data(), pe_host.size() * sizeof(float));

// ---- 4. Conformer layer stack (all in-graph, shared pe). ----
for (int i = 0; i < n_layers_; ++i) {
ConformerLayer layer(ml_, i);
x = layer.build_graph_batched(ctx, x, Tp, mels.B, pe, pos_len, vout, pool);
x = layer.build_graph_batched(ctx, x, Tp, mels.B, pe, pos_len, vout, pool,
local ? att_w : -1, local ? att_w : -1);
}
return x; // [d_model, Tp, B]
}, flat);
Expand Down
Loading
Loading