Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
d017602
feat(nemo-conformer-tdt): port Nemo Conformer TDT model and ASR pipeline
ysdede Feb 28, 2026
964bc8f
fix(nemo-conformer-tdt): handle empty token decode output
ysdede Feb 28, 2026
fa9bc25
chore(nemo-conformer-tdt): keep typegen compatibility for transcribe/…
ysdede Mar 1, 2026
63aeee8
refactor(nemo-conformer-tdt): redesign transcribe output shape and API
ysdede Mar 1, 2026
f6835ad
fix(nemo-conformer-tdt): round timestamps and confidences, simplify p…
ysdede Mar 1, 2026
2dd36a1
fix: dispose tensors on error path, decouple frame confidences from t…
ysdede Mar 1, 2026
10977df
merge: sync with upstream (ModelRegistry API refactor, commit 4811a61)
ysdede Mar 1, 2026
39d9be4
feat: integrate NemoConformerTDT with ModelRegistry API
ysdede Mar 1, 2026
3d984e5
style: replace console.warn with logger.warn in feature extractor
ysdede Mar 1, 2026
9f3a220
fix(nemo-conformer-tdt): harden edge cases, restore pipeline design
ysdede Mar 1, 2026
3bac1dc
test(nemo-conformer-tdt): rewrite tests to match current API
ysdede Mar 1, 2026
c75ebd2
fix(nemo-conformer-tdt): harden decoding and feature utilities
ysdede Mar 2, 2026
493a588
fix(nemo-conformer-tdt): address critical review issues
ysdede Mar 3, 2026
5b4cdab
fix(nemo-conformer-tdt): clamp timestamps and validate cache limits
ysdede Mar 3, 2026
7690227
fix(nemo-conformer-tdt): close remaining tensor disposal leaks
ysdede Mar 3, 2026
1f065c3
fix(nemo-conformer-tdt): dispose auxiliary decoder outputs
ysdede Mar 3, 2026
ec09a09
perf(nemo-conformer-tdt): avoid tolist in length hot path
ysdede Mar 3, 2026
8a90a7c
fix(nemo-conformer-tdt): harden duration and audio validation
ysdede Mar 3, 2026
ce0a3eb
fix: address prioritized review findings
ysdede Mar 3, 2026
5d91d39
fix(nemo-conformer-tdt): apply low-risk hardening follow-ups
ysdede Mar 3, 2026
dfc2c13
fix(nemo-conformer-tdt): enforce named outputs and frame-level confid…
ysdede Mar 3, 2026
a5bd2cf
docs(nemo-conformer-tdt): clarify cached tensor sharing semantics
ysdede Mar 3, 2026
abada62
fix(nemo-conformer-tdt): harden decoder I/O validation and feed cleanup
ysdede Mar 3, 2026
62d8bc0
fix(nemo-conformer-tdt): address bot review findings
ysdede Mar 5, 2026
03fb8bd
style(nemo-conformer-tdt): simplify duration frame expression
ysdede Mar 5, 2026
426061e
fix(nemo-tdt): address PR10 follow-up review comments
ysdede Mar 5, 2026
d7476a6
fix(transformers): resolve Nemo TDT typegen regressions
ysdede Mar 5, 2026
0989f7a
fix(nemo-tdt): address PR11 cache and vocab review feedback
ysdede Mar 5, 2026
49a4af8
merge: sync upstream/main through #1559 and keep Nemo TDT ASR wiring
ysdede Mar 5, 2026
ee819a1
fix(nemo-tdt): add supports() for ASR model class selection
ysdede Mar 5, 2026
b44f7f3
fix(model-registry): include processor files for text-to-audio pipelines
ysdede Mar 5, 2026
bfa97e6
Revert "fix(model-registry): include processor files for text-to-audi…
ysdede Mar 5, 2026
a85dff2
fix(nemo-tdt): address PR #12 reviewer feedback
ysdede Mar 5, 2026
8dfccdd
feat(nemo-tdt): align asr pipeline outputs and long-audio handling
ysdede Mar 7, 2026
816f581
chore(tests): drop unrelated parakeet feature extractor coverage
ysdede Mar 7, 2026
f59ba06
feat(nemo-conformer-tdt): add sentence-based ASR pipeline chunking
ysdede Mar 8, 2026
00b3d93
fix(nemo): scope ASR tests and address review fixes
ysdede Mar 8, 2026
07118c3
fix(nemo-tdt): address follow-up review threads
ysdede Mar 8, 2026
341df3d
chore(asr): restore upstream cast spacing
ysdede Mar 8, 2026
29f2baa
fix(nemo-tdt): handle sparse vocab and merge dedupe
ysdede Mar 8, 2026
39e5cb1
fix(nemo-tdt): merge split domain suffix words
ysdede Mar 8, 2026
495bab5
Revert "fix(nemo-tdt): merge split domain suffix words"
ysdede Mar 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/transformers/src/models/feature_extractors.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export * from './clap/feature_extraction_clap.js';
export * from './dac/feature_extraction_dac.js';
export * from './gemma3n/feature_extraction_gemma3n.js';
export * from './moonshine/feature_extraction_moonshine.js';
export * from './nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js';
export * from './parakeet/feature_extraction_parakeet.js';
export * from './pyannote/feature_extraction_pyannote.js';
export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
Expand Down
1 change: 1 addition & 0 deletions packages/transformers/src/models/modeling_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ export const MODEL_TYPES = {
ImageAudioTextToText: 13,
Supertonic: 14,
Chatterbox: 15,
NemoConformerTDT: 16,
};

const MODEL_TYPE_CONFIG = {
Expand Down
1 change: 1 addition & 0 deletions packages/transformers/src/models/models.js
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ export * from './mt5/modeling_mt5.js';
export * from './multi_modality/modeling_multi_modality.js';
export * from './musicgen/modeling_musicgen.js';
export * from './nanochat/modeling_nanochat.js';
export * from './nemo_conformer_tdt/modeling_nemo_conformer_tdt.js';
export * from './neobert/modeling_neobert.js';
export * from './nomic_bert/modeling_nomic_bert.js';
export * from './olmo/modeling_olmo.js';
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
import { Tensor } from '../../utils/tensor.js';
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
import { logger } from '../../utils/logger.js';
import { FeatureLRUCache, createAudioCacheKey } from './transducer_cache.js';
import { computeTemporalDeltas } from './transducer_deltas.js';

const EPSILON = 1e-5;
export const NEMO_FEATURE_OUTPUT_OWNERSHIP = Symbol('NemoConformerTDTFeatureOutputOwnership');
export const NEMO_FEATURE_OUTPUT_RELEASE = Symbol('NemoConformerTDTFeatureOutputRelease');

function tagNemoFeatureOutputOwnership(value, cacheOwnsTensors, release = null) {
Object.defineProperty(value, NEMO_FEATURE_OUTPUT_OWNERSHIP, {
value: cacheOwnsTensors,
enumerable: false,
configurable: true,
});
if (release) {
Object.defineProperty(value, NEMO_FEATURE_OUTPUT_RELEASE, {
value: release,
enumerable: false,
configurable: true,
});
}
return value;
}

/**
* Feature extractor for Nemo Conformer TDT models.
*
* Mirrors NeMo-style log-mel extraction used by Parakeet with configurable
* `feature_size` (e.g. 80 or 128 mel bins via `preprocessor_config.json`).
*/
export class NemoConformerTDTFeatureExtractor extends FeatureExtractor {
constructor(config) {
super(config);

if (!Number.isInteger(this.config.n_fft) || this.config.n_fft <= 0) {
throw new Error(
`NemoConformerTDTFeatureExtractor expected \`n_fft\` as a positive integer, got ${this.config.n_fft}.`,
);
}
if (
!Number.isInteger(this.config.win_length) ||
this.config.win_length <= 0 ||
this.config.win_length > this.config.n_fft
) {
throw new Error(
`NemoConformerTDTFeatureExtractor expected \`win_length\` in [1, n_fft], got win_length=${this.config.win_length}, n_fft=${this.config.n_fft}.`,
);
}

// Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist.
this.config.mel_filters ??= mel_filter_bank(
Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins
this.config.feature_size, // num_mel_filters
0.0, // min_frequency
this.config.sampling_rate / 2, // max_frequency
this.config.sampling_rate, // sampling_rate
'slaney', // norm
'slaney', // mel_scale
);

const window = window_function(this.config.win_length, 'hann', {
periodic: false,
});

this.window = new Float64Array(this.config.n_fft);
const offset = Math.floor((this.config.n_fft - this.config.win_length) / 2);
this.window.set(window, offset);

// Optional feature-level cache and delta/delta-delta post-processing.
this.use_feature_cache = this.config.use_feature_cache ?? false;
this.delta_order = this.config.delta_order ?? 0;
this.delta_window = this.config.delta_window ?? 2;
this.delta_concatenate = this.config.delta_concatenate ?? true;

if (![0, 1, 2].includes(this.delta_order)) {
throw new Error(
`NemoConformerTDTFeatureExtractor expected delta_order in {0,1,2}, got ${this.delta_order}.`,
);
}
if (!Number.isInteger(this.delta_window) || this.delta_window < 1) {
throw new Error(
`NemoConformerTDTFeatureExtractor expected \`delta_window\` as a positive integer, got ${this.delta_window}.`,
);
}
if (this.delta_order > 0 && !this.delta_concatenate) {
logger.warn(
'NemoConformerTDTFeatureExtractor: `delta_concatenate=false` is set. ' +
'`input_features` will remain base features and deltas are returned in extra fields.',
);
}

this.feature_cache = this.use_feature_cache
? new FeatureLRUCache({
max_entries: this.config.feature_cache_max_entries ?? 128,
max_size_mb: this.config.feature_cache_max_size_mb ?? 64,
})
: null;
}

/**
* Computes the log-Mel spectrogram of the provided audio waveform.
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
*/
async _extract_fbank_features(waveform) {
// Parakeet uses a custom preemphasis strategy: Apply preemphasis to entire waveform at once
const preemphasis = this.config.preemphasis ?? 0;
if (!Number.isFinite(preemphasis) || preemphasis < 0 || preemphasis >= 1) {
throw new Error(
`NemoConformerTDTFeatureExtractor expected \`preemphasis\` in [0, 1), got ${this.config.preemphasis}.`,
);
}
waveform = new Float64Array(waveform); // Clone to avoid destructive changes
if (preemphasis !== 0) {
for (let j = waveform.length - 1; j >= 1; --j) {
waveform[j] -= preemphasis * waveform[j - 1];
}
}

const features = await spectrogram(
waveform,
this.window, // window
this.window.length, // frame_length
this.config.hop_length, // hop_length
{
fft_length: this.config.n_fft,
power: 2.0,
mel_filters: this.config.mel_filters,
log_mel: 'log',
mel_floor: -Infinity,
pad_mode: 'constant',
center: true,

// Custom
transpose: true,
mel_offset: 2 ** -24,
},
);

return features;
}

/**
* Asynchronously extracts features from a given audio using the provided configuration.
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
* @returns {Promise<{
* input_features: Tensor;
* attention_mask: Tensor;
* delta_features?: Tensor;
* delta_delta_features?: Tensor;
* }>} A Promise resolving to an object containing extracted model inputs.
* When cache is enabled, tensor instances are shared and owned by the cache.
* Do not mutate or dispose returned tensors unless cache is disabled/cleared.
*/
async _call(audio) {
validate_audio_inputs(audio, 'NemoConformerTDTFeatureExtractor');

if (this.feature_cache) {
const key = `${createAudioCacheKey(audio, this.config.sampling_rate)}:${this.delta_order}:${this.delta_window}:${this.delta_concatenate}`;
const cached = this.feature_cache.acquire(key);
if (cached) {
return tagNemoFeatureOutputOwnership({ ...cached.value }, true, cached.release);
}

const extracted = await this._extract(audio);
const cacheOwnsTensors = this.feature_cache.set(key, extracted);
if (!cacheOwnsTensors) {
return tagNemoFeatureOutputOwnership({ ...extracted }, false);
}

const borrowed = this.feature_cache.acquire(key);
if (!borrowed) {
return tagNemoFeatureOutputOwnership({ ...extracted }, false);
}
return tagNemoFeatureOutputOwnership({ ...borrowed.value }, true, borrowed.release);
}

return tagNemoFeatureOutputOwnership(await this._extract(audio), false);
}

async _extract(audio) {
const features = await this._extract_fbank_features(audio);

const [num_frames, num_features] = features.dims;
const raw_features_length = Math.floor(
(audio.length + Math.floor(this.config.n_fft / 2) * 2 - this.config.n_fft) / this.config.hop_length,
);
// Clamp to [0, num_frames] to avoid a negative fill offset for very short clips.
const features_length = Math.max(0, Math.min(num_frames, raw_features_length));

const features_data = /** @type {Float32Array} */ (features.data);
features_data.fill(0, features_length * num_features);

// normalize mel features, ignoring padding
const sum = new Float64Array(num_features);
const sum_sq = new Float64Array(num_features);

for (let i = 0; i < features_length; ++i) {
const offset = i * num_features;
for (let j = 0; j < num_features; ++j) {
const val = features_data[offset + j];
sum[j] += val;
sum_sq[j] += val * val;
}
}

// Skip normalization for empty/very short audio to avoid NaN from divide-by-zero.
if (features_length > 0) {
// Calculate mean and standard deviation, then normalize
const divisor = features_length > 1 ? features_length - 1 : 1;
for (let j = 0; j < num_features; ++j) {
const mean = sum[j] / features_length;
const variance = (sum_sq[j] - features_length * mean * mean) / divisor;
const std = Math.sqrt(Math.max(variance, 0)) + EPSILON;
const inv_std = 1 / std;

for (let i = 0; i < features_length; ++i) {
const index = i * num_features + j;
features_data[index] = (features_data[index] - mean) * inv_std;
}
}
}

const mask_data = new BigInt64Array(num_frames);
mask_data.fill(1n, 0, features_length);

let input_features = features.unsqueeze_(0);
const attention_mask = new Tensor('int64', mask_data, [1, num_frames]);

const result = {
input_features,
attention_mask,
};

if (this.delta_order > 0) {
const delta_result = computeTemporalDeltas(input_features, {
order: this.delta_order,
window: this.delta_window,
concatenate: this.delta_concatenate,
});
if (delta_result instanceof Tensor) {
input_features.dispose();
input_features = delta_result;
result.input_features = input_features;
} else {
result.delta_features = delta_result.delta;
if (delta_result.delta_delta) {
result.delta_delta_features = delta_result.delta_delta;
}
}
}

return result;
}

clear_cache() {
this.feature_cache?.clear();
}

get_cache_stats() {
return this.feature_cache?.stats() ?? null;
}
}
Loading