From d0176023a8ea4a48343f62131c5d4f2ff447da3e Mon Sep 17 00:00:00 2001 From: ysdede Date: Sat, 28 Feb 2026 22:52:41 +0300 Subject: [PATCH 01/33] feat(nemo-conformer-tdt): port Nemo Conformer TDT model and ASR pipeline --- .../src/models/feature_extractors.js | 1 + packages/transformers/src/models/models.js | 1 + .../feature_extraction_nemo_conformer_tdt.js | 204 ++++++ .../modeling_nemo_conformer_tdt.js | 597 ++++++++++++++++++ .../processing_nemo_conformer_tdt.js | 19 + .../nemo_conformer_tdt/transducer_cache.js | 119 ++++ .../nemo_conformer_tdt/transducer_deltas.js | 69 ++ .../nemo_conformer_tdt/transducer_text.js | 73 +++ .../utils_nemo_conformer_tdt.js | 4 + .../transformers/src/models/processors.js | 1 + packages/transformers/src/models/registry.js | 1 + packages/transformers/src/pipelines.js | 3 +- .../pipelines/automatic-speech-recognition.js | 139 ++++ ...t_feature_extraction_nemo_conformer_tdt.js | 78 +++ .../test_modeling_nemo_conformer_tdt.js | 188 ++++++ .../test_feature_extraction_parakeet.js | 45 ++ ..._pipelines_automatic_speech_recognition.js | 133 ++++ 17 files changed, 1674 insertions(+), 1 deletion(-) create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/processing_nemo_conformer_tdt.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js create mode 100644 packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js create mode 100644 packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js create mode 100644 packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js diff --git a/packages/transformers/src/models/feature_extractors.js b/packages/transformers/src/models/feature_extractors.js index 589b96cd7..48ecc6ff6 100644 --- a/packages/transformers/src/models/feature_extractors.js +++ b/packages/transformers/src/models/feature_extractors.js @@ -5,6 +5,7 @@ export * from './clap/feature_extraction_clap.js'; export * from './dac/feature_extraction_dac.js'; export * from './gemma3n/feature_extraction_gemma3n.js'; export * from './moonshine/feature_extraction_moonshine.js'; +export * from './nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js'; export * from './parakeet/feature_extraction_parakeet.js'; export * from './pyannote/feature_extraction_pyannote.js'; export * from './seamless_m4t/feature_extraction_seamless_m4t.js'; diff --git a/packages/transformers/src/models/models.js b/packages/transformers/src/models/models.js index 9c6d6f2dd..2fe9055a0 100644 --- a/packages/transformers/src/models/models.js +++ b/packages/transformers/src/models/models.js @@ -102,6 +102,7 @@ export * from './mpt/modeling_mpt.js'; export * from './mt5/modeling_mt5.js'; export * from './multi_modality/modeling_multi_modality.js'; export * from './musicgen/modeling_musicgen.js'; +export * from './nemo_conformer_tdt/modeling_nemo_conformer_tdt.js'; export * from './nanochat/modeling_nanochat.js'; export * from './neobert/modeling_neobert.js'; export * from './nomic_bert/modeling_nomic_bert.js'; diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js new file mode 100644 index 000000000..f1bfe6b76 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -0,0 +1,204 @@ +import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js'; +import { Tensor } from '../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { FeatureLRUCache, createAudioCacheKey } from './transducer_cache.js'; +import { computeTemporalDeltas } from './transducer_deltas.js'; + +const EPSILON = 1e-5; + +/** + * Feature extractor for Nemo Conformer TDT models. + * + * Mirrors NeMo-style log-mel extraction used by Parakeet with configurable + * `feature_size` (e.g. 80 or 128 mel bins via `preprocessor_config.json`). + */ +export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { + constructor(config) { + super(config); + + // Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist. + this.config.mel_filters ??= mel_filter_bank( + Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins + this.config.feature_size, // num_mel_filters + 0.0, // min_frequency + this.config.sampling_rate / 2, // max_frequency + this.config.sampling_rate, // sampling_rate + 'slaney', // norm + 'slaney', // mel_scale + ); + + const window = window_function(this.config.win_length, 'hann', { + periodic: false, + }); + + this.window = new Float64Array(this.config.n_fft); + const offset = Math.floor((this.config.n_fft - this.config.win_length) / 2); + this.window.set(window, offset); + + // Optional feature-level cache and delta/delta-delta post-processing. + this.use_feature_cache = this.config.use_feature_cache ?? false; + this.delta_order = this.config.delta_order ?? 0; + this.delta_window = this.config.delta_window ?? 2; + this.delta_concatenate = this.config.delta_concatenate ?? true; + + if (![0, 1, 2].includes(this.delta_order)) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected delta_order in {0,1,2}, got ${this.delta_order}.`, + ); + } + if (this.delta_order > 0 && !this.delta_concatenate) { + console.warn( + 'NemoConformerTDTFeatureExtractor: `delta_concatenate=false` is set. ' + + '`input_features` will remain base features and deltas are returned in extra fields.', + ); + } + + this.feature_cache = this.use_feature_cache + ? new FeatureLRUCache({ + max_entries: this.config.feature_cache_max_entries ?? 128, + max_size_mb: this.config.feature_cache_max_size_mb ?? 64, + }) + : null; + } + + /** + * Computes the log-Mel spectrogram of the provided audio waveform. + * @param {Float32Array|Float64Array} waveform The audio waveform to process. + * @returns {Promise} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. + */ + async _extract_fbank_features(waveform) { + // Parakeet uses a custom preemphasis strategy: Apply preemphasis to entire waveform at once + const preemphasis = this.config.preemphasis; + waveform = new Float64Array(waveform); // Clone to avoid destructive changes + for (let j = waveform.length - 1; j >= 1; --j) { + waveform[j] -= preemphasis * waveform[j - 1]; + } + + const features = await spectrogram( + waveform, + this.window, // window + this.window.length, // frame_length + this.config.hop_length, // hop_length + { + fft_length: this.config.n_fft, + power: 2.0, + mel_filters: this.config.mel_filters, + log_mel: 'log', + mel_floor: -Infinity, + pad_mode: 'constant', + center: true, + + // Custom + transpose: true, + mel_offset: 2 ** -24, + }, + ); + + return features; + } + + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ + * input_features: Tensor; + * attention_mask: Tensor; + * delta_features?: Tensor; + * delta_delta_features?: Tensor; + * }>} A Promise resolving to an object containing extracted model inputs. + */ + async _call(audio) { + validate_audio_inputs(audio, 'NemoConformerTDTFeatureExtractor'); + + if (this.feature_cache) { + const key = `${createAudioCacheKey(audio, this.config.sampling_rate)}:${this.delta_order}:${this.delta_window}:${this.delta_concatenate}`; + const cached = this.feature_cache.get(key); + if (cached) { + return cached; + } + + const extracted = await this._extract(audio); + this.feature_cache.set(key, extracted); + return extracted; + } + + return await this._extract(audio); + } + + async _extract(audio) { + const features = await this._extract_fbank_features(audio); + + const features_length = Math.floor( + (audio.length + Math.floor(this.config.n_fft / 2) * 2 - this.config.n_fft) / this.config.hop_length, + ); + + const features_data = /** @type {Float32Array} */ (features.data); + features_data.fill(0, features_length * features.dims[1]); + + // normalize mel features, ignoring padding + const [num_frames, num_features] = features.dims; + const sum = new Float64Array(num_features); + const sum_sq = new Float64Array(num_features); + + for (let i = 0; i < features_length; ++i) { + const offset = i * num_features; + for (let j = 0; j < num_features; ++j) { + const val = features_data[offset + j]; + sum[j] += val; + sum_sq[j] += val * val; + } + } + + // Calculate mean and standard deviation, then normalize + const divisor = features_length > 1 ? features_length - 1 : 1; + for (let j = 0; j < num_features; ++j) { + const mean = sum[j] / features_length; + const variance = (sum_sq[j] - features_length * mean * mean) / divisor; + const std = Math.sqrt(variance) + EPSILON; + const inv_std = 1 / std; + + for (let i = 0; i < features_length; ++i) { + const index = i * num_features + j; + features_data[index] = (features_data[index] - mean) * inv_std; + } + } + + const mask_data = new BigInt64Array(num_frames); + mask_data.fill(1n, 0, features_length); + + let input_features = features.unsqueeze_(0); + const attention_mask = new Tensor('int64', mask_data, [1, num_frames]); + + const result = { + input_features, + attention_mask, + }; + + if (this.delta_order > 0) { + const delta_result = computeTemporalDeltas(input_features, { + order: this.delta_order, + window: this.delta_window, + concatenate: this.delta_concatenate, + }); + if (delta_result instanceof Tensor) { + input_features = delta_result; + result.input_features = input_features; + } else { + result.delta_features = delta_result.delta; + if (delta_result.delta_delta) { + result.delta_delta_features = delta_result.delta_delta; + } + } + } + + return result; + } + + clear_cache() { + this.feature_cache?.clear(); + } + + get_cache_stats() { + return this.feature_cache?.stats() ?? null; + } +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js new file mode 100644 index 000000000..c0874b21b --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -0,0 +1,597 @@ +import { AutoConfig } from '../../configs.js'; +import { Tensor } from '../../utils/tensor.js'; +import { PreTrainedModel } from '../modeling_utils.js'; +import { constructSessions, sessionRun } from '../session.js'; +import { buildTransducerWordTimestamps, decodeTransducerText } from './transducer_text.js'; + +const NEMO_CONFORMER_TDT_MODEL_TYPE = 'nemo-conformer-tdt'; + +const DEFAULT_TRANSDUCER_IO = Object.freeze({ + encoder_output: 'outputs', + decoder_encoder: 'encoder_outputs', + decoder_token: 'targets', + decoder_token_length: 'target_length', + decoder_state_1: 'input_states_1', + decoder_state_2: 'input_states_2', + decoder_output: 'outputs', + decoder_output_state_1: 'output_states_1', + decoder_output_state_2: 'output_states_2', +}); + +function argmax(values, offset = 0, length = values.length - offset) { + let maxIndex = offset; + let maxValue = Number.NEGATIVE_INFINITY; + const end = offset + length; + for (let i = offset; i < end; ++i) { + const v = values[i]; + if (v > maxValue) { + maxValue = v; + maxIndex = i; + } + } + return maxIndex; +} + +function toInt(value) { + return typeof value === 'bigint' ? Number(value) : value; +} + +function inferEncoderOutputLayout(outputTensor) { + if (outputTensor.dims.length !== 3 || outputTensor.dims[0] !== 1) { + throw new Error( + `Nemo Conformer TDT expected encoder output dims [1, D, T] or [1, T, D], got [${outputTensor.dims.join(', ')}].`, + ); + } + + // Heuristic fallback: in most Nemo exports D > T. + return outputTensor.dims[1] >= outputTensor.dims[2] ? 'BDT' : 'BTD'; +} + +function resolveTransducerConfig(config, sessions) { + const transducerConfig = config['transformers.js_config']?.transducer; + if (!transducerConfig) { + throw new Error( + 'Missing `transformers.js_config.transducer` in config.json for nemo-conformer-tdt. See external model repo contract.', + ); + } + + const decoderConfig = transducerConfig.decoder ?? {}; + const numLayers = decoderConfig.num_layers; + const hiddenSize = decoderConfig.hidden_size; + + if (!Number.isInteger(numLayers) || numLayers <= 0) { + throw new Error('Invalid `transformers.js_config.transducer.decoder.num_layers`: expected a positive integer.'); + } + if (!Number.isInteger(hiddenSize) || hiddenSize <= 0) { + throw new Error( + 'Invalid `transformers.js_config.transducer.decoder.hidden_size`: expected a positive integer.', + ); + } + + const io = { + ...DEFAULT_TRANSDUCER_IO, + ...(transducerConfig.io ?? {}), + }; + + const decoderSession = sessions?.decoder_model_merged; + if (!decoderSession) { + throw new Error('Missing required session `decoder_model_merged` for Nemo Conformer TDT.'); + } + + const decoderInputNames = decoderSession.inputNames ?? []; + const decoderOutputNames = decoderSession.outputNames ?? []; + const missingDecoderInputs = [ + io.decoder_encoder, + io.decoder_token, + io.decoder_token_length, + io.decoder_state_1, + io.decoder_state_2, + ].filter((name) => !decoderInputNames.includes(name)); + + if (missingDecoderInputs.length > 0) { + throw new Error( + `Nemo Conformer TDT decoder session is missing expected inputs: ${missingDecoderInputs.join(', ')}. ` + + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + ); + } + const missingDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2].filter( + (name) => !decoderOutputNames.includes(name), + ); + if (missingDecoderOutputs.length > 0) { + throw new Error( + `Nemo Conformer TDT decoder session is missing expected outputs: ${missingDecoderOutputs.join(', ')}. ` + + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + ); + } + + const encoderSession = sessions?.encoder_model; + if (!encoderSession) { + throw new Error('Missing required session `encoder_model` for Nemo Conformer TDT.'); + } + if (!(encoderSession.outputNames ?? []).includes(io.encoder_output)) { + throw new Error( + `Nemo Conformer TDT encoder session is missing expected output: ${io.encoder_output}. ` + + 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', + ); + } + + const maxSymbolsPerStep = transducerConfig.max_symbols_per_step ?? 10; + const subsamplingFactor = transducerConfig.subsampling_factor ?? 8; + const frameShiftS = transducerConfig.frame_shift_s ?? 0.01; + const blankTokenId = transducerConfig.blank_token_id ?? 0; + const decoderTokenDType = transducerConfig.decoder_token_dtype ?? 'int32'; + const decoderTokenLengthDType = transducerConfig.decoder_token_length_dtype ?? 'int32'; + + if (!Number.isInteger(blankTokenId) || blankTokenId < 0) { + throw new Error('Invalid `transformers.js_config.transducer.blank_token_id`: expected a non-negative integer.'); + } + if (!Number.isInteger(maxSymbolsPerStep) || maxSymbolsPerStep <= 0) { + throw new Error( + 'Invalid `transformers.js_config.transducer.max_symbols_per_step`: expected a positive integer.', + ); + } + if (!Number.isFinite(subsamplingFactor) || subsamplingFactor <= 0) { + throw new Error('Invalid `transformers.js_config.transducer.subsampling_factor`: expected a positive number.'); + } + if (!Number.isFinite(frameShiftS) || frameShiftS <= 0) { + throw new Error('Invalid `transformers.js_config.transducer.frame_shift_s`: expected a positive number.'); + } + if (!['int32', 'int64'].includes(decoderTokenDType)) { + throw new Error( + 'Invalid `transformers.js_config.transducer.decoder_token_dtype`: expected "int32" or "int64".', + ); + } + if (!['int32', 'int64'].includes(decoderTokenLengthDType)) { + throw new Error( + 'Invalid `transformers.js_config.transducer.decoder_token_length_dtype`: expected "int32" or "int64".', + ); + } + + return { + blank_token_id: blankTokenId, + max_symbols_per_step: maxSymbolsPerStep, + subsampling_factor: subsamplingFactor, + frame_shift_s: frameShiftS, + vocab_size: transducerConfig.vocab_size ?? config.vocab_size ?? null, + duration_start_index: transducerConfig.duration_start_index ?? null, + encoder_input_layout: transducerConfig.encoder_input_layout ?? 'BTF', + encoder_output_layout: transducerConfig.encoder_output_layout ?? null, + encoder_frame_layout: transducerConfig.encoder_frame_layout ?? 'BD1', + decoder_token_dtype: decoderTokenDType, + decoder_token_length_dtype: decoderTokenLengthDType, + decoder: { + num_layers: numLayers, + hidden_size: hiddenSize, + }, + io, + }; +} + +export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { + main_input_name = 'input_features'; + forward_params = ['input_features', 'attention_mask']; + + constructor(config, sessions, configs) { + super(config, sessions, configs); + this.transducer = resolveTransducerConfig(config, sessions); + } + + /** + * Load Nemo Conformer TDT sessions using v4 canonical ONNX filenames. + * @type {typeof PreTrainedModel.from_pretrained} + */ + static async from_pretrained( + pretrained_model_name_or_path, + { + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = 'main', + model_file_name = null, + subfolder = 'onnx', + device = null, + dtype = null, + use_external_data_format = null, + session_options = {}, + } = {}, + ) { + const options = { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + model_file_name, + subfolder, + device, + dtype, + use_external_data_format, + session_options, + }; + + config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options); + if (config.model_type !== NEMO_CONFORMER_TDT_MODEL_TYPE) { + throw new Error(`Unsupported model type: ${config.model_type}`); + } + + if (options.model_file_name && options.model_file_name !== 'encoder_model') { + throw new Error( + 'NemoConformerForTDT does not support `model_file_name` override. ' + + 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', + ); + } + + let sessions; + try { + sessions = await constructSessions( + pretrained_model_name_or_path, + { + encoder_model: 'encoder_model', + decoder_model_merged: 'decoder_model_merged', + }, + options, + 'decoder_model_merged', + ); + } catch (error) { + const reason = error?.message ?? String(error); + throw new Error( + 'Failed to load Nemo Conformer TDT sessions. Expected canonical v4 files under `onnx/`: ' + + '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + + `Original error: ${reason}`, + ); + } + + return new this(config, sessions, {}); + } +} + +export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { + async _runEncoder(feeds) { + return await sessionRun(this.sessions.encoder_model, feeds); + } + + async _runDecoder(feeds) { + return await sessionRun(this.sessions.decoder_model_merged, feeds); + } + + _disposeDecoderState(state, keepState = null) { + if (!state) return; + if (state.state1 && state.state1 !== keepState?.state1) { + state.state1.dispose(); + } + if (state.state2 && state.state2 !== keepState?.state2) { + state.state2.dispose(); + } + } + + _getEncoderOutput(outputs) { + const name = this.transducer.io.encoder_output; + return outputs[name] ?? Object.values(outputs)[0]; + } + + _encoderOutputToFrames(encoderOutput) { + const layout = this.transducer.encoder_output_layout ?? inferEncoderOutputLayout(encoderOutput); + const dims = encoderOutput.dims; + const data = encoderOutput.data; + const frames = []; + + if (layout === 'BDT') { + const D = dims[1]; + const T = dims[2]; + for (let t = 0; t < T; ++t) { + const frame = new Float32Array(D); + for (let d = 0; d < D; ++d) { + frame[d] = data[d * T + t]; + } + frames.push(frame); + } + return frames; + } + + if (layout === 'BTD') { + const T = dims[1]; + const D = dims[2]; + for (let t = 0; t < T; ++t) { + const offset = t * D; + frames.push(new Float32Array(data.subarray(offset, offset + D))); + } + return frames; + } + + throw new Error( + `Unsupported encoder output layout "${layout}". Use 'BDT' or 'BTD' in transformers.js_config.transducer.`, + ); + } + + _createFrameTensor(frameData) { + const layout = this.transducer.encoder_frame_layout; + if (layout === 'BD1') { + return new Tensor('float32', frameData, [1, frameData.length, 1]); + } else if (layout === 'B1D') { + return new Tensor('float32', frameData, [1, 1, frameData.length]); + } + throw new Error( + `Unsupported encoder frame layout "${layout}". Use 'BD1' or 'B1D' in transformers.js_config.transducer.`, + ); + } + + _buildEncoderFeeds(model_inputs) { + const encoderSession = this.sessions.encoder_model; + const feeds = {}; + const disposables = []; + const inputFeatures = model_inputs.input_features; + + if (!(inputFeatures instanceof Tensor)) { + throw new Error( + 'NemoConformerForTDT.transcribe expected `model_inputs.input_features` as a Tensor from the processor.', + ); + } + + const missingInputs = []; + for (const name of encoderSession.inputNames) { + if (model_inputs[name] instanceof Tensor) { + feeds[name] = model_inputs[name]; + continue; + } + + if (name === 'input_features') { + feeds[name] = inputFeatures; + continue; + } + + if (name === 'audio_signal') { + const layout = this.transducer.encoder_input_layout; + if (layout === 'BTF') { + feeds[name] = inputFeatures; + } else if (layout === 'BFT') { + const transposed = inputFeatures.transpose(0, 2, 1); + disposables.push(transposed); + feeds[name] = transposed; + } else { + throw new Error( + `Unsupported encoder input layout "${layout}". Use 'BTF' or 'BFT' in transformers.js_config.transducer.`, + ); + } + continue; + } + + if (name === 'length') { + let length = null; + const attentionMask = model_inputs.attention_mask; + if (attentionMask instanceof Tensor) { + const mask = attentionMask.tolist(); + length = mask[0].reduce((acc, x) => acc + toInt(x), 0); + } else { + length = inputFeatures.dims[1]; + } + const lengthTensor = new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1]); + disposables.push(lengthTensor); + feeds[name] = lengthTensor; + continue; + } + + missingInputs.push(name); + } + + if (missingInputs.length > 0) { + throw new Error( + `Nemo Conformer TDT encoder session expects additional inputs that are not available: ${missingInputs.join(', ')}.`, + ); + } + + return { feeds, disposables }; + } + + _resolveVocabSize(tokenizer) { + if (Number.isInteger(this.transducer.vocab_size) && this.transducer.vocab_size > 0) { + return this.transducer.vocab_size; + } + + if (tokenizer?.get_vocab) { + const size = Object.keys(tokenizer.get_vocab()).length; + if (size > 0) { + return size; + } + } + + throw new Error( + 'Unable to resolve vocabulary size for Nemo Conformer TDT. Set `vocab_size` in config.json or provide tokenizer with a vocab.', + ); + } + + _validateRuntimeConfig(vocabSize) { + if (this.transducer.blank_token_id >= vocabSize) { + throw new Error( + `Invalid Nemo Conformer TDT config: blank_token_id=${this.transducer.blank_token_id} must be < vocab_size=${vocabSize}.`, + ); + } + const durationStart = this.transducer.duration_start_index ?? vocabSize; + if (!Number.isInteger(durationStart) || durationStart < vocabSize) { + throw new Error( + `Invalid Nemo Conformer TDT config: duration_start_index=${durationStart} must be an integer >= vocab_size=${vocabSize}.`, + ); + } + } + + /** + * Transcribe model-ready features using TDT decoding. + * @param {Object} model_inputs Processor outputs (must include `input_features`). + * @param {Object} [decode_options] + * @param {any} [decode_options.tokenizer] Tokenizer used for text reconstruction and word timestamps. + * @param {boolean} [decode_options.return_token_timestamps=true] + * @param {boolean} [decode_options.return_word_timestamps=true] + * @param {boolean} [decode_options.return_utterance_timestamp=true] + * @returns {Promise<{ + * text: string, + * token_ids: number[], + * token_timestamps?: [number, number][], + * word_timestamps?: { text: string, timestamp: [number, number]}[], + * utterance_timestamp?: [number, number], + * }>} + */ + async transcribe( + model_inputs, + { + tokenizer = null, + return_token_timestamps = true, + return_word_timestamps = true, + return_utterance_timestamp = true, + } = {}, + ) { + const io = this.transducer.io; + const vocabSize = this._resolveVocabSize(tokenizer); + this._validateRuntimeConfig(vocabSize); + + const { feeds: encoderFeeds, disposables } = this._buildEncoderFeeds(model_inputs); + let encoderOutputs; + try { + encoderOutputs = await this._runEncoder(encoderFeeds); + } finally { + for (const tensor of disposables) { + tensor.dispose(); + } + } + + const encoderOutput = this._getEncoderOutput(encoderOutputs); + let frames; + try { + frames = this._encoderOutputToFrames(encoderOutput); + } finally { + const seen = new Set(); + for (const value of Object.values(encoderOutputs)) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } + } + } + const frameTime = this.transducer.subsampling_factor * this.transducer.frame_shift_s; + + const numLayers = this.transducer.decoder.num_layers; + const hiddenSize = this.transducer.decoder.hidden_size; + const blankId = this.transducer.blank_token_id; + const maxSymbolsPerStep = this.transducer.max_symbols_per_step; + + /** @type {number[]} */ + const tokenIds = []; + /** @type {[number, number][]} */ + const tokenTimestamps = []; + + let decoderState = { + state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), + state2: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), + }; + + const targetLengthTensor = + this.transducer.decoder_token_length_dtype === 'int64' + ? new Tensor('int64', BigInt64Array.from([1n]), [1]) + : new Tensor('int32', new Int32Array([1]), [1]); + let emittedOnFrame = 0; + + try { + for (let frameIndex = 0; frameIndex < frames.length; ) { + const frameTensor = this._createFrameTensor(frames[frameIndex]); + const prevTokenId = tokenIds.length > 0 ? tokenIds[tokenIds.length - 1] : blankId; + const tokenTensor = + this.transducer.decoder_token_dtype === 'int64' + ? new Tensor('int64', BigInt64Array.from([BigInt(prevTokenId)]), [1, 1]) + : new Tensor('int32', new Int32Array([prevTokenId]), [1, 1]); + + const decoderFeeds = { + [io.decoder_encoder]: frameTensor, + [io.decoder_token]: tokenTensor, + [io.decoder_token_length]: targetLengthTensor, + [io.decoder_state_1]: decoderState.state1, + [io.decoder_state_2]: decoderState.state2, + }; + + let decoderOutput; + try { + decoderOutput = await this._runDecoder(decoderFeeds); + } finally { + tokenTensor.dispose(); + frameTensor.dispose(); + } + + const logits = decoderOutput[io.decoder_output] ?? Object.values(decoderOutput)[0]; + const logitsData = logits.data; + if (logitsData.length < vocabSize) { + throw new Error( + `Nemo Conformer TDT decoder output is too small (${logitsData.length}) for vocab_size=${vocabSize}.`, + ); + } + const tokenId = argmax(logitsData, 0, vocabSize); + + const durationStart = this.transducer.duration_start_index ?? vocabSize; + const hasDurationLogits = logitsData.length > durationStart; + const step = hasDurationLogits + ? argmax(logitsData, durationStart, logitsData.length - durationStart) - durationStart + : 0; + + const newState = { + state1: decoderOutput[io.decoder_output_state_1] ?? decoderState.state1, + state2: decoderOutput[io.decoder_output_state_2] ?? decoderState.state2, + }; + + if (tokenId !== blankId) { + this._disposeDecoderState(decoderState, newState); + decoderState = newState; + + tokenIds.push(tokenId); + const durationFrames = step > 0 ? step : 1; + tokenTimestamps.push([frameIndex * frameTime, (frameIndex + durationFrames) * frameTime]); + emittedOnFrame += 1; + } else { + this._disposeDecoderState(newState, decoderState); + } + + logits.dispose(); + + if (step > 0) { + frameIndex += step; + emittedOnFrame = 0; + } else if (tokenId === blankId || emittedOnFrame >= maxSymbolsPerStep) { + frameIndex += 1; + emittedOnFrame = 0; + } + } + } finally { + targetLengthTensor.dispose(); + this._disposeDecoderState(decoderState); + } + + const text = decodeTransducerText(tokenizer, tokenIds); + + const result = { + text, + token_ids: tokenIds, + }; + + if (return_token_timestamps) { + result.token_timestamps = tokenTimestamps; + } + + if (return_word_timestamps) { + result.word_timestamps = buildTransducerWordTimestamps(tokenizer, tokenIds, tokenTimestamps); + } + + if (return_utterance_timestamp) { + if (tokenTimestamps.length > 0) { + result.utterance_timestamp = [tokenTimestamps[0][0], tokenTimestamps[tokenTimestamps.length - 1][1]]; + } else { + result.utterance_timestamp = [0, frames.length * frameTime]; + } + } + + return result; + } + + /** + * Runs TDT transcription when called directly. + * @param {Object} model_inputs + */ + async _call(model_inputs) { + return await this.transcribe(model_inputs); + } +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/processing_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/processing_nemo_conformer_tdt.js new file mode 100644 index 000000000..4c2d0a7eb --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/processing_nemo_conformer_tdt.js @@ -0,0 +1,19 @@ +import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; +import { AutoTokenizer } from '../auto/tokenization_auto.js'; +import { Processor } from '../../processing_utils.js'; + +/** + * Processor for Nemo Conformer TDT models. + */ +export class NemoConformerTDTProcessor extends Processor { + static tokenizer_class = AutoTokenizer; + static feature_extractor_class = AutoFeatureExtractor; + + /** + * Preprocess raw audio for Nemo Conformer TDT models. + * @param {Float32Array|Float64Array} audio + */ + async _call(audio) { + return await this.feature_extractor(audio); + } +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js new file mode 100644 index 000000000..7f46eeb6d --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -0,0 +1,119 @@ +import { Tensor } from '../../utils/tensor.js'; + +/** + * Create a stable hash key for audio samples, used by feature caches. + * @param {Float32Array|Float64Array} audio + * @param {number} [sampling_rate=16000] + * @returns {string} + */ +export function createAudioCacheKey(audio, sampling_rate = 16000) { + // FNV-1a 32-bit over quantized values for deterministic cross-runtime keys. + let hash = 2166136261; + hash ^= audio.length; + hash = Math.imul(hash, 16777619); + hash ^= sampling_rate; + hash = Math.imul(hash, 16777619); + + // Sample stride hash to keep keying cheap for long audio. + const stride = Math.max(1, Math.floor(audio.length / 4096)); + for (let i = 0; i < audio.length; i += stride) { + const q = (audio[i] * 32768) | 0; + hash ^= q; + hash = Math.imul(hash, 16777619); + } + return `${sampling_rate}:${audio.length}:${(hash >>> 0).toString(16)}`; +} + +/** + * Lightweight LRU cache for extracted features. + * Stores values as-is and tracks approximate memory usage. + */ +export class FeatureLRUCache { + /** + * @param {{max_entries?: number, max_size_mb?: number}} [options] + */ + constructor({ max_entries = 128, max_size_mb = 64 } = {}) { + this.max_entries = max_entries; + this.max_size_mb = max_size_mb; + this.cache = new Map(); + this.current_size_bytes = 0; + } + + /** + * @param {string} key + * @returns {any|null} + */ + get(key) { + const entry = this.cache.get(key); + if (!entry) return null; + this.cache.delete(key); + this.cache.set(key, entry); + return entry.value; + } + + /** + * @param {string} key + * @param {any} value + * @returns {void} + */ + set(key, value) { + const existing = this.cache.get(key); + if (existing) { + this.current_size_bytes -= existing.size_bytes; + this.cache.delete(key); + } + + const size_bytes = estimateSizeBytes(value); + this.cache.set(key, { value, size_bytes }); + this.current_size_bytes += size_bytes; + this._evict(); + } + + clear() { + this.cache.clear(); + this.current_size_bytes = 0; + } + + stats() { + return { + entries: this.cache.size, + size_mb: this.current_size_bytes / (1024 * 1024), + max_entries: this.max_entries, + max_size_mb: this.max_size_mb, + }; + } + + _evict() { + const max_bytes = this.max_size_mb * 1024 * 1024; + while (this.cache.size > this.max_entries || this.current_size_bytes > max_bytes) { + const oldest_key = this.cache.keys().next().value; + if (oldest_key === undefined) break; + const oldest = this.cache.get(oldest_key); + this.cache.delete(oldest_key); + this.current_size_bytes -= oldest?.size_bytes ?? 0; + } + } +} + +function estimateSizeBytes(value) { + if (value instanceof Tensor) { + return value.data?.byteLength ?? 0; + } + if (value?.input_features instanceof Tensor) { + let bytes = value.input_features.data?.byteLength ?? 0; + if (value.attention_mask instanceof Tensor) { + bytes += value.attention_mask.data?.byteLength ?? 0; + } + if (value.delta_features instanceof Tensor) { + bytes += value.delta_features.data?.byteLength ?? 0; + } + if (value.delta_delta_features instanceof Tensor) { + bytes += value.delta_delta_features.data?.byteLength ?? 0; + } + return bytes; + } + if (value?.byteLength) { + return value.byteLength; + } + return 0; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js new file mode 100644 index 000000000..80a85f8be --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js @@ -0,0 +1,69 @@ +import { Tensor } from '../../utils/tensor.js'; + +/** + * Compute temporal deltas (and optionally delta-deltas) for [1, T, F] features. + * @param {Tensor} input_features + * @param {{order?: 1|2, window?: number, concatenate?: boolean}} [options] + * @returns {Tensor|{delta: Tensor, delta_delta?: Tensor}} + */ +export function computeTemporalDeltas(input_features, { order = 1, window = 2, concatenate = false } = {}) { + if (!(input_features instanceof Tensor)) { + throw new Error('computeTemporalDeltas expects `input_features` as a Tensor.'); + } + if (input_features.dims.length !== 3 || input_features.dims[0] !== 1) { + throw new Error(`computeTemporalDeltas expects dims [1, T, F], got [${input_features.dims.join(', ')}].`); + } + if (!Number.isInteger(window) || window <= 0) { + throw new Error('computeTemporalDeltas expects `window` to be a positive integer.'); + } + + const [batch, T, F] = input_features.dims; + const base = /** @type {Float32Array} */ (input_features.data); + const delta = new Float32Array(base.length); + const denom = 2 * Array.from({ length: window }, (_, i) => (i + 1) ** 2).reduce((a, b) => a + b, 0); + + const at = (t, f) => base[t * F + f]; + for (let t = 0; t < T; ++t) { + for (let f = 0; f < F; ++f) { + let num = 0; + for (let n = 1; n <= window; ++n) { + const tp = Math.min(T - 1, t + n); + const tm = Math.max(0, t - n); + num += n * (at(tp, f) - at(tm, f)); + } + delta[t * F + f] = num / denom; + } + } + + const delta_tensor = new Tensor('float32', delta, [batch, T, F]); + if (order === 1) { + if (!concatenate) { + return { delta: delta_tensor }; + } + return new Tensor('float32', concatFloat32([base, delta]), [batch, T, F * 2]); + } + + const delta_delta = /** @type {{delta: Tensor}} */ ( + computeTemporalDeltas(delta_tensor, { order: 1, window, concatenate: false }) + ).delta.data; + const delta_delta_tensor = new Tensor('float32', delta_delta, [batch, T, F]); + if (!concatenate) { + return { + delta: delta_tensor, + delta_delta: delta_delta_tensor, + }; + } + + return new Tensor('float32', concatFloat32([base, delta, delta_delta]), [batch, T, F * 3]); +} + +function concatFloat32(items) { + const total = items.reduce((sum, arr) => sum + arr.length, 0); + const output = new Float32Array(total); + let offset = 0; + for (const arr of items) { + output.set(arr, offset); + offset += arr.length; + } + return output; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js new file mode 100644 index 000000000..1234e7d82 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -0,0 +1,73 @@ +/** + * Decode token ids into final transcription text. + * @param {any} tokenizer + * @param {number[]} token_ids + * @returns {string} + */ +export function decodeTransducerText(tokenizer, token_ids) { + if (!tokenizer) return token_ids.join(' '); + return tokenizer.decode(token_ids, { skip_special_tokens: true }).trim(); +} + +/** + * Build word-level timestamps from token ids and token-level timestamps. + * @param {any} tokenizer + * @param {number[]} token_ids + * @param {[number, number][]} token_timestamps + * @returns {{ text: string, timestamp: [number, number] }[]} + */ +export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timestamps) { + if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { + return []; + } + + const words = []; + let current = null; + + for (let i = 0; i < token_ids.length; ++i) { + const id = token_ids[i]; + const ts = token_timestamps[i]; + const piece = tokenizer.decode([id], { + skip_special_tokens: true, + clean_up_tokenization_spaces: false, + }); + + if (!piece) continue; + + const startsNewWord = /^\s+/.test(piece) || piece.startsWith('▁'); + const normalizedPiece = piece.replace(/^\s+/, '').replace(/^▁+/, ''); + if (!normalizedPiece) continue; + + if (!current || startsNewWord) { + if (current) { + const text = current.text.trim(); + if (text) { + words.push({ + text, + timestamp: [current.start, current.end], + }); + } + } + current = { + text: normalizedPiece, + start: ts[0], + end: ts[1], + }; + } else { + current.text += normalizedPiece; + current.end = ts[1]; + } + } + + if (current) { + const text = current.text.trim(); + if (text) { + words.push({ + text, + timestamp: [current.start, current.end], + }); + } + } + + return words; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js new file mode 100644 index 000000000..24859bc16 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js @@ -0,0 +1,4 @@ +// Backwards-compatible barrel for older internal imports. +export { createAudioCacheKey, FeatureLRUCache } from './transducer_cache.js'; +export { computeTemporalDeltas } from './transducer_deltas.js'; +export { decodeTransducerText, buildTransducerWordTimestamps } from './transducer_text.js'; diff --git a/packages/transformers/src/models/processors.js b/packages/transformers/src/models/processors.js index e00b1c71a..28b0efca0 100644 --- a/packages/transformers/src/models/processors.js +++ b/packages/transformers/src/models/processors.js @@ -8,6 +8,7 @@ export * from './jina_clip/processing_jina_clip.js'; export * from './llava/processing_llava.js'; export * from './mgp_str/processing_mgp_str.js'; export * from './moonshine/processing_moonshine.js'; +export * from './nemo_conformer_tdt/processing_nemo_conformer_tdt.js'; export * from './owlvit/processing_owlvit.js'; export * from './paligemma/processing_paligemma.js'; export * from './phi3_v/processing_phi3_v.js'; diff --git a/packages/transformers/src/models/registry.js b/packages/transformers/src/models/registry.js index 4f0dad6de..5c08bdded 100644 --- a/packages/transformers/src/models/registry.js +++ b/packages/transformers/src/models/registry.js @@ -41,6 +41,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ ['unispeech-sat', 'UniSpeechSatModel'], ['hubert', 'HubertModel'], ['wavlm', 'WavLMModel'], + ['nemo-conformer-tdt', 'NemoConformerForTDT'], ['audio-spectrogram-transformer', 'ASTModel'], ['vits', 'VitsModel'], ['pyannote', 'PyAnnoteModel'], diff --git a/packages/transformers/src/pipelines.js b/packages/transformers/src/pipelines.js index 60dae8a86..7c2cca700 100644 --- a/packages/transformers/src/pipelines.js +++ b/packages/transformers/src/pipelines.js @@ -40,6 +40,7 @@ import { AutoModelForDepthEstimation, AutoModelForImageFeatureExtraction, } from './models/auto/modeling_auto.js'; +import { NemoConformerForTDT } from './models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js'; import { dispatchCallback } from './utils/core.js'; import { logger } from './utils/logger.js'; @@ -195,7 +196,7 @@ const SUPPORTED_TASKS = Object.freeze({ 'automatic-speech-recognition': { tokenizer: AutoTokenizer, pipeline: AutomaticSpeechRecognitionPipeline, - model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC], + model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC, NemoConformerForTDT], processor: AutoProcessor, default: { // TODO: replace with original diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index d4ab074a2..06d795af6 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -16,14 +16,21 @@ import { logger } from '../utils/logger.js'; * @property {string} text The recognized text. */ +/** + * @typedef {'utterance' | 'word' | 'token' | 'all'} TimestampGranularity + */ + /** * @typedef {Object} AutomaticSpeechRecognitionOutput * @property {string} text The recognized text. * @property {Chunk[]} [chunks] When using `return_timestamps`, the `chunks` will become a list * containing all the various text chunks identified by the model. + * @property {Chunk[]} [tokens] Optional token-level timestamp chunks for models that support them. + * @property {[number, number]} [utterance] Optional utterance-level timestamp span. * * @typedef {Object} AutomaticSpeechRecognitionSpecificParams Parameters specific to automatic-speech-recognition pipelines. * @property {boolean|'word'} [return_timestamps] Whether to return timestamps or not. Default is `false`. + * @property {TimestampGranularity} [timestamp_granularity] Granularity used when `return_timestamps` is enabled for Parakeet TDT models. Default is `'word'`. * @property {number} [chunk_length_s] The length of audio chunks to process in seconds. Default is 0 (no chunking). * @property {number} [stride_length_s] The length of overlap between consecutive audio chunks in seconds. If not provided, defaults to `chunk_length_s / 6`. * @property {boolean} [force_full_sequences] Whether to force outputting full sequences or not. Default is `false`. @@ -152,6 +159,8 @@ export class AutomaticSpeechRecognitionPipeline case 'hubert': case 'parakeet_ctc': return this._call_wav2vec2(audio, kwargs); + case 'nemo-conformer-tdt': + return this._call_nemo_conformer_tdt(audio, kwargs); case 'moonshine': return this._call_moonshine(audio, kwargs); default: @@ -300,6 +309,136 @@ export class AutomaticSpeechRecognitionPipeline return single ? toReturn[0] : toReturn; } + /** + * @param {any} return_timestamps + * @param {any} timestamp_granularity + * @returns {TimestampGranularity|null} + */ + _normalizeNemoConformerTimestampGranularity(return_timestamps, timestamp_granularity) { + if (!return_timestamps) { + return null; + } + + const granularity = timestamp_granularity ?? 'word'; + const allowed = ['utterance', 'word', 'token', 'all']; + if (!allowed.includes(granularity)) { + throw new Error( + `Invalid \`timestamp_granularity\`: "${granularity}". Expected one of: ${allowed.join(', ')}.`, + ); + } + return /** @type {TimestampGranularity} */ (granularity); + } + + /** + * @param {any} result + * @param {TimestampGranularity|null} granularity + * @returns {AutomaticSpeechRecognitionOutput} + */ + _formatNemoConformerTDTResult(result, granularity) { + const text = result.text ?? ''; + if (!granularity) { + return { text }; + } + + const wordChunks = (result.word_timestamps ?? []).map((item) => ({ + text: item.text, + timestamp: item.timestamp, + })); + const tokenChunks = (result.token_timestamps ?? []).map((timestamp, index) => { + const tokenId = result.token_ids?.[index]; + const decodedToken = + tokenId == null + ? '' + : (this.tokenizer?.decode([tokenId], { + skip_special_tokens: true, + clean_up_tokenization_spaces: false, + }) ?? ''); + return { + text: decodedToken || (tokenId == null ? '' : `${tokenId}`), + timestamp, + }; + }); + const utterance = result.utterance_timestamp; + + if (granularity === 'utterance') { + if (!utterance) { + return { text, chunks: [] }; + } + return { + text, + chunks: [{ text, timestamp: utterance }], + }; + } + + if (granularity === 'word') { + return { text, chunks: wordChunks }; + } + + if (granularity === 'token') { + return { text, chunks: tokenChunks }; + } + + return { + text, + chunks: wordChunks, + tokens: tokenChunks, + ...(utterance ? { utterance } : {}), + }; + } + + /** + * Nemo Conformer TDT ASR output rules: + * - `return_timestamps=false`: `{ text }` + * - `timestamp_granularity='utterance'`: `chunks` contains a single utterance span + * - `timestamp_granularity='word'`: `chunks` contains word-level spans + * - `timestamp_granularity='token'`: `chunks` contains token-level spans + * - `timestamp_granularity='all'`: returns `chunks` (word), `tokens`, and `utterance` + */ + async _call_nemo_conformer_tdt(audio, kwargs) { + if (typeof this.model.transcribe !== 'function') { + throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); + } + if (!this.processor) { + throw new Error('Nemo Conformer TDT pipeline requires a processor.'); + } + if (!this.tokenizer) { + throw new Error('Nemo Conformer TDT pipeline requires a tokenizer.'); + } + if (!this.processor.feature_extractor?.config?.sampling_rate) { + throw new Error( + 'Nemo Conformer TDT pipeline requires `processor.feature_extractor.config.sampling_rate` to prepare audio.', + ); + } + + const return_timestamps = kwargs.return_timestamps ?? false; + const withTimestamps = return_timestamps !== false; + const granularity = this._normalizeNemoConformerTimestampGranularity( + withTimestamps, + kwargs.timestamp_granularity, + ); + + const decodeOptions = { + tokenizer: this.tokenizer, + return_token_timestamps: granularity === 'token' || granularity === 'all', + return_word_timestamps: granularity === 'word' || granularity === 'all', + return_utterance_timestamp: granularity === 'utterance' || granularity === 'all', + }; + + const single = !Array.isArray(audio); + const batchedAudio = single ? [audio] : audio; + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; + const preparedAudios = await prepareAudios(batchedAudio, sampling_rate); + + const toReturn = []; + for (const aud of preparedAudios) { + const inputs = await this.processor(aud); + const output = await this.model.transcribe(inputs, decodeOptions); + toReturn.push(this._formatNemoConformerTDTResult(output, granularity)); + } + + return single ? toReturn[0] : toReturn; + } + async _call_moonshine(audio, kwargs) { const single = !Array.isArray(audio); const batchedAudio = single ? [audio] : audio; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js new file mode 100644 index 000000000..cee9568d0 --- /dev/null +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -0,0 +1,78 @@ +import { NemoConformerTDTFeatureExtractor } from "../../../src/transformers.js"; + +import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("NemoConformerTDTFeatureExtractor", () => { + const base = { + sampling_rate: 16000, + n_fft: 512, + win_length: 400, + hop_length: 160, + preemphasis: 0.97, + }; + + const audio = Float32Array.from({ length: 16000 }, (_, i) => Math.sin((2 * Math.PI * 220 * i) / 16000)); + + it( + "supports 80 mel bins", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80 }); + const { input_features, attention_mask } = await extractor(audio); + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(80); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "supports 128 mel bins", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 128 }); + const { input_features, attention_mask } = await extractor(audio); + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(128); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "supports concatenated delta and delta-delta features", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 128, + delta_order: 2, + delta_window: 2, + delta_concatenate: true, + }); + const { input_features } = await extractor(audio); + expect(input_features.dims[2]).toBe(128 * 3); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "uses feature cache when enabled", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + use_feature_cache: true, + feature_cache_max_entries: 8, + feature_cache_max_size_mb: 8, + }); + const first = await extractor(audio); + const second = await extractor(audio); + + expect(first).toBe(second); + expect(extractor.get_cache_stats().entries).toBe(1); + extractor.clear_cache(); + expect(extractor.get_cache_stats().entries).toBe(0); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js new file mode 100644 index 000000000..83a1523e2 --- /dev/null +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -0,0 +1,188 @@ +import { NemoConformerForTDT, Tensor } from "../../../src/transformers.js"; +import { createAudioCacheKey, FeatureLRUCache } from "../../../src/models/nemo_conformer_tdt/transducer_cache.js"; +import { computeTemporalDeltas } from "../../../src/models/nemo_conformer_tdt/transducer_deltas.js"; + +import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +class MockNemoConformerForTDT extends NemoConformerForTDT { + constructor(config, sessions, decoderScript) { + super(config, sessions, {}); + this.decoderScript = decoderScript; + this.decoderCalls = 0; + } + + async _runEncoder() { + return { + outputs: new Tensor( + "float32", + new Float32Array([ + // D=2, T=3 (BDT) + 0.1, + 0.2, + 0.3, // d0 over t + 0.4, + 0.5, + 0.6, // d1 over t + ]), + [1, 2, 3], + ), + }; + } + + async _runDecoder() { + const step = this.decoderScript[this.decoderCalls++]; + const stateShape = [1, 1, 2]; + return { + outputs: new Tensor("float32", new Float32Array(step.logits), [1, 1, step.logits.length]), + output_states_1: new Tensor("float32", new Float32Array([this.decoderCalls, 0]), stateShape), + output_states_2: new Tensor("float32", new Float32Array([0, this.decoderCalls]), stateShape), + }; + } +} + +const BASE_SESSIONS = { + encoder_model: { + inputNames: ["input_features"], + outputNames: ["outputs"], + }, + decoder_model_merged: { + inputNames: ["encoder_outputs", "targets", "target_length", "input_states_1", "input_states_2"], + outputNames: ["outputs", "output_states_1", "output_states_2"], + }, +}; + +const BASE_CONFIG = { + model_type: "nemo-conformer-tdt", + "transformers.js_config": { + transducer: { + blank_token_id: 0, + max_symbols_per_step: 2, + subsampling_factor: 4, + frame_shift_s: 0.01, + vocab_size: 3, + duration_start_index: 3, + encoder_output_layout: "BDT", + encoder_frame_layout: "BD1", + decoder: { + num_layers: 1, + hidden_size: 2, + }, + }, + }, +}; + +export default () => { + describe("NemoConformerForTDT", () => { + it( + "greedily decodes scripted token and duration logits", + async () => { + const tokenizer = { + decode(ids) { + const idArray = Array.isArray(ids) ? ids : [ids]; + return idArray + .map((id) => { + if (id === 1 || id === 1n) return " hello"; + if (id === 2 || id === 2n) return " world"; + return ""; + }) + .join(""); + }, + }; + + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [ + // step 1: emit token=1, duration=0 + { logits: [0.1, 10.0, 0.0, 8.0, 1.0, 0.5] }, + // step 2: emit blank, duration=1 -> move to next frame + { logits: [9.0, 0.0, 0.0, 0.0, 8.0, 0.0] }, + // step 3: emit token=2, duration=2 -> jump to end + { logits: [0.0, 0.0, 10.0, 0.0, 0.0, 9.0] }, + ]); + + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + const output = await model.transcribe(inputs, { + tokenizer, + return_token_timestamps: true, + return_word_timestamps: true, + return_utterance_timestamp: true, + }); + + expect(output.text).toBe("hello world"); + expect(output.token_ids).toEqual([1, 2]); + expect(output.token_timestamps).toEqual([ + [0, 0.04], + [0.04, 0.12], + ]); + expect(output.word_timestamps).toEqual([ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.12] }, + ]); + expect(output.utterance_timestamp).toEqual([0, 0.12]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it("fails fast when transducer config is missing", () => { + const invalidConfig = { model_type: "nemo-conformer-tdt" }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("Missing `transformers.js_config.transducer`"); + }); + }); + + describe("Nemo Conformer TDT utilities", () => { + it( + "computes delta and delta-delta features", + async () => { + const input = new Tensor( + "float32", + Float32Array.from([ + // T=4, F=2 + 1, 2, 2, 4, 3, 6, 4, 8, + ]), + [1, 4, 2], + ); + + const split = computeTemporalDeltas(input, { order: 2, window: 1, concatenate: false }); + expect(split.delta.dims).toEqual([1, 4, 2]); + expect(split.delta_delta.dims).toEqual([1, 4, 2]); + + const concat = computeTemporalDeltas(input, { order: 2, window: 1, concatenate: true }); + expect(concat.dims).toEqual([1, 4, 6]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "creates stable audio cache keys", + async () => { + const a = Float32Array.from([0, 0.1, 0.2, 0.3]); + const b = Float32Array.from([0, 0.1, 0.2, 0.4]); + const ka1 = createAudioCacheKey(a, 16000); + const ka2 = createAudioCacheKey(a, 16000); + const kb = createAudioCacheKey(b, 16000); + + expect(ka1).toEqual(ka2); + expect(ka1).not.toEqual(kb); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "evicts least-recently-used entries when full", + async () => { + const cache = new FeatureLRUCache({ max_entries: 2, max_size_mb: 4 }); + cache.set("a", new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3])); + cache.set("b", new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3])); + expect(cache.get("a")).not.toBeNull(); + + cache.set("c", new Tensor("float32", new Float32Array([7, 8, 9]), [1, 3])); + // `b` should be evicted because `a` was recently accessed. + expect(cache.get("b")).toBeNull(); + expect(cache.get("a")).not.toBeNull(); + expect(cache.get("c")).not.toBeNull(); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js new file mode 100644 index 000000000..fab1861d9 --- /dev/null +++ b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js @@ -0,0 +1,45 @@ +import { ParakeetFeatureExtractor } from "../../../src/transformers.js"; + +import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("ParakeetFeatureExtractor", () => { + const config = { + feature_size: 80, + sampling_rate: 16000, + n_fft: 512, + win_length: 400, + hop_length: 160, + preemphasis: 0.97, + }; + + /** @type {ParakeetFeatureExtractor} */ + let feature_extractor; + beforeAll(() => { + feature_extractor = new ParakeetFeatureExtractor(config); + }); + + it( + "extracts normalized features and mask from synthetic audio", + async () => { + const duration_s = 1.0; + const total = Math.floor(config.sampling_rate * duration_s); + const audio = Float32Array.from({ length: total }, (_, i) => Math.sin((2 * Math.PI * 220 * i) / config.sampling_rate)); + + const { input_features, attention_mask } = await feature_extractor(audio); + + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(config.feature_size); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + + const validFrames = attention_mask.tolist()[0].reduce((acc, x) => acc + Number(x), 0); + expect(validFrames).toBeGreaterThan(0); + expect(validFrames).toBeLessThanOrEqual(input_features.dims[1]); + + const preview = Array.from(input_features.data.slice(0, 256)); + expect(preview.every(Number.isFinite)).toBe(true); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 963306f7f..831a885ac 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -125,5 +125,138 @@ export default () => { await pipe?.dispose(); }, MAX_MODEL_DISPOSE_TIME); }); + + describe("nemo-conformer-tdt (unit)", () => { + const makeUnitPipe = (modelType = "nemo-conformer-tdt") => { + const calls = []; + const model = { + config: { model_type: modelType }, + async transcribe(_inputs, options) { + calls.push(options); + return { + text: "hello world", + token_ids: [1, 2], + token_timestamps: [ + [0, 0.04], + [0.04, 0.08], + ], + word_timestamps: [ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.08] }, + ], + utterance_timestamp: [0, 0.08], + }; + }, + async dispose() {}, + }; + + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const tokenizer = { + decode(ids) { + const idArray = Array.isArray(ids) ? ids : [ids]; + return idArray + .map((id) => { + if (id === 1 || id === 1n) return " hello"; + if (id === 2 || id === 2n) return " world"; + return ""; + }) + .join(""); + }, + }; + + return { + pipe: new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer, + processor, + }), + calls, + }; + }; + + it("dispatches to nemo-conformer-tdt path", async () => { + const { pipe, calls } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { return_timestamps: false }); + expect(output).toEqual({ text: "hello world" }); + expect(calls).toHaveLength(1); + }); + + it("default timestamps use word granularity", async () => { + const { pipe, calls } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { return_timestamps: true }); + expect(output).toEqual({ + text: "hello world", + chunks: [ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.08] }, + ], + }); + expect(calls[0]).toMatchObject({ + return_word_timestamps: true, + return_token_timestamps: false, + return_utterance_timestamp: false, + }); + }); + + it("supports utterance granularity", async () => { + const { pipe } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { + return_timestamps: true, + timestamp_granularity: "utterance", + }); + expect(output).toEqual({ + text: "hello world", + chunks: [{ text: "hello world", timestamp: [0, 0.08] }], + }); + }); + + it("supports token granularity", async () => { + const { pipe } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { + return_timestamps: true, + timestamp_granularity: "token", + }); + expect(output).toEqual({ + text: "hello world", + chunks: [ + { text: " hello", timestamp: [0, 0.04] }, + { text: " world", timestamp: [0.04, 0.08] }, + ], + }); + }); + + it("supports all granularities at once", async () => { + const { pipe } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { + return_timestamps: true, + timestamp_granularity: "all", + }); + expect(output).toEqual({ + text: "hello world", + chunks: [ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.08] }, + ], + tokens: [ + { text: " hello", timestamp: [0, 0.04] }, + { text: " world", timestamp: [0.04, 0.08] }, + ], + utterance: [0, 0.08], + }); + }); + + it("throws for invalid timestamp granularity", async () => { + const { pipe } = makeUnitPipe(); + await expect( + pipe(new Float32Array(16000), { + return_timestamps: true, + timestamp_granularity: "frame", + }), + ).rejects.toThrow("Invalid `timestamp_granularity`"); + }); + }); }); }; From 964bc8fa79baf04d0f6db17fefde041f0e4e9928 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sat, 28 Feb 2026 23:20:46 +0300 Subject: [PATCH 02/33] fix(nemo-conformer-tdt): handle empty token decode output --- .../src/models/nemo_conformer_tdt/transducer_text.js | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index 1234e7d82..c3b15c630 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -5,6 +5,7 @@ * @returns {string} */ export function decodeTransducerText(tokenizer, token_ids) { + if (!Array.isArray(token_ids) || token_ids.length === 0) return ''; if (!tokenizer) return token_ids.join(' '); return tokenizer.decode(token_ids, { skip_special_tokens: true }).trim(); } From fa9bc25465557deffb915e86dbd573ed4e14680b Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 16:57:38 +0300 Subject: [PATCH 03/33] chore(nemo-conformer-tdt): keep typegen compatibility for transcribe/cache helpers Carry over non-runtime typing fixes from the prior branch while intentionally excluding the WebGPU disable_prepacking workaround in session.js.\n\n- Cast dynamic model.transcribe access for Nemo TDT pipeline method checks/calls.\n- Cast Tensor data byteLength access in transducer cache utilities.\n- Add explicit tuple/object JSDoc annotations in transducer timestamp builder.\n\nThis keeps main-based v4 work clean with latest ORT-Web on origin/main and avoids retaining the temporary encoder prepacking workaround. --- .../src/models/nemo_conformer_tdt/transducer_cache.js | 10 +++++----- .../src/models/nemo_conformer_tdt/transducer_text.js | 4 ++++ .../src/pipelines/automatic-speech-recognition.js | 4 ++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 7f46eeb6d..3042e9e8f 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -97,18 +97,18 @@ export class FeatureLRUCache { function estimateSizeBytes(value) { if (value instanceof Tensor) { - return value.data?.byteLength ?? 0; + return /** @type {any} */ (value.data)?.byteLength ?? 0; } if (value?.input_features instanceof Tensor) { - let bytes = value.input_features.data?.byteLength ?? 0; + let bytes = /** @type {any} */ (value.input_features.data)?.byteLength ?? 0; if (value.attention_mask instanceof Tensor) { - bytes += value.attention_mask.data?.byteLength ?? 0; + bytes += /** @type {any} */ (value.attention_mask.data)?.byteLength ?? 0; } if (value.delta_features instanceof Tensor) { - bytes += value.delta_features.data?.byteLength ?? 0; + bytes += /** @type {any} */ (value.delta_features.data)?.byteLength ?? 0; } if (value.delta_delta_features instanceof Tensor) { - bytes += value.delta_delta_features.data?.byteLength ?? 0; + bytes += /** @type {any} */ (value.delta_delta_features.data)?.byteLength ?? 0; } return bytes; } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index c3b15c630..34c31e2a3 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -22,7 +22,9 @@ export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timest return []; } + /** @type {{ text: string, timestamp: [number, number] }[]} */ const words = []; + /** @type {{ text: string, start: number, end: number } | null} */ let current = null; for (let i = 0; i < token_ids.length; ++i) { @@ -45,6 +47,7 @@ export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timest if (text) { words.push({ text, + // Keep tuple shape for TS consumers. timestamp: [current.start, current.end], }); } @@ -65,6 +68,7 @@ export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timest if (text) { words.push({ text, + // Keep tuple shape for TS consumers. timestamp: [current.start, current.end], }); } diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 06d795af6..3333063e4 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -395,7 +395,7 @@ export class AutomaticSpeechRecognitionPipeline * - `timestamp_granularity='all'`: returns `chunks` (word), `tokens`, and `utterance` */ async _call_nemo_conformer_tdt(audio, kwargs) { - if (typeof this.model.transcribe !== 'function') { + if (typeof /** @type {any} */ (this.model).transcribe !== 'function') { throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); } if (!this.processor) { @@ -432,7 +432,7 @@ export class AutomaticSpeechRecognitionPipeline const toReturn = []; for (const aud of preparedAudios) { const inputs = await this.processor(aud); - const output = await this.model.transcribe(inputs, decodeOptions); + const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); toReturn.push(this._formatNemoConformerTDTResult(output, granularity)); } From 63aeee8347f73df0d8c75c51771f81967cb3c180 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 19:34:00 +0300 Subject: [PATCH 04/33] refactor(nemo-conformer-tdt): redesign transcribe output shape and API - Replace legacy per-feature flags (return_token_timestamps, return_word_timestamps, return_utterance_timestamp) with a layered API: return_timestamps (utterance-level), return_words, return_tokens - Merge duplicate outputs: words absorbs word_timestamps, tokens absorbs token_timestamps and token_ids - Add per-token confidence, word-level confidence aggregation, utterance_confidence, and confidence_scores summary - Gate frame confidences behind returnFrameConfidences flag - Add return_metrics with encode/decode/total timing and RTF - Add debug flags: returnFrameIndices, returnLogProbs, returnTdtSteps - Fix vocab Map handling in getIdToTokenMap and _resolveVocabSize (tokenizer.get_vocab() returns Map in WASM binding) - Update ASR pipeline to wire timestamp_granularity to new model flags - Format all changed files with Prettier per CONTRIBUTING.md --- .../modeling_nemo_conformer_tdt.js | 208 +++++++++++++++--- .../nemo_conformer_tdt/transducer_text.js | 147 ++++++++++--- .../utils_nemo_conformer_tdt.js | 2 +- .../pipelines/automatic-speech-recognition.js | 41 ++-- 4 files changed, 315 insertions(+), 83 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index c0874b21b..766c189d8 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -2,7 +2,7 @@ import { AutoConfig } from '../../configs.js'; import { Tensor } from '../../utils/tensor.js'; import { PreTrainedModel } from '../modeling_utils.js'; import { constructSessions, sessionRun } from '../session.js'; -import { buildTransducerWordTimestamps, decodeTransducerText } from './transducer_text.js'; +import { buildTransducerDetailedOutputs, decodeTransducerText } from './transducer_text.js'; const NEMO_CONFORMER_TDT_MODEL_TYPE = 'nemo-conformer-tdt'; @@ -36,6 +36,42 @@ function toInt(value) { return typeof value === 'bigint' ? Number(value) : value; } +function nowMs() { + return typeof performance !== 'undefined' && typeof performance.now === 'function' ? performance.now() : Date.now(); +} + +function roundMetric(value, digits = 2) { + if (!Number.isFinite(value)) return 0; + const factor = 10 ** digits; + return Math.round(value * factor) / factor; +} + +/** + * @param {Float32Array|number[]} logits + * @param {number} tokenId + * @param {number} vocabSize + * @returns {{ confidence: number, logProb: number }} + */ +function confidenceFromLogits(logits, tokenId, vocabSize) { + let maxLogit = Number.NEGATIVE_INFINITY; + for (let i = 0; i < vocabSize; ++i) { + if (logits[i] > maxLogit) { + maxLogit = logits[i]; + } + } + + let expSum = 0; + for (let i = 0; i < vocabSize; ++i) { + expSum += Math.exp(logits[i] - maxLogit); + } + const logSumExp = maxLogit + Math.log(expSum); + const logProb = logits[tokenId] - logSumExp; + return { + confidence: Math.exp(logProb), + logProb, + }; +} + function inferEncoderOutputLayout(outputTensor) { if (outputTensor.dims.length !== 3 || outputTensor.dims[0] !== 1) { throw new Error( @@ -389,7 +425,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } if (tokenizer?.get_vocab) { - const size = Object.keys(tokenizer.get_vocab()).length; + const vocab = tokenizer.get_vocab(); + const size = vocab instanceof Map ? vocab.size : Object.keys(vocab).length; if (size > 0) { return size; } @@ -416,35 +453,63 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { /** * Transcribe model-ready features using TDT decoding. + * + * - `return_timestamps: false` → `{ text, is_final }` (+ metrics if `return_metrics`) + * - `return_timestamps: true` → adds `utterance_confidence`, `utterance_timestamp`, `confidence_scores` + * - `return_words: true` (requires `return_timestamps`) → adds `words` list + * - `return_tokens: true` (requires `return_timestamps`) → adds `tokens` list + * - `return_metrics` is independent and can be combined with either level. + * - Debug flags (`returnFrameConfidences`, `returnFrameIndices`, `returnLogProbs`, `returnTdtSteps`) are independent. + * * @param {Object} model_inputs Processor outputs (must include `input_features`). * @param {Object} [decode_options] - * @param {any} [decode_options.tokenizer] Tokenizer used for text reconstruction and word timestamps. - * @param {boolean} [decode_options.return_token_timestamps=true] - * @param {boolean} [decode_options.return_word_timestamps=true] - * @param {boolean} [decode_options.return_utterance_timestamp=true] + * @param {any} [decode_options.tokenizer] Tokenizer for text reconstruction and word boundaries. + * @param {boolean} [decode_options.return_timestamps=true] Include utterance-level timestamps and confidence averages. + * @param {boolean} [decode_options.return_words=false] Include word-level list (requires return_timestamps). + * @param {boolean} [decode_options.return_tokens=false] Include token-level list (requires return_timestamps). + * @param {boolean} [decode_options.return_metrics=false] Include encoding/decoding timing metrics. + * @param {boolean} [decode_options.returnFrameConfidences=false] Include per-frame confidence scores in confidence_scores. + * @param {boolean} [decode_options.returnFrameIndices=false] Include per-token encoder frame indices. + * @param {boolean} [decode_options.returnLogProbs=false] Include per-token log probabilities. + * @param {boolean} [decode_options.returnTdtSteps=false] Include raw TDT duration steps. + * @param {number} [decode_options.timeOffset=0] Offset added to all timestamps (seconds). * @returns {Promise<{ * text: string, - * token_ids: number[], - * token_timestamps?: [number, number][], - * word_timestamps?: { text: string, timestamp: [number, number]}[], + * is_final: boolean, + * utterance_confidence?: number, * utterance_timestamp?: [number, number], + * words?: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, + * tokens?: Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, + * confidence_scores?: { token_avg: number|null, word_avg: number|null, frame: number[]|null, frame_avg: number|null, overall_log_prob: number|null }, + * metrics?: { preprocess_ms: number, encode_ms: number, decode_ms: number, tokenize_ms: number, total_ms: number, rtf: number, rtf_x: number }, + * frameIndices?: number[] | null, + * logProbs?: number[] | null, + * tdtSteps?: number[] | null, * }>} */ async transcribe( model_inputs, { tokenizer = null, - return_token_timestamps = true, - return_word_timestamps = true, - return_utterance_timestamp = true, + return_timestamps = true, + return_words = false, + return_tokens = false, + return_metrics = false, + returnFrameConfidences = false, + returnFrameIndices = false, + returnLogProbs = false, + returnTdtSteps = false, + timeOffset = 0, } = {}, ) { + const totalStart = nowMs(); const io = this.transducer.io; const vocabSize = this._resolveVocabSize(tokenizer); this._validateRuntimeConfig(vocabSize); const { feeds: encoderFeeds, disposables } = this._buildEncoderFeeds(model_inputs); let encoderOutputs; + const encodeStart = nowMs(); try { encoderOutputs = await this._runEncoder(encoderFeeds); } finally { @@ -452,6 +517,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tensor.dispose(); } } + const encodeMs = nowMs() - encodeStart; const encoderOutput = this._getEncoderOutput(encoderOutputs); let frames; @@ -473,10 +539,22 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const blankId = this.transducer.blank_token_id; const maxSymbolsPerStep = this.transducer.max_symbols_per_step; + const needConfidences = !!return_timestamps; + /** @type {number[]} */ const tokenIds = []; /** @type {[number, number][]} */ const tokenTimestamps = []; + /** @type {number[] | null} */ + const tokenConfidences = needConfidences ? [] : null; + /** @type {number[] | null} */ + const frameConfidences = returnFrameConfidences ? [] : null; + /** @type {number[] | null} */ + const frameIndices = returnFrameIndices ? [] : null; + /** @type {number[] | null} */ + const logProbs = returnLogProbs || needConfidences ? [] : null; + /** @type {number[] | null} */ + const tdtSteps = returnTdtSteps ? [] : null; let decoderState = { state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), @@ -488,6 +566,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { ? new Tensor('int64', BigInt64Array.from([1n]), [1]) : new Tensor('int32', new Int32Array([1]), [1]); let emittedOnFrame = 0; + const decodeStart = nowMs(); try { for (let frameIndex = 0; frameIndex < frames.length; ) { @@ -522,12 +601,22 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { ); } const tokenId = argmax(logitsData, 0, vocabSize); - const durationStart = this.transducer.duration_start_index ?? vocabSize; const hasDurationLogits = logitsData.length > durationStart; const step = hasDurationLogits ? argmax(logitsData, durationStart, logitsData.length - durationStart) - durationStart : 0; + if (tdtSteps) { + tdtSteps.push(step); + } + + const maybeConfidence = + needConfidences || returnLogProbs || returnFrameConfidences + ? confidenceFromLogits(logitsData, tokenId, vocabSize) + : null; + if (frameConfidences && maybeConfidence) { + frameConfidences.push(maybeConfidence.confidence); + } const newState = { state1: decoderOutput[io.decoder_output_state_1] ?? decoderState.state1, @@ -540,7 +629,19 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenIds.push(tokenId); const durationFrames = step > 0 ? step : 1; - tokenTimestamps.push([frameIndex * frameTime, (frameIndex + durationFrames) * frameTime]); + tokenTimestamps.push([ + frameIndex * frameTime + timeOffset, + (frameIndex + durationFrames) * frameTime + timeOffset, + ]); + if (tokenConfidences && maybeConfidence) { + tokenConfidences.push(maybeConfidence.confidence); + } + if (frameIndices) { + frameIndices.push(frameIndex); + } + if (logProbs && maybeConfidence) { + logProbs.push(maybeConfidence.logProb); + } emittedOnFrame += 1; } else { this._disposeDecoderState(newState, decoderState); @@ -560,28 +661,77 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { targetLengthTensor.dispose(); this._disposeDecoderState(decoderState); } + const decodeMs = nowMs() - decodeStart; + const tokenizeStart = nowMs(); const text = decodeTransducerText(tokenizer, tokenIds); + const needDetailed = return_timestamps && (return_words || return_tokens); + const detailed = needDetailed + ? buildTransducerDetailedOutputs(tokenizer, tokenIds, tokenTimestamps, tokenConfidences) + : null; + const tokenizeMs = nowMs() - tokenizeStart; + + /** @type {any} */ + const result = { text, is_final: true }; + + if (return_timestamps) { + result.utterance_confidence = + tokenConfidences && tokenConfidences.length > 0 + ? tokenConfidences.reduce((a, b) => a + b, 0) / tokenConfidences.length + : null; + + result.utterance_timestamp = + tokenTimestamps.length > 0 + ? /** @type {[number, number]} */ ([ + tokenTimestamps[0][0], + tokenTimestamps[tokenTimestamps.length - 1][1], + ]) + : /** @type {[number, number]} */ ([timeOffset, frames.length * frameTime + timeOffset]); + + if (detailed) { + if (return_words) result.words = detailed.words; + if (return_tokens) result.tokens = detailed.tokens; + } - const result = { - text, - token_ids: tokenIds, - }; - - if (return_token_timestamps) { - result.token_timestamps = tokenTimestamps; + result.confidence_scores = { + token_avg: result.utterance_confidence, + word_avg: detailed?.word_avg ?? null, + overall_log_prob: + logProbs && logProbs.length > 0 ? logProbs.reduce((a, b) => a + b, 0) / logProbs.length : null, + }; + + if (frameConfidences && frameConfidences.length > 0) { + result.confidence_scores.frame = frameConfidences; + result.confidence_scores.frame_avg = + frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length; + } } - if (return_word_timestamps) { - result.word_timestamps = buildTransducerWordTimestamps(tokenizer, tokenIds, tokenTimestamps); + if (returnFrameIndices) { + result.frameIndices = frameIndices; + } + if (returnLogProbs) { + result.logProbs = logProbs; + } + if (returnTdtSteps) { + result.tdtSteps = tdtSteps; } - if (return_utterance_timestamp) { - if (tokenTimestamps.length > 0) { - result.utterance_timestamp = [tokenTimestamps[0][0], tokenTimestamps[tokenTimestamps.length - 1][1]]; - } else { - result.utterance_timestamp = [0, frames.length * frameTime]; - } + if (return_metrics) { + const totalMs = nowMs() - totalStart; + const utteranceDuration = result.utterance_timestamp + ? Math.max(result.utterance_timestamp[1] - result.utterance_timestamp[0], 1e-8) + : Math.max(frames.length * frameTime, 1e-8); + const rtf = totalMs / 1000 / utteranceDuration; + result.metrics = { + preprocess_ms: 0.0, + encode_ms: roundMetric(encodeMs, 2), + decode_ms: roundMetric(decodeMs, 2), + tokenize_ms: roundMetric(tokenizeMs, 2), + total_ms: roundMetric(totalMs, 2), + rtf: roundMetric(rtf, 4), + rtf_x: roundMetric(1 / Math.max(rtf, 1e-8), 2), + }; } return result; diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index 34c31e2a3..0550f323f 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -1,3 +1,60 @@ +/** + * Cache tokenizer id->token maps for stable and fast boundary detection. + * @type {WeakMap>} + */ +const TOKEN_ID_TO_TEXT_CACHE = new WeakMap(); + +/** + * @param {any} tokenizer + * @returns {Map} + */ +function getIdToTokenMap(tokenizer) { + let cached = TOKEN_ID_TO_TEXT_CACHE.get(tokenizer); + if (cached) return cached; + + cached = new Map(); + if (tokenizer?.get_vocab) { + const vocab = tokenizer.get_vocab(); + // get_vocab() may return a Map or a plain Object depending on the tokenizer backend. + const entries = vocab instanceof Map ? vocab.entries() : Object.entries(vocab); + for (const [token, id] of entries) { + if (Number.isInteger(id)) { + cached.set(id, token); + } + } + } + TOKEN_ID_TO_TEXT_CACHE.set(tokenizer, cached); + return cached; +} + +/** + * Resolve per-token text and word boundary metadata in a tokenizer-agnostic way. + * Uses raw vocab token (if available) for boundary markers, and decoded token text for display. + * @param {any} tokenizer + * @param {number} id + * @returns {{ raw: string, clean: string, startsNewWord: boolean }} + */ +function resolveTokenPiece(tokenizer, id) { + const rawToken = getIdToTokenMap(tokenizer).get(id) ?? ''; + const decoded = tokenizer.decode([id], { + skip_special_tokens: true, + clean_up_tokenization_spaces: false, + }); + + // SentencePiece/BPE boundary markers used by common tokenizers. + const startsWithBoundaryMarker = /^(?:▁|Ġ)+/.test(rawToken); + const startsWithWhitespace = /^\s+/.test(decoded); + const startsNewWord = startsWithBoundaryMarker || startsWithWhitespace; + + // Human readable token text. + let clean = decoded.replace(/^\s+/, ''); + if (!clean) { + clean = rawToken.replace(/^(?:▁|Ġ|Ċ)+/, '').replace(/^ +/, ''); + } + + return { raw: rawToken || decoded, clean, startsNewWord }; +} + /** * Decode token ids into final transcription text. * @param {any} tokenizer @@ -11,68 +68,106 @@ export function decodeTransducerText(tokenizer, token_ids) { } /** - * Build word-level timestamps from token ids and token-level timestamps. + * Build detailed word/token outputs with optional confidence aggregation. * @param {any} tokenizer * @param {number[]} token_ids * @param {[number, number][]} token_timestamps - * @returns {{ text: string, timestamp: [number, number] }[]} + * @param {number[] | null} token_confidences + * @returns {{ + * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, + * tokens: Array<{ token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, + * word_confidences: number[] | null, + * word_avg: number | null, + * }} */ -export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timestamps) { +export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_timestamps, token_confidences = null) { if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { - return []; + return { words: [], tokens: [], word_confidences: null, word_avg: null }; } - /** @type {{ text: string, timestamp: [number, number] }[]} */ + /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ + const tokens = []; + /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ const words = []; - /** @type {{ text: string, start: number, end: number } | null} */ + + /** @type {{ text: string, start: number, end: number, confs: number[] } | null} */ let current = null; for (let i = 0; i < token_ids.length; ++i) { const id = token_ids[i]; const ts = token_timestamps[i]; - const piece = tokenizer.decode([id], { - skip_special_tokens: true, - clean_up_tokenization_spaces: false, - }); - - if (!piece) continue; + const piece = resolveTokenPiece(tokenizer, id); + const raw = piece.raw; + const startsNewWord = piece.startsNewWord; + const clean = piece.clean; + if (!clean) continue; - const startsNewWord = /^\s+/.test(piece) || piece.startsWith('▁'); - const normalizedPiece = piece.replace(/^\s+/, '').replace(/^▁+/, ''); - if (!normalizedPiece) continue; + const tok = { + id, + token: clean, + raw_token: raw, + is_word_start: startsNewWord, + start_time: ts[0], + end_time: ts[1], + }; + const conf = token_confidences?.[i]; + if (conf != null && Number.isFinite(conf)) { + tok.confidence = conf; + } + tokens.push(tok); if (!current || startsNewWord) { if (current) { const text = current.text.trim(); if (text) { - words.push({ + /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ + const word = { text, - // Keep tuple shape for TS consumers. - timestamp: [current.start, current.end], - }); + start_time: current.start, + end_time: current.end, + }; + if (current.confs.length > 0) { + word.confidence = current.confs.reduce((a, b) => a + b, 0) / current.confs.length; + } + words.push(word); } } current = { - text: normalizedPiece, + text: clean, start: ts[0], end: ts[1], + confs: conf != null && Number.isFinite(conf) ? [conf] : [], }; } else { - current.text += normalizedPiece; + current.text += clean; current.end = ts[1]; + if (conf != null && Number.isFinite(conf)) { + current.confs.push(conf); + } } } if (current) { const text = current.text.trim(); if (text) { - words.push({ + /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ + const word = { text, - // Keep tuple shape for TS consumers. - timestamp: [current.start, current.end], - }); + start_time: current.start, + end_time: current.end, + }; + if (current.confs.length > 0) { + word.confidence = current.confs.reduce((a, b) => a + b, 0) / current.confs.length; + } + words.push(word); } } - return words; + const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? 0) : null; + const word_avg = + word_confidences && word_confidences.length > 0 + ? word_confidences.reduce((a, b) => a + b, 0) / word_confidences.length + : null; + + return { words, tokens, word_confidences, word_avg }; } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js index 24859bc16..935336b1e 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js @@ -1,4 +1,4 @@ // Backwards-compatible barrel for older internal imports. export { createAudioCacheKey, FeatureLRUCache } from './transducer_cache.js'; export { computeTemporalDeltas } from './transducer_deltas.js'; -export { decodeTransducerText, buildTransducerWordTimestamps } from './transducer_text.js'; +export { decodeTransducerText, buildTransducerDetailedOutputs } from './transducer_text.js'; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 3333063e4..5a2d088b3 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -340,24 +340,14 @@ export class AutomaticSpeechRecognitionPipeline return { text }; } - const wordChunks = (result.word_timestamps ?? []).map((item) => ({ - text: item.text, - timestamp: item.timestamp, + const wordChunks = (result.words ?? []).map((w) => ({ + text: w.text, + timestamp: [w.start_time, w.end_time], + })); + const tokenChunks = (result.tokens ?? []).map((t) => ({ + text: t.token ?? t.text ?? '', + timestamp: [t.start_time, t.end_time], })); - const tokenChunks = (result.token_timestamps ?? []).map((timestamp, index) => { - const tokenId = result.token_ids?.[index]; - const decodedToken = - tokenId == null - ? '' - : (this.tokenizer?.decode([tokenId], { - skip_special_tokens: true, - clean_up_tokenization_spaces: false, - }) ?? ''); - return { - text: decodedToken || (tokenId == null ? '' : `${tokenId}`), - timestamp, - }; - }); const utterance = result.utterance_timestamp; if (granularity === 'utterance') { @@ -389,13 +379,10 @@ export class AutomaticSpeechRecognitionPipeline /** * Nemo Conformer TDT ASR output rules: * - `return_timestamps=false`: `{ text }` - * - `timestamp_granularity='utterance'`: `chunks` contains a single utterance span - * - `timestamp_granularity='word'`: `chunks` contains word-level spans - * - `timestamp_granularity='token'`: `chunks` contains token-level spans - * - `timestamp_granularity='all'`: returns `chunks` (word), `tokens`, and `utterance` + * - `return_timestamps=true`: return full raw model transcription payload. */ async _call_nemo_conformer_tdt(audio, kwargs) { - if (typeof /** @type {any} */ (this.model).transcribe !== 'function') { + if (typeof (/** @type {any} */ (this.model).transcribe) !== 'function') { throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); } if (!this.processor) { @@ -411,17 +398,17 @@ export class AutomaticSpeechRecognitionPipeline } const return_timestamps = kwargs.return_timestamps ?? false; - const withTimestamps = return_timestamps !== false; const granularity = this._normalizeNemoConformerTimestampGranularity( - withTimestamps, + return_timestamps, kwargs.timestamp_granularity, ); + const withTimestamps = granularity !== null; const decodeOptions = { tokenizer: this.tokenizer, - return_token_timestamps: granularity === 'token' || granularity === 'all', - return_word_timestamps: granularity === 'word' || granularity === 'all', - return_utterance_timestamp: granularity === 'utterance' || granularity === 'all', + return_timestamps: withTimestamps, + return_words: granularity === 'word' || granularity === 'all', + return_tokens: granularity === 'token' || granularity === 'all', }; const single = !Array.isArray(audio); From f6835ad84a98036204028fc9ce02f55e9ec2047e Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 21:01:10 +0300 Subject: [PATCH 05/33] fix(nemo-conformer-tdt): round timestamps and confidences, simplify pipeline - Add roundTs() for millisecond-precision timestamp rounding at source - Round all confidence averages to 6 decimal places - Round per-token and per-word confidence values - Remove timestamp_granularity and formatting helpers from pipeline - Pipeline returns model.transcribe() output directly - Auto-enable return_words and return_metrics when return_timestamps is true --- .../modeling_nemo_conformer_tdt.js | 27 +++-- .../nemo_conformer_tdt/transducer_text.js | 10 +- .../pipelines/automatic-speech-recognition.js | 98 +++---------------- 3 files changed, 36 insertions(+), 99 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 766c189d8..2c48ef8e4 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -46,6 +46,10 @@ function roundMetric(value, digits = 2) { return Math.round(value * factor) / factor; } +function roundTs(value) { + return Math.round(value * 1000) / 1000; +} + /** * @param {Float32Array|number[]} logits * @param {number} tokenId @@ -630,8 +634,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenIds.push(tokenId); const durationFrames = step > 0 ? step : 1; tokenTimestamps.push([ - frameIndex * frameTime + timeOffset, - (frameIndex + durationFrames) * frameTime + timeOffset, + roundTs(frameIndex * frameTime + timeOffset), + roundTs((frameIndex + durationFrames) * frameTime + timeOffset), ]); if (tokenConfidences && maybeConfidence) { tokenConfidences.push(maybeConfidence.confidence); @@ -677,7 +681,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { if (return_timestamps) { result.utterance_confidence = tokenConfidences && tokenConfidences.length > 0 - ? tokenConfidences.reduce((a, b) => a + b, 0) / tokenConfidences.length + ? roundMetric(tokenConfidences.reduce((a, b) => a + b, 0) / tokenConfidences.length, 6) : null; result.utterance_timestamp = @@ -686,7 +690,10 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenTimestamps[0][0], tokenTimestamps[tokenTimestamps.length - 1][1], ]) - : /** @type {[number, number]} */ ([timeOffset, frames.length * frameTime + timeOffset]); + : /** @type {[number, number]} */ ([ + roundTs(timeOffset), + roundTs(frames.length * frameTime + timeOffset), + ]); if (detailed) { if (return_words) result.words = detailed.words; @@ -695,15 +702,19 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { result.confidence_scores = { token_avg: result.utterance_confidence, - word_avg: detailed?.word_avg ?? null, + word_avg: detailed?.word_avg != null ? roundMetric(detailed.word_avg, 6) : null, overall_log_prob: - logProbs && logProbs.length > 0 ? logProbs.reduce((a, b) => a + b, 0) / logProbs.length : null, + logProbs && logProbs.length > 0 + ? roundMetric(logProbs.reduce((a, b) => a + b, 0) / logProbs.length, 6) + : null, }; if (frameConfidences && frameConfidences.length > 0) { result.confidence_scores.frame = frameConfidences; - result.confidence_scores.frame_avg = - frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length; + result.confidence_scores.frame_avg = roundMetric( + frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, + 6, + ); } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index 0550f323f..e98938224 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -112,7 +112,7 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times }; const conf = token_confidences?.[i]; if (conf != null && Number.isFinite(conf)) { - tok.confidence = conf; + tok.confidence = Math.round(conf * 1e6) / 1e6; } tokens.push(tok); @@ -127,7 +127,8 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times end_time: current.end, }; if (current.confs.length > 0) { - word.confidence = current.confs.reduce((a, b) => a + b, 0) / current.confs.length; + word.confidence = + Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; } words.push(word); } @@ -157,7 +158,8 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times end_time: current.end, }; if (current.confs.length > 0) { - word.confidence = current.confs.reduce((a, b) => a + b, 0) / current.confs.length; + word.confidence = + Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; } words.push(word); } @@ -166,7 +168,7 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? 0) : null; const word_avg = word_confidences && word_confidences.length > 0 - ? word_confidences.reduce((a, b) => a + b, 0) / word_confidences.length + ? Math.round((word_confidences.reduce((a, b) => a + b, 0) / word_confidences.length) * 1e6) / 1e6 : null; return { words, tokens, word_confidences, word_avg }; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 5a2d088b3..4194bdb9a 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -16,21 +16,14 @@ import { logger } from '../utils/logger.js'; * @property {string} text The recognized text. */ -/** - * @typedef {'utterance' | 'word' | 'token' | 'all'} TimestampGranularity - */ - /** * @typedef {Object} AutomaticSpeechRecognitionOutput * @property {string} text The recognized text. * @property {Chunk[]} [chunks] When using `return_timestamps`, the `chunks` will become a list * containing all the various text chunks identified by the model. - * @property {Chunk[]} [tokens] Optional token-level timestamp chunks for models that support them. - * @property {[number, number]} [utterance] Optional utterance-level timestamp span. * * @typedef {Object} AutomaticSpeechRecognitionSpecificParams Parameters specific to automatic-speech-recognition pipelines. * @property {boolean|'word'} [return_timestamps] Whether to return timestamps or not. Default is `false`. - * @property {TimestampGranularity} [timestamp_granularity] Granularity used when `return_timestamps` is enabled for Parakeet TDT models. Default is `'word'`. * @property {number} [chunk_length_s] The length of audio chunks to process in seconds. Default is 0 (no chunking). * @property {number} [stride_length_s] The length of overlap between consecutive audio chunks in seconds. If not provided, defaults to `chunk_length_s / 6`. * @property {boolean} [force_full_sequences] Whether to force outputting full sequences or not. Default is `false`. @@ -310,76 +303,12 @@ export class AutomaticSpeechRecognitionPipeline } /** - * @param {any} return_timestamps - * @param {any} timestamp_granularity - * @returns {TimestampGranularity|null} - */ - _normalizeNemoConformerTimestampGranularity(return_timestamps, timestamp_granularity) { - if (!return_timestamps) { - return null; - } - - const granularity = timestamp_granularity ?? 'word'; - const allowed = ['utterance', 'word', 'token', 'all']; - if (!allowed.includes(granularity)) { - throw new Error( - `Invalid \`timestamp_granularity\`: "${granularity}". Expected one of: ${allowed.join(', ')}.`, - ); - } - return /** @type {TimestampGranularity} */ (granularity); - } - - /** - * @param {any} result - * @param {TimestampGranularity|null} granularity - * @returns {AutomaticSpeechRecognitionOutput} - */ - _formatNemoConformerTDTResult(result, granularity) { - const text = result.text ?? ''; - if (!granularity) { - return { text }; - } - - const wordChunks = (result.words ?? []).map((w) => ({ - text: w.text, - timestamp: [w.start_time, w.end_time], - })); - const tokenChunks = (result.tokens ?? []).map((t) => ({ - text: t.token ?? t.text ?? '', - timestamp: [t.start_time, t.end_time], - })); - const utterance = result.utterance_timestamp; - - if (granularity === 'utterance') { - if (!utterance) { - return { text, chunks: [] }; - } - return { - text, - chunks: [{ text, timestamp: utterance }], - }; - } - - if (granularity === 'word') { - return { text, chunks: wordChunks }; - } - - if (granularity === 'token') { - return { text, chunks: tokenChunks }; - } - - return { - text, - chunks: wordChunks, - tokens: tokenChunks, - ...(utterance ? { utterance } : {}), - }; - } - - /** - * Nemo Conformer TDT ASR output rules: - * - `return_timestamps=false`: `{ text }` - * - `return_timestamps=true`: return full raw model transcription payload. + * Nemo Conformer TDT ASR pipeline. + * + * Delegates to model.transcribe() and returns its output directly. + * Use `return_timestamps: true` on the pipeline call to get utterance-level data. + * For words/tokens/metrics/debug, call model.transcribe() directly with the + * extended options (return_words, return_tokens, return_metrics, etc.). */ async _call_nemo_conformer_tdt(audio, kwargs) { if (typeof (/** @type {any} */ (this.model).transcribe) !== 'function') { @@ -397,18 +326,13 @@ export class AutomaticSpeechRecognitionPipeline ); } - const return_timestamps = kwargs.return_timestamps ?? false; - const granularity = this._normalizeNemoConformerTimestampGranularity( - return_timestamps, - kwargs.timestamp_granularity, - ); - const withTimestamps = granularity !== null; + const return_timestamps = !!(kwargs.return_timestamps ?? false); const decodeOptions = { tokenizer: this.tokenizer, - return_timestamps: withTimestamps, - return_words: granularity === 'word' || granularity === 'all', - return_tokens: granularity === 'token' || granularity === 'all', + return_timestamps, + return_words: return_timestamps, + return_metrics: true, }; const single = !Array.isArray(audio); @@ -420,7 +344,7 @@ export class AutomaticSpeechRecognitionPipeline for (const aud of preparedAudios) { const inputs = await this.processor(aud); const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); - toReturn.push(this._formatNemoConformerTDTResult(output, granularity)); + toReturn.push(output); } return single ? toReturn[0] : toReturn; From 2dd36a117147269b93ffea945b0f4076959c211f Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 21:13:20 +0300 Subject: [PATCH 06/33] fix: dispose tensors on error path, decouple frame confidences from timestamps, honor return_metrics kwarg - modeling_nemo_conformer_tdt: dispose logits and new decoder state tensors before throwing when logitsData.length < vocabSize to prevent resource leak - modeling_nemo_conformer_tdt: move returnFrameConfidences output block outside the return_timestamps guard so frame/frame_avg are emitted independently - automatic-speech-recognition: change return_metrics from hardcoded true to kwargs.return_metrics ?? false to respect user intent and avoid overhead --- .../modeling_nemo_conformer_tdt.js | 47 +++++++++++-------- .../pipelines/automatic-speech-recognition.js | 4 +- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 2c48ef8e4..8a46bb014 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -131,7 +131,7 @@ function resolveTransducerConfig(config, sessions) { if (missingDecoderInputs.length > 0) { throw new Error( `Nemo Conformer TDT decoder session is missing expected inputs: ${missingDecoderInputs.join(', ')}. ` + - 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', ); } const missingDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2].filter( @@ -140,7 +140,7 @@ function resolveTransducerConfig(config, sessions) { if (missingDecoderOutputs.length > 0) { throw new Error( `Nemo Conformer TDT decoder session is missing expected outputs: ${missingDecoderOutputs.join(', ')}. ` + - 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', ); } @@ -151,7 +151,7 @@ function resolveTransducerConfig(config, sessions) { if (!(encoderSession.outputNames ?? []).includes(io.encoder_output)) { throw new Error( `Nemo Conformer TDT encoder session is missing expected output: ${io.encoder_output}. ` + - 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', + 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', ); } @@ -258,7 +258,7 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { if (options.model_file_name && options.model_file_name !== 'encoder_model') { throw new Error( 'NemoConformerForTDT does not support `model_file_name` override. ' + - 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', + 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', ); } @@ -277,8 +277,8 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { const reason = error?.message ?? String(error); throw new Error( 'Failed to load Nemo Conformer TDT sessions. Expected canonical v4 files under `onnx/`: ' + - '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + - `Original error: ${reason}`, + '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + + `Original error: ${reason}`, ); } @@ -573,7 +573,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const decodeStart = nowMs(); try { - for (let frameIndex = 0; frameIndex < frames.length; ) { + for (let frameIndex = 0; frameIndex < frames.length;) { const frameTensor = this._createFrameTensor(frames[frameIndex]); const prevTokenId = tokenIds.length > 0 ? tokenIds[tokenIds.length - 1] : blankId; const tokenTensor = @@ -600,6 +600,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const logits = decoderOutput[io.decoder_output] ?? Object.values(decoderOutput)[0]; const logitsData = logits.data; if (logitsData.length < vocabSize) { + logits.dispose(); + this._disposeDecoderState({ + state1: decoderOutput[io.decoder_output_state_1], + state2: decoderOutput[io.decoder_output_state_2], + }); throw new Error( `Nemo Conformer TDT decoder output is too small (${logitsData.length}) for vocab_size=${vocabSize}.`, ); @@ -687,13 +692,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { result.utterance_timestamp = tokenTimestamps.length > 0 ? /** @type {[number, number]} */ ([ - tokenTimestamps[0][0], - tokenTimestamps[tokenTimestamps.length - 1][1], - ]) + tokenTimestamps[0][0], + tokenTimestamps[tokenTimestamps.length - 1][1], + ]) : /** @type {[number, number]} */ ([ - roundTs(timeOffset), - roundTs(frames.length * frameTime + timeOffset), - ]); + roundTs(timeOffset), + roundTs(frames.length * frameTime + timeOffset), + ]); if (detailed) { if (return_words) result.words = detailed.words; @@ -708,14 +713,18 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { ? roundMetric(logProbs.reduce((a, b) => a + b, 0) / logProbs.length, 6) : null, }; + } - if (frameConfidences && frameConfidences.length > 0) { - result.confidence_scores.frame = frameConfidences; - result.confidence_scores.frame_avg = roundMetric( - frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, - 6, - ); + // Frame confidences are independent of return_timestamps — emit whenever requested. + if (returnFrameConfidences && frameConfidences && frameConfidences.length > 0) { + if (!result.confidence_scores) { + result.confidence_scores = {}; } + result.confidence_scores.frame = frameConfidences; + result.confidence_scores.frame_avg = roundMetric( + frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, + 6, + ); } if (returnFrameIndices) { diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 4194bdb9a..77592f5a0 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -332,7 +332,7 @@ export class AutomaticSpeechRecognitionPipeline tokenizer: this.tokenizer, return_timestamps, return_words: return_timestamps, - return_metrics: true, + return_metrics: kwargs.return_metrics ?? false, }; const single = !Array.isArray(audio); @@ -365,7 +365,7 @@ export class AutomaticSpeechRecognitionPipeline const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6; const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs }); - const text = this.processor.batch_decode(/** @type {Tensor} */ (outputs), { skip_special_tokens: true })[0]; + const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0]; toReturn.push({ text }); } return single ? toReturn[0] : toReturn; From 39d9be4f47f087d56c3f1c06db638aebf520e143 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 21:28:15 +0300 Subject: [PATCH 07/33] feat: integrate NemoConformerTDT with ModelRegistry API - Add MODEL_TYPES.NemoConformerTDT (id=16) to modeling_utils - Register NemoConformerForTDT in MODEL_TYPE_MAPPING, MODEL_NAME_TO_CLASS_MAPPING, and MODEL_CLASS_TO_NAME_MAPPING so the base class from_pretrained, ModelRegistry, and is_pipeline_cached all recognise the model correctly - Add NemoConformerTDT case to get_model_files so progress_callback receives accurate file size totals for encoder_model.onnx + decoder_model_merged.onnx --- .../transformers/src/models/modeling_utils.js | 3 ++- .../modeling_nemo_conformer_tdt.js | 16 +++++++++++++++- .../src/utils/model_registry/get_model_files.js | 7 +++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/packages/transformers/src/models/modeling_utils.js b/packages/transformers/src/models/modeling_utils.js index d4c5d6d32..ef6e5a99a 100644 --- a/packages/transformers/src/models/modeling_utils.js +++ b/packages/transformers/src/models/modeling_utils.js @@ -98,6 +98,7 @@ export const MODEL_TYPES = { ImageAudioTextToText: 13, Supertonic: 14, Chatterbox: 15, + NemoConformerTDT: 16, }; const MODEL_TYPE_CONFIG = { @@ -857,7 +858,7 @@ export class PreTrainedModel extends Callable { if (inputs) { throw new Error( '`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' + - 'Make sure to either pass {inputs} or {input_name}=...', + 'Make sure to either pass {inputs} or {input_name}=...', ); } } else { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 8a46bb014..1dceb2947 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -1,6 +1,12 @@ import { AutoConfig } from '../../configs.js'; import { Tensor } from '../../utils/tensor.js'; -import { PreTrainedModel } from '../modeling_utils.js'; +import { + PreTrainedModel, + MODEL_TYPES, + MODEL_TYPE_MAPPING, + MODEL_NAME_TO_CLASS_MAPPING, + MODEL_CLASS_TO_NAME_MAPPING, +} from '../modeling_utils.js'; import { constructSessions, sessionRun } from '../session.js'; import { buildTransducerDetailedOutputs, decodeTransducerText } from './transducer_text.js'; @@ -765,3 +771,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { return await this.transcribe(model_inputs); } } + +// Register with ModelRegistry so get_model_files / progress_callback enumerate +// the correct ONNX files: encoder_model + decoder_model_merged. +MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); +MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerTDTPreTrainedModel', NemoConformerTDTPreTrainedModel); +MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerForTDT', NemoConformerForTDT); +MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerTDTPreTrainedModel, 'NemoConformerTDTPreTrainedModel'); +MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerForTDT, 'NemoConformerForTDT'); diff --git a/packages/transformers/src/utils/model_registry/get_model_files.js b/packages/transformers/src/utils/model_registry/get_model_files.js index d60dbfd0a..b0cf029e1 100644 --- a/packages/transformers/src/utils/model_registry/get_model_files.js +++ b/packages/transformers/src/utils/model_registry/get_model_files.js @@ -71,8 +71,8 @@ export async function get_model_files( const archList = architectures.length > 0 ? architectures.join(', ') : '(none)'; logger.warn( `[get_model_files] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] ` + - `for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). ` + - `If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`, + `for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). ` + + `If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`, ); // Always fallback to EncoderOnly (single model.onnx file) @@ -166,6 +166,9 @@ export async function get_model_files( add_model_file('model', 'language_model'); add_model_file('conditional_decoder'); files.push('generation_config.json'); + } else if (modelType === MODEL_TYPES.NemoConformerTDT) { + add_model_file('model', 'encoder_model'); + add_model_file('decoder_model_merged'); } else if (modelType === MODEL_TYPES.AutoEncoder) { add_model_file('encoder_model'); add_model_file('decoder_model'); From 3d984e527c733de5aa5a5d0785acd33317baf84c Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 21:58:16 +0300 Subject: [PATCH 08/33] style: replace console.warn with logger.warn in feature extractor Standardizes internal logging to follow the upstream convention introduced in ModelRegistry refactor. --- .../feature_extraction_nemo_conformer_tdt.js | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index f1bfe6b76..0bde33d54 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -1,6 +1,7 @@ import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js'; import { Tensor } from '../../utils/tensor.js'; import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { logger } from '../../utils/logger.js'; import { FeatureLRUCache, createAudioCacheKey } from './transducer_cache.js'; import { computeTemporalDeltas } from './transducer_deltas.js'; @@ -47,17 +48,17 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { ); } if (this.delta_order > 0 && !this.delta_concatenate) { - console.warn( + logger.warn( 'NemoConformerTDTFeatureExtractor: `delta_concatenate=false` is set. ' + - '`input_features` will remain base features and deltas are returned in extra fields.', + '`input_features` will remain base features and deltas are returned in extra fields.', ); } this.feature_cache = this.use_feature_cache ? new FeatureLRUCache({ - max_entries: this.config.feature_cache_max_entries ?? 128, - max_size_mb: this.config.feature_cache_max_size_mb ?? 64, - }) + max_entries: this.config.feature_cache_max_entries ?? 128, + max_size_mb: this.config.feature_cache_max_size_mb ?? 64, + }) : null; } From 9f3a220284d43bf9a859f3e9d4190bd2636832d0 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 22:22:28 +0300 Subject: [PATCH 09/33] fix(nemo-conformer-tdt): harden edge cases, restore pipeline design - Guard feature extractor against empty/short audio (NaN prevention) - Move decoder tensor init inside try block for safe disposal - Add architecture key to MODEL_TYPE_MAPPING - Add input validation in buildTransducerDetailedOutputs - Harden audio cache hash against NaN samples - Add order validation in computeTemporalDeltas - Restore pipeline: return_timestamps truthy => words + metrics always on --- packages/transformers/src/models/models.js | 2 +- .../feature_extraction_nemo_conformer_tdt.js | 33 +++++++++++-------- .../modeling_nemo_conformer_tdt.js | 29 +++++++++------- .../nemo_conformer_tdt/transducer_cache.js | 3 +- .../nemo_conformer_tdt/transducer_deltas.js | 10 ++++-- .../nemo_conformer_tdt/transducer_text.js | 10 ++++++ .../pipelines/automatic-speech-recognition.js | 4 +-- 7 files changed, 59 insertions(+), 32 deletions(-) diff --git a/packages/transformers/src/models/models.js b/packages/transformers/src/models/models.js index 2fe9055a0..0cac26e2a 100644 --- a/packages/transformers/src/models/models.js +++ b/packages/transformers/src/models/models.js @@ -102,8 +102,8 @@ export * from './mpt/modeling_mpt.js'; export * from './mt5/modeling_mt5.js'; export * from './multi_modality/modeling_multi_modality.js'; export * from './musicgen/modeling_musicgen.js'; -export * from './nemo_conformer_tdt/modeling_nemo_conformer_tdt.js'; export * from './nanochat/modeling_nanochat.js'; +export * from './nemo_conformer_tdt/modeling_nemo_conformer_tdt.js'; export * from './neobert/modeling_neobert.js'; export * from './nomic_bert/modeling_nomic_bert.js'; export * from './olmo/modeling_olmo.js'; diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 0bde33d54..e230ff50c 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -129,15 +129,17 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { async _extract(audio) { const features = await this._extract_fbank_features(audio); - const features_length = Math.floor( + const [num_frames, num_features] = features.dims; + const raw_features_length = Math.floor( (audio.length + Math.floor(this.config.n_fft / 2) * 2 - this.config.n_fft) / this.config.hop_length, ); + // Clamp to [0, num_frames] to avoid a negative fill offset for very short clips. + const features_length = Math.max(0, Math.min(num_frames, raw_features_length)); const features_data = /** @type {Float32Array} */ (features.data); - features_data.fill(0, features_length * features.dims[1]); + features_data.fill(0, features_length * num_features); // normalize mel features, ignoring padding - const [num_frames, num_features] = features.dims; const sum = new Float64Array(num_features); const sum_sq = new Float64Array(num_features); @@ -150,17 +152,20 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { } } - // Calculate mean and standard deviation, then normalize - const divisor = features_length > 1 ? features_length - 1 : 1; - for (let j = 0; j < num_features; ++j) { - const mean = sum[j] / features_length; - const variance = (sum_sq[j] - features_length * mean * mean) / divisor; - const std = Math.sqrt(variance) + EPSILON; - const inv_std = 1 / std; - - for (let i = 0; i < features_length; ++i) { - const index = i * num_features + j; - features_data[index] = (features_data[index] - mean) * inv_std; + // Skip normalization for empty/very short audio to avoid NaN from divide-by-zero. + if (features_length > 0) { + // Calculate mean and standard deviation, then normalize + const divisor = features_length > 1 ? features_length - 1 : 1; + for (let j = 0; j < num_features; ++j) { + const mean = sum[j] / features_length; + const variance = (sum_sq[j] - features_length * mean * mean) / divisor; + const std = Math.sqrt(variance) + EPSILON; + const inv_std = 1 / std; + + for (let i = 0; i < features_length; ++i) { + const index = i * num_features + j; + features_data[index] = (features_data[index] - mean) * inv_std; + } } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 1dceb2947..be25afe42 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -566,19 +566,23 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { /** @type {number[] | null} */ const tdtSteps = returnTdtSteps ? [] : null; - let decoderState = { - state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), - state2: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), - }; + let decoderState; + let targetLengthTensor; - const targetLengthTensor = - this.transducer.decoder_token_length_dtype === 'int64' - ? new Tensor('int64', BigInt64Array.from([1n]), [1]) - : new Tensor('int32', new Int32Array([1]), [1]); let emittedOnFrame = 0; const decodeStart = nowMs(); try { + decoderState = { + state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), + state2: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), + }; + + targetLengthTensor = + this.transducer.decoder_token_length_dtype === 'int64' + ? new Tensor('int64', BigInt64Array.from([1n]), [1]) + : new Tensor('int32', new Int32Array([1]), [1]); + for (let frameIndex = 0; frameIndex < frames.length;) { const frameTensor = this._createFrameTensor(frames[frameIndex]); const prevTokenId = tokenIds.length > 0 ? tokenIds[tokenIds.length - 1] : blankId; @@ -643,6 +647,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { decoderState = newState; tokenIds.push(tokenId); + // TDT duration convention: step=0 means "stay on current frame" (duration index 0 = no advance). + // We still associate the token with this frame, so durationFrames is at least 1. const durationFrames = step > 0 ? step : 1; tokenTimestamps.push([ roundTs(frameIndex * frameTime + timeOffset), @@ -673,8 +679,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } } } finally { - targetLengthTensor.dispose(); - this._disposeDecoderState(decoderState); + if (targetLengthTensor) targetLengthTensor.dispose(); + if (decoderState) this._disposeDecoderState(decoderState); } const decodeMs = nowMs() - decodeStart; @@ -774,7 +780,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { // Register with ModelRegistry so get_model_files / progress_callback enumerate // the correct ONNX files: encoder_model + decoder_model_merged. -MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); +MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); // model_type key +MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT); // architecture key MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerTDTPreTrainedModel', NemoConformerTDTPreTrainedModel); MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerForTDT', NemoConformerForTDT); MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerTDTPreTrainedModel, 'NemoConformerTDTPreTrainedModel'); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 3042e9e8f..5a3af06ef 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -17,7 +17,8 @@ export function createAudioCacheKey(audio, sampling_rate = 16000) { // Sample stride hash to keep keying cheap for long audio. const stride = Math.max(1, Math.floor(audio.length / 4096)); for (let i = 0; i < audio.length; i += stride) { - const q = (audio[i] * 32768) | 0; + const sample = Number.isFinite(audio[i]) ? audio[i] : 0; + const q = Math.max(-32768, Math.min(32767, Math.round(sample * 32768))); hash ^= q; hash = Math.imul(hash, 16777619); } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js index 80a85f8be..1abe48139 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js @@ -16,6 +16,9 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c if (!Number.isInteger(window) || window <= 0) { throw new Error('computeTemporalDeltas expects `window` to be a positive integer.'); } + if (order !== 1 && order !== 2) { + throw new Error('computeTemporalDeltas expects `order` to be 1 or 2.'); + } const [batch, T, F] = input_features.dims; const base = /** @type {Float32Array} */ (input_features.data); @@ -43,10 +46,10 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c return new Tensor('float32', concatFloat32([base, delta]), [batch, T, F * 2]); } - const delta_delta = /** @type {{delta: Tensor}} */ ( + const recursive_result = /** @type {{delta: Tensor}} */ ( computeTemporalDeltas(delta_tensor, { order: 1, window, concatenate: false }) - ).delta.data; - const delta_delta_tensor = new Tensor('float32', delta_delta, [batch, T, F]); + ); + const delta_delta_tensor = recursive_result.delta; if (!concatenate) { return { delta: delta_tensor, @@ -54,6 +57,7 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c }; } + const delta_delta = /** @type {Float32Array} */ (delta_delta_tensor.data); return new Tensor('float32', concatFloat32([base, delta, delta_delta]), [batch, T, F * 3]); } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index e98938224..33729d394 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -84,6 +84,16 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { return { words: [], tokens: [], word_confidences: null, word_avg: null }; } + if (token_ids.length !== token_timestamps.length) { + throw new Error( + `buildTransducerDetailedOutputs expects equal lengths for token_ids (${token_ids.length}) and token_timestamps (${token_timestamps.length}).`, + ); + } + if (token_confidences && token_confidences.length !== token_ids.length) { + throw new Error( + `buildTransducerDetailedOutputs expects token_confidences length (${token_confidences.length}) to match token_ids length (${token_ids.length}).`, + ); + } /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ const tokens = []; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 77592f5a0..882dfb68f 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -326,13 +326,13 @@ export class AutomaticSpeechRecognitionPipeline ); } - const return_timestamps = !!(kwargs.return_timestamps ?? false); + const return_timestamps = !!(kwargs.return_timestamps); const decodeOptions = { tokenizer: this.tokenizer, return_timestamps, return_words: return_timestamps, - return_metrics: kwargs.return_metrics ?? false, + return_metrics: true, }; const single = !Array.isArray(audio); From 3bac1dc0e0b4f843e17e831617d67dc67b7772b3 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 22:43:09 +0300 Subject: [PATCH 10/33] test(nemo-conformer-tdt): rewrite tests to match current API - Remove all timestamp_granularity tests (feature was removed) - Fix option names: return_tokens, return_words, return_timestamps - Fix output fields: tokens/words arrays, not token_ids/word_timestamps - Verify pipeline passes return_words + return_metrics when timestamps on - Add test: return_timestamps 'word' treated as truthy --- .../test_modeling_nemo_conformer_tdt.js | 21 ++- ..._pipelines_automatic_speech_recognition.js | 130 ++++++------------ 2 files changed, 55 insertions(+), 96 deletions(-) diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 83a1523e2..1106a9332 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -104,22 +104,21 @@ export default () => { const output = await model.transcribe(inputs, { tokenizer, - return_token_timestamps: true, - return_word_timestamps: true, - return_utterance_timestamp: true, + return_timestamps: true, + return_words: true, + return_tokens: true, }); expect(output.text).toBe("hello world"); - expect(output.token_ids).toEqual([1, 2]); - expect(output.token_timestamps).toEqual([ - [0, 0.04], - [0.04, 0.12], + expect(output.utterance_timestamp).toEqual([0, 0.12]); + expect(output.words).toEqual([ + expect.objectContaining({ text: "hello", start_time: 0, end_time: 0.04 }), + expect.objectContaining({ text: "world", start_time: 0.04, end_time: 0.12 }), ]); - expect(output.word_timestamps).toEqual([ - { text: "hello", timestamp: [0, 0.04] }, - { text: "world", timestamp: [0.04, 0.12] }, + expect(output.tokens).toEqual([ + expect.objectContaining({ id: 1, start_time: 0, end_time: 0.04 }), + expect.objectContaining({ id: 2, start_time: 0.04, end_time: 0.12 }), ]); - expect(output.utterance_timestamp).toEqual([0, 0.12]); }, MAX_TEST_EXECUTION_TIME, ); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 831a885ac..0763795d6 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -133,19 +133,22 @@ export default () => { config: { model_type: modelType }, async transcribe(_inputs, options) { calls.push(options); - return { - text: "hello world", - token_ids: [1, 2], - token_timestamps: [ - [0, 0.04], - [0.04, 0.08], - ], - word_timestamps: [ - { text: "hello", timestamp: [0, 0.04] }, - { text: "world", timestamp: [0.04, 0.08] }, - ], - utterance_timestamp: [0, 0.08], - }; + const result = { text: "hello world" }; + if (options.return_timestamps) { + result.utterance_timestamp = [0, 0.08]; + result.utterance_confidence = 0.95; + result.confidence_scores = { token_avg: 0.95, word_avg: 0.94, overall_log_prob: -0.05 }; + if (options.return_words) { + result.words = [ + { text: "hello", start_time: 0, end_time: 0.04, confidence: 0.96 }, + { text: "world", start_time: 0.04, end_time: 0.08, confidence: 0.93 }, + ]; + } + } + if (options.return_metrics) { + result.metrics = { total_ms: 42, rtf: 0.01 }; + } + return result; }, async dispose() {}, }; @@ -153,18 +156,7 @@ export default () => { const processor = Object.assign(async () => ({ input_features: {} }), { feature_extractor: { config: { sampling_rate: 16000 } }, }); - const tokenizer = { - decode(ids) { - const idArray = Array.isArray(ids) ? ids : [ids]; - return idArray - .map((id) => { - if (id === 1 || id === 1n) return " hello"; - if (id === 2 || id === 2n) return " world"; - return ""; - }) - .join(""); - }, - }; + const tokenizer = {}; return { pipe: new AutomaticSpeechRecognitionPipeline({ @@ -177,85 +169,53 @@ export default () => { }; }; - it("dispatches to nemo-conformer-tdt path", async () => { + it("returns text and metrics when timestamps disabled", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: false }); - expect(output).toEqual({ text: "hello world" }); + expect(output).toEqual({ text: "hello world", metrics: { total_ms: 42, rtf: 0.01 } }); expect(calls).toHaveLength(1); + expect(calls[0]).toMatchObject({ + return_timestamps: false, + return_words: false, + return_metrics: true, + }); }); - it("default timestamps use word granularity", async () => { + it("returns full output with words when return_timestamps is true", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: true }); - expect(output).toEqual({ + expect(output).toMatchObject({ text: "hello world", - chunks: [ - { text: "hello", timestamp: [0, 0.04] }, - { text: "world", timestamp: [0.04, 0.08] }, + utterance_timestamp: [0, 0.08], + utterance_confidence: 0.95, + words: [ + { text: "hello", start_time: 0, end_time: 0.04 }, + { text: "world", start_time: 0.04, end_time: 0.08 }, ], + confidence_scores: { token_avg: 0.95, word_avg: 0.94 }, + metrics: { total_ms: 42, rtf: 0.01 }, }); expect(calls[0]).toMatchObject({ - return_word_timestamps: true, - return_token_timestamps: false, - return_utterance_timestamp: false, - }); - }); - - it("supports utterance granularity", async () => { - const { pipe } = makeUnitPipe(); - const output = await pipe(new Float32Array(16000), { return_timestamps: true, - timestamp_granularity: "utterance", - }); - expect(output).toEqual({ - text: "hello world", - chunks: [{ text: "hello world", timestamp: [0, 0.08] }], + return_words: true, + return_metrics: true, }); }); - it("supports token granularity", async () => { - const { pipe } = makeUnitPipe(); - const output = await pipe(new Float32Array(16000), { - return_timestamps: true, - timestamp_granularity: "token", - }); - expect(output).toEqual({ + it("treats return_timestamps 'word' as truthy (same as true)", async () => { + const { pipe, calls } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { return_timestamps: "word" }); + expect(output).toMatchObject({ text: "hello world", - chunks: [ - { text: " hello", timestamp: [0, 0.04] }, - { text: " world", timestamp: [0.04, 0.08] }, - ], + utterance_timestamp: [0, 0.08], + words: expect.any(Array), + metrics: expect.any(Object), }); - }); - - it("supports all granularities at once", async () => { - const { pipe } = makeUnitPipe(); - const output = await pipe(new Float32Array(16000), { + expect(calls[0]).toMatchObject({ return_timestamps: true, - timestamp_granularity: "all", + return_words: true, + return_metrics: true, }); - expect(output).toEqual({ - text: "hello world", - chunks: [ - { text: "hello", timestamp: [0, 0.04] }, - { text: "world", timestamp: [0.04, 0.08] }, - ], - tokens: [ - { text: " hello", timestamp: [0, 0.04] }, - { text: " world", timestamp: [0.04, 0.08] }, - ], - utterance: [0, 0.08], - }); - }); - - it("throws for invalid timestamp granularity", async () => { - const { pipe } = makeUnitPipe(); - await expect( - pipe(new Float32Array(16000), { - return_timestamps: true, - timestamp_granularity: "frame", - }), - ).rejects.toThrow("Invalid `timestamp_granularity`"); }); }); }); From c75ebd27f53584e38c3fa98262e3661d7d9cd9f1 Mon Sep 17 00:00:00 2001 From: ysdede Date: Mon, 2 Mar 2026 22:47:51 +0300 Subject: [PATCH 11/33] fix(nemo-conformer-tdt): harden decoding and feature utilities Address reviewer findings except the return_metrics policy decision. - Fix temporal delta concatenation to interleave per frame and add dtype validation. - Validate preemphasis range and clamp normalization variance in feature extraction. - Remove unsafe encoder layout inference; require explicit encoder_output_layout. - Redesign decode loop to read frame data on-demand instead of eager frame materialization. - Deduplicate word finalization and avoid zero-filling missing word confidences. - Tighten tests for delta layout/type checks, explicit layout requirement, call counts, and naming accuracy. --- .../feature_extraction_nemo_conformer_tdt.js | 23 ++-- .../modeling_nemo_conformer_tdt.js | 130 +++++++++--------- .../nemo_conformer_tdt/transducer_deltas.js | 29 ++-- .../nemo_conformer_tdt/transducer_text.js | 69 +++++----- ...t_feature_extraction_nemo_conformer_tdt.js | 12 ++ .../test_modeling_nemo_conformer_tdt.js | 42 ++++-- .../test_feature_extraction_parakeet.js | 2 +- ..._pipelines_automatic_speech_recognition.js | 2 + 8 files changed, 185 insertions(+), 124 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index e230ff50c..aba71c485 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -50,15 +50,15 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { if (this.delta_order > 0 && !this.delta_concatenate) { logger.warn( 'NemoConformerTDTFeatureExtractor: `delta_concatenate=false` is set. ' + - '`input_features` will remain base features and deltas are returned in extra fields.', + '`input_features` will remain base features and deltas are returned in extra fields.', ); } this.feature_cache = this.use_feature_cache ? new FeatureLRUCache({ - max_entries: this.config.feature_cache_max_entries ?? 128, - max_size_mb: this.config.feature_cache_max_size_mb ?? 64, - }) + max_entries: this.config.feature_cache_max_entries ?? 128, + max_size_mb: this.config.feature_cache_max_size_mb ?? 64, + }) : null; } @@ -69,10 +69,17 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { */ async _extract_fbank_features(waveform) { // Parakeet uses a custom preemphasis strategy: Apply preemphasis to entire waveform at once - const preemphasis = this.config.preemphasis; + const preemphasis = this.config.preemphasis ?? 0; + if (!Number.isFinite(preemphasis) || preemphasis < 0 || preemphasis >= 1) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected \`preemphasis\` in [0, 1), got ${this.config.preemphasis}.`, + ); + } waveform = new Float64Array(waveform); // Clone to avoid destructive changes - for (let j = waveform.length - 1; j >= 1; --j) { - waveform[j] -= preemphasis * waveform[j - 1]; + if (preemphasis !== 0) { + for (let j = waveform.length - 1; j >= 1; --j) { + waveform[j] -= preemphasis * waveform[j - 1]; + } } const features = await spectrogram( @@ -159,7 +166,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { for (let j = 0; j < num_features; ++j) { const mean = sum[j] / features_length; const variance = (sum_sq[j] - features_length * mean * mean) / divisor; - const std = Math.sqrt(variance) + EPSILON; + const std = Math.sqrt(Math.max(variance, 0)) + EPSILON; const inv_std = 1 / std; for (let i = 0; i < features_length; ++i) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index be25afe42..370328e77 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -82,17 +82,6 @@ function confidenceFromLogits(logits, tokenId, vocabSize) { }; } -function inferEncoderOutputLayout(outputTensor) { - if (outputTensor.dims.length !== 3 || outputTensor.dims[0] !== 1) { - throw new Error( - `Nemo Conformer TDT expected encoder output dims [1, D, T] or [1, T, D], got [${outputTensor.dims.join(', ')}].`, - ); - } - - // Heuristic fallback: in most Nemo exports D > T. - return outputTensor.dims[1] >= outputTensor.dims[2] ? 'BDT' : 'BTD'; -} - function resolveTransducerConfig(config, sessions) { const transducerConfig = config['transformers.js_config']?.transducer; if (!transducerConfig) { @@ -137,7 +126,7 @@ function resolveTransducerConfig(config, sessions) { if (missingDecoderInputs.length > 0) { throw new Error( `Nemo Conformer TDT decoder session is missing expected inputs: ${missingDecoderInputs.join(', ')}. ` + - 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', ); } const missingDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2].filter( @@ -146,7 +135,7 @@ function resolveTransducerConfig(config, sessions) { if (missingDecoderOutputs.length > 0) { throw new Error( `Nemo Conformer TDT decoder session is missing expected outputs: ${missingDecoderOutputs.join(', ')}. ` + - 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', ); } @@ -157,7 +146,7 @@ function resolveTransducerConfig(config, sessions) { if (!(encoderSession.outputNames ?? []).includes(io.encoder_output)) { throw new Error( `Nemo Conformer TDT encoder session is missing expected output: ${io.encoder_output}. ` + - 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', + 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', ); } @@ -165,6 +154,7 @@ function resolveTransducerConfig(config, sessions) { const subsamplingFactor = transducerConfig.subsampling_factor ?? 8; const frameShiftS = transducerConfig.frame_shift_s ?? 0.01; const blankTokenId = transducerConfig.blank_token_id ?? 0; + const encoderOutputLayout = transducerConfig.encoder_output_layout; const decoderTokenDType = transducerConfig.decoder_token_dtype ?? 'int32'; const decoderTokenLengthDType = transducerConfig.decoder_token_length_dtype ?? 'int32'; @@ -182,6 +172,9 @@ function resolveTransducerConfig(config, sessions) { if (!Number.isFinite(frameShiftS) || frameShiftS <= 0) { throw new Error('Invalid `transformers.js_config.transducer.frame_shift_s`: expected a positive number.'); } + if (encoderOutputLayout !== 'BDT' && encoderOutputLayout !== 'BTD') { + throw new Error('Invalid `transformers.js_config.transducer.encoder_output_layout`: expected "BDT" or "BTD".'); + } if (!['int32', 'int64'].includes(decoderTokenDType)) { throw new Error( 'Invalid `transformers.js_config.transducer.decoder_token_dtype`: expected "int32" or "int64".', @@ -201,7 +194,7 @@ function resolveTransducerConfig(config, sessions) { vocab_size: transducerConfig.vocab_size ?? config.vocab_size ?? null, duration_start_index: transducerConfig.duration_start_index ?? null, encoder_input_layout: transducerConfig.encoder_input_layout ?? 'BTF', - encoder_output_layout: transducerConfig.encoder_output_layout ?? null, + encoder_output_layout: encoderOutputLayout, encoder_frame_layout: transducerConfig.encoder_frame_layout ?? 'BD1', decoder_token_dtype: decoderTokenDType, decoder_token_length_dtype: decoderTokenLengthDType, @@ -264,7 +257,7 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { if (options.model_file_name && options.model_file_name !== 'encoder_model') { throw new Error( 'NemoConformerForTDT does not support `model_file_name` override. ' + - 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', + 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', ); } @@ -283,8 +276,8 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { const reason = error?.message ?? String(error); throw new Error( 'Failed to load Nemo Conformer TDT sessions. Expected canonical v4 files under `onnx/`: ' + - '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + - `Original error: ${reason}`, + '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + + `Original error: ${reason}`, ); } @@ -316,33 +309,45 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { return outputs[name] ?? Object.values(outputs)[0]; } - _encoderOutputToFrames(encoderOutput) { - const layout = this.transducer.encoder_output_layout ?? inferEncoderOutputLayout(encoderOutput); - const dims = encoderOutput.dims; - const data = encoderOutput.data; - const frames = []; + _getEncoderFrameCount(encoderOutput) { + if (encoderOutput.dims.length !== 3 || encoderOutput.dims[0] !== 1) { + throw new Error( + `Nemo Conformer TDT expected encoder output dims [1, D, T] or [1, T, D], got [${encoderOutput.dims.join(', ')}].`, + ); + } + const layout = this.transducer.encoder_output_layout; + if (layout === 'BDT') { + return encoderOutput.dims[2]; + } + if (layout === 'BTD') { + return encoderOutput.dims[1]; + } + throw new Error( + `Unsupported encoder output layout "${layout}". Use 'BDT' or 'BTD' in transformers.js_config.transducer.`, + ); + } + + _getFrameData(encoderOutput, frameIndex, reusableFrame) { + const layout = this.transducer.encoder_output_layout; + if (encoderOutput.type !== 'float32') { + throw new Error(`Nemo Conformer TDT expected encoder output type "float32", got "${encoderOutput.type}".`); + } + const data = /** @type {Float32Array} */ (encoderOutput.data); if (layout === 'BDT') { - const D = dims[1]; - const T = dims[2]; - for (let t = 0; t < T; ++t) { - const frame = new Float32Array(D); - for (let d = 0; d < D; ++d) { - frame[d] = data[d * T + t]; - } - frames.push(frame); + const D = encoderOutput.dims[1]; + const T = encoderOutput.dims[2]; + const frame = reusableFrame && reusableFrame.length === D ? reusableFrame : new Float32Array(D); + for (let d = 0; d < D; ++d) { + frame[d] = data[d * T + frameIndex]; } - return frames; + return frame; } if (layout === 'BTD') { - const T = dims[1]; - const D = dims[2]; - for (let t = 0; t < T; ++t) { - const offset = t * D; - frames.push(new Float32Array(data.subarray(offset, offset + D))); - } - return frames; + const D = encoderOutput.dims[2]; + const offset = frameIndex * D; + return data.subarray(offset, offset + D); } throw new Error( @@ -530,18 +535,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const encodeMs = nowMs() - encodeStart; const encoderOutput = this._getEncoderOutput(encoderOutputs); - let frames; - try { - frames = this._encoderOutputToFrames(encoderOutput); - } finally { - const seen = new Set(); - for (const value of Object.values(encoderOutputs)) { - if (value instanceof Tensor && !seen.has(value)) { - value.dispose(); - seen.add(value); - } - } - } + const frameCount = this._getEncoderFrameCount(encoderOutput); const frameTime = this.transducer.subsampling_factor * this.transducer.frame_shift_s; const numLayers = this.transducer.decoder.num_layers; @@ -568,6 +562,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { let decoderState; let targetLengthTensor; + let reusableFrame = null; let emittedOnFrame = 0; const decodeStart = nowMs(); @@ -583,8 +578,12 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { ? new Tensor('int64', BigInt64Array.from([1n]), [1]) : new Tensor('int32', new Int32Array([1]), [1]); - for (let frameIndex = 0; frameIndex < frames.length;) { - const frameTensor = this._createFrameTensor(frames[frameIndex]); + for (let frameIndex = 0; frameIndex < frameCount; ) { + const frameData = this._getFrameData(encoderOutput, frameIndex, reusableFrame); + if (this.transducer.encoder_output_layout === 'BDT') { + reusableFrame = frameData; + } + const frameTensor = this._createFrameTensor(frameData); const prevTokenId = tokenIds.length > 0 ? tokenIds[tokenIds.length - 1] : blankId; const tokenTensor = this.transducer.decoder_token_dtype === 'int64' @@ -681,6 +680,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } finally { if (targetLengthTensor) targetLengthTensor.dispose(); if (decoderState) this._disposeDecoderState(decoderState); + const seen = new Set(); + for (const value of Object.values(encoderOutputs)) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } + } } const decodeMs = nowMs() - decodeStart; @@ -704,13 +710,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { result.utterance_timestamp = tokenTimestamps.length > 0 ? /** @type {[number, number]} */ ([ - tokenTimestamps[0][0], - tokenTimestamps[tokenTimestamps.length - 1][1], - ]) + tokenTimestamps[0][0], + tokenTimestamps[tokenTimestamps.length - 1][1], + ]) : /** @type {[number, number]} */ ([ - roundTs(timeOffset), - roundTs(frames.length * frameTime + timeOffset), - ]); + roundTs(timeOffset), + roundTs(frameCount * frameTime + timeOffset), + ]); if (detailed) { if (return_words) result.words = detailed.words; @@ -753,7 +759,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const totalMs = nowMs() - totalStart; const utteranceDuration = result.utterance_timestamp ? Math.max(result.utterance_timestamp[1] - result.utterance_timestamp[0], 1e-8) - : Math.max(frames.length * frameTime, 1e-8); + : Math.max(frameCount * frameTime, 1e-8); const rtf = totalMs / 1000 / utteranceDuration; result.metrics = { preprocess_ms: 0.0, @@ -780,8 +786,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { // Register with ModelRegistry so get_model_files / progress_callback enumerate // the correct ONNX files: encoder_model + decoder_model_merged. -MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); // model_type key -MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT); // architecture key +MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); // model_type key +MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT); // architecture key MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerTDTPreTrainedModel', NemoConformerTDTPreTrainedModel); MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerForTDT', NemoConformerForTDT); MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerTDTPreTrainedModel, 'NemoConformerTDTPreTrainedModel'); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js index 1abe48139..50651f82f 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js @@ -19,6 +19,9 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c if (order !== 1 && order !== 2) { throw new Error('computeTemporalDeltas expects `order` to be 1 or 2.'); } + if (input_features.type !== 'float32') { + throw new Error(`computeTemporalDeltas expects input tensor type "float32", got "${input_features.type}".`); + } const [batch, T, F] = input_features.dims; const base = /** @type {Float32Array} */ (input_features.data); @@ -43,7 +46,7 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c if (!concatenate) { return { delta: delta_tensor }; } - return new Tensor('float32', concatFloat32([base, delta]), [batch, T, F * 2]); + return new Tensor('float32', interleaveByFrame([base, delta], T, F), [batch, T, F * 2]); } const recursive_result = /** @type {{delta: Tensor}} */ ( @@ -58,16 +61,26 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c } const delta_delta = /** @type {Float32Array} */ (delta_delta_tensor.data); - return new Tensor('float32', concatFloat32([base, delta, delta_delta]), [batch, T, F * 3]); + return new Tensor('float32', interleaveByFrame([base, delta, delta_delta], T, F), [batch, T, F * 3]); } -function concatFloat32(items) { - const total = items.reduce((sum, arr) => sum + arr.length, 0); - const output = new Float32Array(total); - let offset = 0; +function interleaveByFrame(items, T, F) { + const chunkSize = T * F; for (const arr of items) { - output.set(arr, offset); - offset += arr.length; + if (arr.length !== chunkSize) { + throw new Error( + `computeTemporalDeltas expected concatenation arrays with length ${chunkSize}, got ${arr.length}.`, + ); + } + } + + const output = new Float32Array(chunkSize * items.length); + for (let t = 0; t < T; ++t) { + const srcOffset = t * F; + const dstOffset = t * F * items.length; + for (let i = 0; i < items.length; ++i) { + output.set(items[i].subarray(srcOffset, srcOffset + F), dstOffset + i * F); + } } return output; } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index 33729d394..a68bb828d 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -55,6 +55,28 @@ function resolveTokenPiece(tokenizer, id) { return { raw: rawToken || decoded, clean, startsNewWord }; } +/** + * @param {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} words + * @param {{ text: string, start: number, end: number, confs: number[] } | null} current + */ +function finalizeAndPushWord(words, current) { + if (!current) return; + + const text = current.text.trim(); + if (!text) return; + + /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ + const word = { + text, + start_time: current.start, + end_time: current.end, + }; + if (current.confs.length > 0) { + word.confidence = Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; + } + words.push(word); +} + /** * Decode token ids into final transcription text. * @param {any} tokenizer @@ -76,7 +98,7 @@ export function decodeTransducerText(tokenizer, token_ids) { * @returns {{ * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, * tokens: Array<{ token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, - * word_confidences: number[] | null, + * word_confidences: (number | null)[] | null, * word_avg: number | null, * }} */ @@ -127,22 +149,7 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times tokens.push(tok); if (!current || startsNewWord) { - if (current) { - const text = current.text.trim(); - if (text) { - /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ - const word = { - text, - start_time: current.start, - end_time: current.end, - }; - if (current.confs.length > 0) { - word.confidence = - Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; - } - words.push(word); - } - } + finalizeAndPushWord(words, current); current = { text: clean, start: ts[0], @@ -158,28 +165,16 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times } } - if (current) { - const text = current.text.trim(); - if (text) { - /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ - const word = { - text, - start_time: current.start, - end_time: current.end, - }; - if (current.confs.length > 0) { - word.confidence = - Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; - } - words.push(word); + finalizeAndPushWord(words, current); + + const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null; + let word_avg = null; + if (word_confidences) { + const validConfidences = word_confidences.filter((x) => x != null); + if (validConfidences.length > 0) { + word_avg = Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; } } - const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? 0) : null; - const word_avg = - word_confidences && word_confidences.length > 0 - ? Math.round((word_confidences.reduce((a, b) => a + b, 0) / word_confidences.length) * 1e6) / 1e6 - : null; - return { words, tokens, word_confidences, word_avg }; } diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index cee9568d0..aa0d6ffad 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -74,5 +74,17 @@ export default () => { }, MAX_TEST_EXECUTION_TIME, ); + + it( + "validates preemphasis range", + async () => { + const invalidHigh = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, preemphasis: 1 }); + await expect(invalidHigh(audio)).rejects.toThrow("preemphasis"); + + const invalidLow = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, preemphasis: -0.1 }); + await expect(invalidLow(audio)).rejects.toThrow("preemphasis"); + }, + MAX_TEST_EXECUTION_TIME, + ); }); }; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 1106a9332..4214bbd7b 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -111,14 +111,8 @@ export default () => { expect(output.text).toBe("hello world"); expect(output.utterance_timestamp).toEqual([0, 0.12]); - expect(output.words).toEqual([ - expect.objectContaining({ text: "hello", start_time: 0, end_time: 0.04 }), - expect.objectContaining({ text: "world", start_time: 0.04, end_time: 0.12 }), - ]); - expect(output.tokens).toEqual([ - expect.objectContaining({ id: 1, start_time: 0, end_time: 0.04 }), - expect.objectContaining({ id: 2, start_time: 0.04, end_time: 0.12 }), - ]); + expect(output.words).toEqual([expect.objectContaining({ text: "hello", start_time: 0, end_time: 0.04 }), expect.objectContaining({ text: "world", start_time: 0.04, end_time: 0.12 })]); + expect(output.tokens).toEqual([expect.objectContaining({ id: 1, start_time: 0, end_time: 0.04 }), expect.objectContaining({ id: 2, start_time: 0.04, end_time: 0.12 })]); }, MAX_TEST_EXECUTION_TIME, ); @@ -127,6 +121,20 @@ export default () => { const invalidConfig = { model_type: "nemo-conformer-tdt" }; expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("Missing `transformers.js_config.transducer`"); }); + + it("requires explicit encoder_output_layout in transducer config", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + encoder_output_layout: undefined, + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); + }); }); describe("Nemo Conformer TDT utilities", () => { @@ -146,12 +154,30 @@ export default () => { expect(split.delta.dims).toEqual([1, 4, 2]); expect(split.delta_delta.dims).toEqual([1, 4, 2]); + const concatOrder1 = computeTemporalDeltas(input, { order: 1, window: 1, concatenate: true }); + expect(concatOrder1.dims).toEqual([1, 4, 4]); + expect(Array.from(concatOrder1.data.slice(0, 8))).toEqual([ + 1, + 2, + 0.5, + 1, // t0: base + delta + 2, + 4, + 1, + 2, // t1: base + delta + ]); + const concat = computeTemporalDeltas(input, { order: 2, window: 1, concatenate: true }); expect(concat.dims).toEqual([1, 4, 6]); }, MAX_TEST_EXECUTION_TIME, ); + it("rejects non-float32 tensors for temporal deltas", () => { + const input = new Tensor("float64", Float64Array.from([1, 2, 2, 4]), [1, 2, 2]); + expect(() => computeTemporalDeltas(input, { order: 1, window: 1, concatenate: true })).toThrow('type "float32"'); + }); + it( "creates stable audio cache keys", async () => { diff --git a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js index fab1861d9..c0e67a58b 100644 --- a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js +++ b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js @@ -20,7 +20,7 @@ export default () => { }); it( - "extracts normalized features and mask from synthetic audio", + "extracts features and mask from synthetic audio", async () => { const duration_s = 1.0; const total = Math.floor(config.sampling_rate * duration_s); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 0763795d6..8159c6098 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -195,6 +195,7 @@ export default () => { confidence_scores: { token_avg: 0.95, word_avg: 0.94 }, metrics: { total_ms: 42, rtf: 0.01 }, }); + expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: true, return_words: true, @@ -211,6 +212,7 @@ export default () => { words: expect.any(Array), metrics: expect.any(Object), }); + expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: true, return_words: true, From 493a5881d60f138848af3861e637e7d7dfb8288b Mon Sep 17 00:00:00 2001 From: ysdede Date: Tue, 3 Mar 2026 23:33:52 +0300 Subject: [PATCH 12/33] fix(nemo-conformer-tdt): address critical review issues Fixes high-impact issues found in PR review validation:\n- force NemoConformerForTDT to MODEL_TYPES.NemoConformerTDT in registry overrides\n- ensure encoder outputs are disposed when pre-decode validation throws\n- remove stride sampling from audio cache key hashing to prevent false cache hits\n- use encoder_model selector key in get_model_files for Nemo per-component dtype/device overrides\n\nAlso adds targeted regression tests for mapping, disposal behavior, file selection, and cache key correctness. --- .../modeling_nemo_conformer_tdt.js | 6 +- .../nemo_conformer_tdt/transducer_cache.js | 5 +- packages/transformers/src/models/registry.js | 3 + .../utils/model_registry/get_model_files.js | 2 +- .../test_modeling_nemo_conformer_tdt.js | 79 +++++++++++++++++++ 5 files changed, 89 insertions(+), 6 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 370328e77..597b7e7fa 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -534,8 +534,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } const encodeMs = nowMs() - encodeStart; - const encoderOutput = this._getEncoderOutput(encoderOutputs); - const frameCount = this._getEncoderFrameCount(encoderOutput); + let frameCount = 0; + let encoderOutput = null; const frameTime = this.transducer.subsampling_factor * this.transducer.frame_shift_s; const numLayers = this.transducer.decoder.num_layers; @@ -568,6 +568,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const decodeStart = nowMs(); try { + encoderOutput = this._getEncoderOutput(encoderOutputs); + frameCount = this._getEncoderFrameCount(encoderOutput); decoderState = { state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), state2: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 5a3af06ef..bf71afcad 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -14,9 +14,8 @@ export function createAudioCacheKey(audio, sampling_rate = 16000) { hash ^= sampling_rate; hash = Math.imul(hash, 16777619); - // Sample stride hash to keep keying cheap for long audio. - const stride = Math.max(1, Math.floor(audio.length / 4096)); - for (let i = 0; i < audio.length; i += stride) { + // Hash all quantized samples to minimize false cache hits across waveforms. + for (let i = 0; i < audio.length; ++i) { const sample = Number.isFinite(audio[i]) ? audio[i] : 0; const q = Math.max(-32768, Math.min(32767, Math.round(sample * 32768))); hash ^= q; diff --git a/packages/transformers/src/models/registry.js b/packages/transformers/src/models/registry.js index 5c08bdded..eabe1e2fa 100644 --- a/packages/transformers/src/models/registry.js +++ b/packages/transformers/src/models/registry.js @@ -577,6 +577,9 @@ const CUSTOM_MAPPING = [ ], ['SupertonicForConditionalGeneration', ALL_MODEL_FILES.SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic], ['ChatterboxModel', ALL_MODEL_FILES.ChatterboxModel, MODEL_TYPES.Chatterbox], + // Keep AutoModel lookup in MODEL_MAPPING_NAMES_ENCODER_ONLY while forcing the + // correct runtime model type for two-artifact Nemo Conformer TDT loading. + ['NemoConformerForTDT', ALL_MODEL_FILES.NemoConformerForTDT, MODEL_TYPES.NemoConformerTDT], ]; for (const [name, model, type] of CUSTOM_MAPPING) { MODEL_TYPE_MAPPING.set(name, type); diff --git a/packages/transformers/src/utils/model_registry/get_model_files.js b/packages/transformers/src/utils/model_registry/get_model_files.js index b0cf029e1..ed64a5309 100644 --- a/packages/transformers/src/utils/model_registry/get_model_files.js +++ b/packages/transformers/src/utils/model_registry/get_model_files.js @@ -167,7 +167,7 @@ export async function get_model_files( add_model_file('conditional_decoder'); files.push('generation_config.json'); } else if (modelType === MODEL_TYPES.NemoConformerTDT) { - add_model_file('model', 'encoder_model'); + add_model_file('encoder_model'); add_model_file('decoder_model_merged'); } else if (modelType === MODEL_TYPES.AutoEncoder) { add_model_file('encoder_model'); diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 4214bbd7b..522b7db74 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -1,6 +1,8 @@ import { NemoConformerForTDT, Tensor } from "../../../src/transformers.js"; import { createAudioCacheKey, FeatureLRUCache } from "../../../src/models/nemo_conformer_tdt/transducer_cache.js"; import { computeTemporalDeltas } from "../../../src/models/nemo_conformer_tdt/transducer_deltas.js"; +import { MODEL_TYPE_MAPPING, MODEL_TYPES } from "../../../src/models/modeling_utils.js"; +import { get_model_files } from "../../../src/utils/model_registry/get_model_files.js"; import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; @@ -73,6 +75,11 @@ const BASE_CONFIG = { export default () => { describe("NemoConformerForTDT", () => { + it("maps NemoConformerForTDT to MODEL_TYPES.NemoConformerTDT", () => { + expect(MODEL_TYPE_MAPPING.get("NemoConformerForTDT")).toBe(MODEL_TYPES.NemoConformerTDT); + expect(MODEL_TYPE_MAPPING.get("nemo-conformer-tdt")).toBe(MODEL_TYPES.NemoConformerTDT); + }); + it( "greedily decodes scripted token and duration logits", async () => { @@ -135,6 +142,43 @@ export default () => { }; expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); }); + + it( + "disposes encoder outputs when frame-count validation fails before decode", + async () => { + class BadEncoderOutputModel extends NemoConformerForTDT { + constructor(config, sessions, encoderOutput) { + super(config, sessions, {}); + this.encoderOutput = encoderOutput; + } + + async _runEncoder() { + return { outputs: this.encoderOutput }; + } + } + + const badEncoderOutput = new Tensor("float32", new Float32Array([0, 1, 2, 3]), [2, 2]); + let disposed = 0; + const originalDispose = badEncoderOutput.dispose.bind(badEncoderOutput); + badEncoderOutput.dispose = () => { + disposed += 1; + originalDispose(); + }; + + const model = new BadEncoderOutputModel(BASE_CONFIG, BASE_SESSIONS, badEncoderOutput); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { decode: () => "" }, + }), + ).rejects.toThrow("expected encoder output dims"); + expect(disposed).toBe(1); + }, + MAX_TEST_EXECUTION_TIME, + ); }); describe("Nemo Conformer TDT utilities", () => { @@ -193,6 +237,41 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it("uses Nemo encoder selector key when resolving model files", async () => { + const files = await get_model_files("dummy/nemo", { + local_files_only: true, + config: { + architectures: ["UnknownArch"], + model_type: "nemo-conformer-tdt", + "transformers.js_config": {}, + }, + dtype: { + model: "int8", + encoder_model: "fp16", + decoder_model_merged: "q4", + }, + }); + expect(files).toEqual([ + "config.json", + "onnx/encoder_model_fp16.onnx", + "onnx/decoder_model_merged_q4.onnx", + ]); + }); + + it( + "distinguishes long waveforms that differ at unsampled indices", + async () => { + const a = new Float32Array(10000); + const b = new Float32Array(10000); + b[1] = 0.12345; // Index 1 was skipped by the prior stride-based hash for this length. + + const ka = createAudioCacheKey(a, 16000); + const kb = createAudioCacheKey(b, 16000); + expect(ka).not.toEqual(kb); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "evicts least-recently-used entries when full", async () => { From 5b4cdabd48a2179b4aaf254e9c10deff04397145 Mon Sep 17 00:00:00 2001 From: ysdede Date: Tue, 3 Mar 2026 23:45:28 +0300 Subject: [PATCH 13/33] fix(nemo-conformer-tdt): clamp timestamps and validate cache limits - Clamp token end timestamps to encoder frame bounds during TDT decoding.\n- Validate FeatureLRUCache constructor limits to fail fast on invalid settings.\n- Add regression tests for timestamp clamping and cache limit validation. --- .../modeling_nemo_conformer_tdt.js | 5 ++- .../nemo_conformer_tdt/transducer_cache.js | 6 +++ .../test_modeling_nemo_conformer_tdt.js | 39 +++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 597b7e7fa..401a30a54 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -650,10 +650,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenIds.push(tokenId); // TDT duration convention: step=0 means "stay on current frame" (duration index 0 = no advance). // We still associate the token with this frame, so durationFrames is at least 1. - const durationFrames = step > 0 ? step : 1; + const durationFrames = Math.max(1, step > 0 ? step : 1); + const endFrame = Math.min(frameCount, frameIndex + durationFrames); tokenTimestamps.push([ roundTs(frameIndex * frameTime + timeOffset), - roundTs((frameIndex + durationFrames) * frameTime + timeOffset), + roundTs(endFrame * frameTime + timeOffset), ]); if (tokenConfidences && maybeConfidence) { tokenConfidences.push(maybeConfidence.confidence); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index bf71afcad..4ea17da0e 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -33,6 +33,12 @@ export class FeatureLRUCache { * @param {{max_entries?: number, max_size_mb?: number}} [options] */ constructor({ max_entries = 128, max_size_mb = 64 } = {}) { + if (!Number.isInteger(max_entries) || max_entries < 0) { + throw new Error('FeatureLRUCache expected `max_entries` to be a non-negative integer.'); + } + if (!Number.isFinite(max_size_mb) || max_size_mb < 0) { + throw new Error('FeatureLRUCache expected `max_size_mb` to be a non-negative number.'); + } this.max_entries = max_entries; this.max_size_mb = max_size_mb; this.cache = new Map(); diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 522b7db74..d28e6ceae 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -124,6 +124,38 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "clamps token timestamps when step jumps beyond remaining frames", + async () => { + const tokenizer = { + decode(ids) { + const idArray = Array.isArray(ids) ? ids : [ids]; + return idArray.map((id) => (id === 1 || id === 1n ? " token" : "")).join(""); + }, + }; + + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [ + // Emit token=1 with duration index choosing a large step (argmax at tail). + { logits: [0.1, 10.0, 0.0, 0.0, 0.0, 0.0, 12.0] }, + ]); + + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + const output = await model.transcribe(inputs, { + tokenizer, + return_timestamps: true, + return_tokens: true, + }); + + expect(output.tokens).toHaveLength(1); + expect(output.tokens[0]).toEqual(expect.objectContaining({ start_time: 0, end_time: 0.12 })); + expect(output.utterance_timestamp).toEqual([0, 0.12]); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("fails fast when transducer config is missing", () => { const invalidConfig = { model_type: "nemo-conformer-tdt" }; expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("Missing `transformers.js_config.transducer`"); @@ -288,5 +320,12 @@ export default () => { }, MAX_TEST_EXECUTION_TIME, ); + + it("rejects invalid cache limits", () => { + expect(() => new FeatureLRUCache({ max_entries: -1 })).toThrow("max_entries"); + expect(() => new FeatureLRUCache({ max_entries: 1.25 })).toThrow("max_entries"); + expect(() => new FeatureLRUCache({ max_size_mb: -1 })).toThrow("max_size_mb"); + expect(() => new FeatureLRUCache({ max_size_mb: Number.POSITIVE_INFINITY })).toThrow("max_size_mb"); + }); }); }; From 7690227443ce4f7047a07bf185501cb4fa2379c2 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 00:05:18 +0300 Subject: [PATCH 14/33] fix(nemo-conformer-tdt): close remaining tensor disposal leaks Dispose intermediate tensors in computeTemporalDeltas concatenate paths and dispose replaced base input features when delta concatenation returns a new tensor.\n\nAdd regression tests that assert disposal behavior for delta concatenate flows and feature extractor reassignment. --- .../feature_extraction_nemo_conformer_tdt.js | 1 + .../nemo_conformer_tdt/transducer_deltas.js | 9 +++-- ...t_feature_extraction_nemo_conformer_tdt.js | 33 ++++++++++++++++++- .../test_modeling_nemo_conformer_tdt.js | 22 +++++++++++++ 4 files changed, 62 insertions(+), 3 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index aba71c485..8907b2ab8 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -194,6 +194,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { concatenate: this.delta_concatenate, }); if (delta_result instanceof Tensor) { + input_features.dispose(); input_features = delta_result; result.input_features = input_features; } else { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js index 50651f82f..957fa0776 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js @@ -46,7 +46,9 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c if (!concatenate) { return { delta: delta_tensor }; } - return new Tensor('float32', interleaveByFrame([base, delta], T, F), [batch, T, F * 2]); + const result = new Tensor('float32', interleaveByFrame([base, delta], T, F), [batch, T, F * 2]); + delta_tensor.dispose(); + return result; } const recursive_result = /** @type {{delta: Tensor}} */ ( @@ -61,7 +63,10 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c } const delta_delta = /** @type {Float32Array} */ (delta_delta_tensor.data); - return new Tensor('float32', interleaveByFrame([base, delta, delta_delta], T, F), [batch, T, F * 3]); + const result = new Tensor('float32', interleaveByFrame([base, delta, delta_delta], T, F), [batch, T, F * 3]); + delta_delta_tensor.dispose(); + delta_tensor.dispose(); + return result; } function interleaveByFrame(items, T, F) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index aa0d6ffad..71b67e60d 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -1,4 +1,4 @@ -import { NemoConformerTDTFeatureExtractor } from "../../../src/transformers.js"; +import { NemoConformerTDTFeatureExtractor, Tensor } from "../../../src/transformers.js"; import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; @@ -54,6 +54,37 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "disposes replaced base features when concatenated delta output is used", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + delta_order: 1, + delta_window: 2, + delta_concatenate: true, + }); + + const originalDispose = Tensor.prototype.dispose; + let disposeCalls = 0; + Tensor.prototype.dispose = function () { + disposeCalls += 1; + return originalDispose.call(this); + }; + + try { + const { input_features } = await extractor(audio); + expect(input_features.dims[2]).toBe(80 * 2); + } finally { + Tensor.prototype.dispose = originalDispose; + } + + // One dispose from computeTemporalDeltas intermediate tensor, one from replacing base features tensor. + expect(disposeCalls).toBe(2); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "uses feature cache when enabled", async () => { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index d28e6ceae..2199fd42f 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -254,6 +254,28 @@ export default () => { expect(() => computeTemporalDeltas(input, { order: 1, window: 1, concatenate: true })).toThrow('type "float32"'); }); + it("disposes intermediate delta tensors in concatenate paths", () => { + const input = new Tensor("float32", Float32Array.from([1, 2, 2, 4, 3, 6, 4, 8]), [1, 4, 2]); + const originalDispose = Tensor.prototype.dispose; + let disposeCalls = 0; + Tensor.prototype.dispose = function () { + disposeCalls += 1; + return originalDispose.call(this); + }; + + try { + const order1 = computeTemporalDeltas(input, { order: 1, window: 1, concatenate: true }); + const order2 = computeTemporalDeltas(input, { order: 2, window: 1, concatenate: true }); + expect(order1.dims).toEqual([1, 4, 4]); + expect(order2.dims).toEqual([1, 4, 6]); + } finally { + Tensor.prototype.dispose = originalDispose; + } + + // order=1 concat disposes one intermediate tensor, order=2 concat disposes two. + expect(disposeCalls).toBe(3); + }); + it( "creates stable audio cache keys", async () => { From 1f065c367870e6e854ab5672bbcfc92648595148 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 00:26:30 +0300 Subject: [PATCH 15/33] fix(nemo-conformer-tdt): dispose auxiliary decoder outputs Dispose non-essential Tensor outputs returned by decoder steps to prevent cumulative memory growth. Keep logits/state tensors alive for decoding/state transitions and dispose extras immediately.\n\nAdd regression test to assert auxiliary decoder tensor outputs are disposed each step. --- .../modeling_nemo_conformer_tdt.js | 19 ++++++-- .../test_modeling_nemo_conformer_tdt.js | 44 +++++++++++++++++++ 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 401a30a54..92e52bdbc 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -609,12 +609,23 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } const logits = decoderOutput[io.decoder_output] ?? Object.values(decoderOutput)[0]; + const outputState1 = decoderOutput[io.decoder_output_state_1]; + const outputState2 = decoderOutput[io.decoder_output_state_2]; + const seenDecoderTensors = new Set(); + for (const value of Object.values(decoderOutput)) { + if (!(value instanceof Tensor) || seenDecoderTensors.has(value)) continue; + seenDecoderTensors.add(value); + if (value === logits || value === outputState1 || value === outputState2) { + continue; + } + value.dispose(); + } const logitsData = logits.data; if (logitsData.length < vocabSize) { logits.dispose(); this._disposeDecoderState({ - state1: decoderOutput[io.decoder_output_state_1], - state2: decoderOutput[io.decoder_output_state_2], + state1: outputState1, + state2: outputState2, }); throw new Error( `Nemo Conformer TDT decoder output is too small (${logitsData.length}) for vocab_size=${vocabSize}.`, @@ -639,8 +650,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } const newState = { - state1: decoderOutput[io.decoder_output_state_1] ?? decoderState.state1, - state2: decoderOutput[io.decoder_output_state_2] ?? decoderState.state2, + state1: outputState1 ?? decoderState.state1, + state2: outputState2 ?? decoderState.state2, }; if (tokenId !== blankId) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 2199fd42f..c7ae1a98c 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -211,6 +211,50 @@ export default () => { }, MAX_TEST_EXECUTION_TIME, ); + + it( + "disposes auxiliary decoder tensor outputs per decode step", + async () => { + class AuxDecoderOutputModel extends NemoConformerForTDT { + constructor(config, sessions) { + super(config, sessions, {}); + this.auxDisposals = 0; + } + + async _runEncoder() { + return { + outputs: new Tensor("float32", new Float32Array([0.1, 0.2]), [1, 2, 1]), + }; + } + + async _runDecoder() { + const stateShape = [1, 1, 2]; + const aux = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 1, 3]); + const originalDispose = aux.dispose.bind(aux); + aux.dispose = () => { + this.auxDisposals += 1; + originalDispose(); + }; + return { + outputs: new Tensor("float32", new Float32Array([10.0, 0.0, 0.0, 8.0, 0.0]), [1, 1, 5]), + output_states_1: new Tensor("float32", new Float32Array([0, 0]), stateShape), + output_states_2: new Tensor("float32", new Float32Array([0, 0]), stateShape), + auxiliary_scores: aux, + }; + } + } + + const model = new AuxDecoderOutputModel(BASE_CONFIG, BASE_SESSIONS); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + const output = await model.transcribe(inputs, { return_timestamps: false }); + expect(output).toEqual(expect.objectContaining({ text: "" })); + expect(model.auxDisposals).toBe(1); + }, + MAX_TEST_EXECUTION_TIME, + ); }); describe("Nemo Conformer TDT utilities", () => { From ec09a090c9c6950a24da00a31e9192a994e6bc0e Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 00:41:21 +0300 Subject: [PATCH 16/33] perf(nemo-conformer-tdt): avoid tolist in length hot path Compute encoder length directly from attention_mask.data instead of attention_mask.tolist() to avoid large transient array allocations in ASR decode hot path. --- .../nemo_conformer_tdt/modeling_nemo_conformer_tdt.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 92e52bdbc..a29349d53 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -411,8 +411,12 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { let length = null; const attentionMask = model_inputs.attention_mask; if (attentionMask instanceof Tensor) { - const mask = attentionMask.tolist(); - length = mask[0].reduce((acc, x) => acc + toInt(x), 0); + const maskData = attentionMask.data; + let sum = 0; + for (let i = 0; i < maskData.length; ++i) { + sum += toInt(maskData[i]); + } + length = sum; } else { length = inputFeatures.dims[1]; } From 8a90a7c90cf5f434cbe63e52be4c513c6e20992b Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 01:36:48 +0300 Subject: [PATCH 17/33] fix(nemo-conformer-tdt): harden duration and audio validation Fail fast when duration logits are required but missing in decoder output, and enforce positive-integer vocab size at runtime config validation. Validate prepared Nemo pipeline audio for non-empty finite samples before processor/model calls. Add regression tests for missing duration logits and non-finite audio rejection. --- .../modeling_nemo_conformer_tdt.js | 15 +++++++++++++ .../pipelines/automatic-speech-recognition.js | 21 ++++++++++++++++++ .../test_modeling_nemo_conformer_tdt.js | 22 +++++++++++++++++++ ..._pipelines_automatic_speech_recognition.js | 7 ++++++ 4 files changed, 65 insertions(+) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index a29349d53..d16eac520 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -457,6 +457,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } _validateRuntimeConfig(vocabSize) { + if (!Number.isInteger(vocabSize) || vocabSize <= 0) { + throw new Error( + `Invalid Nemo Conformer TDT config: vocab_size=${vocabSize} must be a positive integer.`, + ); + } if (this.transducer.blank_token_id >= vocabSize) { throw new Error( `Invalid Nemo Conformer TDT config: blank_token_id=${this.transducer.blank_token_id} must be < vocab_size=${vocabSize}.`, @@ -638,6 +643,16 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const tokenId = argmax(logitsData, 0, vocabSize); const durationStart = this.transducer.duration_start_index ?? vocabSize; const hasDurationLogits = logitsData.length > durationStart; + if (this.transducer.duration_start_index != null && !hasDurationLogits) { + logits.dispose(); + this._disposeDecoderState({ + state1: outputState1, + state2: outputState2, + }); + throw new Error( + `Nemo Conformer TDT decoder output is missing duration logits: expected values beyond index ${durationStart - 1}, got length=${logitsData.length}.`, + ); + } const step = hasDurationLogits ? argmax(logitsData, durationStart, logitsData.length - durationStart) - durationStart : 0; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 882dfb68f..92f16c7ac 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -140,6 +140,24 @@ export class AutomaticSpeechRecognitionPipeline Pipeline ) { + _validateNemoAudio(audio, index) { + if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { + throw new TypeError( + `Nemo Conformer TDT pipeline expected audio at index ${index} to be Float32Array or Float64Array.`, + ); + } + if (audio.length === 0) { + throw new Error(`Nemo Conformer TDT pipeline expected non-empty audio at index ${index}.`); + } + for (let i = 0; i < audio.length; ++i) { + if (!Number.isFinite(audio[i])) { + throw new Error( + `Nemo Conformer TDT pipeline expected finite audio samples; found ${audio[i]} at index ${index}:${i}.`, + ); + } + } + } + async _call(audio, kwargs = {}) { switch (this.model.config.model_type) { case 'whisper': @@ -339,6 +357,9 @@ export class AutomaticSpeechRecognitionPipeline const batchedAudio = single ? [audio] : audio; const sampling_rate = this.processor.feature_extractor.config.sampling_rate; const preparedAudios = await prepareAudios(batchedAudio, sampling_rate); + for (let i = 0; i < preparedAudios.length; ++i) { + this._validateNemoAudio(preparedAudios[i], i); + } const toReturn = []; for (const aud of preparedAudios) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index c7ae1a98c..c87bd4038 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -156,6 +156,28 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "fails fast when duration logits are required but missing", + async () => { + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [ + // Only vocab logits are returned; duration head is missing. + { logits: [0.1, 10.0, 0.0] }, + ]); + + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { decode: () => "" }, + return_timestamps: false, + }), + ).rejects.toThrow("missing duration logits"); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("fails fast when transducer config is missing", () => { const invalidConfig = { model_type: "nemo-conformer-tdt" }; expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("Missing `transformers.js_config.transducer`"); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 8159c6098..06f8cec16 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -219,6 +219,13 @@ export default () => { return_metrics: true, }); }); + + it("rejects non-finite audio samples before Nemo decoding", async () => { + const { pipe } = makeUnitPipe(); + await expect(pipe(Float32Array.from([0, Number.NaN, 0]), { return_timestamps: false })).rejects.toThrow( + "finite audio samples", + ); + }); }); }); }; From ce0a3eb91113322ffefe0ada6268542c724a76eb Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 01:48:50 +0300 Subject: [PATCH 18/33] fix: address prioritized review findings Fix placeholder interpolation in _prepare_model_inputs error text. Add fail-fast validation for Nemo delta_window and reject duplicate decoder output aliases in transducer io config. Add regression tests for delta_window validation and duplicate decoder output alias rejection. --- .../transformers/src/models/modeling_utils.js | 5 +++-- .../feature_extraction_nemo_conformer_tdt.js | 5 +++++ .../modeling_nemo_conformer_tdt.js | 7 +++++++ ...st_feature_extraction_nemo_conformer_tdt.js | 15 +++++++++++++++ .../test_modeling_nemo_conformer_tdt.js | 18 ++++++++++++++++++ 5 files changed, 48 insertions(+), 2 deletions(-) diff --git a/packages/transformers/src/models/modeling_utils.js b/packages/transformers/src/models/modeling_utils.js index ef6e5a99a..d537b559a 100644 --- a/packages/transformers/src/models/modeling_utils.js +++ b/packages/transformers/src/models/modeling_utils.js @@ -857,8 +857,9 @@ export class PreTrainedModel extends Callable { if (input_name in model_inputs) { if (inputs) { throw new Error( - '`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' + - 'Make sure to either pass {inputs} or {input_name}=...', + '`inputs` was passed alongside ' + + `\`${input_name}\` which is not allowed. ` + + `Make sure to either pass \`inputs\` or \`${input_name}\`=...`, ); } } else { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 8907b2ab8..3c5e5bd97 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -47,6 +47,11 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { `NemoConformerTDTFeatureExtractor expected delta_order in {0,1,2}, got ${this.delta_order}.`, ); } + if (!Number.isInteger(this.delta_window) || this.delta_window < 1) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected \`delta_window\` as a positive integer, got ${this.delta_window}.`, + ); + } if (this.delta_order > 0 && !this.delta_concatenate) { logger.warn( 'NemoConformerTDTFeatureExtractor: `delta_concatenate=false` is set. ' + diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index d16eac520..efc9a1576 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -107,6 +107,13 @@ function resolveTransducerConfig(config, sessions) { ...DEFAULT_TRANSDUCER_IO, ...(transducerConfig.io ?? {}), }; + const requiredDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2]; + if (new Set(requiredDecoderOutputs).size !== requiredDecoderOutputs.length) { + throw new Error( + 'Invalid `transformers.js_config.transducer.io`: decoder output names must be distinct ' + + '(decoder_output, decoder_output_state_1, decoder_output_state_2).', + ); + } const decoderSession = sessions?.decoder_model_merged; if (!decoderSession) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index 71b67e60d..6ded8dda4 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -117,5 +117,20 @@ export default () => { }, MAX_TEST_EXECUTION_TIME, ); + + it("validates delta_window at construction time", () => { + expect( + () => new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, delta_order: 1, delta_window: 0 }), + ).toThrow("delta_window"); + expect( + () => + new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + delta_order: 1, + delta_window: 1.5, + }), + ).toThrow("delta_window"); + }); }); }; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index c87bd4038..342d06f26 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -197,6 +197,24 @@ export default () => { expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); }); + it("rejects duplicate decoder output aliases in transducer io config", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + io: { + decoder_output: "outputs", + decoder_output_state_1: "outputs", + decoder_output_state_2: "output_states_2", + }, + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("must be distinct"); + }); + it( "disposes encoder outputs when frame-count validation fails before decode", async () => { From 5d91d396ec463481844351d613f61f88c8b41646 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 01:58:28 +0300 Subject: [PATCH 19/33] fix(nemo-conformer-tdt): apply low-risk hardening follow-ups Validate transcribe timeOffset as finite and guard encoderOutputs cleanup path to avoid masking primary failures. Align transducer_text JSDoc token type with runtime shape (include id). Harden Parakeet feature extractor test by using direct mask data and explicit tensor disposal via try/finally; add timeOffset validation regression test. --- .../modeling_nemo_conformer_tdt.js | 15 +++++++---- .../nemo_conformer_tdt/transducer_text.js | 2 +- .../test_modeling_nemo_conformer_tdt.js | 19 ++++++++++++++ .../test_feature_extraction_parakeet.js | 26 +++++++++++-------- 4 files changed, 45 insertions(+), 17 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index efc9a1576..eda732983 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -533,6 +533,9 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { timeOffset = 0, } = {}, ) { + if (!Number.isFinite(timeOffset)) { + throw new Error('NemoConformerForTDT.transcribe expected `timeOffset` to be a finite number.'); + } const totalStart = nowMs(); const io = this.transducer.io; const vocabSize = this._resolveVocabSize(tokenizer); @@ -720,11 +723,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } finally { if (targetLengthTensor) targetLengthTensor.dispose(); if (decoderState) this._disposeDecoderState(decoderState); - const seen = new Set(); - for (const value of Object.values(encoderOutputs)) { - if (value instanceof Tensor && !seen.has(value)) { - value.dispose(); - seen.add(value); + if (encoderOutputs && typeof encoderOutputs === 'object') { + const seen = new Set(); + for (const value of Object.values(encoderOutputs)) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } } } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index a68bb828d..d12a27553 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -97,7 +97,7 @@ export function decodeTransducerText(tokenizer, token_ids) { * @param {number[] | null} token_confidences * @returns {{ * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, - * tokens: Array<{ token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, + * tokens: Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, * word_confidences: (number | null)[] | null, * word_avg: number | null, * }} diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 342d06f26..29b060605 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -156,6 +156,25 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "rejects non-finite timeOffset", + async () => { + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [{ logits: [9.0, 0.0, 0.0, 1.0] }]); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { decode: () => "" }, + return_timestamps: true, + timeOffset: Number.NaN, + }), + ).rejects.toThrow("timeOffset"); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "fails fast when duration logits are required but missing", async () => { diff --git a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js index c0e67a58b..82ece82a9 100644 --- a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js +++ b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js @@ -27,17 +27,21 @@ export default () => { const audio = Float32Array.from({ length: total }, (_, i) => Math.sin((2 * Math.PI * 220 * i) / config.sampling_rate)); const { input_features, attention_mask } = await feature_extractor(audio); - - expect(input_features.dims[0]).toBe(1); - expect(input_features.dims[2]).toBe(config.feature_size); - expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); - - const validFrames = attention_mask.tolist()[0].reduce((acc, x) => acc + Number(x), 0); - expect(validFrames).toBeGreaterThan(0); - expect(validFrames).toBeLessThanOrEqual(input_features.dims[1]); - - const preview = Array.from(input_features.data.slice(0, 256)); - expect(preview.every(Number.isFinite)).toBe(true); + try { + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(config.feature_size); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + + const validFrames = attention_mask.data.reduce((acc, x) => acc + Number(x), 0); + expect(validFrames).toBeGreaterThan(0); + expect(validFrames).toBeLessThanOrEqual(input_features.dims[1]); + + const preview = Array.from(input_features.data.slice(0, 256)); + expect(preview.every(Number.isFinite)).toBe(true); + } finally { + input_features.dispose(); + attention_mask.dispose(); + } }, MAX_TEST_EXECUTION_TIME, ); From dfc2c130aa3b599d76e15a807fa15101893c4fa1 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 02:24:23 +0300 Subject: [PATCH 20/33] fix(nemo-conformer-tdt): enforce named outputs and frame-level confidences --- .../modeling_nemo_conformer_tdt.js | 44 +++++-- ...t_feature_extraction_nemo_conformer_tdt.js | 28 +++-- .../test_modeling_nemo_conformer_tdt.js | 112 ++++++++++++++++++ 3 files changed, 170 insertions(+), 14 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index eda732983..c47407a56 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -313,7 +313,15 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { _getEncoderOutput(outputs) { const name = this.transducer.io.encoder_output; - return outputs[name] ?? Object.values(outputs)[0]; + const out = outputs?.[name]; + if (!(out instanceof Tensor)) { + const available = outputs && typeof outputs === 'object' ? Object.keys(outputs).join(', ') : '(none)'; + throw new Error( + `Nemo Conformer TDT encoder output "${name}" was not returned by the session. ` + + `Available outputs: ${available}.`, + ); + } + return out; } _getEncoderFrameCount(encoderOutput) { @@ -570,8 +578,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const tokenTimestamps = []; /** @type {number[] | null} */ const tokenConfidences = needConfidences ? [] : null; - /** @type {number[] | null} */ - const frameConfidences = returnFrameConfidences ? [] : null; + /** @type {Map | null} */ + const frameConfidenceStats = returnFrameConfidences ? new Map() : null; /** @type {number[] | null} */ const frameIndices = returnFrameIndices ? [] : null; /** @type {number[] | null} */ @@ -627,7 +635,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { frameTensor.dispose(); } - const logits = decoderOutput[io.decoder_output] ?? Object.values(decoderOutput)[0]; + const logits = decoderOutput[io.decoder_output]; const outputState1 = decoderOutput[io.decoder_output_state_1]; const outputState2 = decoderOutput[io.decoder_output_state_2]; const seenDecoderTensors = new Set(); @@ -639,6 +647,18 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } value.dispose(); } + if (!(logits instanceof Tensor)) { + this._disposeDecoderState( + { + state1: outputState1, + state2: outputState2, + }, + decoderState, + ); + throw new Error( + `Nemo Conformer TDT decoder output "${io.decoder_output}" was not returned by the session.`, + ); + } const logitsData = logits.data; if (logitsData.length < vocabSize) { logits.dispose(); @@ -674,8 +694,14 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { needConfidences || returnLogProbs || returnFrameConfidences ? confidenceFromLogits(logitsData, tokenId, vocabSize) : null; - if (frameConfidences && maybeConfidence) { - frameConfidences.push(maybeConfidence.confidence); + if (frameConfidenceStats && maybeConfidence) { + const stats = frameConfidenceStats.get(frameIndex); + if (stats) { + stats.sum += maybeConfidence.confidence; + stats.count += 1; + } else { + frameConfidenceStats.set(frameIndex, { sum: maybeConfidence.confidence, count: 1 }); + } } const newState = { @@ -779,7 +805,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } // Frame confidences are independent of return_timestamps — emit whenever requested. - if (returnFrameConfidences && frameConfidences && frameConfidences.length > 0) { + if (returnFrameConfidences && frameConfidenceStats && frameConfidenceStats.size > 0) { + const frameConfidences = []; + for (const { sum, count } of frameConfidenceStats.values()) { + frameConfidences.push(sum / count); + } if (!result.confidence_scores) { result.confidence_scores = {}; } diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index 6ded8dda4..8c184800d 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -19,9 +19,14 @@ export default () => { async () => { const extractor = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80 }); const { input_features, attention_mask } = await extractor(audio); - expect(input_features.dims[0]).toBe(1); - expect(input_features.dims[2]).toBe(80); - expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + try { + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(80); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + } finally { + input_features.dispose(); + attention_mask.dispose(); + } }, MAX_TEST_EXECUTION_TIME, ); @@ -31,9 +36,14 @@ export default () => { async () => { const extractor = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 128 }); const { input_features, attention_mask } = await extractor(audio); - expect(input_features.dims[0]).toBe(1); - expect(input_features.dims[2]).toBe(128); - expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + try { + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(128); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + } finally { + input_features.dispose(); + attention_mask.dispose(); + } }, MAX_TEST_EXECUTION_TIME, ); @@ -49,7 +59,11 @@ export default () => { delta_concatenate: true, }); const { input_features } = await extractor(audio); - expect(input_features.dims[2]).toBe(128 * 3); + try { + expect(input_features.dims[2]).toBe(128 * 3); + } finally { + input_features.dispose(); + } }, MAX_TEST_EXECUTION_TIME, ); diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 29b060605..2e185e41f 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -156,6 +156,37 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "aggregates frame confidences per encoder frame (not per decode step)", + async () => { + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [ + // Frame 0: emit token=1, step=0 + { logits: [0.0, 4.0, -2.0, 9.0, 1.0, 0.0] }, + // Frame 0: emit token=2, step=0 (hits max_symbols_per_step and advances frame) + { logits: [0.0, -1.0, 3.0, 9.0, 1.0, 0.0] }, + // Frame 1: emit blank, step=2 -> exits decode loop + { logits: [5.0, 0.0, 0.0, 0.0, 1.0, 9.0] }, + ]); + + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + const output = await model.transcribe(inputs, { + return_timestamps: false, + returnFrameConfidences: true, + }); + + expect(output.confidence_scores.frame).toHaveLength(2); + expect(output.confidence_scores.frame[0]).toBeCloseTo(0.9579343795, 6); + expect(output.confidence_scores.frame_avg).toBeCloseTo( + (output.confidence_scores.frame[0] + output.confidence_scores.frame[1]) / 2, + 6, + ); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "rejects non-finite timeOffset", async () => { @@ -216,6 +247,87 @@ export default () => { expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); }); + it( + "fails fast when named encoder output is missing at runtime", + async () => { + class MissingEncoderOutputModel extends NemoConformerForTDT { + async _runEncoder() { + return { + outputs: new Tensor("float32", new Float32Array([0.1, 0.2]), [1, 2, 1]), + }; + } + + async _runDecoder() { + const stateShape = [1, 1, 2]; + return { + outputs: new Tensor("float32", new Float32Array([9.0, 0.0, 0.0, 8.0]), [1, 1, 4]), + output_states_1: new Tensor("float32", new Float32Array([0, 0]), stateShape), + output_states_2: new Tensor("float32", new Float32Array([0, 0]), stateShape), + }; + } + } + + const config = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + io: { encoder_output: "encoder_out" }, + }, + }, + }; + const sessions = { + ...BASE_SESSIONS, + encoder_model: { + ...BASE_SESSIONS.encoder_model, + outputNames: ["encoder_out"], + }, + }; + const model = new MissingEncoderOutputModel(config, sessions, {}); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow( + 'encoder output "encoder_out" was not returned', + ); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "fails fast when named decoder logits output is missing at runtime", + async () => { + class MissingDecoderOutputModel extends NemoConformerForTDT { + async _runEncoder() { + return { + outputs: new Tensor("float32", new Float32Array([0.1, 0.2]), [1, 2, 1]), + }; + } + + async _runDecoder() { + const stateShape = [1, 1, 2]; + return { + unexpected_logits: new Tensor("float32", new Float32Array([9.0, 0.0, 0.0, 8.0]), [1, 1, 4]), + output_states_1: new Tensor("float32", new Float32Array([0, 0]), stateShape), + output_states_2: new Tensor("float32", new Float32Array([0, 0]), stateShape), + }; + } + } + + const model = new MissingDecoderOutputModel(BASE_CONFIG, BASE_SESSIONS, {}); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow( + 'decoder output "outputs" was not returned', + ); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("rejects duplicate decoder output aliases in transducer io config", () => { const invalidConfig = { ...BASE_CONFIG, From a5bd2cf71dd85fe1472f479117ace8f173bab7b5 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 02:29:57 +0300 Subject: [PATCH 21/33] docs(nemo-conformer-tdt): clarify cached tensor sharing semantics --- .../feature_extraction_nemo_conformer_tdt.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 3c5e5bd97..3e66d492c 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -119,6 +119,8 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { * delta_features?: Tensor; * delta_delta_features?: Tensor; * }>} A Promise resolving to an object containing extracted model inputs. + * When cache is enabled, tensor instances are shared with cached entries. + * Do not mutate or dispose returned tensors unless cache is disabled/cleared. */ async _call(audio) { validate_audio_inputs(audio, 'NemoConformerTDTFeatureExtractor'); @@ -127,7 +129,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { const key = `${createAudioCacheKey(audio, this.config.sampling_rate)}:${this.delta_order}:${this.delta_window}:${this.delta_concatenate}`; const cached = this.feature_cache.get(key); if (cached) { - return cached; + return { ...cached }; } const extracted = await this._extract(audio); From abada622a54ee4f2784a5b2ad5abf48397c1b2f9 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 02:47:17 +0300 Subject: [PATCH 22/33] fix(nemo-conformer-tdt): harden decoder I/O validation and feed cleanup --- .../modeling_nemo_conformer_tdt.js | 16 +++++++++++++++ ...t_feature_extraction_nemo_conformer_tdt.js | 15 +++++++++----- .../test_modeling_nemo_conformer_tdt.js | 20 +++++++++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index c47407a56..b29359799 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -107,6 +107,19 @@ function resolveTransducerConfig(config, sessions) { ...DEFAULT_TRANSDUCER_IO, ...(transducerConfig.io ?? {}), }; + const requiredDecoderInputs = [ + io.decoder_encoder, + io.decoder_token, + io.decoder_token_length, + io.decoder_state_1, + io.decoder_state_2, + ]; + if (new Set(requiredDecoderInputs).size !== requiredDecoderInputs.length) { + throw new Error( + 'Invalid `transformers.js_config.transducer.io`: decoder input names must be distinct ' + + '(decoder_encoder, decoder_token, decoder_token_length, decoder_state_1, decoder_state_2).', + ); + } const requiredDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2]; if (new Set(requiredDecoderOutputs).size !== requiredDecoderOutputs.length) { throw new Error( @@ -445,6 +458,9 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } if (missingInputs.length > 0) { + for (const tensor of disposables) { + tensor.dispose(); + } throw new Error( `Nemo Conformer TDT encoder session expects additional inputs that are not available: ${missingInputs.join(', ')}.`, ); diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index 8c184800d..ef852a3b5 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -109,12 +109,17 @@ export default () => { feature_cache_max_entries: 8, feature_cache_max_size_mb: 8, }); - const first = await extractor(audio); - const second = await extractor(audio); + try { + const first = await extractor(audio); + const second = await extractor(audio); - expect(first).toBe(second); - expect(extractor.get_cache_stats().entries).toBe(1); - extractor.clear_cache(); + expect(first).not.toBe(second); + expect(first.input_features).toBe(second.input_features); + expect(first.attention_mask).toBe(second.attention_mask); + expect(extractor.get_cache_stats().entries).toBe(1); + } finally { + extractor.clear_cache(); + } expect(extractor.get_cache_stats().entries).toBe(0); }, MAX_TEST_EXECUTION_TIME, diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 2e185e41f..652038463 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -346,6 +346,26 @@ export default () => { expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("must be distinct"); }); + it("rejects duplicate decoder input aliases in transducer io config", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + io: { + decoder_encoder: "encoder_outputs", + decoder_token: "targets", + decoder_token_length: "target_length", + decoder_state_1: "input_states_1", + decoder_state_2: "input_states_1", + }, + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("must be distinct"); + }); + it( "disposes encoder outputs when frame-count validation fails before decode", async () => { From 62d8bc0d0d3ab0634cf2b460827122cd8a53575a Mon Sep 17 00:00:00 2001 From: ysdede Date: Thu, 5 Mar 2026 23:18:07 +0300 Subject: [PATCH 23/33] fix(nemo-conformer-tdt): address bot review findings - fail fast on missing decoder state outputs and invalid encoder layout enums\n- make FeatureLRUCache own cached tensor lifetimes (replace/evict/clear) with deduped disposal and deterministic size fallback\n- validate n_fft/win_length in Nemo feature extractor\n- align Nemo ASR pipeline docs with actual forwarded options\n- add regression coverage for runtime config validation, non-concatenated deltas/cache behavior, missing decoder state outputs, and cache disposal semantics\n\nValidation:\n- pnpm test -- tests/models.test.js --filter nemo_conformer_tdt\n- pnpm test -- tests/pipelines.test.js --filter automatic_speech_recognition --- .../feature_extraction_nemo_conformer_tdt.js | 17 +- .../modeling_nemo_conformer_tdt.js | 35 +++- .../nemo_conformer_tdt/transducer_cache.js | 63 +++++- .../pipelines/automatic-speech-recognition.js | 6 +- ...t_feature_extraction_nemo_conformer_tdt.js | 74 ++++++- .../test_modeling_nemo_conformer_tdt.js | 197 ++++++++++++++++++ 6 files changed, 375 insertions(+), 17 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 3e66d492c..1f569365b 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -17,6 +17,21 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { constructor(config) { super(config); + if (!Number.isInteger(this.config.n_fft) || this.config.n_fft <= 0) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected \`n_fft\` as a positive integer, got ${this.config.n_fft}.`, + ); + } + if ( + !Number.isInteger(this.config.win_length) || + this.config.win_length <= 0 || + this.config.win_length > this.config.n_fft + ) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected \`win_length\` in [1, n_fft], got win_length=${this.config.win_length}, n_fft=${this.config.n_fft}.`, + ); + } + // Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist. this.config.mel_filters ??= mel_filter_bank( Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins @@ -119,7 +134,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { * delta_features?: Tensor; * delta_delta_features?: Tensor; * }>} A Promise resolving to an object containing extracted model inputs. - * When cache is enabled, tensor instances are shared with cached entries. + * When cache is enabled, tensor instances are shared and owned by the cache. * Do not mutate or dispose returned tensors unless cache is disabled/cleared. */ async _call(audio) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index b29359799..02e3e3c42 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -175,6 +175,8 @@ function resolveTransducerConfig(config, sessions) { const frameShiftS = transducerConfig.frame_shift_s ?? 0.01; const blankTokenId = transducerConfig.blank_token_id ?? 0; const encoderOutputLayout = transducerConfig.encoder_output_layout; + const encoderInputLayout = transducerConfig.encoder_input_layout ?? 'BTF'; + const encoderFrameLayout = transducerConfig.encoder_frame_layout ?? 'BD1'; const decoderTokenDType = transducerConfig.decoder_token_dtype ?? 'int32'; const decoderTokenLengthDType = transducerConfig.decoder_token_length_dtype ?? 'int32'; @@ -195,6 +197,12 @@ function resolveTransducerConfig(config, sessions) { if (encoderOutputLayout !== 'BDT' && encoderOutputLayout !== 'BTD') { throw new Error('Invalid `transformers.js_config.transducer.encoder_output_layout`: expected "BDT" or "BTD".'); } + if (encoderInputLayout !== 'BTF' && encoderInputLayout !== 'BFT') { + throw new Error('Invalid `transformers.js_config.transducer.encoder_input_layout`: expected "BTF" or "BFT".'); + } + if (encoderFrameLayout !== 'BD1' && encoderFrameLayout !== 'B1D') { + throw new Error('Invalid `transformers.js_config.transducer.encoder_frame_layout`: expected "BD1" or "B1D".'); + } if (!['int32', 'int64'].includes(decoderTokenDType)) { throw new Error( 'Invalid `transformers.js_config.transducer.decoder_token_dtype`: expected "int32" or "int64".', @@ -213,9 +221,9 @@ function resolveTransducerConfig(config, sessions) { frame_shift_s: frameShiftS, vocab_size: transducerConfig.vocab_size ?? config.vocab_size ?? null, duration_start_index: transducerConfig.duration_start_index ?? null, - encoder_input_layout: transducerConfig.encoder_input_layout ?? 'BTF', + encoder_input_layout: encoderInputLayout, encoder_output_layout: encoderOutputLayout, - encoder_frame_layout: transducerConfig.encoder_frame_layout ?? 'BD1', + encoder_frame_layout: encoderFrameLayout, decoder_token_dtype: decoderTokenDType, decoder_token_length_dtype: decoderTokenLengthDType, decoder: { @@ -316,10 +324,10 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { _disposeDecoderState(state, keepState = null) { if (!state) return; - if (state.state1 && state.state1 !== keepState?.state1) { + if (state.state1 instanceof Tensor && state.state1 !== keepState?.state1) { state.state1.dispose(); } - if (state.state2 && state.state2 !== keepState?.state2) { + if (state.state2 instanceof Tensor && state.state2 !== keepState?.state2) { state.state2.dispose(); } } @@ -470,7 +478,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } _resolveVocabSize(tokenizer) { - if (Number.isInteger(this.transducer.vocab_size) && this.transducer.vocab_size > 0) { + if (Number.isInteger(this.transducer.vocab_size)) { return this.transducer.vocab_size; } @@ -675,6 +683,19 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { `Nemo Conformer TDT decoder output "${io.decoder_output}" was not returned by the session.`, ); } + if (!(outputState1 instanceof Tensor) || !(outputState2 instanceof Tensor)) { + logits.dispose(); + this._disposeDecoderState( + { + state1: outputState1, + state2: outputState2, + }, + decoderState, + ); + throw new Error( + `Nemo Conformer TDT decoder state outputs "${io.decoder_output_state_1}" and "${io.decoder_output_state_2}" were not returned by the session.`, + ); + } const logitsData = logits.data; if (logitsData.length < vocabSize) { logits.dispose(); @@ -721,8 +742,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } const newState = { - state1: outputState1 ?? decoderState.state1, - state2: outputState2 ?? decoderState.state2, + state1: outputState1, + state2: outputState2, }; if (tokenId !== blankId) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 4ea17da0e..6dded2d5d 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -26,7 +26,7 @@ export function createAudioCacheKey(audio, sampling_rate = 16000) { /** * Lightweight LRU cache for extracted features. - * Stores values as-is and tracks approximate memory usage. + * Stores values as-is, owns cached tensor lifetimes, and tracks approximate memory usage. */ export class FeatureLRUCache { /** @@ -65,6 +65,7 @@ export class FeatureLRUCache { set(key, value) { const existing = this.cache.get(key); if (existing) { + disposeCachedValue(existing.value); this.current_size_bytes -= existing.size_bytes; this.cache.delete(key); } @@ -76,6 +77,9 @@ export class FeatureLRUCache { } clear() { + for (const { value } of this.cache.values()) { + disposeCachedValue(value); + } this.cache.clear(); this.current_size_bytes = 0; } @@ -96,25 +100,72 @@ export class FeatureLRUCache { if (oldest_key === undefined) break; const oldest = this.cache.get(oldest_key); this.cache.delete(oldest_key); + disposeCachedValue(oldest?.value); this.current_size_bytes -= oldest?.size_bytes ?? 0; } } } +function tensorByteSize(tensor) { + let byteLength = null; + try { + byteLength = /** @type {any} */ (tensor.data)?.byteLength ?? null; + } catch { + byteLength = null; + } + if (typeof byteLength === 'number' && byteLength >= 0) { + return byteLength; + } + + const bytesPerElement = { + bool: 1, + int8: 1, + uint8: 1, + int16: 2, + uint16: 2, + int32: 4, + uint32: 4, + int64: 8, + uint64: 8, + float16: 2, + float32: 4, + float64: 8, + }; + return tensor.size * (bytesPerElement[tensor.type] ?? 4); +} + +function collectCachedTensors(value, out = new Set()) { + if (value instanceof Tensor) { + out.add(value); + return out; + } + if (value?.input_features instanceof Tensor) out.add(value.input_features); + if (value?.attention_mask instanceof Tensor) out.add(value.attention_mask); + if (value?.delta_features instanceof Tensor) out.add(value.delta_features); + if (value?.delta_delta_features instanceof Tensor) out.add(value.delta_delta_features); + return out; +} + +function disposeCachedValue(value) { + for (const tensor of collectCachedTensors(value)) { + tensor.dispose(); + } +} + function estimateSizeBytes(value) { if (value instanceof Tensor) { - return /** @type {any} */ (value.data)?.byteLength ?? 0; + return tensorByteSize(value); } if (value?.input_features instanceof Tensor) { - let bytes = /** @type {any} */ (value.input_features.data)?.byteLength ?? 0; + let bytes = tensorByteSize(value.input_features); if (value.attention_mask instanceof Tensor) { - bytes += /** @type {any} */ (value.attention_mask.data)?.byteLength ?? 0; + bytes += tensorByteSize(value.attention_mask); } if (value.delta_features instanceof Tensor) { - bytes += /** @type {any} */ (value.delta_features.data)?.byteLength ?? 0; + bytes += tensorByteSize(value.delta_features); } if (value.delta_delta_features instanceof Tensor) { - bytes += /** @type {any} */ (value.delta_delta_features.data)?.byteLength ?? 0; + bytes += tensorByteSize(value.delta_delta_features); } return bytes; } diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 92f16c7ac..e4b0f8f90 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -325,8 +325,10 @@ export class AutomaticSpeechRecognitionPipeline * * Delegates to model.transcribe() and returns its output directly. * Use `return_timestamps: true` on the pipeline call to get utterance-level data. - * For words/tokens/metrics/debug, call model.transcribe() directly with the - * extended options (return_words, return_tokens, return_metrics, etc.). + * This pipeline always requests metrics, and enables word details when + * timestamps are requested. + * For token-level and debug controls, call `model.transcribe()` directly with + * extended options. */ async _call_nemo_conformer_tdt(audio, kwargs) { if (typeof (/** @type {any} */ (this.model).transcribe) !== 'function') { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index ef852a3b5..74f60d8d0 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -68,6 +68,35 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "supports non-concatenated delta and delta-delta features", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + delta_order: 2, + delta_window: 2, + delta_concatenate: false, + }); + const { input_features, delta_features, delta_delta_features, attention_mask } = await extractor(audio); + try { + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(80); + expect(delta_features).toBeDefined(); + expect(delta_delta_features).toBeDefined(); + expect(delta_features.dims).toEqual(input_features.dims); + expect(delta_delta_features.dims).toEqual(input_features.dims); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + } finally { + input_features.dispose(); + delta_features?.dispose(); + delta_delta_features?.dispose(); + attention_mask.dispose(); + } + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "disposes replaced base features when concatenated delta output is used", async () => { @@ -86,11 +115,13 @@ export default () => { return originalDispose.call(this); }; + let input_features; try { - const { input_features } = await extractor(audio); + ({ input_features } = await extractor(audio)); expect(input_features.dims[2]).toBe(80 * 2); } finally { Tensor.prototype.dispose = originalDispose; + input_features?.dispose(); } // One dispose from computeTemporalDeltas intermediate tensor, one from replacing base features tensor. @@ -125,6 +156,37 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "uses feature cache when enabled for non-concatenated delta outputs", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + delta_order: 2, + delta_window: 2, + delta_concatenate: false, + use_feature_cache: true, + feature_cache_max_entries: 8, + feature_cache_max_size_mb: 8, + }); + try { + const first = await extractor(audio); + const second = await extractor(audio); + + expect(first).not.toBe(second); + expect(first.input_features).toBe(second.input_features); + expect(first.attention_mask).toBe(second.attention_mask); + expect(first.delta_features).toBe(second.delta_features); + expect(first.delta_delta_features).toBe(second.delta_delta_features); + expect(extractor.get_cache_stats().entries).toBe(1); + } finally { + extractor.clear_cache(); + } + expect(extractor.get_cache_stats().entries).toBe(0); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "validates preemphasis range", async () => { @@ -151,5 +213,15 @@ export default () => { }), ).toThrow("delta_window"); }); + + it("validates n_fft and win_length at construction time", () => { + expect(() => new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, n_fft: 0 })).toThrow("n_fft"); + expect(() => new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, win_length: 0 })).toThrow( + "win_length", + ); + expect(() => new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, win_length: 1024 })).toThrow( + "win_length", + ); + }); }); }; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 652038463..20f64da77 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -80,6 +80,96 @@ export default () => { expect(MODEL_TYPE_MAPPING.get("nemo-conformer-tdt")).toBe(MODEL_TYPES.NemoConformerTDT); }); + it( + "throws on invalid runtime config: vocab_size must be > 0", + async () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + vocab_size: 0, + }, + }, + }; + const model = new MockNemoConformerForTDT(invalidConfig, BASE_SESSIONS, []); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { + decode: () => "", + get_vocab: () => new Map([["a", 0]]), + }, + }), + ).rejects.toThrow("vocab_size"); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "throws on invalid runtime config: blank_token_id must be < vocab_size", + async () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + blank_token_id: 3, + }, + }, + }; + const model = new MockNemoConformerForTDT(invalidConfig, BASE_SESSIONS, []); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { + decode: () => "", + get_vocab: () => new Map([["a", 0], ["b", 1], ["c", 2]]), + }, + }), + ).rejects.toThrow("blank_token_id"); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "throws on invalid runtime config: duration_start_index must be >= vocab_size", + async () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + duration_start_index: 2, + }, + }, + }; + const model = new MockNemoConformerForTDT(invalidConfig, BASE_SESSIONS, []); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { + decode: () => "", + get_vocab: () => new Map([["a", 0], ["b", 1], ["c", 2]]), + }, + }), + ).rejects.toThrow("duration_start_index"); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "greedily decodes scripted token and duration logits", async () => { @@ -247,6 +337,34 @@ export default () => { expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); }); + it("rejects invalid encoder_input_layout at construction time", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + encoder_input_layout: "BAD", + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_input_layout"); + }); + + it("rejects invalid encoder_frame_layout at construction time", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + encoder_frame_layout: "BAD", + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_frame_layout"); + }); + it( "fails fast when named encoder output is missing at runtime", async () => { @@ -328,6 +446,35 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "fails fast when named decoder state outputs are missing at runtime", + async () => { + class MissingDecoderStateOutputsModel extends NemoConformerForTDT { + async _runEncoder() { + return { + outputs: new Tensor("float32", new Float32Array([0.1, 0.2]), [1, 2, 1]), + }; + } + + async _runDecoder() { + return { + outputs: new Tensor("float32", new Float32Array([9.0, 0.0, 0.0, 8.0]), [1, 1, 4]), + }; + } + } + + const model = new MissingDecoderStateOutputsModel(BASE_CONFIG, BASE_SESSIONS, {}); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow( + 'decoder state outputs "output_states_1" and "output_states_2" were not returned', + ); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("rejects duplicate decoder output aliases in transducer io config", () => { const invalidConfig = { ...BASE_CONFIG, @@ -578,6 +725,56 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it("disposes replaced cache entries", () => { + const cache = new FeatureLRUCache({ max_entries: 4, max_size_mb: 4 }); + const originalDispose = Tensor.prototype.dispose; + let disposeCalls = 0; + Tensor.prototype.dispose = function () { + disposeCalls += 1; + return originalDispose.call(this); + }; + + try { + cache.set("x", new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3])); + cache.set("x", new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3])); + expect(disposeCalls).toBe(1); + } finally { + Tensor.prototype.dispose = originalDispose; + cache.clear(); + } + }); + + it("disposes tensors on eviction and clear without double-disposing shared refs", () => { + const cache = new FeatureLRUCache({ max_entries: 1, max_size_mb: 4 }); + const originalDispose = Tensor.prototype.dispose; + let disposeCalls = 0; + Tensor.prototype.dispose = function () { + disposeCalls += 1; + return originalDispose.call(this); + }; + + try { + const sharedA = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3]); + cache.set("a", { + input_features: sharedA, + attention_mask: sharedA, + }); + const sharedB = new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3]); + cache.set("b", { + input_features: sharedB, + attention_mask: sharedB, + }); + // Eviction of "a" should dispose sharedA once, despite duplicate field references. + expect(disposeCalls).toBe(1); + + cache.clear(); + // Clear should dispose sharedB once. + expect(disposeCalls).toBe(2); + } finally { + Tensor.prototype.dispose = originalDispose; + } + }); + it("rejects invalid cache limits", () => { expect(() => new FeatureLRUCache({ max_entries: -1 })).toThrow("max_entries"); expect(() => new FeatureLRUCache({ max_entries: 1.25 })).toThrow("max_entries"); From 03fb8bd223c62e25f428f2e5d74f17d64bd9954e Mon Sep 17 00:00:00 2001 From: ysdede Date: Thu, 5 Mar 2026 23:26:02 +0300 Subject: [PATCH 24/33] style(nemo-conformer-tdt): simplify duration frame expression Apply Gemini review nit in Nemo decode loop by replacing a redundant duration expression with Math.max(1, step).\n\nValidation:\n- pnpm test -- tests/models.test.js --filter nemo_conformer_tdt --- .../models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 02e3e3c42..92aedc2ac 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -753,7 +753,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenIds.push(tokenId); // TDT duration convention: step=0 means "stay on current frame" (duration index 0 = no advance). // We still associate the token with this frame, so durationFrames is at least 1. - const durationFrames = Math.max(1, step > 0 ? step : 1); + const durationFrames = Math.max(1, step); const endFrame = Math.min(frameCount, frameIndex + durationFrames); tokenTimestamps.push([ roundTs(frameIndex * frameTime + timeOffset), From 426061e5109b1953987c2a58873c6739b67cdbba Mon Sep 17 00:00:00 2001 From: ysdede Date: Thu, 5 Mar 2026 23:42:42 +0300 Subject: [PATCH 25/33] fix(nemo-tdt): address PR10 follow-up review comments Checklist (bot comment IDs): - [x] 2892132356: guard tokenizer.get_vocab() return type before Object.keys in _resolveVocabSize. - [x] 2892132367: treat zero cache limits as explicit no-cache mode; do not store/dispose just-produced values. - [x] 2892132372: dispose processor tensors in Nemo ASR pipeline when cache does not own lifetimes. Added regression tests for vocab resolution fallback, zero-limit cache semantics, and Nemo pipeline tensor ownership behavior. Validation: - pnpm test -- tests/models.test.js --filter nemo_conformer_tdt - pnpm test -- tests/pipelines.test.js --filter automatic_speech_recognition --- .../modeling_nemo_conformer_tdt.js | 12 +- .../nemo_conformer_tdt/transducer_cache.js | 8 ++ .../pipelines/automatic-speech-recognition.js | 22 ++- .../test_modeling_nemo_conformer_tdt.js | 62 +++++++++ ..._pipelines_automatic_speech_recognition.js | 126 +++++++++++++++++- 5 files changed, 224 insertions(+), 6 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 92aedc2ac..cfe0cfc6b 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -484,9 +484,15 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { if (tokenizer?.get_vocab) { const vocab = tokenizer.get_vocab(); - const size = vocab instanceof Map ? vocab.size : Object.keys(vocab).length; - if (size > 0) { - return size; + if (vocab instanceof Map) { + if (vocab.size > 0) { + return vocab.size; + } + } else if (vocab && typeof vocab === 'object') { + const size = Object.keys(vocab).length; + if (size > 0) { + return size; + } } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 6dded2d5d..1b82f71f1 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -63,6 +63,14 @@ export class FeatureLRUCache { * @returns {void} */ set(key, value) { + // Explicit no-cache mode: keep caller ownership of current values. + if (this.max_entries === 0 || this.max_size_mb === 0) { + if (this.cache.size > 0) { + this.clear(); + } + return; + } + const existing = this.cache.get(key); if (existing) { disposeCachedValue(existing.value); diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index e4b0f8f90..4717ca279 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -363,11 +363,29 @@ export class AutomaticSpeechRecognitionPipeline this._validateNemoAudio(preparedAudios[i], i); } + const featureCache = this.processor.feature_extractor?.feature_cache; + const cacheOwnsTensors = !!( + featureCache && + featureCache.max_entries > 0 && + featureCache.max_size_mb > 0 + ); const toReturn = []; for (const aud of preparedAudios) { const inputs = await this.processor(aud); - const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); - toReturn.push(output); + try { + const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); + toReturn.push(output); + } finally { + if (!cacheOwnsTensors) { + const seen = new Set(); + for (const value of Object.values(inputs ?? {})) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } + } + } + } } return single ? toReturn[0] : toReturn; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 20f64da77..63d88cb07 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -170,6 +170,36 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "throws explicit vocab resolution error when tokenizer.get_vocab returns a non-object", + async () => { + const configWithoutVocab = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + vocab_size: undefined, + }, + }, + }; + const model = new MockNemoConformerForTDT(configWithoutVocab, BASE_SESSIONS, []); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { + decode: () => "", + get_vocab: () => null, + }, + }), + ).rejects.toThrow("Unable to resolve vocabulary size"); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "greedily decodes scripted token and duration logits", async () => { @@ -775,6 +805,38 @@ export default () => { } }); + it("treats zero cache limits as explicit no-cache mode without disposing inserted values", () => { + const byEntries = new FeatureLRUCache({ max_entries: 0, max_size_mb: 4 }); + const bySize = new FeatureLRUCache({ max_entries: 4, max_size_mb: 0 }); + const t1 = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3]); + const t2 = new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3]); + + let t1Disposals = 0; + const t1Dispose = t1.dispose.bind(t1); + t1.dispose = () => { + t1Disposals += 1; + t1Dispose(); + }; + let t2Disposals = 0; + const t2Dispose = t2.dispose.bind(t2); + t2.dispose = () => { + t2Disposals += 1; + t2Dispose(); + }; + + byEntries.set("x", t1); + bySize.set("y", t2); + expect(byEntries.get("x")).toBeNull(); + expect(bySize.get("y")).toBeNull(); + expect(t1Disposals).toBe(0); + expect(t2Disposals).toBe(0); + + t1.dispose(); + t2.dispose(); + expect(t1Disposals).toBe(1); + expect(t2Disposals).toBe(1); + }); + it("rejects invalid cache limits", () => { expect(() => new FeatureLRUCache({ max_entries: -1 })).toThrow("max_entries"); expect(() => new FeatureLRUCache({ max_entries: 1.25 })).toThrow("max_entries"); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 06f8cec16..9f227a9b3 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -1,4 +1,4 @@ -import { pipeline, AutomaticSpeechRecognitionPipeline } from "../../src/transformers.js"; +import { pipeline, AutomaticSpeechRecognitionPipeline, Tensor } from "../../src/transformers.js"; import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; @@ -226,6 +226,130 @@ export default () => { "finite audio samples", ); }); + + it("disposes processor tensors after Nemo transcription when feature cache is disabled", async () => { + let disposeCalls = 0; + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe() { + return { text: "ok" }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => { + const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); + const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); + const trackDispose = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + trackDispose(input_features); + trackDispose(attention_mask); + return { input_features, attention_mask }; + }, { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + const output = await pipe(new Float32Array(16000), { return_timestamps: false }); + expect(output).toEqual({ text: "ok" }); + expect(disposeCalls).toBe(2); + }); + + it("keeps processor tensors alive when Nemo feature cache owns tensor lifetimes", async () => { + let disposeCalls = 0; + let lastInputs = null; + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe() { + return { text: "ok" }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => { + const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); + const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); + const trackDispose = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + trackDispose(input_features); + trackDispose(attention_mask); + lastInputs = { input_features, attention_mask }; + return lastInputs; + }, { + feature_extractor: { + config: { sampling_rate: 16000 }, + feature_cache: { max_entries: 2, max_size_mb: 8 }, + }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + try { + const output = await pipe(new Float32Array(16000), { return_timestamps: false }); + expect(output).toEqual({ text: "ok" }); + expect(disposeCalls).toBe(0); + } finally { + lastInputs?.input_features.dispose(); + lastInputs?.attention_mask.dispose(); + } + }); + + it("disposes processor tensors when Nemo feature cache limits disable caching", async () => { + let disposeCalls = 0; + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe() { + return { text: "ok" }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => { + const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); + const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); + const trackDispose = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + trackDispose(input_features); + trackDispose(attention_mask); + return { input_features, attention_mask }; + }, { + feature_extractor: { + config: { sampling_rate: 16000 }, + feature_cache: { max_entries: 0, max_size_mb: 8 }, + }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + const output = await pipe(new Float32Array(16000), { return_timestamps: false }); + expect(output).toEqual({ text: "ok" }); + expect(disposeCalls).toBe(2); + }); }); }); }; From d7476a6d5c29716268bebb956e28063e626eac63 Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 00:06:11 +0300 Subject: [PATCH 26/33] fix(transformers): resolve Nemo TDT typegen regressions - widen confidenceFromLogits input type to Tensor data arrays - narrow feature_cache access with explicit typed cast in ASR pipeline --- .../models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js | 2 +- .../src/pipelines/automatic-speech-recognition.js | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index cfe0cfc6b..b79b57cae 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -57,7 +57,7 @@ function roundTs(value) { } /** - * @param {Float32Array|number[]} logits + * @param {import('../../utils/tensor.js').Tensor['data']} logits * @param {number} tokenId * @param {number} vocabSize * @returns {{ confidence: number, logProb: number }} diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 4717ca279..b4c468a4c 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -363,7 +363,9 @@ export class AutomaticSpeechRecognitionPipeline this._validateNemoAudio(preparedAudios[i], i); } - const featureCache = this.processor.feature_extractor?.feature_cache; + const featureCache = /** @type {{ max_entries: number, max_size_mb: number }|null|undefined} */ ( + /** @type {any} */ (this.processor.feature_extractor)?.feature_cache + ); const cacheOwnsTensors = !!( featureCache && featureCache.max_entries > 0 && From 0989f7abf31a5e5e322c335c1d11e5e4cdd5365f Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 00:16:52 +0300 Subject: [PATCH 27/33] fix(nemo-tdt): address PR11 cache and vocab review feedback Checklist (bot comment IDs): - [x] 2892287484: handle array-returning tokenizer vocab in _resolveVocabSize. - [x] 2892322884: avoid disposing when re-setting the same object for an existing cache key. - [x] 2892322906: skip caching oversized values to prevent insert-then-dispose of caller-owned tensors. - [x] 2892322910: guard byteLength type in estimateSizeBytes. Added regression tests for array vocab sizing, same-object set behavior, oversized value skipping, and non-numeric byteLength handling. Validation: - pnpm test -- tests/models.test.js --filter nemo_conformer_tdt - pnpm test -- tests/pipelines.test.js --filter automatic_speech_recognition --- .../modeling_nemo_conformer_tdt.js | 4 ++ .../nemo_conformer_tdt/transducer_cache.js | 29 ++++++++- .../test_modeling_nemo_conformer_tdt.js | 64 +++++++++++++++++++ 3 files changed, 94 insertions(+), 3 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index b79b57cae..2ae145058 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -488,6 +488,10 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { if (vocab.size > 0) { return vocab.size; } + } else if (Array.isArray(vocab)) { + if (vocab.length > 0) { + return vocab.length; + } } else if (vocab && typeof vocab === 'object') { const size = Object.keys(vocab).length; if (size > 0) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 1b82f71f1..02fee3f28 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -71,14 +71,36 @@ export class FeatureLRUCache { return; } + const max_bytes = this.max_size_mb * 1024 * 1024; const existing = this.cache.get(key); + if (existing?.value === value) { + // Refresh recency for unchanged value without invalidating caller-owned references. + this.cache.delete(key); + if (existing.size_bytes <= max_bytes) { + this.cache.set(key, existing); + } else { + this.current_size_bytes -= existing.size_bytes; + } + return; + } + + const size_bytes = estimateSizeBytes(value); + if (size_bytes > max_bytes) { + // Cannot fit in cache: keep caller ownership and skip caching. + if (existing) { + disposeCachedValue(existing.value); + this.current_size_bytes -= existing.size_bytes; + this.cache.delete(key); + } + return; + } + if (existing) { disposeCachedValue(existing.value); this.current_size_bytes -= existing.size_bytes; this.cache.delete(key); } - const size_bytes = estimateSizeBytes(value); this.cache.set(key, { value, size_bytes }); this.current_size_bytes += size_bytes; this._evict(); @@ -177,8 +199,9 @@ function estimateSizeBytes(value) { } return bytes; } - if (value?.byteLength) { - return value.byteLength; + const byteLength = value?.byteLength; + if (typeof byteLength === 'number' && Number.isFinite(byteLength) && byteLength >= 0) { + return byteLength; } return 0; } diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 63d88cb07..01e977bf4 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -200,6 +200,25 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it("resolves vocab size from array tokenizers when config vocab_size is not set", () => { + const configWithoutVocab = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + vocab_size: undefined, + }, + }, + }; + const model = new MockNemoConformerForTDT(configWithoutVocab, BASE_SESSIONS, []); + expect( + model._resolveVocabSize({ + get_vocab: () => ["", "hello", "world"], + }), + ).toBe(3); + }); + it( "greedily decodes scripted token and duration logits", async () => { @@ -774,6 +793,25 @@ export default () => { } }); + it("does not dispose when re-setting the same value object for an existing key", () => { + const cache = new FeatureLRUCache({ max_entries: 4, max_size_mb: 4 }); + const tensor = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3]); + let disposeCalls = 0; + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + + cache.set("x", tensor); + cache.set("x", tensor); + expect(cache.get("x")).toBe(tensor); + expect(disposeCalls).toBe(0); + + cache.clear(); + expect(disposeCalls).toBe(1); + }); + it("disposes tensors on eviction and clear without double-disposing shared refs", () => { const cache = new FeatureLRUCache({ max_entries: 1, max_size_mb: 4 }); const originalDispose = Tensor.prototype.dispose; @@ -837,6 +875,32 @@ export default () => { expect(t2Disposals).toBe(1); }); + it("skips caching oversized values without disposing caller-owned tensors", () => { + const cache = new FeatureLRUCache({ max_entries: 4, max_size_mb: 0.000001 }); + const tensor = new Tensor("float32", new Float32Array([1, 2]), [1, 2]); + let disposeCalls = 0; + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + + cache.set("big", tensor); + expect(cache.get("big")).toBeNull(); + expect(disposeCalls).toBe(0); + + tensor.dispose(); + expect(disposeCalls).toBe(1); + }); + + it("ignores non-numeric byteLength values in size estimation", () => { + const cache = new FeatureLRUCache({ max_entries: 4, max_size_mb: 4 }); + cache.set("x", { byteLength: "invalid" }); + expect(cache.stats().entries).toBe(1); + expect(cache.stats().size_mb).toBe(0); + cache.clear(); + }); + it("rejects invalid cache limits", () => { expect(() => new FeatureLRUCache({ max_entries: -1 })).toThrow("max_entries"); expect(() => new FeatureLRUCache({ max_entries: 1.25 })).toThrow("max_entries"); From ee819a1c824d496434bf7dd3a0c15547e18fb624 Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 01:28:49 +0300 Subject: [PATCH 28/33] fix(nemo-tdt): add supports() for ASR model class selection --- .../models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 2ae145058..15a8c2a57 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -243,6 +243,10 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { this.transducer = resolveTransducerConfig(config, sessions); } + static supports(model_type) { + return model_type === NEMO_CONFORMER_TDT_MODEL_TYPE; + } + /** * Load Nemo Conformer TDT sessions using v4 canonical ONNX filenames. * @type {typeof PreTrainedModel.from_pretrained} From b44f7f3ff1b6308cb57e084a10edc4b7dfc32bfd Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 01:44:32 +0300 Subject: [PATCH 29/33] fix(model-registry): include processor files for text-to-audio pipelines --- packages/transformers/package.json | 2 ++ .../src/utils/model_registry/get_pipeline_files.js | 6 +++++- packages/transformers/tests/utils/cache.test.js | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/packages/transformers/package.json b/packages/transformers/package.json index c00268823..031efea78 100644 --- a/packages/transformers/package.json +++ b/packages/transformers/package.json @@ -28,6 +28,8 @@ "dev": "node scripts/dev.mjs", "build": "node scripts/build.mjs && pnpm typegen", "test": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage", + "test:models": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage tests/models.test.js --runInBand", + "test:nemo-tdt": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage tests/models.test.js --runInBand --testNamePattern nemo_conformer_tdt", "readme": "python ./docs/scripts/build_readme.py", "docs-api": "node ./docs/scripts/generate.js", "docs-build": "doc-builder build transformers.js ./docs/source/ --not_python_module --build_dir ./docs/build/", diff --git a/packages/transformers/src/utils/model_registry/get_pipeline_files.js b/packages/transformers/src/utils/model_registry/get_pipeline_files.js index cff073b7d..bcac4cf56 100644 --- a/packages/transformers/src/utils/model_registry/get_pipeline_files.js +++ b/packages/transformers/src/utils/model_registry/get_pipeline_files.js @@ -32,9 +32,13 @@ export async function get_pipeline_files(task, modelId, options = {}) { // - 'text' tasks: always check tokenizer, skip processor (text models rarely have one) // - 'audio'/'image' tasks: skip tokenizer, always check processor // - 'multimodal' tasks: check both + // + // NOTE: + // `text-to-audio` may load `AutoModelForTextToSpectrogram` models (e.g., SpeechT5), + // which require processor files. Keep processor detection enabled for this task. const { type } = taskConfig; const include_tokenizer = type !== 'audio' && type !== 'image'; - const include_processor = type !== 'text'; + const include_processor = type !== 'text' || task === 'text-to-audio'; return get_files(modelId, { ...options, diff --git a/packages/transformers/tests/utils/cache.test.js b/packages/transformers/tests/utils/cache.test.js index 36f8880a7..1370d9896 100644 --- a/packages/transformers/tests/utils/cache.test.js +++ b/packages/transformers/tests/utils/cache.test.js @@ -6,6 +6,7 @@ import { MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; const LLAMA_MODEL_ID = "hf-internal-testing/tiny-random-LlamaForCausalLM"; const BERT_MODEL_ID = "hf-internal-testing/tiny-random-BertModel"; const VIT_MODEL_ID = "hf-internal-testing/tiny-random-vit"; +const SPEECHT5_MODEL_ID = "Xenova/speecht5_tts"; // Dedicated model IDs for cache clearing tests to avoid interference with other parallel tests. // These must NOT be used in any other test file. @@ -201,6 +202,16 @@ describe("Cache", () => { }, MAX_TEST_EXECUTION_TIME, ); + + it( + "should include processor files for text-to-audio when model provides them", + async () => { + const files = await ModelRegistry.get_pipeline_files("text-to-audio", SPEECHT5_MODEL_ID, DEFAULT_MODEL_OPTIONS); + expect(files).toContain("preprocessor_config.json"); + expect(files).toContain("tokenizer.json"); + }, + MAX_TEST_EXECUTION_TIME, + ); }); describe("is_cached", () => { From bfa97e6ec750bb193d2306898bd31fd05b91392b Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 01:54:44 +0300 Subject: [PATCH 30/33] Revert "fix(model-registry): include processor files for text-to-audio pipelines" This reverts commit b44f7f3ff1b6308cb57e084a10edc4b7dfc32bfd. --- packages/transformers/package.json | 2 -- .../src/utils/model_registry/get_pipeline_files.js | 6 +----- packages/transformers/tests/utils/cache.test.js | 11 ----------- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/packages/transformers/package.json b/packages/transformers/package.json index 031efea78..c00268823 100644 --- a/packages/transformers/package.json +++ b/packages/transformers/package.json @@ -28,8 +28,6 @@ "dev": "node scripts/dev.mjs", "build": "node scripts/build.mjs && pnpm typegen", "test": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage", - "test:models": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage tests/models.test.js --runInBand", - "test:nemo-tdt": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage tests/models.test.js --runInBand --testNamePattern nemo_conformer_tdt", "readme": "python ./docs/scripts/build_readme.py", "docs-api": "node ./docs/scripts/generate.js", "docs-build": "doc-builder build transformers.js ./docs/source/ --not_python_module --build_dir ./docs/build/", diff --git a/packages/transformers/src/utils/model_registry/get_pipeline_files.js b/packages/transformers/src/utils/model_registry/get_pipeline_files.js index bcac4cf56..cff073b7d 100644 --- a/packages/transformers/src/utils/model_registry/get_pipeline_files.js +++ b/packages/transformers/src/utils/model_registry/get_pipeline_files.js @@ -32,13 +32,9 @@ export async function get_pipeline_files(task, modelId, options = {}) { // - 'text' tasks: always check tokenizer, skip processor (text models rarely have one) // - 'audio'/'image' tasks: skip tokenizer, always check processor // - 'multimodal' tasks: check both - // - // NOTE: - // `text-to-audio` may load `AutoModelForTextToSpectrogram` models (e.g., SpeechT5), - // which require processor files. Keep processor detection enabled for this task. const { type } = taskConfig; const include_tokenizer = type !== 'audio' && type !== 'image'; - const include_processor = type !== 'text' || task === 'text-to-audio'; + const include_processor = type !== 'text'; return get_files(modelId, { ...options, diff --git a/packages/transformers/tests/utils/cache.test.js b/packages/transformers/tests/utils/cache.test.js index 1370d9896..36f8880a7 100644 --- a/packages/transformers/tests/utils/cache.test.js +++ b/packages/transformers/tests/utils/cache.test.js @@ -6,7 +6,6 @@ import { MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; const LLAMA_MODEL_ID = "hf-internal-testing/tiny-random-LlamaForCausalLM"; const BERT_MODEL_ID = "hf-internal-testing/tiny-random-BertModel"; const VIT_MODEL_ID = "hf-internal-testing/tiny-random-vit"; -const SPEECHT5_MODEL_ID = "Xenova/speecht5_tts"; // Dedicated model IDs for cache clearing tests to avoid interference with other parallel tests. // These must NOT be used in any other test file. @@ -202,16 +201,6 @@ describe("Cache", () => { }, MAX_TEST_EXECUTION_TIME, ); - - it( - "should include processor files for text-to-audio when model provides them", - async () => { - const files = await ModelRegistry.get_pipeline_files("text-to-audio", SPEECHT5_MODEL_ID, DEFAULT_MODEL_OPTIONS); - expect(files).toContain("preprocessor_config.json"); - expect(files).toContain("tokenizer.json"); - }, - MAX_TEST_EXECUTION_TIME, - ); }); describe("is_cached", () => { From a85dff25ba04e7b223dc89367e9e3da3a6424774 Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 02:43:07 +0300 Subject: [PATCH 31/33] fix(nemo-tdt): address PR #12 reviewer feedback --- .../feature_extraction_nemo_conformer_tdt.js | 2 +- .../modeling_nemo_conformer_tdt.js | 27 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 1f569365b..3ecea708a 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -149,7 +149,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { const extracted = await this._extract(audio); this.feature_cache.set(key, extracted); - return extracted; + return { ...extracted }; } return await this._extract(audio); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 15a8c2a57..64a6372eb 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -460,6 +460,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } else { length = inputFeatures.dims[1]; } + if (!Number.isInteger(length) || length < 0) { + throw new Error( + `Nemo Conformer TDT expected a non-negative integer encoder length, got: ${length}.`, + ); + } const lengthTensor = new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1]); disposables.push(lengthTensor); feeds[name] = lengthTensor; @@ -713,10 +718,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const logitsData = logits.data; if (logitsData.length < vocabSize) { logits.dispose(); - this._disposeDecoderState({ - state1: outputState1, - state2: outputState2, - }); + this._disposeDecoderState( + { + state1: outputState1, + state2: outputState2, + }, + decoderState, + ); throw new Error( `Nemo Conformer TDT decoder output is too small (${logitsData.length}) for vocab_size=${vocabSize}.`, ); @@ -726,10 +734,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const hasDurationLogits = logitsData.length > durationStart; if (this.transducer.duration_start_index != null && !hasDurationLogits) { logits.dispose(); - this._disposeDecoderState({ - state1: outputState1, - state2: outputState2, - }); + this._disposeDecoderState( + { + state1: outputState1, + state2: outputState2, + }, + decoderState, + ); throw new Error( `Nemo Conformer TDT decoder output is missing duration logits: expected values beyond index ${durationStart - 1}, got length=${logitsData.length}.`, ); From 8dfccddc4dd30aa6fb66108f35382c74a25dbf65 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 8 Mar 2026 00:45:06 +0300 Subject: [PATCH 32/33] feat(nemo-tdt): align asr pipeline outputs and long-audio handling Align the Nemo ASR pipeline with the shared task contract by returning text-only results by default and chunk-based timestamps for segment and word modes. Add automatic long-audio windowing, decoded-text-driven word reconstruction, and model-local helpers for window merge and chunk assembly. Also add regression coverage for numeric/punctuation word boundaries, windowed merge behavior, and auto-windowed long-form pipeline decoding. --- .../pipeline_nemo_conformer_tdt.js | 167 ++++++++++ .../transducer_segment_offsets.js | 87 ++++++ .../nemo_conformer_tdt/transducer_text.js | 155 +--------- .../transducer_window_merge.js | 179 +++++++++++ .../transducer_word_offsets.js | 216 +++++++++++++ .../utils_nemo_conformer_tdt.js | 4 + .../pipelines/automatic-speech-recognition.js | 96 +----- .../test_modeling_nemo_conformer_tdt.js | 66 ++++ ..._pipelines_automatic_speech_recognition.js | 292 ++++++++++++++++-- 9 files changed, 1007 insertions(+), 255 deletions(-) create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js diff --git a/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js new file mode 100644 index 000000000..8762fc1d1 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js @@ -0,0 +1,167 @@ +import { Tensor } from '../../utils/tensor.js'; +import { + buildWordChunks, + buildNemoSegmentChunks, + buildNemoWindowSpecs, + mergeNemoWindowResults, +} from './utils_nemo_conformer_tdt.js'; + +const NEMO_AUTO_WINDOW_THRESHOLD_S = 180; +const NEMO_AUTO_CHUNK_LENGTH_S = 90; +const NEMO_AUTO_STRIDE_LENGTH_S = 10; + +function validateNemoAudio(audio, index) { + if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { + throw new TypeError( + `Nemo Conformer TDT pipeline expected audio at index ${index} to be Float32Array or Float64Array.`, + ); + } + if (audio.length === 0) { + throw new Error(`Nemo Conformer TDT pipeline expected non-empty audio at index ${index}.`); + } + for (let i = 0; i < audio.length; ++i) { + if (!Number.isFinite(audio[i])) { + throw new Error( + `Nemo Conformer TDT pipeline expected finite audio samples; found ${audio[i]} at index ${index}:${i}.`, + ); + } + } +} + +/** + * Run the ASR pipeline adapter for Nemo Conformer TDT models. + * Keeps the public contract task-shaped while delegating rich outputs to `model.transcribe()`. + * + * @param {{ + * model: any, + * processor: any, + * tokenizer: any, + * audio: Float32Array|Float64Array|Array, + * kwargs: Record, + * prepareAudios: (audio: any[], sampling_rate: number) => Promise<(Float32Array|Float64Array)[]>, + * }} options + */ +export async function runNemoConformerTDTPipeline({ + model, + processor, + tokenizer, + audio, + kwargs, + prepareAudios, +}) { + if (typeof model?.transcribe !== 'function') { + throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); + } + if (!processor) { + throw new Error('Nemo Conformer TDT pipeline requires a processor.'); + } + if (!tokenizer) { + throw new Error('Nemo Conformer TDT pipeline requires a tokenizer.'); + } + if (!processor.feature_extractor?.config?.sampling_rate) { + throw new Error( + 'Nemo Conformer TDT pipeline requires `processor.feature_extractor.config.sampling_rate` to prepare audio.', + ); + } + + const return_timestamps = kwargs.return_timestamps ?? false; + const wantWordTimestamps = return_timestamps === 'word'; + const wantTimestampChunks = return_timestamps === true || wantWordTimestamps; + const requested_chunk_length_s = kwargs.chunk_length_s ?? 0; + const requested_stride_length_s = kwargs.stride_length_s ?? null; + + const single = !Array.isArray(audio); + const batchedAudio = single ? [audio] : audio; + const sampling_rate = processor.feature_extractor.config.sampling_rate; + const preparedAudios = await prepareAudios(batchedAudio, sampling_rate); + for (let i = 0; i < preparedAudios.length; ++i) { + validateNemoAudio(preparedAudios[i], i); + } + + const featureCache = /** @type {{ max_entries: number, max_size_mb: number }|null|undefined} */ ( + /** @type {any} */ (processor.feature_extractor)?.feature_cache + ); + const cacheOwnsTensors = !!( + featureCache && + featureCache.max_entries > 0 && + featureCache.max_size_mb > 0 + ); + + const runNemoTranscribe = async (windowAudio, decodeOptions) => { + const inputs = await processor(windowAudio); + try { + return await model.transcribe(inputs, decodeOptions); + } finally { + if (!cacheOwnsTensors) { + const seen = new Set(); + for (const value of Object.values(inputs ?? {})) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } + } + } + } + }; + + const toReturn = []; + for (const aud of preparedAudios) { + const audio_duration_s = aud.length / sampling_rate; + const autoWindowing = requested_chunk_length_s <= 0 && audio_duration_s > NEMO_AUTO_WINDOW_THRESHOLD_S; + const chunk_length_s = + requested_chunk_length_s > 0 + ? requested_chunk_length_s + : autoWindowing + ? NEMO_AUTO_CHUNK_LENGTH_S + : 0; + const stride_length_s = + requested_chunk_length_s > 0 + ? requested_stride_length_s + : autoWindowing + ? NEMO_AUTO_STRIDE_LENGTH_S + : null; + + if (chunk_length_s > 0) { + const windows = buildNemoWindowSpecs(aud, sampling_rate, chunk_length_s, stride_length_s); + const windowResults = []; + for (const window of windows) { + const output = await runNemoTranscribe(window.audio, { + tokenizer, + return_timestamps: true, + return_words: true, + return_tokens: true, + return_metrics: false, + timeOffset: window.start_s, + }); + windowResults.push({ window, output }); + } + + const merged = mergeNemoWindowResults(tokenizer, windowResults); + const result = { text: merged.text || windowResults.map((x) => x.output.text ?? '').join(' ').trim() }; + if (wantWordTimestamps) { + result.chunks = buildWordChunks(merged.words); + } else if (wantTimestampChunks) { + result.chunks = buildNemoSegmentChunks(merged.words, merged.utterance_timestamp, result.text); + } + toReturn.push(result); + continue; + } + + const output = await runNemoTranscribe(aud, { + tokenizer, + return_timestamps: wantTimestampChunks, + return_words: wantTimestampChunks, + return_metrics: false, + }); + + const result = { text: output.text ?? '' }; + if (wantWordTimestamps) { + result.chunks = buildWordChunks(output.words ?? []); + } else if (wantTimestampChunks) { + result.chunks = buildNemoSegmentChunks(output.words ?? [], output.utterance_timestamp ?? null, result.text); + } + toReturn.push(result); + } + + return single ? toReturn[0] : toReturn; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js new file mode 100644 index 000000000..0cb3e01a8 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js @@ -0,0 +1,87 @@ +const NEMO_SEGMENT_BREAK_REGEX = /[.!?;:]["')\]]*$/; +const NEMO_MAX_WORD_GAP_S = 0.8; + +/** + * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @returns {string} + */ +export function joinTimedWords(words) { + let text = ''; + for (const word of words) { + const part = word.text ?? ''; + if (!part) continue; + if (!text) { + text = part; + } else if (/^[,.;:!?)}\]]+$/.test(part)) { + text += part; + } else { + text += ` ${part}`; + } + } + return text; +} + +/** + * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @returns {Array<{ text: string, timestamp: [number, number] }>} + */ +export function buildWordChunks(words) { + return words.map((word) => ({ + text: word.text, + timestamp: [word.start_time, word.end_time], + })); +} + +/** + * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @returns {string} + */ +export function buildSegmentText(words) { + return joinTimedWords(words); +} + +/** + * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @param {[number, number] | null} utterance_timestamp + * @param {string} text + * @returns {Array<{ text: string, timestamp: [number, number] }>} + */ +export function buildNemoSegmentChunks(words, utterance_timestamp = null, text = '') { + if (!Array.isArray(words) || words.length === 0) { + if (utterance_timestamp) { + return [{ text, timestamp: utterance_timestamp }]; + } + return []; + } + + /** @type {Array<{ text: string, timestamp: [number, number] }>} */ + const chunks = []; + /** @type {typeof words} */ + let current = []; + for (const word of words) { + const prev = current.at(-1); + if (prev) { + const gap_s = Math.max(0, word.start_time - prev.end_time); + const shouldBreak = + NEMO_SEGMENT_BREAK_REGEX.test(prev.text) || + gap_s > NEMO_MAX_WORD_GAP_S; + if (shouldBreak) { + chunks.push({ + text: buildSegmentText(current), + timestamp: [current[0].start_time, current[current.length - 1].end_time], + }); + current = []; + } + } + current.push(word); + } + + if (current.length > 0) { + chunks.push({ + text: buildSegmentText(current), + timestamp: [current[0].start_time, current[current.length - 1].end_time], + }); + } + + return chunks; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index d12a27553..274a22962 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -1,81 +1,4 @@ -/** - * Cache tokenizer id->token maps for stable and fast boundary detection. - * @type {WeakMap>} - */ -const TOKEN_ID_TO_TEXT_CACHE = new WeakMap(); - -/** - * @param {any} tokenizer - * @returns {Map} - */ -function getIdToTokenMap(tokenizer) { - let cached = TOKEN_ID_TO_TEXT_CACHE.get(tokenizer); - if (cached) return cached; - - cached = new Map(); - if (tokenizer?.get_vocab) { - const vocab = tokenizer.get_vocab(); - // get_vocab() may return a Map or a plain Object depending on the tokenizer backend. - const entries = vocab instanceof Map ? vocab.entries() : Object.entries(vocab); - for (const [token, id] of entries) { - if (Number.isInteger(id)) { - cached.set(id, token); - } - } - } - TOKEN_ID_TO_TEXT_CACHE.set(tokenizer, cached); - return cached; -} - -/** - * Resolve per-token text and word boundary metadata in a tokenizer-agnostic way. - * Uses raw vocab token (if available) for boundary markers, and decoded token text for display. - * @param {any} tokenizer - * @param {number} id - * @returns {{ raw: string, clean: string, startsNewWord: boolean }} - */ -function resolveTokenPiece(tokenizer, id) { - const rawToken = getIdToTokenMap(tokenizer).get(id) ?? ''; - const decoded = tokenizer.decode([id], { - skip_special_tokens: true, - clean_up_tokenization_spaces: false, - }); - - // SentencePiece/BPE boundary markers used by common tokenizers. - const startsWithBoundaryMarker = /^(?:▁|Ġ)+/.test(rawToken); - const startsWithWhitespace = /^\s+/.test(decoded); - const startsNewWord = startsWithBoundaryMarker || startsWithWhitespace; - - // Human readable token text. - let clean = decoded.replace(/^\s+/, ''); - if (!clean) { - clean = rawToken.replace(/^(?:▁|Ġ|Ċ)+/, '').replace(/^ +/, ''); - } - - return { raw: rawToken || decoded, clean, startsNewWord }; -} - -/** - * @param {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} words - * @param {{ text: string, start: number, end: number, confs: number[] } | null} current - */ -function finalizeAndPushWord(words, current) { - if (!current) return; - - const text = current.text.trim(); - if (!text) return; - - /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ - const word = { - text, - start_time: current.start, - end_time: current.end, - }; - if (current.confs.length > 0) { - word.confidence = Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; - } - words.push(word); -} +import { buildTransducerWordOffsets } from './transducer_word_offsets.js'; /** * Decode token ids into final transcription text. @@ -103,78 +26,6 @@ export function decodeTransducerText(tokenizer, token_ids) { * }} */ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_timestamps, token_confidences = null) { - if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { - return { words: [], tokens: [], word_confidences: null, word_avg: null }; - } - if (token_ids.length !== token_timestamps.length) { - throw new Error( - `buildTransducerDetailedOutputs expects equal lengths for token_ids (${token_ids.length}) and token_timestamps (${token_timestamps.length}).`, - ); - } - if (token_confidences && token_confidences.length !== token_ids.length) { - throw new Error( - `buildTransducerDetailedOutputs expects token_confidences length (${token_confidences.length}) to match token_ids length (${token_ids.length}).`, - ); - } - - /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ - const tokens = []; - /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ - const words = []; - - /** @type {{ text: string, start: number, end: number, confs: number[] } | null} */ - let current = null; - - for (let i = 0; i < token_ids.length; ++i) { - const id = token_ids[i]; - const ts = token_timestamps[i]; - const piece = resolveTokenPiece(tokenizer, id); - const raw = piece.raw; - const startsNewWord = piece.startsNewWord; - const clean = piece.clean; - if (!clean) continue; - - const tok = { - id, - token: clean, - raw_token: raw, - is_word_start: startsNewWord, - start_time: ts[0], - end_time: ts[1], - }; - const conf = token_confidences?.[i]; - if (conf != null && Number.isFinite(conf)) { - tok.confidence = Math.round(conf * 1e6) / 1e6; - } - tokens.push(tok); - - if (!current || startsNewWord) { - finalizeAndPushWord(words, current); - current = { - text: clean, - start: ts[0], - end: ts[1], - confs: conf != null && Number.isFinite(conf) ? [conf] : [], - }; - } else { - current.text += clean; - current.end = ts[1]; - if (conf != null && Number.isFinite(conf)) { - current.confs.push(conf); - } - } - } - - finalizeAndPushWord(words, current); - - const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null; - let word_avg = null; - if (word_confidences) { - const validConfidences = word_confidences.filter((x) => x != null); - if (validConfidences.length > 0) { - word_avg = Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; - } - } - - return { words, tokens, word_confidences, word_avg }; + const fullText = decodeTransducerText(tokenizer, token_ids); + return buildTransducerWordOffsets(tokenizer, token_ids, token_timestamps, token_confidences, fullText); } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js new file mode 100644 index 000000000..3eb1f2609 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js @@ -0,0 +1,179 @@ +import { decodeTransducerText } from './transducer_text.js'; +import { joinTimedWords } from './transducer_segment_offsets.js'; + +/** + * @param {Float32Array|Float64Array} audio + * @param {number} sampling_rate + * @param {number} chunk_length_s + * @param {number | null} stride_length_s + * @returns {Array<{audio: Float32Array|Float64Array, start_s: number, end_s: number, left_stride_s: number, right_stride_s: number}>} + */ +export function buildNemoWindowSpecs(audio, sampling_rate, chunk_length_s, stride_length_s) { + if (!(chunk_length_s > 0)) { + return [ + { + audio, + start_s: 0, + end_s: audio.length / sampling_rate, + left_stride_s: 0, + right_stride_s: 0, + }, + ]; + } + + if (stride_length_s === null) { + stride_length_s = chunk_length_s / 6; + } else if (!(stride_length_s >= 0)) { + throw Error('`stride_length_s` must be non-negative.'); + } + if (chunk_length_s <= 2 * stride_length_s) { + throw Error('`chunk_length_s` must be larger than `2 * stride_length_s` for Nemo windowed decoding.'); + } + + const window = Math.floor(sampling_rate * chunk_length_s); + const stride = Math.floor(sampling_rate * stride_length_s); + const jump = window - 2 * stride; + + /** @type {Array<{audio: Float32Array|Float64Array, start_s: number, end_s: number, left_stride_s: number, right_stride_s: number}>} */ + const windows = []; + let offset = 0; + while (true) { + const offset_end = offset + window; + const subarr = audio.subarray(offset, offset_end); + const is_first = offset === 0; + const is_last = offset_end >= audio.length; + windows.push({ + audio: subarr, + start_s: offset / sampling_rate, + end_s: (offset + subarr.length) / sampling_rate, + left_stride_s: is_first ? 0 : stride / sampling_rate, + right_stride_s: is_last ? 0 : stride / sampling_rate, + }); + if (is_last) break; + offset += jump; + } + + return windows; +} + +function shouldKeepTimedItem(start_time, end_time, keep_start_s, keep_end_s, is_first_window, is_last_window) { + const midpoint = (start_time + end_time) / 2; + if (!is_first_window && midpoint < keep_start_s) { + return false; + } + if (!is_last_window && midpoint >= keep_end_s) { + return false; + } + return true; +} + +function dedupeMergedWords(words) { + /** @type {typeof words} */ + const merged = []; + for (const word of words) { + const prev = merged.at(-1); + if ( + prev && + prev.text === word.text && + word.start_time < prev.end_time + ) { + const prevDuration = prev.end_time - prev.start_time; + const nextDuration = word.end_time - word.start_time; + if (nextDuration > prevDuration) { + merged[merged.length - 1] = word; + } + continue; + } + merged.push(word); + } + return merged; +} + +function dedupeMergedTokens(tokens) { + /** @type {typeof tokens} */ + const merged = []; + for (const token of tokens) { + const prev = merged.at(-1); + if ( + prev && + prev.id === token.id && + prev.raw_token === token.raw_token && + token.start_time < prev.end_time + ) { + const prevDuration = prev.end_time - prev.start_time; + const nextDuration = token.end_time - token.start_time; + if (nextDuration > prevDuration) { + merged[merged.length - 1] = token; + } + continue; + } + merged.push(token); + } + return merged; +} + +/** + * @param {any} tokenizer + * @param {Array<{ window: { start_s: number, end_s: number, left_stride_s: number, right_stride_s: number }, output: { words?: any[], tokens?: any[] } }>} windowResults + * @returns {{ text: string, tokens: any[], words: any[], utterance_timestamp: [number, number] | null }} + */ +export function mergeNemoWindowResults(tokenizer, windowResults) { + /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ + const mergedTokens = []; + /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ + const mergedWords = []; + + for (const { window, output } of windowResults) { + const keep_start_s = window.start_s + window.left_stride_s; + const keep_end_s = window.end_s - window.right_stride_s; + const is_first_window = window.left_stride_s === 0; + const is_last_window = window.right_stride_s === 0; + + for (const token of output.tokens ?? []) { + if ( + shouldKeepTimedItem( + token.start_time, + token.end_time, + keep_start_s, + keep_end_s, + is_first_window, + is_last_window, + ) + ) { + mergedTokens.push(token); + } + } + + for (const word of output.words ?? []) { + if ( + shouldKeepTimedItem( + word.start_time, + word.end_time, + keep_start_s, + keep_end_s, + is_first_window, + is_last_window, + ) + ) { + mergedWords.push(word); + } + } + } + + const tokens = dedupeMergedTokens(mergedTokens); + const words = dedupeMergedWords(mergedWords); + const text = + words.length > 0 + ? joinTimedWords(words) + : tokens.length > 0 && typeof tokenizer?.decode === 'function' + ? decodeTransducerText(tokenizer, tokens.map((token) => token.id)) + : ''; + const utterance_timestamp = + words.length > 0 + ? /** @type {[number, number]} */ ([words[0].start_time, words[words.length - 1].end_time]) + : tokens.length > 0 + ? /** @type {[number, number]} */ ([tokens[0].start_time, tokens[tokens.length - 1].end_time]) + : null; + + return { text, tokens, words, utterance_timestamp }; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js new file mode 100644 index 000000000..5046a71a5 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js @@ -0,0 +1,216 @@ +/** + * Cache tokenizer id->token maps for stable and fast boundary detection. + * @type {WeakMap>} + */ +const TOKEN_ID_TO_TEXT_CACHE = new WeakMap(); + +/** + * @param {any} tokenizer + * @returns {Map} + */ +function getIdToTokenMap(tokenizer) { + let cached = TOKEN_ID_TO_TEXT_CACHE.get(tokenizer); + if (cached) return cached; + + cached = new Map(); + if (tokenizer?.get_vocab) { + const vocab = tokenizer.get_vocab(); + const entries = vocab instanceof Map ? vocab.entries() : Object.entries(vocab); + for (const [token, id] of entries) { + if (Number.isInteger(id)) { + cached.set(id, token); + } + } + } + TOKEN_ID_TO_TEXT_CACHE.set(tokenizer, cached); + return cached; +} + +/** + * Resolve per-token text and word boundary metadata in a tokenizer-agnostic way. + * @param {any} tokenizer + * @param {number} id + * @returns {{ raw: string, clean: string, startsNewWord: boolean }} + */ +function resolveTokenPiece(tokenizer, id) { + const rawToken = getIdToTokenMap(tokenizer).get(id) ?? ''; + const decoded = tokenizer.decode([id], { + skip_special_tokens: true, + clean_up_tokenization_spaces: false, + }); + + const startsWithBoundaryMarker = /^(?:▁|Ġ)+/.test(rawToken); + const startsWithWhitespace = /^\s+/.test(decoded); + const startsNewWord = startsWithBoundaryMarker || startsWithWhitespace; + + let clean = decoded.replace(/^\s+/, ''); + if (!clean) { + clean = rawToken.replace(/^(?:▁|Ġ|Ċ)+/, '').replace(/^ +/, ''); + } + + return { raw: rawToken || decoded, clean, startsNewWord }; +} + +/** + * @param {string} fullText + * @param {number} cursor + * @param {string} tokenText + * @returns {{ cursor: number, text: string, skippedWhitespace: boolean }} + */ +function consumeAlignedTokenText(fullText, cursor, tokenText) { + let skippedWhitespace = false; + while (cursor < fullText.length && /\s/.test(fullText[cursor])) { + skippedWhitespace = true; + cursor += 1; + } + + if (!tokenText) { + return { cursor, text: '', skippedWhitespace }; + } + + if (fullText.startsWith(tokenText, cursor)) { + return { + cursor: cursor + tokenText.length, + text: fullText.slice(cursor, cursor + tokenText.length), + skippedWhitespace, + }; + } + + const next = fullText.indexOf(tokenText, cursor); + if (next !== -1 && /^\s*$/.test(fullText.slice(cursor, next))) { + return { + cursor: next + tokenText.length, + text: fullText.slice(next, next + tokenText.length), + skippedWhitespace: skippedWhitespace || next > cursor, + }; + } + + return { + cursor: cursor + tokenText.length, + text: tokenText, + skippedWhitespace, + }; +} + +/** + * @param {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} words + * @param {{ text: string, start: number, end: number, confs: number[] } | null} current + */ +function finalizeAndPushWord(words, current) { + if (!current) return; + + const text = current.text.trim(); + if (!text) return; + + /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ + const word = { + text, + start_time: current.start, + end_time: current.end, + }; + if (current.confs.length > 0) { + word.confidence = Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; + } + words.push(word); +} + +/** + * @param {any} tokenizer + * @param {number[]} token_ids + * @param {[number, number][]} token_timestamps + * @param {number[] | null} token_confidences + * @param {string} fullText + * @returns {{ + * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, + * tokens: Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, + * word_confidences: (number | null)[] | null, + * word_avg: number | null, + * }} + */ +export function buildTransducerWordOffsets( + tokenizer, + token_ids, + token_timestamps, + token_confidences = null, + fullText = '', +) { + if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { + return { words: [], tokens: [], word_confidences: null, word_avg: null }; + } + if (token_ids.length !== token_timestamps.length) { + throw new Error( + `buildTransducerWordOffsets expects equal lengths for token_ids (${token_ids.length}) and token_timestamps (${token_timestamps.length}).`, + ); + } + if (token_confidences && token_confidences.length !== token_ids.length) { + throw new Error( + `buildTransducerWordOffsets expects token_confidences length (${token_confidences.length}) to match token_ids length (${token_ids.length}).`, + ); + } + + /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ + const tokens = []; + /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ + const words = []; + let textCursor = 0; + + /** @type {{ text: string, start: number, end: number, confs: number[] } | null} */ + let current = null; + + for (let i = 0; i < token_ids.length; ++i) { + const id = token_ids[i]; + const ts = token_timestamps[i]; + const piece = resolveTokenPiece(tokenizer, id); + const raw = piece.raw; + const clean = piece.clean; + if (!clean) continue; + + const aligned = consumeAlignedTokenText(fullText, textCursor, clean); + textCursor = aligned.cursor; + const tokenText = aligned.text || clean; + const startsNewWord = !current || aligned.skippedWhitespace || piece.startsNewWord; + + const tok = { + id, + token: tokenText, + raw_token: raw, + is_word_start: startsNewWord, + start_time: ts[0], + end_time: ts[1], + }; + const conf = token_confidences?.[i]; + if (conf != null && Number.isFinite(conf)) { + tok.confidence = Math.round(conf * 1e6) / 1e6; + } + tokens.push(tok); + + if (!current || startsNewWord) { + finalizeAndPushWord(words, current); + current = { + text: tokenText, + start: ts[0], + end: ts[1], + confs: conf != null && Number.isFinite(conf) ? [conf] : [], + }; + } else { + current.text += tokenText; + current.end = ts[1]; + if (conf != null && Number.isFinite(conf)) { + current.confs.push(conf); + } + } + } + + finalizeAndPushWord(words, current); + + const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null; + let word_avg = null; + if (word_confidences) { + const validConfidences = word_confidences.filter((x) => x != null); + if (validConfidences.length > 0) { + word_avg = Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; + } + } + + return { words, tokens, word_confidences, word_avg }; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js index 935336b1e..fdf5e7e67 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js @@ -2,3 +2,7 @@ export { createAudioCacheKey, FeatureLRUCache } from './transducer_cache.js'; export { computeTemporalDeltas } from './transducer_deltas.js'; export { decodeTransducerText, buildTransducerDetailedOutputs } from './transducer_text.js'; +export { buildTransducerWordOffsets } from './transducer_word_offsets.js'; +export { joinTimedWords, buildWordChunks, buildSegmentText, buildNemoSegmentChunks } from './transducer_segment_offsets.js'; +export { buildNemoWindowSpecs, mergeNemoWindowResults } from './transducer_window_merge.js'; +export { runNemoConformerTDTPipeline } from './pipeline_nemo_conformer_tdt.js'; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index b4c468a4c..6415f5857 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -3,6 +3,9 @@ import { Pipeline, prepareAudios } from './_base.js'; import { Tensor } from '../utils/tensor.js'; import { max, round } from '../utils/maths.js'; import { logger } from '../utils/logger.js'; +import { + runNemoConformerTDTPipeline, +} from '../models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js'; /** * @typedef {import('./_base.js').TextAudioPipelineConstructorArgs} TextAudioPipelineConstructorArgs @@ -140,24 +143,6 @@ export class AutomaticSpeechRecognitionPipeline Pipeline ) { - _validateNemoAudio(audio, index) { - if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { - throw new TypeError( - `Nemo Conformer TDT pipeline expected audio at index ${index} to be Float32Array or Float64Array.`, - ); - } - if (audio.length === 0) { - throw new Error(`Nemo Conformer TDT pipeline expected non-empty audio at index ${index}.`); - } - for (let i = 0; i < audio.length; ++i) { - if (!Number.isFinite(audio[i])) { - throw new Error( - `Nemo Conformer TDT pipeline expected finite audio samples; found ${audio[i]} at index ${index}:${i}.`, - ); - } - } - } - async _call(audio, kwargs = {}) { switch (this.model.config.model_type) { case 'whisper': @@ -323,74 +308,19 @@ export class AutomaticSpeechRecognitionPipeline /** * Nemo Conformer TDT ASR pipeline. * - * Delegates to model.transcribe() and returns its output directly. - * Use `return_timestamps: true` on the pipeline call to get utterance-level data. - * This pipeline always requests metrics, and enables word details when - * timestamps are requested. - * For token-level and debug controls, call `model.transcribe()` directly with - * extended options. + * Keeps the pipeline surface aligned with the shared ASR task contract: + * `{ text }` by default and `{ text, chunks }` when timestamps are requested. + * Rich Nemo-specific outputs remain available on direct `model.transcribe()`. */ async _call_nemo_conformer_tdt(audio, kwargs) { - if (typeof (/** @type {any} */ (this.model).transcribe) !== 'function') { - throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); - } - if (!this.processor) { - throw new Error('Nemo Conformer TDT pipeline requires a processor.'); - } - if (!this.tokenizer) { - throw new Error('Nemo Conformer TDT pipeline requires a tokenizer.'); - } - if (!this.processor.feature_extractor?.config?.sampling_rate) { - throw new Error( - 'Nemo Conformer TDT pipeline requires `processor.feature_extractor.config.sampling_rate` to prepare audio.', - ); - } - - const return_timestamps = !!(kwargs.return_timestamps); - - const decodeOptions = { + return runNemoConformerTDTPipeline({ + model: this.model, + processor: this.processor, tokenizer: this.tokenizer, - return_timestamps, - return_words: return_timestamps, - return_metrics: true, - }; - - const single = !Array.isArray(audio); - const batchedAudio = single ? [audio] : audio; - const sampling_rate = this.processor.feature_extractor.config.sampling_rate; - const preparedAudios = await prepareAudios(batchedAudio, sampling_rate); - for (let i = 0; i < preparedAudios.length; ++i) { - this._validateNemoAudio(preparedAudios[i], i); - } - - const featureCache = /** @type {{ max_entries: number, max_size_mb: number }|null|undefined} */ ( - /** @type {any} */ (this.processor.feature_extractor)?.feature_cache - ); - const cacheOwnsTensors = !!( - featureCache && - featureCache.max_entries > 0 && - featureCache.max_size_mb > 0 - ); - const toReturn = []; - for (const aud of preparedAudios) { - const inputs = await this.processor(aud); - try { - const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); - toReturn.push(output); - } finally { - if (!cacheOwnsTensors) { - const seen = new Set(); - for (const value of Object.values(inputs ?? {})) { - if (value instanceof Tensor && !seen.has(value)) { - value.dispose(); - seen.add(value); - } - } - } - } - } - - return single ? toReturn[0] : toReturn; + audio, + kwargs, + prepareAudios, + }); } async _call_moonshine(audio, kwargs) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 01e977bf4..c52df3667 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -1,6 +1,7 @@ import { NemoConformerForTDT, Tensor } from "../../../src/transformers.js"; import { createAudioCacheKey, FeatureLRUCache } from "../../../src/models/nemo_conformer_tdt/transducer_cache.js"; import { computeTemporalDeltas } from "../../../src/models/nemo_conformer_tdt/transducer_deltas.js"; +import { buildTransducerDetailedOutputs } from "../../../src/models/nemo_conformer_tdt/transducer_text.js"; import { MODEL_TYPE_MAPPING, MODEL_TYPES } from "../../../src/models/modeling_utils.js"; import { get_model_files } from "../../../src/utils/model_registry/get_model_files.js"; @@ -645,6 +646,71 @@ export default () => { }); describe("Nemo Conformer TDT utilities", () => { + it("keeps word boundaries from the final decoded text for numeric and punctuation tokens", () => { + const rawById = { + 1: "▁score", + 2: ".", + 3: "48", + 4: "-", + 5: "year", + 6: "-", + 7: "old", + 8: "▁with", + 9: "0", + 10: ".", + 11: "5", + }; + const tokenizer = { + get_vocab() { + return rawById; + }, + decode(ids) { + if (ids.length === 1) { + return rawById[ids[0]].replace(/^▁/, ""); + } + return "score. 48-year-old with 0.5"; + }, + }; + + const output = buildTransducerDetailedOutputs( + tokenizer, + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], + [ + [0.0, 0.3], + [0.3, 0.4], + [0.5, 0.8], + [0.8, 0.85], + [0.85, 1.05], + [1.05, 1.1], + [1.1, 1.3], + [1.4, 1.7], + [1.8, 1.9], + [1.9, 1.95], + [1.95, 2.05], + ], + ); + + expect(output.words.map((x) => x.text)).toEqual([ + "score.", + "48-year-old", + "with", + "0.5", + ]); + expect(output.tokens.map((x) => x.token)).toEqual([ + "score", + ".", + "48", + "-", + "year", + "-", + "old", + "with", + "0", + ".", + "5", + ]); + }); + it( "computes delta and delta-delta features", async () => { diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 9f227a9b3..5a377ece1 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -156,7 +156,17 @@ export default () => { const processor = Object.assign(async () => ({ input_features: {} }), { feature_extractor: { config: { sampling_rate: 16000 } }, }); - const tokenizer = {}; + const tokenizer = { + decode(ids) { + const pieces = { + 1: "hello", + 2: "world", + 3: "again", + 4: "today", + }; + return ids.map((id) => pieces[id] ?? "").filter(Boolean).join(" "); + }, + }; return { pipe: new AutomaticSpeechRecognitionPipeline({ @@ -169,57 +179,299 @@ export default () => { }; }; - it("returns text and metrics when timestamps disabled", async () => { + it("returns text when timestamps disabled", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: false }); - expect(output).toEqual({ text: "hello world", metrics: { total_ms: 42, rtf: 0.01 } }); + expect(output).toEqual({ text: "hello world" }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: false, return_words: false, - return_metrics: true, + return_metrics: false, }); }); - it("returns full output with words when return_timestamps is true", async () => { + it("returns timestamped chunks when return_timestamps is true", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: true }); - expect(output).toMatchObject({ + expect(output).toEqual({ text: "hello world", - utterance_timestamp: [0, 0.08], - utterance_confidence: 0.95, - words: [ - { text: "hello", start_time: 0, end_time: 0.04 }, - { text: "world", start_time: 0.04, end_time: 0.08 }, + chunks: [ + { text: "hello world", timestamp: [0, 0.08] }, ], - confidence_scores: { token_avg: 0.95, word_avg: 0.94 }, - metrics: { total_ms: 42, rtf: 0.01 }, }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: true, return_words: true, - return_metrics: true, + return_metrics: false, }); }); - it("treats return_timestamps 'word' as truthy (same as true)", async () => { + it("returns word chunks when return_timestamps is 'word'", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: "word" }); - expect(output).toMatchObject({ + expect(output).toEqual({ text: "hello world", - utterance_timestamp: [0, 0.08], - words: expect.any(Array), - metrics: expect.any(Object), + chunks: [ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.08] }, + ], }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: true, return_words: true, - return_metrics: true, + return_metrics: false, }); }); + it("merges overlapping windows when Nemo chunking is enabled", async () => { + const calls = []; + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe(_inputs, options) { + calls.push(options); + if (options.timeOffset === 0) { + return { + text: "hello world again", + words: [ + { text: "hello", start_time: 0, end_time: 0.5 }, + { text: "world", start_time: 0.5, end_time: 1.1 }, + { text: "again", start_time: 1.2, end_time: 1.8 }, + ], + tokens: [ + { id: 1, token: "hello", raw_token: "hello", is_word_start: true, start_time: 0, end_time: 0.5 }, + { id: 2, token: "world", raw_token: "world", is_word_start: true, start_time: 0.5, end_time: 1.1 }, + { id: 3, token: "again", raw_token: "again", is_word_start: true, start_time: 1.2, end_time: 1.8 }, + ], + }; + } + return { + text: "again today", + words: [ + { text: "again", start_time: 1.2, end_time: 1.8 }, + { text: "today", start_time: 1.8, end_time: 2.4 }, + ], + tokens: [ + { id: 3, token: "again", raw_token: "again", is_word_start: true, start_time: 1.2, end_time: 1.8 }, + { id: 4, token: "today", raw_token: "today", is_word_start: true, start_time: 1.8, end_time: 2.4 }, + ], + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const tokenizer = { + decode(ids) { + const pieces = { + 1: "hello", + 2: "world", + 3: "again", + 4: "today", + }; + return ids.map((id) => pieces[id] ?? "").filter(Boolean).join(" "); + }, + }; + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer, + processor, + }); + + const output = await pipe(new Float32Array(3 * 16000), { + return_timestamps: "word", + chunk_length_s: 2, + stride_length_s: 0.5, + }); + + expect(output).toEqual({ + text: "hello world again today", + chunks: [ + { text: "hello", timestamp: [0, 0.5] }, + { text: "world", timestamp: [0.5, 1.1] }, + { text: "again", timestamp: [1.2, 1.8] }, + { text: "today", timestamp: [1.8, 2.4] }, + ], + }); + expect(calls).toHaveLength(2); + expect(calls[0]).toMatchObject({ + return_timestamps: true, + return_words: true, + return_tokens: true, + return_metrics: false, + timeOffset: 0, + }); + expect(calls[1]).toMatchObject({ + return_timestamps: true, + return_words: true, + return_tokens: true, + return_metrics: false, + timeOffset: 1, + }); + }); + + it("reconstructs windowed Nemo text from merged words when token decode drops spaces", async () => { + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe(_inputs, options) { + if (options.timeOffset === 0) { + return { + text: "score. 48-year-old", + words: [ + { text: "score.", start_time: 0, end_time: 0.4 }, + { text: "48-year-old", start_time: 0.5, end_time: 1.3 }, + ], + tokens: [ + { id: 1, token: "score", raw_token: "▁score", is_word_start: true, start_time: 0, end_time: 0.3 }, + { id: 2, token: ".", raw_token: ".", is_word_start: false, start_time: 0.3, end_time: 0.4 }, + { id: 3, token: "48", raw_token: "48", is_word_start: false, start_time: 0.5, end_time: 0.8 }, + { id: 4, token: "-", raw_token: "-", is_word_start: false, start_time: 0.8, end_time: 0.85 }, + { id: 5, token: "year", raw_token: "year", is_word_start: false, start_time: 0.85, end_time: 1.05 }, + { id: 4, token: "-", raw_token: "-", is_word_start: false, start_time: 1.05, end_time: 1.1 }, + { id: 6, token: "old", raw_token: "old", is_word_start: false, start_time: 1.1, end_time: 1.3 }, + ], + }; + } + return { + text: "with 0.5", + words: [ + { text: "with", start_time: 1.4, end_time: 1.7 }, + { text: "0.5", start_time: 1.8, end_time: 2.05 }, + ], + tokens: [ + { id: 7, token: "with", raw_token: "▁with", is_word_start: true, start_time: 1.4, end_time: 1.7 }, + { id: 8, token: "0", raw_token: "0", is_word_start: false, start_time: 1.8, end_time: 1.9 }, + { id: 2, token: ".", raw_token: ".", is_word_start: false, start_time: 1.9, end_time: 1.95 }, + { id: 9, token: "5", raw_token: "5", is_word_start: false, start_time: 1.95, end_time: 2.05 }, + ], + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const tokenizer = { + decode(ids) { + const pieces = { + 1: "score", + 2: ".", + 3: "48", + 4: "-", + 5: "year", + 6: "old", + 7: "with", + 8: "0", + 9: "5", + }; + return ids.map((id) => pieces[id] ?? "").join(""); + }, + }; + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer, + processor, + }); + + const output = await pipe(new Float32Array(3 * 16000), { + return_timestamps: "word", + chunk_length_s: 2, + stride_length_s: 0.5, + }); + + expect(output.text).toBe("score. 48-year-old with 0.5"); + expect(output.chunks).toEqual([ + { text: "score.", timestamp: [0, 0.4] }, + { text: "48-year-old", timestamp: [0.5, 1.3] }, + { text: "with", timestamp: [1.4, 1.7] }, + { text: "0.5", timestamp: [1.8, 2.05] }, + ]); + }); + + it("auto-window long Nemo audio with 90s chunks and 10s stride", async () => { + const calls = []; + const wordsByOffset = new Map([ + [0, { id: 1, text: "alpha", start: 0, end: 1 }], + [70, { id: 2, text: "beta", start: 85, end: 86 }], + [140, { id: 3, text: "gamma", start: 155, end: 156 }], + [210, { id: 4, text: "delta", start: 225, end: 226 }], + ]); + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe(_inputs, options) { + calls.push(options); + const item = wordsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + words: [ + { text: item.text, start_time: item.start, end_time: item.end }, + ], + tokens: [ + { + id: item.id, + token: item.text, + raw_token: item.text, + is_word_start: true, + start_time: item.start, + end_time: item.end, + }, + ], + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const tokenizer = { + decode(ids) { + const pieces = { + 1: "alpha", + 2: "beta", + 3: "gamma", + 4: "delta", + }; + return ids.map((id) => pieces[id] ?? "").filter(Boolean).join(" "); + }, + }; + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer, + processor, + }); + + const output = await pipe(new Float32Array(300 * 16000), { return_timestamps: "word" }); + + expect(output).toEqual({ + text: "alpha beta gamma delta", + chunks: [ + { text: "alpha", timestamp: [0, 1] }, + { text: "beta", timestamp: [85, 86] }, + { text: "gamma", timestamp: [155, 156] }, + { text: "delta", timestamp: [225, 226] }, + ], + }); + expect(calls).toHaveLength(4); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 70, 140, 210]); + for (const call of calls) { + expect(call).toMatchObject({ + return_timestamps: true, + return_words: true, + return_tokens: true, + return_metrics: false, + }); + } + }); + it("rejects non-finite audio samples before Nemo decoding", async () => { const { pipe } = makeUnitPipe(); await expect(pipe(Float32Array.from([0, Number.NaN, 0]), { return_timestamps: false })).rejects.toThrow( From 816f581180012e3ce017876da0b150d02046d2a4 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 8 Mar 2026 00:58:29 +0300 Subject: [PATCH 33/33] chore(tests): drop unrelated parakeet feature extractor coverage Remove the standalone parakeet feature extractor test from this branch. It exercises an existing parakeet_ctc path that is outside the scope of Conformer TDT integration and makes the PR look broader than it is. --- .../test_feature_extraction_parakeet.js | 49 ------------------- 1 file changed, 49 deletions(-) delete mode 100644 packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js diff --git a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js deleted file mode 100644 index 82ece82a9..000000000 --- a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js +++ /dev/null @@ -1,49 +0,0 @@ -import { ParakeetFeatureExtractor } from "../../../src/transformers.js"; - -import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; - -export default () => { - describe("ParakeetFeatureExtractor", () => { - const config = { - feature_size: 80, - sampling_rate: 16000, - n_fft: 512, - win_length: 400, - hop_length: 160, - preemphasis: 0.97, - }; - - /** @type {ParakeetFeatureExtractor} */ - let feature_extractor; - beforeAll(() => { - feature_extractor = new ParakeetFeatureExtractor(config); - }); - - it( - "extracts features and mask from synthetic audio", - async () => { - const duration_s = 1.0; - const total = Math.floor(config.sampling_rate * duration_s); - const audio = Float32Array.from({ length: total }, (_, i) => Math.sin((2 * Math.PI * 220 * i) / config.sampling_rate)); - - const { input_features, attention_mask } = await feature_extractor(audio); - try { - expect(input_features.dims[0]).toBe(1); - expect(input_features.dims[2]).toBe(config.feature_size); - expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); - - const validFrames = attention_mask.data.reduce((acc, x) => acc + Number(x), 0); - expect(validFrames).toBeGreaterThan(0); - expect(validFrames).toBeLessThanOrEqual(input_features.dims[1]); - - const preview = Array.from(input_features.data.slice(0, 256)); - expect(preview.every(Number.isFinite)).toBe(true); - } finally { - input_features.dispose(); - attention_mask.dispose(); - } - }, - MAX_TEST_EXECUTION_TIME, - ); - }); -};