From d0176023a8ea4a48343f62131c5d4f2ff447da3e Mon Sep 17 00:00:00 2001 From: ysdede Date: Sat, 28 Feb 2026 22:52:41 +0300 Subject: [PATCH 01/40] feat(nemo-conformer-tdt): port Nemo Conformer TDT model and ASR pipeline --- .../src/models/feature_extractors.js | 1 + packages/transformers/src/models/models.js | 1 + .../feature_extraction_nemo_conformer_tdt.js | 204 ++++++ .../modeling_nemo_conformer_tdt.js | 597 ++++++++++++++++++ .../processing_nemo_conformer_tdt.js | 19 + .../nemo_conformer_tdt/transducer_cache.js | 119 ++++ .../nemo_conformer_tdt/transducer_deltas.js | 69 ++ .../nemo_conformer_tdt/transducer_text.js | 73 +++ .../utils_nemo_conformer_tdt.js | 4 + .../transformers/src/models/processors.js | 1 + packages/transformers/src/models/registry.js | 1 + packages/transformers/src/pipelines.js | 3 +- .../pipelines/automatic-speech-recognition.js | 139 ++++ ...t_feature_extraction_nemo_conformer_tdt.js | 78 +++ .../test_modeling_nemo_conformer_tdt.js | 188 ++++++ .../test_feature_extraction_parakeet.js | 45 ++ ..._pipelines_automatic_speech_recognition.js | 133 ++++ 17 files changed, 1674 insertions(+), 1 deletion(-) create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/processing_nemo_conformer_tdt.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js create mode 100644 packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js create mode 100644 packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js create mode 100644 packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js diff --git a/packages/transformers/src/models/feature_extractors.js b/packages/transformers/src/models/feature_extractors.js index 589b96cd7..48ecc6ff6 100644 --- a/packages/transformers/src/models/feature_extractors.js +++ b/packages/transformers/src/models/feature_extractors.js @@ -5,6 +5,7 @@ export * from './clap/feature_extraction_clap.js'; export * from './dac/feature_extraction_dac.js'; export * from './gemma3n/feature_extraction_gemma3n.js'; export * from './moonshine/feature_extraction_moonshine.js'; +export * from './nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js'; export * from './parakeet/feature_extraction_parakeet.js'; export * from './pyannote/feature_extraction_pyannote.js'; export * from './seamless_m4t/feature_extraction_seamless_m4t.js'; diff --git a/packages/transformers/src/models/models.js b/packages/transformers/src/models/models.js index 9c6d6f2dd..2fe9055a0 100644 --- a/packages/transformers/src/models/models.js +++ b/packages/transformers/src/models/models.js @@ -102,6 +102,7 @@ export * from './mpt/modeling_mpt.js'; export * from './mt5/modeling_mt5.js'; export * from './multi_modality/modeling_multi_modality.js'; export * from './musicgen/modeling_musicgen.js'; +export * from './nemo_conformer_tdt/modeling_nemo_conformer_tdt.js'; export * from './nanochat/modeling_nanochat.js'; export * from './neobert/modeling_neobert.js'; export * from './nomic_bert/modeling_nomic_bert.js'; diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js new file mode 100644 index 000000000..f1bfe6b76 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -0,0 +1,204 @@ +import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js'; +import { Tensor } from '../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { FeatureLRUCache, createAudioCacheKey } from './transducer_cache.js'; +import { computeTemporalDeltas } from './transducer_deltas.js'; + +const EPSILON = 1e-5; + +/** + * Feature extractor for Nemo Conformer TDT models. + * + * Mirrors NeMo-style log-mel extraction used by Parakeet with configurable + * `feature_size` (e.g. 80 or 128 mel bins via `preprocessor_config.json`). + */ +export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { + constructor(config) { + super(config); + + // Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist. + this.config.mel_filters ??= mel_filter_bank( + Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins + this.config.feature_size, // num_mel_filters + 0.0, // min_frequency + this.config.sampling_rate / 2, // max_frequency + this.config.sampling_rate, // sampling_rate + 'slaney', // norm + 'slaney', // mel_scale + ); + + const window = window_function(this.config.win_length, 'hann', { + periodic: false, + }); + + this.window = new Float64Array(this.config.n_fft); + const offset = Math.floor((this.config.n_fft - this.config.win_length) / 2); + this.window.set(window, offset); + + // Optional feature-level cache and delta/delta-delta post-processing. + this.use_feature_cache = this.config.use_feature_cache ?? false; + this.delta_order = this.config.delta_order ?? 0; + this.delta_window = this.config.delta_window ?? 2; + this.delta_concatenate = this.config.delta_concatenate ?? true; + + if (![0, 1, 2].includes(this.delta_order)) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected delta_order in {0,1,2}, got ${this.delta_order}.`, + ); + } + if (this.delta_order > 0 && !this.delta_concatenate) { + console.warn( + 'NemoConformerTDTFeatureExtractor: `delta_concatenate=false` is set. ' + + '`input_features` will remain base features and deltas are returned in extra fields.', + ); + } + + this.feature_cache = this.use_feature_cache + ? new FeatureLRUCache({ + max_entries: this.config.feature_cache_max_entries ?? 128, + max_size_mb: this.config.feature_cache_max_size_mb ?? 64, + }) + : null; + } + + /** + * Computes the log-Mel spectrogram of the provided audio waveform. + * @param {Float32Array|Float64Array} waveform The audio waveform to process. + * @returns {Promise} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. + */ + async _extract_fbank_features(waveform) { + // Parakeet uses a custom preemphasis strategy: Apply preemphasis to entire waveform at once + const preemphasis = this.config.preemphasis; + waveform = new Float64Array(waveform); // Clone to avoid destructive changes + for (let j = waveform.length - 1; j >= 1; --j) { + waveform[j] -= preemphasis * waveform[j - 1]; + } + + const features = await spectrogram( + waveform, + this.window, // window + this.window.length, // frame_length + this.config.hop_length, // hop_length + { + fft_length: this.config.n_fft, + power: 2.0, + mel_filters: this.config.mel_filters, + log_mel: 'log', + mel_floor: -Infinity, + pad_mode: 'constant', + center: true, + + // Custom + transpose: true, + mel_offset: 2 ** -24, + }, + ); + + return features; + } + + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ + * input_features: Tensor; + * attention_mask: Tensor; + * delta_features?: Tensor; + * delta_delta_features?: Tensor; + * }>} A Promise resolving to an object containing extracted model inputs. + */ + async _call(audio) { + validate_audio_inputs(audio, 'NemoConformerTDTFeatureExtractor'); + + if (this.feature_cache) { + const key = `${createAudioCacheKey(audio, this.config.sampling_rate)}:${this.delta_order}:${this.delta_window}:${this.delta_concatenate}`; + const cached = this.feature_cache.get(key); + if (cached) { + return cached; + } + + const extracted = await this._extract(audio); + this.feature_cache.set(key, extracted); + return extracted; + } + + return await this._extract(audio); + } + + async _extract(audio) { + const features = await this._extract_fbank_features(audio); + + const features_length = Math.floor( + (audio.length + Math.floor(this.config.n_fft / 2) * 2 - this.config.n_fft) / this.config.hop_length, + ); + + const features_data = /** @type {Float32Array} */ (features.data); + features_data.fill(0, features_length * features.dims[1]); + + // normalize mel features, ignoring padding + const [num_frames, num_features] = features.dims; + const sum = new Float64Array(num_features); + const sum_sq = new Float64Array(num_features); + + for (let i = 0; i < features_length; ++i) { + const offset = i * num_features; + for (let j = 0; j < num_features; ++j) { + const val = features_data[offset + j]; + sum[j] += val; + sum_sq[j] += val * val; + } + } + + // Calculate mean and standard deviation, then normalize + const divisor = features_length > 1 ? features_length - 1 : 1; + for (let j = 0; j < num_features; ++j) { + const mean = sum[j] / features_length; + const variance = (sum_sq[j] - features_length * mean * mean) / divisor; + const std = Math.sqrt(variance) + EPSILON; + const inv_std = 1 / std; + + for (let i = 0; i < features_length; ++i) { + const index = i * num_features + j; + features_data[index] = (features_data[index] - mean) * inv_std; + } + } + + const mask_data = new BigInt64Array(num_frames); + mask_data.fill(1n, 0, features_length); + + let input_features = features.unsqueeze_(0); + const attention_mask = new Tensor('int64', mask_data, [1, num_frames]); + + const result = { + input_features, + attention_mask, + }; + + if (this.delta_order > 0) { + const delta_result = computeTemporalDeltas(input_features, { + order: this.delta_order, + window: this.delta_window, + concatenate: this.delta_concatenate, + }); + if (delta_result instanceof Tensor) { + input_features = delta_result; + result.input_features = input_features; + } else { + result.delta_features = delta_result.delta; + if (delta_result.delta_delta) { + result.delta_delta_features = delta_result.delta_delta; + } + } + } + + return result; + } + + clear_cache() { + this.feature_cache?.clear(); + } + + get_cache_stats() { + return this.feature_cache?.stats() ?? null; + } +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js new file mode 100644 index 000000000..c0874b21b --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -0,0 +1,597 @@ +import { AutoConfig } from '../../configs.js'; +import { Tensor } from '../../utils/tensor.js'; +import { PreTrainedModel } from '../modeling_utils.js'; +import { constructSessions, sessionRun } from '../session.js'; +import { buildTransducerWordTimestamps, decodeTransducerText } from './transducer_text.js'; + +const NEMO_CONFORMER_TDT_MODEL_TYPE = 'nemo-conformer-tdt'; + +const DEFAULT_TRANSDUCER_IO = Object.freeze({ + encoder_output: 'outputs', + decoder_encoder: 'encoder_outputs', + decoder_token: 'targets', + decoder_token_length: 'target_length', + decoder_state_1: 'input_states_1', + decoder_state_2: 'input_states_2', + decoder_output: 'outputs', + decoder_output_state_1: 'output_states_1', + decoder_output_state_2: 'output_states_2', +}); + +function argmax(values, offset = 0, length = values.length - offset) { + let maxIndex = offset; + let maxValue = Number.NEGATIVE_INFINITY; + const end = offset + length; + for (let i = offset; i < end; ++i) { + const v = values[i]; + if (v > maxValue) { + maxValue = v; + maxIndex = i; + } + } + return maxIndex; +} + +function toInt(value) { + return typeof value === 'bigint' ? Number(value) : value; +} + +function inferEncoderOutputLayout(outputTensor) { + if (outputTensor.dims.length !== 3 || outputTensor.dims[0] !== 1) { + throw new Error( + `Nemo Conformer TDT expected encoder output dims [1, D, T] or [1, T, D], got [${outputTensor.dims.join(', ')}].`, + ); + } + + // Heuristic fallback: in most Nemo exports D > T. + return outputTensor.dims[1] >= outputTensor.dims[2] ? 'BDT' : 'BTD'; +} + +function resolveTransducerConfig(config, sessions) { + const transducerConfig = config['transformers.js_config']?.transducer; + if (!transducerConfig) { + throw new Error( + 'Missing `transformers.js_config.transducer` in config.json for nemo-conformer-tdt. See external model repo contract.', + ); + } + + const decoderConfig = transducerConfig.decoder ?? {}; + const numLayers = decoderConfig.num_layers; + const hiddenSize = decoderConfig.hidden_size; + + if (!Number.isInteger(numLayers) || numLayers <= 0) { + throw new Error('Invalid `transformers.js_config.transducer.decoder.num_layers`: expected a positive integer.'); + } + if (!Number.isInteger(hiddenSize) || hiddenSize <= 0) { + throw new Error( + 'Invalid `transformers.js_config.transducer.decoder.hidden_size`: expected a positive integer.', + ); + } + + const io = { + ...DEFAULT_TRANSDUCER_IO, + ...(transducerConfig.io ?? {}), + }; + + const decoderSession = sessions?.decoder_model_merged; + if (!decoderSession) { + throw new Error('Missing required session `decoder_model_merged` for Nemo Conformer TDT.'); + } + + const decoderInputNames = decoderSession.inputNames ?? []; + const decoderOutputNames = decoderSession.outputNames ?? []; + const missingDecoderInputs = [ + io.decoder_encoder, + io.decoder_token, + io.decoder_token_length, + io.decoder_state_1, + io.decoder_state_2, + ].filter((name) => !decoderInputNames.includes(name)); + + if (missingDecoderInputs.length > 0) { + throw new Error( + `Nemo Conformer TDT decoder session is missing expected inputs: ${missingDecoderInputs.join(', ')}. ` + + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + ); + } + const missingDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2].filter( + (name) => !decoderOutputNames.includes(name), + ); + if (missingDecoderOutputs.length > 0) { + throw new Error( + `Nemo Conformer TDT decoder session is missing expected outputs: ${missingDecoderOutputs.join(', ')}. ` + + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + ); + } + + const encoderSession = sessions?.encoder_model; + if (!encoderSession) { + throw new Error('Missing required session `encoder_model` for Nemo Conformer TDT.'); + } + if (!(encoderSession.outputNames ?? []).includes(io.encoder_output)) { + throw new Error( + `Nemo Conformer TDT encoder session is missing expected output: ${io.encoder_output}. ` + + 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', + ); + } + + const maxSymbolsPerStep = transducerConfig.max_symbols_per_step ?? 10; + const subsamplingFactor = transducerConfig.subsampling_factor ?? 8; + const frameShiftS = transducerConfig.frame_shift_s ?? 0.01; + const blankTokenId = transducerConfig.blank_token_id ?? 0; + const decoderTokenDType = transducerConfig.decoder_token_dtype ?? 'int32'; + const decoderTokenLengthDType = transducerConfig.decoder_token_length_dtype ?? 'int32'; + + if (!Number.isInteger(blankTokenId) || blankTokenId < 0) { + throw new Error('Invalid `transformers.js_config.transducer.blank_token_id`: expected a non-negative integer.'); + } + if (!Number.isInteger(maxSymbolsPerStep) || maxSymbolsPerStep <= 0) { + throw new Error( + 'Invalid `transformers.js_config.transducer.max_symbols_per_step`: expected a positive integer.', + ); + } + if (!Number.isFinite(subsamplingFactor) || subsamplingFactor <= 0) { + throw new Error('Invalid `transformers.js_config.transducer.subsampling_factor`: expected a positive number.'); + } + if (!Number.isFinite(frameShiftS) || frameShiftS <= 0) { + throw new Error('Invalid `transformers.js_config.transducer.frame_shift_s`: expected a positive number.'); + } + if (!['int32', 'int64'].includes(decoderTokenDType)) { + throw new Error( + 'Invalid `transformers.js_config.transducer.decoder_token_dtype`: expected "int32" or "int64".', + ); + } + if (!['int32', 'int64'].includes(decoderTokenLengthDType)) { + throw new Error( + 'Invalid `transformers.js_config.transducer.decoder_token_length_dtype`: expected "int32" or "int64".', + ); + } + + return { + blank_token_id: blankTokenId, + max_symbols_per_step: maxSymbolsPerStep, + subsampling_factor: subsamplingFactor, + frame_shift_s: frameShiftS, + vocab_size: transducerConfig.vocab_size ?? config.vocab_size ?? null, + duration_start_index: transducerConfig.duration_start_index ?? null, + encoder_input_layout: transducerConfig.encoder_input_layout ?? 'BTF', + encoder_output_layout: transducerConfig.encoder_output_layout ?? null, + encoder_frame_layout: transducerConfig.encoder_frame_layout ?? 'BD1', + decoder_token_dtype: decoderTokenDType, + decoder_token_length_dtype: decoderTokenLengthDType, + decoder: { + num_layers: numLayers, + hidden_size: hiddenSize, + }, + io, + }; +} + +export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { + main_input_name = 'input_features'; + forward_params = ['input_features', 'attention_mask']; + + constructor(config, sessions, configs) { + super(config, sessions, configs); + this.transducer = resolveTransducerConfig(config, sessions); + } + + /** + * Load Nemo Conformer TDT sessions using v4 canonical ONNX filenames. + * @type {typeof PreTrainedModel.from_pretrained} + */ + static async from_pretrained( + pretrained_model_name_or_path, + { + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = 'main', + model_file_name = null, + subfolder = 'onnx', + device = null, + dtype = null, + use_external_data_format = null, + session_options = {}, + } = {}, + ) { + const options = { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + model_file_name, + subfolder, + device, + dtype, + use_external_data_format, + session_options, + }; + + config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options); + if (config.model_type !== NEMO_CONFORMER_TDT_MODEL_TYPE) { + throw new Error(`Unsupported model type: ${config.model_type}`); + } + + if (options.model_file_name && options.model_file_name !== 'encoder_model') { + throw new Error( + 'NemoConformerForTDT does not support `model_file_name` override. ' + + 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', + ); + } + + let sessions; + try { + sessions = await constructSessions( + pretrained_model_name_or_path, + { + encoder_model: 'encoder_model', + decoder_model_merged: 'decoder_model_merged', + }, + options, + 'decoder_model_merged', + ); + } catch (error) { + const reason = error?.message ?? String(error); + throw new Error( + 'Failed to load Nemo Conformer TDT sessions. Expected canonical v4 files under `onnx/`: ' + + '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + + `Original error: ${reason}`, + ); + } + + return new this(config, sessions, {}); + } +} + +export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { + async _runEncoder(feeds) { + return await sessionRun(this.sessions.encoder_model, feeds); + } + + async _runDecoder(feeds) { + return await sessionRun(this.sessions.decoder_model_merged, feeds); + } + + _disposeDecoderState(state, keepState = null) { + if (!state) return; + if (state.state1 && state.state1 !== keepState?.state1) { + state.state1.dispose(); + } + if (state.state2 && state.state2 !== keepState?.state2) { + state.state2.dispose(); + } + } + + _getEncoderOutput(outputs) { + const name = this.transducer.io.encoder_output; + return outputs[name] ?? Object.values(outputs)[0]; + } + + _encoderOutputToFrames(encoderOutput) { + const layout = this.transducer.encoder_output_layout ?? inferEncoderOutputLayout(encoderOutput); + const dims = encoderOutput.dims; + const data = encoderOutput.data; + const frames = []; + + if (layout === 'BDT') { + const D = dims[1]; + const T = dims[2]; + for (let t = 0; t < T; ++t) { + const frame = new Float32Array(D); + for (let d = 0; d < D; ++d) { + frame[d] = data[d * T + t]; + } + frames.push(frame); + } + return frames; + } + + if (layout === 'BTD') { + const T = dims[1]; + const D = dims[2]; + for (let t = 0; t < T; ++t) { + const offset = t * D; + frames.push(new Float32Array(data.subarray(offset, offset + D))); + } + return frames; + } + + throw new Error( + `Unsupported encoder output layout "${layout}". Use 'BDT' or 'BTD' in transformers.js_config.transducer.`, + ); + } + + _createFrameTensor(frameData) { + const layout = this.transducer.encoder_frame_layout; + if (layout === 'BD1') { + return new Tensor('float32', frameData, [1, frameData.length, 1]); + } else if (layout === 'B1D') { + return new Tensor('float32', frameData, [1, 1, frameData.length]); + } + throw new Error( + `Unsupported encoder frame layout "${layout}". Use 'BD1' or 'B1D' in transformers.js_config.transducer.`, + ); + } + + _buildEncoderFeeds(model_inputs) { + const encoderSession = this.sessions.encoder_model; + const feeds = {}; + const disposables = []; + const inputFeatures = model_inputs.input_features; + + if (!(inputFeatures instanceof Tensor)) { + throw new Error( + 'NemoConformerForTDT.transcribe expected `model_inputs.input_features` as a Tensor from the processor.', + ); + } + + const missingInputs = []; + for (const name of encoderSession.inputNames) { + if (model_inputs[name] instanceof Tensor) { + feeds[name] = model_inputs[name]; + continue; + } + + if (name === 'input_features') { + feeds[name] = inputFeatures; + continue; + } + + if (name === 'audio_signal') { + const layout = this.transducer.encoder_input_layout; + if (layout === 'BTF') { + feeds[name] = inputFeatures; + } else if (layout === 'BFT') { + const transposed = inputFeatures.transpose(0, 2, 1); + disposables.push(transposed); + feeds[name] = transposed; + } else { + throw new Error( + `Unsupported encoder input layout "${layout}". Use 'BTF' or 'BFT' in transformers.js_config.transducer.`, + ); + } + continue; + } + + if (name === 'length') { + let length = null; + const attentionMask = model_inputs.attention_mask; + if (attentionMask instanceof Tensor) { + const mask = attentionMask.tolist(); + length = mask[0].reduce((acc, x) => acc + toInt(x), 0); + } else { + length = inputFeatures.dims[1]; + } + const lengthTensor = new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1]); + disposables.push(lengthTensor); + feeds[name] = lengthTensor; + continue; + } + + missingInputs.push(name); + } + + if (missingInputs.length > 0) { + throw new Error( + `Nemo Conformer TDT encoder session expects additional inputs that are not available: ${missingInputs.join(', ')}.`, + ); + } + + return { feeds, disposables }; + } + + _resolveVocabSize(tokenizer) { + if (Number.isInteger(this.transducer.vocab_size) && this.transducer.vocab_size > 0) { + return this.transducer.vocab_size; + } + + if (tokenizer?.get_vocab) { + const size = Object.keys(tokenizer.get_vocab()).length; + if (size > 0) { + return size; + } + } + + throw new Error( + 'Unable to resolve vocabulary size for Nemo Conformer TDT. Set `vocab_size` in config.json or provide tokenizer with a vocab.', + ); + } + + _validateRuntimeConfig(vocabSize) { + if (this.transducer.blank_token_id >= vocabSize) { + throw new Error( + `Invalid Nemo Conformer TDT config: blank_token_id=${this.transducer.blank_token_id} must be < vocab_size=${vocabSize}.`, + ); + } + const durationStart = this.transducer.duration_start_index ?? vocabSize; + if (!Number.isInteger(durationStart) || durationStart < vocabSize) { + throw new Error( + `Invalid Nemo Conformer TDT config: duration_start_index=${durationStart} must be an integer >= vocab_size=${vocabSize}.`, + ); + } + } + + /** + * Transcribe model-ready features using TDT decoding. + * @param {Object} model_inputs Processor outputs (must include `input_features`). + * @param {Object} [decode_options] + * @param {any} [decode_options.tokenizer] Tokenizer used for text reconstruction and word timestamps. + * @param {boolean} [decode_options.return_token_timestamps=true] + * @param {boolean} [decode_options.return_word_timestamps=true] + * @param {boolean} [decode_options.return_utterance_timestamp=true] + * @returns {Promise<{ + * text: string, + * token_ids: number[], + * token_timestamps?: [number, number][], + * word_timestamps?: { text: string, timestamp: [number, number]}[], + * utterance_timestamp?: [number, number], + * }>} + */ + async transcribe( + model_inputs, + { + tokenizer = null, + return_token_timestamps = true, + return_word_timestamps = true, + return_utterance_timestamp = true, + } = {}, + ) { + const io = this.transducer.io; + const vocabSize = this._resolveVocabSize(tokenizer); + this._validateRuntimeConfig(vocabSize); + + const { feeds: encoderFeeds, disposables } = this._buildEncoderFeeds(model_inputs); + let encoderOutputs; + try { + encoderOutputs = await this._runEncoder(encoderFeeds); + } finally { + for (const tensor of disposables) { + tensor.dispose(); + } + } + + const encoderOutput = this._getEncoderOutput(encoderOutputs); + let frames; + try { + frames = this._encoderOutputToFrames(encoderOutput); + } finally { + const seen = new Set(); + for (const value of Object.values(encoderOutputs)) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } + } + } + const frameTime = this.transducer.subsampling_factor * this.transducer.frame_shift_s; + + const numLayers = this.transducer.decoder.num_layers; + const hiddenSize = this.transducer.decoder.hidden_size; + const blankId = this.transducer.blank_token_id; + const maxSymbolsPerStep = this.transducer.max_symbols_per_step; + + /** @type {number[]} */ + const tokenIds = []; + /** @type {[number, number][]} */ + const tokenTimestamps = []; + + let decoderState = { + state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), + state2: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), + }; + + const targetLengthTensor = + this.transducer.decoder_token_length_dtype === 'int64' + ? new Tensor('int64', BigInt64Array.from([1n]), [1]) + : new Tensor('int32', new Int32Array([1]), [1]); + let emittedOnFrame = 0; + + try { + for (let frameIndex = 0; frameIndex < frames.length; ) { + const frameTensor = this._createFrameTensor(frames[frameIndex]); + const prevTokenId = tokenIds.length > 0 ? tokenIds[tokenIds.length - 1] : blankId; + const tokenTensor = + this.transducer.decoder_token_dtype === 'int64' + ? new Tensor('int64', BigInt64Array.from([BigInt(prevTokenId)]), [1, 1]) + : new Tensor('int32', new Int32Array([prevTokenId]), [1, 1]); + + const decoderFeeds = { + [io.decoder_encoder]: frameTensor, + [io.decoder_token]: tokenTensor, + [io.decoder_token_length]: targetLengthTensor, + [io.decoder_state_1]: decoderState.state1, + [io.decoder_state_2]: decoderState.state2, + }; + + let decoderOutput; + try { + decoderOutput = await this._runDecoder(decoderFeeds); + } finally { + tokenTensor.dispose(); + frameTensor.dispose(); + } + + const logits = decoderOutput[io.decoder_output] ?? Object.values(decoderOutput)[0]; + const logitsData = logits.data; + if (logitsData.length < vocabSize) { + throw new Error( + `Nemo Conformer TDT decoder output is too small (${logitsData.length}) for vocab_size=${vocabSize}.`, + ); + } + const tokenId = argmax(logitsData, 0, vocabSize); + + const durationStart = this.transducer.duration_start_index ?? vocabSize; + const hasDurationLogits = logitsData.length > durationStart; + const step = hasDurationLogits + ? argmax(logitsData, durationStart, logitsData.length - durationStart) - durationStart + : 0; + + const newState = { + state1: decoderOutput[io.decoder_output_state_1] ?? decoderState.state1, + state2: decoderOutput[io.decoder_output_state_2] ?? decoderState.state2, + }; + + if (tokenId !== blankId) { + this._disposeDecoderState(decoderState, newState); + decoderState = newState; + + tokenIds.push(tokenId); + const durationFrames = step > 0 ? step : 1; + tokenTimestamps.push([frameIndex * frameTime, (frameIndex + durationFrames) * frameTime]); + emittedOnFrame += 1; + } else { + this._disposeDecoderState(newState, decoderState); + } + + logits.dispose(); + + if (step > 0) { + frameIndex += step; + emittedOnFrame = 0; + } else if (tokenId === blankId || emittedOnFrame >= maxSymbolsPerStep) { + frameIndex += 1; + emittedOnFrame = 0; + } + } + } finally { + targetLengthTensor.dispose(); + this._disposeDecoderState(decoderState); + } + + const text = decodeTransducerText(tokenizer, tokenIds); + + const result = { + text, + token_ids: tokenIds, + }; + + if (return_token_timestamps) { + result.token_timestamps = tokenTimestamps; + } + + if (return_word_timestamps) { + result.word_timestamps = buildTransducerWordTimestamps(tokenizer, tokenIds, tokenTimestamps); + } + + if (return_utterance_timestamp) { + if (tokenTimestamps.length > 0) { + result.utterance_timestamp = [tokenTimestamps[0][0], tokenTimestamps[tokenTimestamps.length - 1][1]]; + } else { + result.utterance_timestamp = [0, frames.length * frameTime]; + } + } + + return result; + } + + /** + * Runs TDT transcription when called directly. + * @param {Object} model_inputs + */ + async _call(model_inputs) { + return await this.transcribe(model_inputs); + } +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/processing_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/processing_nemo_conformer_tdt.js new file mode 100644 index 000000000..4c2d0a7eb --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/processing_nemo_conformer_tdt.js @@ -0,0 +1,19 @@ +import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; +import { AutoTokenizer } from '../auto/tokenization_auto.js'; +import { Processor } from '../../processing_utils.js'; + +/** + * Processor for Nemo Conformer TDT models. + */ +export class NemoConformerTDTProcessor extends Processor { + static tokenizer_class = AutoTokenizer; + static feature_extractor_class = AutoFeatureExtractor; + + /** + * Preprocess raw audio for Nemo Conformer TDT models. + * @param {Float32Array|Float64Array} audio + */ + async _call(audio) { + return await this.feature_extractor(audio); + } +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js new file mode 100644 index 000000000..7f46eeb6d --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -0,0 +1,119 @@ +import { Tensor } from '../../utils/tensor.js'; + +/** + * Create a stable hash key for audio samples, used by feature caches. + * @param {Float32Array|Float64Array} audio + * @param {number} [sampling_rate=16000] + * @returns {string} + */ +export function createAudioCacheKey(audio, sampling_rate = 16000) { + // FNV-1a 32-bit over quantized values for deterministic cross-runtime keys. + let hash = 2166136261; + hash ^= audio.length; + hash = Math.imul(hash, 16777619); + hash ^= sampling_rate; + hash = Math.imul(hash, 16777619); + + // Sample stride hash to keep keying cheap for long audio. + const stride = Math.max(1, Math.floor(audio.length / 4096)); + for (let i = 0; i < audio.length; i += stride) { + const q = (audio[i] * 32768) | 0; + hash ^= q; + hash = Math.imul(hash, 16777619); + } + return `${sampling_rate}:${audio.length}:${(hash >>> 0).toString(16)}`; +} + +/** + * Lightweight LRU cache for extracted features. + * Stores values as-is and tracks approximate memory usage. + */ +export class FeatureLRUCache { + /** + * @param {{max_entries?: number, max_size_mb?: number}} [options] + */ + constructor({ max_entries = 128, max_size_mb = 64 } = {}) { + this.max_entries = max_entries; + this.max_size_mb = max_size_mb; + this.cache = new Map(); + this.current_size_bytes = 0; + } + + /** + * @param {string} key + * @returns {any|null} + */ + get(key) { + const entry = this.cache.get(key); + if (!entry) return null; + this.cache.delete(key); + this.cache.set(key, entry); + return entry.value; + } + + /** + * @param {string} key + * @param {any} value + * @returns {void} + */ + set(key, value) { + const existing = this.cache.get(key); + if (existing) { + this.current_size_bytes -= existing.size_bytes; + this.cache.delete(key); + } + + const size_bytes = estimateSizeBytes(value); + this.cache.set(key, { value, size_bytes }); + this.current_size_bytes += size_bytes; + this._evict(); + } + + clear() { + this.cache.clear(); + this.current_size_bytes = 0; + } + + stats() { + return { + entries: this.cache.size, + size_mb: this.current_size_bytes / (1024 * 1024), + max_entries: this.max_entries, + max_size_mb: this.max_size_mb, + }; + } + + _evict() { + const max_bytes = this.max_size_mb * 1024 * 1024; + while (this.cache.size > this.max_entries || this.current_size_bytes > max_bytes) { + const oldest_key = this.cache.keys().next().value; + if (oldest_key === undefined) break; + const oldest = this.cache.get(oldest_key); + this.cache.delete(oldest_key); + this.current_size_bytes -= oldest?.size_bytes ?? 0; + } + } +} + +function estimateSizeBytes(value) { + if (value instanceof Tensor) { + return value.data?.byteLength ?? 0; + } + if (value?.input_features instanceof Tensor) { + let bytes = value.input_features.data?.byteLength ?? 0; + if (value.attention_mask instanceof Tensor) { + bytes += value.attention_mask.data?.byteLength ?? 0; + } + if (value.delta_features instanceof Tensor) { + bytes += value.delta_features.data?.byteLength ?? 0; + } + if (value.delta_delta_features instanceof Tensor) { + bytes += value.delta_delta_features.data?.byteLength ?? 0; + } + return bytes; + } + if (value?.byteLength) { + return value.byteLength; + } + return 0; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js new file mode 100644 index 000000000..80a85f8be --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js @@ -0,0 +1,69 @@ +import { Tensor } from '../../utils/tensor.js'; + +/** + * Compute temporal deltas (and optionally delta-deltas) for [1, T, F] features. + * @param {Tensor} input_features + * @param {{order?: 1|2, window?: number, concatenate?: boolean}} [options] + * @returns {Tensor|{delta: Tensor, delta_delta?: Tensor}} + */ +export function computeTemporalDeltas(input_features, { order = 1, window = 2, concatenate = false } = {}) { + if (!(input_features instanceof Tensor)) { + throw new Error('computeTemporalDeltas expects `input_features` as a Tensor.'); + } + if (input_features.dims.length !== 3 || input_features.dims[0] !== 1) { + throw new Error(`computeTemporalDeltas expects dims [1, T, F], got [${input_features.dims.join(', ')}].`); + } + if (!Number.isInteger(window) || window <= 0) { + throw new Error('computeTemporalDeltas expects `window` to be a positive integer.'); + } + + const [batch, T, F] = input_features.dims; + const base = /** @type {Float32Array} */ (input_features.data); + const delta = new Float32Array(base.length); + const denom = 2 * Array.from({ length: window }, (_, i) => (i + 1) ** 2).reduce((a, b) => a + b, 0); + + const at = (t, f) => base[t * F + f]; + for (let t = 0; t < T; ++t) { + for (let f = 0; f < F; ++f) { + let num = 0; + for (let n = 1; n <= window; ++n) { + const tp = Math.min(T - 1, t + n); + const tm = Math.max(0, t - n); + num += n * (at(tp, f) - at(tm, f)); + } + delta[t * F + f] = num / denom; + } + } + + const delta_tensor = new Tensor('float32', delta, [batch, T, F]); + if (order === 1) { + if (!concatenate) { + return { delta: delta_tensor }; + } + return new Tensor('float32', concatFloat32([base, delta]), [batch, T, F * 2]); + } + + const delta_delta = /** @type {{delta: Tensor}} */ ( + computeTemporalDeltas(delta_tensor, { order: 1, window, concatenate: false }) + ).delta.data; + const delta_delta_tensor = new Tensor('float32', delta_delta, [batch, T, F]); + if (!concatenate) { + return { + delta: delta_tensor, + delta_delta: delta_delta_tensor, + }; + } + + return new Tensor('float32', concatFloat32([base, delta, delta_delta]), [batch, T, F * 3]); +} + +function concatFloat32(items) { + const total = items.reduce((sum, arr) => sum + arr.length, 0); + const output = new Float32Array(total); + let offset = 0; + for (const arr of items) { + output.set(arr, offset); + offset += arr.length; + } + return output; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js new file mode 100644 index 000000000..1234e7d82 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -0,0 +1,73 @@ +/** + * Decode token ids into final transcription text. + * @param {any} tokenizer + * @param {number[]} token_ids + * @returns {string} + */ +export function decodeTransducerText(tokenizer, token_ids) { + if (!tokenizer) return token_ids.join(' '); + return tokenizer.decode(token_ids, { skip_special_tokens: true }).trim(); +} + +/** + * Build word-level timestamps from token ids and token-level timestamps. + * @param {any} tokenizer + * @param {number[]} token_ids + * @param {[number, number][]} token_timestamps + * @returns {{ text: string, timestamp: [number, number] }[]} + */ +export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timestamps) { + if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { + return []; + } + + const words = []; + let current = null; + + for (let i = 0; i < token_ids.length; ++i) { + const id = token_ids[i]; + const ts = token_timestamps[i]; + const piece = tokenizer.decode([id], { + skip_special_tokens: true, + clean_up_tokenization_spaces: false, + }); + + if (!piece) continue; + + const startsNewWord = /^\s+/.test(piece) || piece.startsWith('▁'); + const normalizedPiece = piece.replace(/^\s+/, '').replace(/^▁+/, ''); + if (!normalizedPiece) continue; + + if (!current || startsNewWord) { + if (current) { + const text = current.text.trim(); + if (text) { + words.push({ + text, + timestamp: [current.start, current.end], + }); + } + } + current = { + text: normalizedPiece, + start: ts[0], + end: ts[1], + }; + } else { + current.text += normalizedPiece; + current.end = ts[1]; + } + } + + if (current) { + const text = current.text.trim(); + if (text) { + words.push({ + text, + timestamp: [current.start, current.end], + }); + } + } + + return words; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js new file mode 100644 index 000000000..24859bc16 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js @@ -0,0 +1,4 @@ +// Backwards-compatible barrel for older internal imports. +export { createAudioCacheKey, FeatureLRUCache } from './transducer_cache.js'; +export { computeTemporalDeltas } from './transducer_deltas.js'; +export { decodeTransducerText, buildTransducerWordTimestamps } from './transducer_text.js'; diff --git a/packages/transformers/src/models/processors.js b/packages/transformers/src/models/processors.js index e00b1c71a..28b0efca0 100644 --- a/packages/transformers/src/models/processors.js +++ b/packages/transformers/src/models/processors.js @@ -8,6 +8,7 @@ export * from './jina_clip/processing_jina_clip.js'; export * from './llava/processing_llava.js'; export * from './mgp_str/processing_mgp_str.js'; export * from './moonshine/processing_moonshine.js'; +export * from './nemo_conformer_tdt/processing_nemo_conformer_tdt.js'; export * from './owlvit/processing_owlvit.js'; export * from './paligemma/processing_paligemma.js'; export * from './phi3_v/processing_phi3_v.js'; diff --git a/packages/transformers/src/models/registry.js b/packages/transformers/src/models/registry.js index 4f0dad6de..5c08bdded 100644 --- a/packages/transformers/src/models/registry.js +++ b/packages/transformers/src/models/registry.js @@ -41,6 +41,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ ['unispeech-sat', 'UniSpeechSatModel'], ['hubert', 'HubertModel'], ['wavlm', 'WavLMModel'], + ['nemo-conformer-tdt', 'NemoConformerForTDT'], ['audio-spectrogram-transformer', 'ASTModel'], ['vits', 'VitsModel'], ['pyannote', 'PyAnnoteModel'], diff --git a/packages/transformers/src/pipelines.js b/packages/transformers/src/pipelines.js index 60dae8a86..7c2cca700 100644 --- a/packages/transformers/src/pipelines.js +++ b/packages/transformers/src/pipelines.js @@ -40,6 +40,7 @@ import { AutoModelForDepthEstimation, AutoModelForImageFeatureExtraction, } from './models/auto/modeling_auto.js'; +import { NemoConformerForTDT } from './models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js'; import { dispatchCallback } from './utils/core.js'; import { logger } from './utils/logger.js'; @@ -195,7 +196,7 @@ const SUPPORTED_TASKS = Object.freeze({ 'automatic-speech-recognition': { tokenizer: AutoTokenizer, pipeline: AutomaticSpeechRecognitionPipeline, - model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC], + model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC, NemoConformerForTDT], processor: AutoProcessor, default: { // TODO: replace with original diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index d4ab074a2..06d795af6 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -16,14 +16,21 @@ import { logger } from '../utils/logger.js'; * @property {string} text The recognized text. */ +/** + * @typedef {'utterance' | 'word' | 'token' | 'all'} TimestampGranularity + */ + /** * @typedef {Object} AutomaticSpeechRecognitionOutput * @property {string} text The recognized text. * @property {Chunk[]} [chunks] When using `return_timestamps`, the `chunks` will become a list * containing all the various text chunks identified by the model. + * @property {Chunk[]} [tokens] Optional token-level timestamp chunks for models that support them. + * @property {[number, number]} [utterance] Optional utterance-level timestamp span. * * @typedef {Object} AutomaticSpeechRecognitionSpecificParams Parameters specific to automatic-speech-recognition pipelines. * @property {boolean|'word'} [return_timestamps] Whether to return timestamps or not. Default is `false`. + * @property {TimestampGranularity} [timestamp_granularity] Granularity used when `return_timestamps` is enabled for Parakeet TDT models. Default is `'word'`. * @property {number} [chunk_length_s] The length of audio chunks to process in seconds. Default is 0 (no chunking). * @property {number} [stride_length_s] The length of overlap between consecutive audio chunks in seconds. If not provided, defaults to `chunk_length_s / 6`. * @property {boolean} [force_full_sequences] Whether to force outputting full sequences or not. Default is `false`. @@ -152,6 +159,8 @@ export class AutomaticSpeechRecognitionPipeline case 'hubert': case 'parakeet_ctc': return this._call_wav2vec2(audio, kwargs); + case 'nemo-conformer-tdt': + return this._call_nemo_conformer_tdt(audio, kwargs); case 'moonshine': return this._call_moonshine(audio, kwargs); default: @@ -300,6 +309,136 @@ export class AutomaticSpeechRecognitionPipeline return single ? toReturn[0] : toReturn; } + /** + * @param {any} return_timestamps + * @param {any} timestamp_granularity + * @returns {TimestampGranularity|null} + */ + _normalizeNemoConformerTimestampGranularity(return_timestamps, timestamp_granularity) { + if (!return_timestamps) { + return null; + } + + const granularity = timestamp_granularity ?? 'word'; + const allowed = ['utterance', 'word', 'token', 'all']; + if (!allowed.includes(granularity)) { + throw new Error( + `Invalid \`timestamp_granularity\`: "${granularity}". Expected one of: ${allowed.join(', ')}.`, + ); + } + return /** @type {TimestampGranularity} */ (granularity); + } + + /** + * @param {any} result + * @param {TimestampGranularity|null} granularity + * @returns {AutomaticSpeechRecognitionOutput} + */ + _formatNemoConformerTDTResult(result, granularity) { + const text = result.text ?? ''; + if (!granularity) { + return { text }; + } + + const wordChunks = (result.word_timestamps ?? []).map((item) => ({ + text: item.text, + timestamp: item.timestamp, + })); + const tokenChunks = (result.token_timestamps ?? []).map((timestamp, index) => { + const tokenId = result.token_ids?.[index]; + const decodedToken = + tokenId == null + ? '' + : (this.tokenizer?.decode([tokenId], { + skip_special_tokens: true, + clean_up_tokenization_spaces: false, + }) ?? ''); + return { + text: decodedToken || (tokenId == null ? '' : `${tokenId}`), + timestamp, + }; + }); + const utterance = result.utterance_timestamp; + + if (granularity === 'utterance') { + if (!utterance) { + return { text, chunks: [] }; + } + return { + text, + chunks: [{ text, timestamp: utterance }], + }; + } + + if (granularity === 'word') { + return { text, chunks: wordChunks }; + } + + if (granularity === 'token') { + return { text, chunks: tokenChunks }; + } + + return { + text, + chunks: wordChunks, + tokens: tokenChunks, + ...(utterance ? { utterance } : {}), + }; + } + + /** + * Nemo Conformer TDT ASR output rules: + * - `return_timestamps=false`: `{ text }` + * - `timestamp_granularity='utterance'`: `chunks` contains a single utterance span + * - `timestamp_granularity='word'`: `chunks` contains word-level spans + * - `timestamp_granularity='token'`: `chunks` contains token-level spans + * - `timestamp_granularity='all'`: returns `chunks` (word), `tokens`, and `utterance` + */ + async _call_nemo_conformer_tdt(audio, kwargs) { + if (typeof this.model.transcribe !== 'function') { + throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); + } + if (!this.processor) { + throw new Error('Nemo Conformer TDT pipeline requires a processor.'); + } + if (!this.tokenizer) { + throw new Error('Nemo Conformer TDT pipeline requires a tokenizer.'); + } + if (!this.processor.feature_extractor?.config?.sampling_rate) { + throw new Error( + 'Nemo Conformer TDT pipeline requires `processor.feature_extractor.config.sampling_rate` to prepare audio.', + ); + } + + const return_timestamps = kwargs.return_timestamps ?? false; + const withTimestamps = return_timestamps !== false; + const granularity = this._normalizeNemoConformerTimestampGranularity( + withTimestamps, + kwargs.timestamp_granularity, + ); + + const decodeOptions = { + tokenizer: this.tokenizer, + return_token_timestamps: granularity === 'token' || granularity === 'all', + return_word_timestamps: granularity === 'word' || granularity === 'all', + return_utterance_timestamp: granularity === 'utterance' || granularity === 'all', + }; + + const single = !Array.isArray(audio); + const batchedAudio = single ? [audio] : audio; + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; + const preparedAudios = await prepareAudios(batchedAudio, sampling_rate); + + const toReturn = []; + for (const aud of preparedAudios) { + const inputs = await this.processor(aud); + const output = await this.model.transcribe(inputs, decodeOptions); + toReturn.push(this._formatNemoConformerTDTResult(output, granularity)); + } + + return single ? toReturn[0] : toReturn; + } + async _call_moonshine(audio, kwargs) { const single = !Array.isArray(audio); const batchedAudio = single ? [audio] : audio; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js new file mode 100644 index 000000000..cee9568d0 --- /dev/null +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -0,0 +1,78 @@ +import { NemoConformerTDTFeatureExtractor } from "../../../src/transformers.js"; + +import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("NemoConformerTDTFeatureExtractor", () => { + const base = { + sampling_rate: 16000, + n_fft: 512, + win_length: 400, + hop_length: 160, + preemphasis: 0.97, + }; + + const audio = Float32Array.from({ length: 16000 }, (_, i) => Math.sin((2 * Math.PI * 220 * i) / 16000)); + + it( + "supports 80 mel bins", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80 }); + const { input_features, attention_mask } = await extractor(audio); + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(80); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "supports 128 mel bins", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 128 }); + const { input_features, attention_mask } = await extractor(audio); + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(128); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "supports concatenated delta and delta-delta features", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 128, + delta_order: 2, + delta_window: 2, + delta_concatenate: true, + }); + const { input_features } = await extractor(audio); + expect(input_features.dims[2]).toBe(128 * 3); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "uses feature cache when enabled", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + use_feature_cache: true, + feature_cache_max_entries: 8, + feature_cache_max_size_mb: 8, + }); + const first = await extractor(audio); + const second = await extractor(audio); + + expect(first).toBe(second); + expect(extractor.get_cache_stats().entries).toBe(1); + extractor.clear_cache(); + expect(extractor.get_cache_stats().entries).toBe(0); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js new file mode 100644 index 000000000..83a1523e2 --- /dev/null +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -0,0 +1,188 @@ +import { NemoConformerForTDT, Tensor } from "../../../src/transformers.js"; +import { createAudioCacheKey, FeatureLRUCache } from "../../../src/models/nemo_conformer_tdt/transducer_cache.js"; +import { computeTemporalDeltas } from "../../../src/models/nemo_conformer_tdt/transducer_deltas.js"; + +import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +class MockNemoConformerForTDT extends NemoConformerForTDT { + constructor(config, sessions, decoderScript) { + super(config, sessions, {}); + this.decoderScript = decoderScript; + this.decoderCalls = 0; + } + + async _runEncoder() { + return { + outputs: new Tensor( + "float32", + new Float32Array([ + // D=2, T=3 (BDT) + 0.1, + 0.2, + 0.3, // d0 over t + 0.4, + 0.5, + 0.6, // d1 over t + ]), + [1, 2, 3], + ), + }; + } + + async _runDecoder() { + const step = this.decoderScript[this.decoderCalls++]; + const stateShape = [1, 1, 2]; + return { + outputs: new Tensor("float32", new Float32Array(step.logits), [1, 1, step.logits.length]), + output_states_1: new Tensor("float32", new Float32Array([this.decoderCalls, 0]), stateShape), + output_states_2: new Tensor("float32", new Float32Array([0, this.decoderCalls]), stateShape), + }; + } +} + +const BASE_SESSIONS = { + encoder_model: { + inputNames: ["input_features"], + outputNames: ["outputs"], + }, + decoder_model_merged: { + inputNames: ["encoder_outputs", "targets", "target_length", "input_states_1", "input_states_2"], + outputNames: ["outputs", "output_states_1", "output_states_2"], + }, +}; + +const BASE_CONFIG = { + model_type: "nemo-conformer-tdt", + "transformers.js_config": { + transducer: { + blank_token_id: 0, + max_symbols_per_step: 2, + subsampling_factor: 4, + frame_shift_s: 0.01, + vocab_size: 3, + duration_start_index: 3, + encoder_output_layout: "BDT", + encoder_frame_layout: "BD1", + decoder: { + num_layers: 1, + hidden_size: 2, + }, + }, + }, +}; + +export default () => { + describe("NemoConformerForTDT", () => { + it( + "greedily decodes scripted token and duration logits", + async () => { + const tokenizer = { + decode(ids) { + const idArray = Array.isArray(ids) ? ids : [ids]; + return idArray + .map((id) => { + if (id === 1 || id === 1n) return " hello"; + if (id === 2 || id === 2n) return " world"; + return ""; + }) + .join(""); + }, + }; + + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [ + // step 1: emit token=1, duration=0 + { logits: [0.1, 10.0, 0.0, 8.0, 1.0, 0.5] }, + // step 2: emit blank, duration=1 -> move to next frame + { logits: [9.0, 0.0, 0.0, 0.0, 8.0, 0.0] }, + // step 3: emit token=2, duration=2 -> jump to end + { logits: [0.0, 0.0, 10.0, 0.0, 0.0, 9.0] }, + ]); + + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + const output = await model.transcribe(inputs, { + tokenizer, + return_token_timestamps: true, + return_word_timestamps: true, + return_utterance_timestamp: true, + }); + + expect(output.text).toBe("hello world"); + expect(output.token_ids).toEqual([1, 2]); + expect(output.token_timestamps).toEqual([ + [0, 0.04], + [0.04, 0.12], + ]); + expect(output.word_timestamps).toEqual([ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.12] }, + ]); + expect(output.utterance_timestamp).toEqual([0, 0.12]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it("fails fast when transducer config is missing", () => { + const invalidConfig = { model_type: "nemo-conformer-tdt" }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("Missing `transformers.js_config.transducer`"); + }); + }); + + describe("Nemo Conformer TDT utilities", () => { + it( + "computes delta and delta-delta features", + async () => { + const input = new Tensor( + "float32", + Float32Array.from([ + // T=4, F=2 + 1, 2, 2, 4, 3, 6, 4, 8, + ]), + [1, 4, 2], + ); + + const split = computeTemporalDeltas(input, { order: 2, window: 1, concatenate: false }); + expect(split.delta.dims).toEqual([1, 4, 2]); + expect(split.delta_delta.dims).toEqual([1, 4, 2]); + + const concat = computeTemporalDeltas(input, { order: 2, window: 1, concatenate: true }); + expect(concat.dims).toEqual([1, 4, 6]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "creates stable audio cache keys", + async () => { + const a = Float32Array.from([0, 0.1, 0.2, 0.3]); + const b = Float32Array.from([0, 0.1, 0.2, 0.4]); + const ka1 = createAudioCacheKey(a, 16000); + const ka2 = createAudioCacheKey(a, 16000); + const kb = createAudioCacheKey(b, 16000); + + expect(ka1).toEqual(ka2); + expect(ka1).not.toEqual(kb); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "evicts least-recently-used entries when full", + async () => { + const cache = new FeatureLRUCache({ max_entries: 2, max_size_mb: 4 }); + cache.set("a", new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3])); + cache.set("b", new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3])); + expect(cache.get("a")).not.toBeNull(); + + cache.set("c", new Tensor("float32", new Float32Array([7, 8, 9]), [1, 3])); + // `b` should be evicted because `a` was recently accessed. + expect(cache.get("b")).toBeNull(); + expect(cache.get("a")).not.toBeNull(); + expect(cache.get("c")).not.toBeNull(); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js new file mode 100644 index 000000000..fab1861d9 --- /dev/null +++ b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js @@ -0,0 +1,45 @@ +import { ParakeetFeatureExtractor } from "../../../src/transformers.js"; + +import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("ParakeetFeatureExtractor", () => { + const config = { + feature_size: 80, + sampling_rate: 16000, + n_fft: 512, + win_length: 400, + hop_length: 160, + preemphasis: 0.97, + }; + + /** @type {ParakeetFeatureExtractor} */ + let feature_extractor; + beforeAll(() => { + feature_extractor = new ParakeetFeatureExtractor(config); + }); + + it( + "extracts normalized features and mask from synthetic audio", + async () => { + const duration_s = 1.0; + const total = Math.floor(config.sampling_rate * duration_s); + const audio = Float32Array.from({ length: total }, (_, i) => Math.sin((2 * Math.PI * 220 * i) / config.sampling_rate)); + + const { input_features, attention_mask } = await feature_extractor(audio); + + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(config.feature_size); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + + const validFrames = attention_mask.tolist()[0].reduce((acc, x) => acc + Number(x), 0); + expect(validFrames).toBeGreaterThan(0); + expect(validFrames).toBeLessThanOrEqual(input_features.dims[1]); + + const preview = Array.from(input_features.data.slice(0, 256)); + expect(preview.every(Number.isFinite)).toBe(true); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 963306f7f..831a885ac 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -125,5 +125,138 @@ export default () => { await pipe?.dispose(); }, MAX_MODEL_DISPOSE_TIME); }); + + describe("nemo-conformer-tdt (unit)", () => { + const makeUnitPipe = (modelType = "nemo-conformer-tdt") => { + const calls = []; + const model = { + config: { model_type: modelType }, + async transcribe(_inputs, options) { + calls.push(options); + return { + text: "hello world", + token_ids: [1, 2], + token_timestamps: [ + [0, 0.04], + [0.04, 0.08], + ], + word_timestamps: [ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.08] }, + ], + utterance_timestamp: [0, 0.08], + }; + }, + async dispose() {}, + }; + + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const tokenizer = { + decode(ids) { + const idArray = Array.isArray(ids) ? ids : [ids]; + return idArray + .map((id) => { + if (id === 1 || id === 1n) return " hello"; + if (id === 2 || id === 2n) return " world"; + return ""; + }) + .join(""); + }, + }; + + return { + pipe: new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer, + processor, + }), + calls, + }; + }; + + it("dispatches to nemo-conformer-tdt path", async () => { + const { pipe, calls } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { return_timestamps: false }); + expect(output).toEqual({ text: "hello world" }); + expect(calls).toHaveLength(1); + }); + + it("default timestamps use word granularity", async () => { + const { pipe, calls } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { return_timestamps: true }); + expect(output).toEqual({ + text: "hello world", + chunks: [ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.08] }, + ], + }); + expect(calls[0]).toMatchObject({ + return_word_timestamps: true, + return_token_timestamps: false, + return_utterance_timestamp: false, + }); + }); + + it("supports utterance granularity", async () => { + const { pipe } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { + return_timestamps: true, + timestamp_granularity: "utterance", + }); + expect(output).toEqual({ + text: "hello world", + chunks: [{ text: "hello world", timestamp: [0, 0.08] }], + }); + }); + + it("supports token granularity", async () => { + const { pipe } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { + return_timestamps: true, + timestamp_granularity: "token", + }); + expect(output).toEqual({ + text: "hello world", + chunks: [ + { text: " hello", timestamp: [0, 0.04] }, + { text: " world", timestamp: [0.04, 0.08] }, + ], + }); + }); + + it("supports all granularities at once", async () => { + const { pipe } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { + return_timestamps: true, + timestamp_granularity: "all", + }); + expect(output).toEqual({ + text: "hello world", + chunks: [ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.08] }, + ], + tokens: [ + { text: " hello", timestamp: [0, 0.04] }, + { text: " world", timestamp: [0.04, 0.08] }, + ], + utterance: [0, 0.08], + }); + }); + + it("throws for invalid timestamp granularity", async () => { + const { pipe } = makeUnitPipe(); + await expect( + pipe(new Float32Array(16000), { + return_timestamps: true, + timestamp_granularity: "frame", + }), + ).rejects.toThrow("Invalid `timestamp_granularity`"); + }); + }); }); }; From 964bc8fa79baf04d0f6db17fefde041f0e4e9928 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sat, 28 Feb 2026 23:20:46 +0300 Subject: [PATCH 02/40] fix(nemo-conformer-tdt): handle empty token decode output --- .../src/models/nemo_conformer_tdt/transducer_text.js | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index 1234e7d82..c3b15c630 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -5,6 +5,7 @@ * @returns {string} */ export function decodeTransducerText(tokenizer, token_ids) { + if (!Array.isArray(token_ids) || token_ids.length === 0) return ''; if (!tokenizer) return token_ids.join(' '); return tokenizer.decode(token_ids, { skip_special_tokens: true }).trim(); } From fa9bc25465557deffb915e86dbd573ed4e14680b Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 16:57:38 +0300 Subject: [PATCH 03/40] chore(nemo-conformer-tdt): keep typegen compatibility for transcribe/cache helpers Carry over non-runtime typing fixes from the prior branch while intentionally excluding the WebGPU disable_prepacking workaround in session.js.\n\n- Cast dynamic model.transcribe access for Nemo TDT pipeline method checks/calls.\n- Cast Tensor data byteLength access in transducer cache utilities.\n- Add explicit tuple/object JSDoc annotations in transducer timestamp builder.\n\nThis keeps main-based v4 work clean with latest ORT-Web on origin/main and avoids retaining the temporary encoder prepacking workaround. --- .../src/models/nemo_conformer_tdt/transducer_cache.js | 10 +++++----- .../src/models/nemo_conformer_tdt/transducer_text.js | 4 ++++ .../src/pipelines/automatic-speech-recognition.js | 4 ++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 7f46eeb6d..3042e9e8f 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -97,18 +97,18 @@ export class FeatureLRUCache { function estimateSizeBytes(value) { if (value instanceof Tensor) { - return value.data?.byteLength ?? 0; + return /** @type {any} */ (value.data)?.byteLength ?? 0; } if (value?.input_features instanceof Tensor) { - let bytes = value.input_features.data?.byteLength ?? 0; + let bytes = /** @type {any} */ (value.input_features.data)?.byteLength ?? 0; if (value.attention_mask instanceof Tensor) { - bytes += value.attention_mask.data?.byteLength ?? 0; + bytes += /** @type {any} */ (value.attention_mask.data)?.byteLength ?? 0; } if (value.delta_features instanceof Tensor) { - bytes += value.delta_features.data?.byteLength ?? 0; + bytes += /** @type {any} */ (value.delta_features.data)?.byteLength ?? 0; } if (value.delta_delta_features instanceof Tensor) { - bytes += value.delta_delta_features.data?.byteLength ?? 0; + bytes += /** @type {any} */ (value.delta_delta_features.data)?.byteLength ?? 0; } return bytes; } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index c3b15c630..34c31e2a3 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -22,7 +22,9 @@ export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timest return []; } + /** @type {{ text: string, timestamp: [number, number] }[]} */ const words = []; + /** @type {{ text: string, start: number, end: number } | null} */ let current = null; for (let i = 0; i < token_ids.length; ++i) { @@ -45,6 +47,7 @@ export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timest if (text) { words.push({ text, + // Keep tuple shape for TS consumers. timestamp: [current.start, current.end], }); } @@ -65,6 +68,7 @@ export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timest if (text) { words.push({ text, + // Keep tuple shape for TS consumers. timestamp: [current.start, current.end], }); } diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 06d795af6..3333063e4 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -395,7 +395,7 @@ export class AutomaticSpeechRecognitionPipeline * - `timestamp_granularity='all'`: returns `chunks` (word), `tokens`, and `utterance` */ async _call_nemo_conformer_tdt(audio, kwargs) { - if (typeof this.model.transcribe !== 'function') { + if (typeof /** @type {any} */ (this.model).transcribe !== 'function') { throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); } if (!this.processor) { @@ -432,7 +432,7 @@ export class AutomaticSpeechRecognitionPipeline const toReturn = []; for (const aud of preparedAudios) { const inputs = await this.processor(aud); - const output = await this.model.transcribe(inputs, decodeOptions); + const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); toReturn.push(this._formatNemoConformerTDTResult(output, granularity)); } From 63aeee8347f73df0d8c75c51771f81967cb3c180 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 19:34:00 +0300 Subject: [PATCH 04/40] refactor(nemo-conformer-tdt): redesign transcribe output shape and API - Replace legacy per-feature flags (return_token_timestamps, return_word_timestamps, return_utterance_timestamp) with a layered API: return_timestamps (utterance-level), return_words, return_tokens - Merge duplicate outputs: words absorbs word_timestamps, tokens absorbs token_timestamps and token_ids - Add per-token confidence, word-level confidence aggregation, utterance_confidence, and confidence_scores summary - Gate frame confidences behind returnFrameConfidences flag - Add return_metrics with encode/decode/total timing and RTF - Add debug flags: returnFrameIndices, returnLogProbs, returnTdtSteps - Fix vocab Map handling in getIdToTokenMap and _resolveVocabSize (tokenizer.get_vocab() returns Map in WASM binding) - Update ASR pipeline to wire timestamp_granularity to new model flags - Format all changed files with Prettier per CONTRIBUTING.md --- .../modeling_nemo_conformer_tdt.js | 208 +++++++++++++++--- .../nemo_conformer_tdt/transducer_text.js | 147 ++++++++++--- .../utils_nemo_conformer_tdt.js | 2 +- .../pipelines/automatic-speech-recognition.js | 41 ++-- 4 files changed, 315 insertions(+), 83 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index c0874b21b..766c189d8 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -2,7 +2,7 @@ import { AutoConfig } from '../../configs.js'; import { Tensor } from '../../utils/tensor.js'; import { PreTrainedModel } from '../modeling_utils.js'; import { constructSessions, sessionRun } from '../session.js'; -import { buildTransducerWordTimestamps, decodeTransducerText } from './transducer_text.js'; +import { buildTransducerDetailedOutputs, decodeTransducerText } from './transducer_text.js'; const NEMO_CONFORMER_TDT_MODEL_TYPE = 'nemo-conformer-tdt'; @@ -36,6 +36,42 @@ function toInt(value) { return typeof value === 'bigint' ? Number(value) : value; } +function nowMs() { + return typeof performance !== 'undefined' && typeof performance.now === 'function' ? performance.now() : Date.now(); +} + +function roundMetric(value, digits = 2) { + if (!Number.isFinite(value)) return 0; + const factor = 10 ** digits; + return Math.round(value * factor) / factor; +} + +/** + * @param {Float32Array|number[]} logits + * @param {number} tokenId + * @param {number} vocabSize + * @returns {{ confidence: number, logProb: number }} + */ +function confidenceFromLogits(logits, tokenId, vocabSize) { + let maxLogit = Number.NEGATIVE_INFINITY; + for (let i = 0; i < vocabSize; ++i) { + if (logits[i] > maxLogit) { + maxLogit = logits[i]; + } + } + + let expSum = 0; + for (let i = 0; i < vocabSize; ++i) { + expSum += Math.exp(logits[i] - maxLogit); + } + const logSumExp = maxLogit + Math.log(expSum); + const logProb = logits[tokenId] - logSumExp; + return { + confidence: Math.exp(logProb), + logProb, + }; +} + function inferEncoderOutputLayout(outputTensor) { if (outputTensor.dims.length !== 3 || outputTensor.dims[0] !== 1) { throw new Error( @@ -389,7 +425,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } if (tokenizer?.get_vocab) { - const size = Object.keys(tokenizer.get_vocab()).length; + const vocab = tokenizer.get_vocab(); + const size = vocab instanceof Map ? vocab.size : Object.keys(vocab).length; if (size > 0) { return size; } @@ -416,35 +453,63 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { /** * Transcribe model-ready features using TDT decoding. + * + * - `return_timestamps: false` → `{ text, is_final }` (+ metrics if `return_metrics`) + * - `return_timestamps: true` → adds `utterance_confidence`, `utterance_timestamp`, `confidence_scores` + * - `return_words: true` (requires `return_timestamps`) → adds `words` list + * - `return_tokens: true` (requires `return_timestamps`) → adds `tokens` list + * - `return_metrics` is independent and can be combined with either level. + * - Debug flags (`returnFrameConfidences`, `returnFrameIndices`, `returnLogProbs`, `returnTdtSteps`) are independent. + * * @param {Object} model_inputs Processor outputs (must include `input_features`). * @param {Object} [decode_options] - * @param {any} [decode_options.tokenizer] Tokenizer used for text reconstruction and word timestamps. - * @param {boolean} [decode_options.return_token_timestamps=true] - * @param {boolean} [decode_options.return_word_timestamps=true] - * @param {boolean} [decode_options.return_utterance_timestamp=true] + * @param {any} [decode_options.tokenizer] Tokenizer for text reconstruction and word boundaries. + * @param {boolean} [decode_options.return_timestamps=true] Include utterance-level timestamps and confidence averages. + * @param {boolean} [decode_options.return_words=false] Include word-level list (requires return_timestamps). + * @param {boolean} [decode_options.return_tokens=false] Include token-level list (requires return_timestamps). + * @param {boolean} [decode_options.return_metrics=false] Include encoding/decoding timing metrics. + * @param {boolean} [decode_options.returnFrameConfidences=false] Include per-frame confidence scores in confidence_scores. + * @param {boolean} [decode_options.returnFrameIndices=false] Include per-token encoder frame indices. + * @param {boolean} [decode_options.returnLogProbs=false] Include per-token log probabilities. + * @param {boolean} [decode_options.returnTdtSteps=false] Include raw TDT duration steps. + * @param {number} [decode_options.timeOffset=0] Offset added to all timestamps (seconds). * @returns {Promise<{ * text: string, - * token_ids: number[], - * token_timestamps?: [number, number][], - * word_timestamps?: { text: string, timestamp: [number, number]}[], + * is_final: boolean, + * utterance_confidence?: number, * utterance_timestamp?: [number, number], + * words?: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, + * tokens?: Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, + * confidence_scores?: { token_avg: number|null, word_avg: number|null, frame: number[]|null, frame_avg: number|null, overall_log_prob: number|null }, + * metrics?: { preprocess_ms: number, encode_ms: number, decode_ms: number, tokenize_ms: number, total_ms: number, rtf: number, rtf_x: number }, + * frameIndices?: number[] | null, + * logProbs?: number[] | null, + * tdtSteps?: number[] | null, * }>} */ async transcribe( model_inputs, { tokenizer = null, - return_token_timestamps = true, - return_word_timestamps = true, - return_utterance_timestamp = true, + return_timestamps = true, + return_words = false, + return_tokens = false, + return_metrics = false, + returnFrameConfidences = false, + returnFrameIndices = false, + returnLogProbs = false, + returnTdtSteps = false, + timeOffset = 0, } = {}, ) { + const totalStart = nowMs(); const io = this.transducer.io; const vocabSize = this._resolveVocabSize(tokenizer); this._validateRuntimeConfig(vocabSize); const { feeds: encoderFeeds, disposables } = this._buildEncoderFeeds(model_inputs); let encoderOutputs; + const encodeStart = nowMs(); try { encoderOutputs = await this._runEncoder(encoderFeeds); } finally { @@ -452,6 +517,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tensor.dispose(); } } + const encodeMs = nowMs() - encodeStart; const encoderOutput = this._getEncoderOutput(encoderOutputs); let frames; @@ -473,10 +539,22 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const blankId = this.transducer.blank_token_id; const maxSymbolsPerStep = this.transducer.max_symbols_per_step; + const needConfidences = !!return_timestamps; + /** @type {number[]} */ const tokenIds = []; /** @type {[number, number][]} */ const tokenTimestamps = []; + /** @type {number[] | null} */ + const tokenConfidences = needConfidences ? [] : null; + /** @type {number[] | null} */ + const frameConfidences = returnFrameConfidences ? [] : null; + /** @type {number[] | null} */ + const frameIndices = returnFrameIndices ? [] : null; + /** @type {number[] | null} */ + const logProbs = returnLogProbs || needConfidences ? [] : null; + /** @type {number[] | null} */ + const tdtSteps = returnTdtSteps ? [] : null; let decoderState = { state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), @@ -488,6 +566,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { ? new Tensor('int64', BigInt64Array.from([1n]), [1]) : new Tensor('int32', new Int32Array([1]), [1]); let emittedOnFrame = 0; + const decodeStart = nowMs(); try { for (let frameIndex = 0; frameIndex < frames.length; ) { @@ -522,12 +601,22 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { ); } const tokenId = argmax(logitsData, 0, vocabSize); - const durationStart = this.transducer.duration_start_index ?? vocabSize; const hasDurationLogits = logitsData.length > durationStart; const step = hasDurationLogits ? argmax(logitsData, durationStart, logitsData.length - durationStart) - durationStart : 0; + if (tdtSteps) { + tdtSteps.push(step); + } + + const maybeConfidence = + needConfidences || returnLogProbs || returnFrameConfidences + ? confidenceFromLogits(logitsData, tokenId, vocabSize) + : null; + if (frameConfidences && maybeConfidence) { + frameConfidences.push(maybeConfidence.confidence); + } const newState = { state1: decoderOutput[io.decoder_output_state_1] ?? decoderState.state1, @@ -540,7 +629,19 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenIds.push(tokenId); const durationFrames = step > 0 ? step : 1; - tokenTimestamps.push([frameIndex * frameTime, (frameIndex + durationFrames) * frameTime]); + tokenTimestamps.push([ + frameIndex * frameTime + timeOffset, + (frameIndex + durationFrames) * frameTime + timeOffset, + ]); + if (tokenConfidences && maybeConfidence) { + tokenConfidences.push(maybeConfidence.confidence); + } + if (frameIndices) { + frameIndices.push(frameIndex); + } + if (logProbs && maybeConfidence) { + logProbs.push(maybeConfidence.logProb); + } emittedOnFrame += 1; } else { this._disposeDecoderState(newState, decoderState); @@ -560,28 +661,77 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { targetLengthTensor.dispose(); this._disposeDecoderState(decoderState); } + const decodeMs = nowMs() - decodeStart; + const tokenizeStart = nowMs(); const text = decodeTransducerText(tokenizer, tokenIds); + const needDetailed = return_timestamps && (return_words || return_tokens); + const detailed = needDetailed + ? buildTransducerDetailedOutputs(tokenizer, tokenIds, tokenTimestamps, tokenConfidences) + : null; + const tokenizeMs = nowMs() - tokenizeStart; + + /** @type {any} */ + const result = { text, is_final: true }; + + if (return_timestamps) { + result.utterance_confidence = + tokenConfidences && tokenConfidences.length > 0 + ? tokenConfidences.reduce((a, b) => a + b, 0) / tokenConfidences.length + : null; + + result.utterance_timestamp = + tokenTimestamps.length > 0 + ? /** @type {[number, number]} */ ([ + tokenTimestamps[0][0], + tokenTimestamps[tokenTimestamps.length - 1][1], + ]) + : /** @type {[number, number]} */ ([timeOffset, frames.length * frameTime + timeOffset]); + + if (detailed) { + if (return_words) result.words = detailed.words; + if (return_tokens) result.tokens = detailed.tokens; + } - const result = { - text, - token_ids: tokenIds, - }; - - if (return_token_timestamps) { - result.token_timestamps = tokenTimestamps; + result.confidence_scores = { + token_avg: result.utterance_confidence, + word_avg: detailed?.word_avg ?? null, + overall_log_prob: + logProbs && logProbs.length > 0 ? logProbs.reduce((a, b) => a + b, 0) / logProbs.length : null, + }; + + if (frameConfidences && frameConfidences.length > 0) { + result.confidence_scores.frame = frameConfidences; + result.confidence_scores.frame_avg = + frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length; + } } - if (return_word_timestamps) { - result.word_timestamps = buildTransducerWordTimestamps(tokenizer, tokenIds, tokenTimestamps); + if (returnFrameIndices) { + result.frameIndices = frameIndices; + } + if (returnLogProbs) { + result.logProbs = logProbs; + } + if (returnTdtSteps) { + result.tdtSteps = tdtSteps; } - if (return_utterance_timestamp) { - if (tokenTimestamps.length > 0) { - result.utterance_timestamp = [tokenTimestamps[0][0], tokenTimestamps[tokenTimestamps.length - 1][1]]; - } else { - result.utterance_timestamp = [0, frames.length * frameTime]; - } + if (return_metrics) { + const totalMs = nowMs() - totalStart; + const utteranceDuration = result.utterance_timestamp + ? Math.max(result.utterance_timestamp[1] - result.utterance_timestamp[0], 1e-8) + : Math.max(frames.length * frameTime, 1e-8); + const rtf = totalMs / 1000 / utteranceDuration; + result.metrics = { + preprocess_ms: 0.0, + encode_ms: roundMetric(encodeMs, 2), + decode_ms: roundMetric(decodeMs, 2), + tokenize_ms: roundMetric(tokenizeMs, 2), + total_ms: roundMetric(totalMs, 2), + rtf: roundMetric(rtf, 4), + rtf_x: roundMetric(1 / Math.max(rtf, 1e-8), 2), + }; } return result; diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index 34c31e2a3..0550f323f 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -1,3 +1,60 @@ +/** + * Cache tokenizer id->token maps for stable and fast boundary detection. + * @type {WeakMap>} + */ +const TOKEN_ID_TO_TEXT_CACHE = new WeakMap(); + +/** + * @param {any} tokenizer + * @returns {Map} + */ +function getIdToTokenMap(tokenizer) { + let cached = TOKEN_ID_TO_TEXT_CACHE.get(tokenizer); + if (cached) return cached; + + cached = new Map(); + if (tokenizer?.get_vocab) { + const vocab = tokenizer.get_vocab(); + // get_vocab() may return a Map or a plain Object depending on the tokenizer backend. + const entries = vocab instanceof Map ? vocab.entries() : Object.entries(vocab); + for (const [token, id] of entries) { + if (Number.isInteger(id)) { + cached.set(id, token); + } + } + } + TOKEN_ID_TO_TEXT_CACHE.set(tokenizer, cached); + return cached; +} + +/** + * Resolve per-token text and word boundary metadata in a tokenizer-agnostic way. + * Uses raw vocab token (if available) for boundary markers, and decoded token text for display. + * @param {any} tokenizer + * @param {number} id + * @returns {{ raw: string, clean: string, startsNewWord: boolean }} + */ +function resolveTokenPiece(tokenizer, id) { + const rawToken = getIdToTokenMap(tokenizer).get(id) ?? ''; + const decoded = tokenizer.decode([id], { + skip_special_tokens: true, + clean_up_tokenization_spaces: false, + }); + + // SentencePiece/BPE boundary markers used by common tokenizers. + const startsWithBoundaryMarker = /^(?:▁|Ġ)+/.test(rawToken); + const startsWithWhitespace = /^\s+/.test(decoded); + const startsNewWord = startsWithBoundaryMarker || startsWithWhitespace; + + // Human readable token text. + let clean = decoded.replace(/^\s+/, ''); + if (!clean) { + clean = rawToken.replace(/^(?:▁|Ġ|Ċ)+/, '').replace(/^ +/, ''); + } + + return { raw: rawToken || decoded, clean, startsNewWord }; +} + /** * Decode token ids into final transcription text. * @param {any} tokenizer @@ -11,68 +68,106 @@ export function decodeTransducerText(tokenizer, token_ids) { } /** - * Build word-level timestamps from token ids and token-level timestamps. + * Build detailed word/token outputs with optional confidence aggregation. * @param {any} tokenizer * @param {number[]} token_ids * @param {[number, number][]} token_timestamps - * @returns {{ text: string, timestamp: [number, number] }[]} + * @param {number[] | null} token_confidences + * @returns {{ + * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, + * tokens: Array<{ token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, + * word_confidences: number[] | null, + * word_avg: number | null, + * }} */ -export function buildTransducerWordTimestamps(tokenizer, token_ids, token_timestamps) { +export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_timestamps, token_confidences = null) { if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { - return []; + return { words: [], tokens: [], word_confidences: null, word_avg: null }; } - /** @type {{ text: string, timestamp: [number, number] }[]} */ + /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ + const tokens = []; + /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ const words = []; - /** @type {{ text: string, start: number, end: number } | null} */ + + /** @type {{ text: string, start: number, end: number, confs: number[] } | null} */ let current = null; for (let i = 0; i < token_ids.length; ++i) { const id = token_ids[i]; const ts = token_timestamps[i]; - const piece = tokenizer.decode([id], { - skip_special_tokens: true, - clean_up_tokenization_spaces: false, - }); - - if (!piece) continue; + const piece = resolveTokenPiece(tokenizer, id); + const raw = piece.raw; + const startsNewWord = piece.startsNewWord; + const clean = piece.clean; + if (!clean) continue; - const startsNewWord = /^\s+/.test(piece) || piece.startsWith('▁'); - const normalizedPiece = piece.replace(/^\s+/, '').replace(/^▁+/, ''); - if (!normalizedPiece) continue; + const tok = { + id, + token: clean, + raw_token: raw, + is_word_start: startsNewWord, + start_time: ts[0], + end_time: ts[1], + }; + const conf = token_confidences?.[i]; + if (conf != null && Number.isFinite(conf)) { + tok.confidence = conf; + } + tokens.push(tok); if (!current || startsNewWord) { if (current) { const text = current.text.trim(); if (text) { - words.push({ + /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ + const word = { text, - // Keep tuple shape for TS consumers. - timestamp: [current.start, current.end], - }); + start_time: current.start, + end_time: current.end, + }; + if (current.confs.length > 0) { + word.confidence = current.confs.reduce((a, b) => a + b, 0) / current.confs.length; + } + words.push(word); } } current = { - text: normalizedPiece, + text: clean, start: ts[0], end: ts[1], + confs: conf != null && Number.isFinite(conf) ? [conf] : [], }; } else { - current.text += normalizedPiece; + current.text += clean; current.end = ts[1]; + if (conf != null && Number.isFinite(conf)) { + current.confs.push(conf); + } } } if (current) { const text = current.text.trim(); if (text) { - words.push({ + /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ + const word = { text, - // Keep tuple shape for TS consumers. - timestamp: [current.start, current.end], - }); + start_time: current.start, + end_time: current.end, + }; + if (current.confs.length > 0) { + word.confidence = current.confs.reduce((a, b) => a + b, 0) / current.confs.length; + } + words.push(word); } } - return words; + const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? 0) : null; + const word_avg = + word_confidences && word_confidences.length > 0 + ? word_confidences.reduce((a, b) => a + b, 0) / word_confidences.length + : null; + + return { words, tokens, word_confidences, word_avg }; } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js index 24859bc16..935336b1e 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js @@ -1,4 +1,4 @@ // Backwards-compatible barrel for older internal imports. export { createAudioCacheKey, FeatureLRUCache } from './transducer_cache.js'; export { computeTemporalDeltas } from './transducer_deltas.js'; -export { decodeTransducerText, buildTransducerWordTimestamps } from './transducer_text.js'; +export { decodeTransducerText, buildTransducerDetailedOutputs } from './transducer_text.js'; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 3333063e4..5a2d088b3 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -340,24 +340,14 @@ export class AutomaticSpeechRecognitionPipeline return { text }; } - const wordChunks = (result.word_timestamps ?? []).map((item) => ({ - text: item.text, - timestamp: item.timestamp, + const wordChunks = (result.words ?? []).map((w) => ({ + text: w.text, + timestamp: [w.start_time, w.end_time], + })); + const tokenChunks = (result.tokens ?? []).map((t) => ({ + text: t.token ?? t.text ?? '', + timestamp: [t.start_time, t.end_time], })); - const tokenChunks = (result.token_timestamps ?? []).map((timestamp, index) => { - const tokenId = result.token_ids?.[index]; - const decodedToken = - tokenId == null - ? '' - : (this.tokenizer?.decode([tokenId], { - skip_special_tokens: true, - clean_up_tokenization_spaces: false, - }) ?? ''); - return { - text: decodedToken || (tokenId == null ? '' : `${tokenId}`), - timestamp, - }; - }); const utterance = result.utterance_timestamp; if (granularity === 'utterance') { @@ -389,13 +379,10 @@ export class AutomaticSpeechRecognitionPipeline /** * Nemo Conformer TDT ASR output rules: * - `return_timestamps=false`: `{ text }` - * - `timestamp_granularity='utterance'`: `chunks` contains a single utterance span - * - `timestamp_granularity='word'`: `chunks` contains word-level spans - * - `timestamp_granularity='token'`: `chunks` contains token-level spans - * - `timestamp_granularity='all'`: returns `chunks` (word), `tokens`, and `utterance` + * - `return_timestamps=true`: return full raw model transcription payload. */ async _call_nemo_conformer_tdt(audio, kwargs) { - if (typeof /** @type {any} */ (this.model).transcribe !== 'function') { + if (typeof (/** @type {any} */ (this.model).transcribe) !== 'function') { throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); } if (!this.processor) { @@ -411,17 +398,17 @@ export class AutomaticSpeechRecognitionPipeline } const return_timestamps = kwargs.return_timestamps ?? false; - const withTimestamps = return_timestamps !== false; const granularity = this._normalizeNemoConformerTimestampGranularity( - withTimestamps, + return_timestamps, kwargs.timestamp_granularity, ); + const withTimestamps = granularity !== null; const decodeOptions = { tokenizer: this.tokenizer, - return_token_timestamps: granularity === 'token' || granularity === 'all', - return_word_timestamps: granularity === 'word' || granularity === 'all', - return_utterance_timestamp: granularity === 'utterance' || granularity === 'all', + return_timestamps: withTimestamps, + return_words: granularity === 'word' || granularity === 'all', + return_tokens: granularity === 'token' || granularity === 'all', }; const single = !Array.isArray(audio); From f6835ad84a98036204028fc9ce02f55e9ec2047e Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 21:01:10 +0300 Subject: [PATCH 05/40] fix(nemo-conformer-tdt): round timestamps and confidences, simplify pipeline - Add roundTs() for millisecond-precision timestamp rounding at source - Round all confidence averages to 6 decimal places - Round per-token and per-word confidence values - Remove timestamp_granularity and formatting helpers from pipeline - Pipeline returns model.transcribe() output directly - Auto-enable return_words and return_metrics when return_timestamps is true --- .../modeling_nemo_conformer_tdt.js | 27 +++-- .../nemo_conformer_tdt/transducer_text.js | 10 +- .../pipelines/automatic-speech-recognition.js | 98 +++---------------- 3 files changed, 36 insertions(+), 99 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 766c189d8..2c48ef8e4 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -46,6 +46,10 @@ function roundMetric(value, digits = 2) { return Math.round(value * factor) / factor; } +function roundTs(value) { + return Math.round(value * 1000) / 1000; +} + /** * @param {Float32Array|number[]} logits * @param {number} tokenId @@ -630,8 +634,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenIds.push(tokenId); const durationFrames = step > 0 ? step : 1; tokenTimestamps.push([ - frameIndex * frameTime + timeOffset, - (frameIndex + durationFrames) * frameTime + timeOffset, + roundTs(frameIndex * frameTime + timeOffset), + roundTs((frameIndex + durationFrames) * frameTime + timeOffset), ]); if (tokenConfidences && maybeConfidence) { tokenConfidences.push(maybeConfidence.confidence); @@ -677,7 +681,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { if (return_timestamps) { result.utterance_confidence = tokenConfidences && tokenConfidences.length > 0 - ? tokenConfidences.reduce((a, b) => a + b, 0) / tokenConfidences.length + ? roundMetric(tokenConfidences.reduce((a, b) => a + b, 0) / tokenConfidences.length, 6) : null; result.utterance_timestamp = @@ -686,7 +690,10 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenTimestamps[0][0], tokenTimestamps[tokenTimestamps.length - 1][1], ]) - : /** @type {[number, number]} */ ([timeOffset, frames.length * frameTime + timeOffset]); + : /** @type {[number, number]} */ ([ + roundTs(timeOffset), + roundTs(frames.length * frameTime + timeOffset), + ]); if (detailed) { if (return_words) result.words = detailed.words; @@ -695,15 +702,19 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { result.confidence_scores = { token_avg: result.utterance_confidence, - word_avg: detailed?.word_avg ?? null, + word_avg: detailed?.word_avg != null ? roundMetric(detailed.word_avg, 6) : null, overall_log_prob: - logProbs && logProbs.length > 0 ? logProbs.reduce((a, b) => a + b, 0) / logProbs.length : null, + logProbs && logProbs.length > 0 + ? roundMetric(logProbs.reduce((a, b) => a + b, 0) / logProbs.length, 6) + : null, }; if (frameConfidences && frameConfidences.length > 0) { result.confidence_scores.frame = frameConfidences; - result.confidence_scores.frame_avg = - frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length; + result.confidence_scores.frame_avg = roundMetric( + frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, + 6, + ); } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index 0550f323f..e98938224 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -112,7 +112,7 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times }; const conf = token_confidences?.[i]; if (conf != null && Number.isFinite(conf)) { - tok.confidence = conf; + tok.confidence = Math.round(conf * 1e6) / 1e6; } tokens.push(tok); @@ -127,7 +127,8 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times end_time: current.end, }; if (current.confs.length > 0) { - word.confidence = current.confs.reduce((a, b) => a + b, 0) / current.confs.length; + word.confidence = + Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; } words.push(word); } @@ -157,7 +158,8 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times end_time: current.end, }; if (current.confs.length > 0) { - word.confidence = current.confs.reduce((a, b) => a + b, 0) / current.confs.length; + word.confidence = + Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; } words.push(word); } @@ -166,7 +168,7 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? 0) : null; const word_avg = word_confidences && word_confidences.length > 0 - ? word_confidences.reduce((a, b) => a + b, 0) / word_confidences.length + ? Math.round((word_confidences.reduce((a, b) => a + b, 0) / word_confidences.length) * 1e6) / 1e6 : null; return { words, tokens, word_confidences, word_avg }; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 5a2d088b3..4194bdb9a 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -16,21 +16,14 @@ import { logger } from '../utils/logger.js'; * @property {string} text The recognized text. */ -/** - * @typedef {'utterance' | 'word' | 'token' | 'all'} TimestampGranularity - */ - /** * @typedef {Object} AutomaticSpeechRecognitionOutput * @property {string} text The recognized text. * @property {Chunk[]} [chunks] When using `return_timestamps`, the `chunks` will become a list * containing all the various text chunks identified by the model. - * @property {Chunk[]} [tokens] Optional token-level timestamp chunks for models that support them. - * @property {[number, number]} [utterance] Optional utterance-level timestamp span. * * @typedef {Object} AutomaticSpeechRecognitionSpecificParams Parameters specific to automatic-speech-recognition pipelines. * @property {boolean|'word'} [return_timestamps] Whether to return timestamps or not. Default is `false`. - * @property {TimestampGranularity} [timestamp_granularity] Granularity used when `return_timestamps` is enabled for Parakeet TDT models. Default is `'word'`. * @property {number} [chunk_length_s] The length of audio chunks to process in seconds. Default is 0 (no chunking). * @property {number} [stride_length_s] The length of overlap between consecutive audio chunks in seconds. If not provided, defaults to `chunk_length_s / 6`. * @property {boolean} [force_full_sequences] Whether to force outputting full sequences or not. Default is `false`. @@ -310,76 +303,12 @@ export class AutomaticSpeechRecognitionPipeline } /** - * @param {any} return_timestamps - * @param {any} timestamp_granularity - * @returns {TimestampGranularity|null} - */ - _normalizeNemoConformerTimestampGranularity(return_timestamps, timestamp_granularity) { - if (!return_timestamps) { - return null; - } - - const granularity = timestamp_granularity ?? 'word'; - const allowed = ['utterance', 'word', 'token', 'all']; - if (!allowed.includes(granularity)) { - throw new Error( - `Invalid \`timestamp_granularity\`: "${granularity}". Expected one of: ${allowed.join(', ')}.`, - ); - } - return /** @type {TimestampGranularity} */ (granularity); - } - - /** - * @param {any} result - * @param {TimestampGranularity|null} granularity - * @returns {AutomaticSpeechRecognitionOutput} - */ - _formatNemoConformerTDTResult(result, granularity) { - const text = result.text ?? ''; - if (!granularity) { - return { text }; - } - - const wordChunks = (result.words ?? []).map((w) => ({ - text: w.text, - timestamp: [w.start_time, w.end_time], - })); - const tokenChunks = (result.tokens ?? []).map((t) => ({ - text: t.token ?? t.text ?? '', - timestamp: [t.start_time, t.end_time], - })); - const utterance = result.utterance_timestamp; - - if (granularity === 'utterance') { - if (!utterance) { - return { text, chunks: [] }; - } - return { - text, - chunks: [{ text, timestamp: utterance }], - }; - } - - if (granularity === 'word') { - return { text, chunks: wordChunks }; - } - - if (granularity === 'token') { - return { text, chunks: tokenChunks }; - } - - return { - text, - chunks: wordChunks, - tokens: tokenChunks, - ...(utterance ? { utterance } : {}), - }; - } - - /** - * Nemo Conformer TDT ASR output rules: - * - `return_timestamps=false`: `{ text }` - * - `return_timestamps=true`: return full raw model transcription payload. + * Nemo Conformer TDT ASR pipeline. + * + * Delegates to model.transcribe() and returns its output directly. + * Use `return_timestamps: true` on the pipeline call to get utterance-level data. + * For words/tokens/metrics/debug, call model.transcribe() directly with the + * extended options (return_words, return_tokens, return_metrics, etc.). */ async _call_nemo_conformer_tdt(audio, kwargs) { if (typeof (/** @type {any} */ (this.model).transcribe) !== 'function') { @@ -397,18 +326,13 @@ export class AutomaticSpeechRecognitionPipeline ); } - const return_timestamps = kwargs.return_timestamps ?? false; - const granularity = this._normalizeNemoConformerTimestampGranularity( - return_timestamps, - kwargs.timestamp_granularity, - ); - const withTimestamps = granularity !== null; + const return_timestamps = !!(kwargs.return_timestamps ?? false); const decodeOptions = { tokenizer: this.tokenizer, - return_timestamps: withTimestamps, - return_words: granularity === 'word' || granularity === 'all', - return_tokens: granularity === 'token' || granularity === 'all', + return_timestamps, + return_words: return_timestamps, + return_metrics: true, }; const single = !Array.isArray(audio); @@ -420,7 +344,7 @@ export class AutomaticSpeechRecognitionPipeline for (const aud of preparedAudios) { const inputs = await this.processor(aud); const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); - toReturn.push(this._formatNemoConformerTDTResult(output, granularity)); + toReturn.push(output); } return single ? toReturn[0] : toReturn; From 2dd36a117147269b93ffea945b0f4076959c211f Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 21:13:20 +0300 Subject: [PATCH 06/40] fix: dispose tensors on error path, decouple frame confidences from timestamps, honor return_metrics kwarg - modeling_nemo_conformer_tdt: dispose logits and new decoder state tensors before throwing when logitsData.length < vocabSize to prevent resource leak - modeling_nemo_conformer_tdt: move returnFrameConfidences output block outside the return_timestamps guard so frame/frame_avg are emitted independently - automatic-speech-recognition: change return_metrics from hardcoded true to kwargs.return_metrics ?? false to respect user intent and avoid overhead --- .../modeling_nemo_conformer_tdt.js | 47 +++++++++++-------- .../pipelines/automatic-speech-recognition.js | 4 +- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 2c48ef8e4..8a46bb014 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -131,7 +131,7 @@ function resolveTransducerConfig(config, sessions) { if (missingDecoderInputs.length > 0) { throw new Error( `Nemo Conformer TDT decoder session is missing expected inputs: ${missingDecoderInputs.join(', ')}. ` + - 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', ); } const missingDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2].filter( @@ -140,7 +140,7 @@ function resolveTransducerConfig(config, sessions) { if (missingDecoderOutputs.length > 0) { throw new Error( `Nemo Conformer TDT decoder session is missing expected outputs: ${missingDecoderOutputs.join(', ')}. ` + - 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', ); } @@ -151,7 +151,7 @@ function resolveTransducerConfig(config, sessions) { if (!(encoderSession.outputNames ?? []).includes(io.encoder_output)) { throw new Error( `Nemo Conformer TDT encoder session is missing expected output: ${io.encoder_output}. ` + - 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', + 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', ); } @@ -258,7 +258,7 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { if (options.model_file_name && options.model_file_name !== 'encoder_model') { throw new Error( 'NemoConformerForTDT does not support `model_file_name` override. ' + - 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', + 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', ); } @@ -277,8 +277,8 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { const reason = error?.message ?? String(error); throw new Error( 'Failed to load Nemo Conformer TDT sessions. Expected canonical v4 files under `onnx/`: ' + - '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + - `Original error: ${reason}`, + '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + + `Original error: ${reason}`, ); } @@ -573,7 +573,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const decodeStart = nowMs(); try { - for (let frameIndex = 0; frameIndex < frames.length; ) { + for (let frameIndex = 0; frameIndex < frames.length;) { const frameTensor = this._createFrameTensor(frames[frameIndex]); const prevTokenId = tokenIds.length > 0 ? tokenIds[tokenIds.length - 1] : blankId; const tokenTensor = @@ -600,6 +600,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const logits = decoderOutput[io.decoder_output] ?? Object.values(decoderOutput)[0]; const logitsData = logits.data; if (logitsData.length < vocabSize) { + logits.dispose(); + this._disposeDecoderState({ + state1: decoderOutput[io.decoder_output_state_1], + state2: decoderOutput[io.decoder_output_state_2], + }); throw new Error( `Nemo Conformer TDT decoder output is too small (${logitsData.length}) for vocab_size=${vocabSize}.`, ); @@ -687,13 +692,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { result.utterance_timestamp = tokenTimestamps.length > 0 ? /** @type {[number, number]} */ ([ - tokenTimestamps[0][0], - tokenTimestamps[tokenTimestamps.length - 1][1], - ]) + tokenTimestamps[0][0], + tokenTimestamps[tokenTimestamps.length - 1][1], + ]) : /** @type {[number, number]} */ ([ - roundTs(timeOffset), - roundTs(frames.length * frameTime + timeOffset), - ]); + roundTs(timeOffset), + roundTs(frames.length * frameTime + timeOffset), + ]); if (detailed) { if (return_words) result.words = detailed.words; @@ -708,14 +713,18 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { ? roundMetric(logProbs.reduce((a, b) => a + b, 0) / logProbs.length, 6) : null, }; + } - if (frameConfidences && frameConfidences.length > 0) { - result.confidence_scores.frame = frameConfidences; - result.confidence_scores.frame_avg = roundMetric( - frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, - 6, - ); + // Frame confidences are independent of return_timestamps — emit whenever requested. + if (returnFrameConfidences && frameConfidences && frameConfidences.length > 0) { + if (!result.confidence_scores) { + result.confidence_scores = {}; } + result.confidence_scores.frame = frameConfidences; + result.confidence_scores.frame_avg = roundMetric( + frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, + 6, + ); } if (returnFrameIndices) { diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 4194bdb9a..77592f5a0 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -332,7 +332,7 @@ export class AutomaticSpeechRecognitionPipeline tokenizer: this.tokenizer, return_timestamps, return_words: return_timestamps, - return_metrics: true, + return_metrics: kwargs.return_metrics ?? false, }; const single = !Array.isArray(audio); @@ -365,7 +365,7 @@ export class AutomaticSpeechRecognitionPipeline const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6; const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs }); - const text = this.processor.batch_decode(/** @type {Tensor} */ (outputs), { skip_special_tokens: true })[0]; + const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0]; toReturn.push({ text }); } return single ? toReturn[0] : toReturn; From 39d9be4f47f087d56c3f1c06db638aebf520e143 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 21:28:15 +0300 Subject: [PATCH 07/40] feat: integrate NemoConformerTDT with ModelRegistry API - Add MODEL_TYPES.NemoConformerTDT (id=16) to modeling_utils - Register NemoConformerForTDT in MODEL_TYPE_MAPPING, MODEL_NAME_TO_CLASS_MAPPING, and MODEL_CLASS_TO_NAME_MAPPING so the base class from_pretrained, ModelRegistry, and is_pipeline_cached all recognise the model correctly - Add NemoConformerTDT case to get_model_files so progress_callback receives accurate file size totals for encoder_model.onnx + decoder_model_merged.onnx --- .../transformers/src/models/modeling_utils.js | 3 ++- .../modeling_nemo_conformer_tdt.js | 16 +++++++++++++++- .../src/utils/model_registry/get_model_files.js | 7 +++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/packages/transformers/src/models/modeling_utils.js b/packages/transformers/src/models/modeling_utils.js index d4c5d6d32..ef6e5a99a 100644 --- a/packages/transformers/src/models/modeling_utils.js +++ b/packages/transformers/src/models/modeling_utils.js @@ -98,6 +98,7 @@ export const MODEL_TYPES = { ImageAudioTextToText: 13, Supertonic: 14, Chatterbox: 15, + NemoConformerTDT: 16, }; const MODEL_TYPE_CONFIG = { @@ -857,7 +858,7 @@ export class PreTrainedModel extends Callable { if (inputs) { throw new Error( '`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' + - 'Make sure to either pass {inputs} or {input_name}=...', + 'Make sure to either pass {inputs} or {input_name}=...', ); } } else { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 8a46bb014..1dceb2947 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -1,6 +1,12 @@ import { AutoConfig } from '../../configs.js'; import { Tensor } from '../../utils/tensor.js'; -import { PreTrainedModel } from '../modeling_utils.js'; +import { + PreTrainedModel, + MODEL_TYPES, + MODEL_TYPE_MAPPING, + MODEL_NAME_TO_CLASS_MAPPING, + MODEL_CLASS_TO_NAME_MAPPING, +} from '../modeling_utils.js'; import { constructSessions, sessionRun } from '../session.js'; import { buildTransducerDetailedOutputs, decodeTransducerText } from './transducer_text.js'; @@ -765,3 +771,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { return await this.transcribe(model_inputs); } } + +// Register with ModelRegistry so get_model_files / progress_callback enumerate +// the correct ONNX files: encoder_model + decoder_model_merged. +MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); +MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerTDTPreTrainedModel', NemoConformerTDTPreTrainedModel); +MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerForTDT', NemoConformerForTDT); +MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerTDTPreTrainedModel, 'NemoConformerTDTPreTrainedModel'); +MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerForTDT, 'NemoConformerForTDT'); diff --git a/packages/transformers/src/utils/model_registry/get_model_files.js b/packages/transformers/src/utils/model_registry/get_model_files.js index d60dbfd0a..b0cf029e1 100644 --- a/packages/transformers/src/utils/model_registry/get_model_files.js +++ b/packages/transformers/src/utils/model_registry/get_model_files.js @@ -71,8 +71,8 @@ export async function get_model_files( const archList = architectures.length > 0 ? architectures.join(', ') : '(none)'; logger.warn( `[get_model_files] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] ` + - `for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). ` + - `If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`, + `for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). ` + + `If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`, ); // Always fallback to EncoderOnly (single model.onnx file) @@ -166,6 +166,9 @@ export async function get_model_files( add_model_file('model', 'language_model'); add_model_file('conditional_decoder'); files.push('generation_config.json'); + } else if (modelType === MODEL_TYPES.NemoConformerTDT) { + add_model_file('model', 'encoder_model'); + add_model_file('decoder_model_merged'); } else if (modelType === MODEL_TYPES.AutoEncoder) { add_model_file('encoder_model'); add_model_file('decoder_model'); From 3d984e527c733de5aa5a5d0785acd33317baf84c Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 21:58:16 +0300 Subject: [PATCH 08/40] style: replace console.warn with logger.warn in feature extractor Standardizes internal logging to follow the upstream convention introduced in ModelRegistry refactor. --- .../feature_extraction_nemo_conformer_tdt.js | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index f1bfe6b76..0bde33d54 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -1,6 +1,7 @@ import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js'; import { Tensor } from '../../utils/tensor.js'; import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { logger } from '../../utils/logger.js'; import { FeatureLRUCache, createAudioCacheKey } from './transducer_cache.js'; import { computeTemporalDeltas } from './transducer_deltas.js'; @@ -47,17 +48,17 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { ); } if (this.delta_order > 0 && !this.delta_concatenate) { - console.warn( + logger.warn( 'NemoConformerTDTFeatureExtractor: `delta_concatenate=false` is set. ' + - '`input_features` will remain base features and deltas are returned in extra fields.', + '`input_features` will remain base features and deltas are returned in extra fields.', ); } this.feature_cache = this.use_feature_cache ? new FeatureLRUCache({ - max_entries: this.config.feature_cache_max_entries ?? 128, - max_size_mb: this.config.feature_cache_max_size_mb ?? 64, - }) + max_entries: this.config.feature_cache_max_entries ?? 128, + max_size_mb: this.config.feature_cache_max_size_mb ?? 64, + }) : null; } From 9f3a220284d43bf9a859f3e9d4190bd2636832d0 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 22:22:28 +0300 Subject: [PATCH 09/40] fix(nemo-conformer-tdt): harden edge cases, restore pipeline design - Guard feature extractor against empty/short audio (NaN prevention) - Move decoder tensor init inside try block for safe disposal - Add architecture key to MODEL_TYPE_MAPPING - Add input validation in buildTransducerDetailedOutputs - Harden audio cache hash against NaN samples - Add order validation in computeTemporalDeltas - Restore pipeline: return_timestamps truthy => words + metrics always on --- packages/transformers/src/models/models.js | 2 +- .../feature_extraction_nemo_conformer_tdt.js | 33 +++++++++++-------- .../modeling_nemo_conformer_tdt.js | 29 +++++++++------- .../nemo_conformer_tdt/transducer_cache.js | 3 +- .../nemo_conformer_tdt/transducer_deltas.js | 10 ++++-- .../nemo_conformer_tdt/transducer_text.js | 10 ++++++ .../pipelines/automatic-speech-recognition.js | 4 +-- 7 files changed, 59 insertions(+), 32 deletions(-) diff --git a/packages/transformers/src/models/models.js b/packages/transformers/src/models/models.js index 2fe9055a0..0cac26e2a 100644 --- a/packages/transformers/src/models/models.js +++ b/packages/transformers/src/models/models.js @@ -102,8 +102,8 @@ export * from './mpt/modeling_mpt.js'; export * from './mt5/modeling_mt5.js'; export * from './multi_modality/modeling_multi_modality.js'; export * from './musicgen/modeling_musicgen.js'; -export * from './nemo_conformer_tdt/modeling_nemo_conformer_tdt.js'; export * from './nanochat/modeling_nanochat.js'; +export * from './nemo_conformer_tdt/modeling_nemo_conformer_tdt.js'; export * from './neobert/modeling_neobert.js'; export * from './nomic_bert/modeling_nomic_bert.js'; export * from './olmo/modeling_olmo.js'; diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 0bde33d54..e230ff50c 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -129,15 +129,17 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { async _extract(audio) { const features = await this._extract_fbank_features(audio); - const features_length = Math.floor( + const [num_frames, num_features] = features.dims; + const raw_features_length = Math.floor( (audio.length + Math.floor(this.config.n_fft / 2) * 2 - this.config.n_fft) / this.config.hop_length, ); + // Clamp to [0, num_frames] to avoid a negative fill offset for very short clips. + const features_length = Math.max(0, Math.min(num_frames, raw_features_length)); const features_data = /** @type {Float32Array} */ (features.data); - features_data.fill(0, features_length * features.dims[1]); + features_data.fill(0, features_length * num_features); // normalize mel features, ignoring padding - const [num_frames, num_features] = features.dims; const sum = new Float64Array(num_features); const sum_sq = new Float64Array(num_features); @@ -150,17 +152,20 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { } } - // Calculate mean and standard deviation, then normalize - const divisor = features_length > 1 ? features_length - 1 : 1; - for (let j = 0; j < num_features; ++j) { - const mean = sum[j] / features_length; - const variance = (sum_sq[j] - features_length * mean * mean) / divisor; - const std = Math.sqrt(variance) + EPSILON; - const inv_std = 1 / std; - - for (let i = 0; i < features_length; ++i) { - const index = i * num_features + j; - features_data[index] = (features_data[index] - mean) * inv_std; + // Skip normalization for empty/very short audio to avoid NaN from divide-by-zero. + if (features_length > 0) { + // Calculate mean and standard deviation, then normalize + const divisor = features_length > 1 ? features_length - 1 : 1; + for (let j = 0; j < num_features; ++j) { + const mean = sum[j] / features_length; + const variance = (sum_sq[j] - features_length * mean * mean) / divisor; + const std = Math.sqrt(variance) + EPSILON; + const inv_std = 1 / std; + + for (let i = 0; i < features_length; ++i) { + const index = i * num_features + j; + features_data[index] = (features_data[index] - mean) * inv_std; + } } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 1dceb2947..be25afe42 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -566,19 +566,23 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { /** @type {number[] | null} */ const tdtSteps = returnTdtSteps ? [] : null; - let decoderState = { - state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), - state2: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), - }; + let decoderState; + let targetLengthTensor; - const targetLengthTensor = - this.transducer.decoder_token_length_dtype === 'int64' - ? new Tensor('int64', BigInt64Array.from([1n]), [1]) - : new Tensor('int32', new Int32Array([1]), [1]); let emittedOnFrame = 0; const decodeStart = nowMs(); try { + decoderState = { + state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), + state2: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), + }; + + targetLengthTensor = + this.transducer.decoder_token_length_dtype === 'int64' + ? new Tensor('int64', BigInt64Array.from([1n]), [1]) + : new Tensor('int32', new Int32Array([1]), [1]); + for (let frameIndex = 0; frameIndex < frames.length;) { const frameTensor = this._createFrameTensor(frames[frameIndex]); const prevTokenId = tokenIds.length > 0 ? tokenIds[tokenIds.length - 1] : blankId; @@ -643,6 +647,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { decoderState = newState; tokenIds.push(tokenId); + // TDT duration convention: step=0 means "stay on current frame" (duration index 0 = no advance). + // We still associate the token with this frame, so durationFrames is at least 1. const durationFrames = step > 0 ? step : 1; tokenTimestamps.push([ roundTs(frameIndex * frameTime + timeOffset), @@ -673,8 +679,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } } } finally { - targetLengthTensor.dispose(); - this._disposeDecoderState(decoderState); + if (targetLengthTensor) targetLengthTensor.dispose(); + if (decoderState) this._disposeDecoderState(decoderState); } const decodeMs = nowMs() - decodeStart; @@ -774,7 +780,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { // Register with ModelRegistry so get_model_files / progress_callback enumerate // the correct ONNX files: encoder_model + decoder_model_merged. -MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); +MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); // model_type key +MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT); // architecture key MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerTDTPreTrainedModel', NemoConformerTDTPreTrainedModel); MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerForTDT', NemoConformerForTDT); MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerTDTPreTrainedModel, 'NemoConformerTDTPreTrainedModel'); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 3042e9e8f..5a3af06ef 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -17,7 +17,8 @@ export function createAudioCacheKey(audio, sampling_rate = 16000) { // Sample stride hash to keep keying cheap for long audio. const stride = Math.max(1, Math.floor(audio.length / 4096)); for (let i = 0; i < audio.length; i += stride) { - const q = (audio[i] * 32768) | 0; + const sample = Number.isFinite(audio[i]) ? audio[i] : 0; + const q = Math.max(-32768, Math.min(32767, Math.round(sample * 32768))); hash ^= q; hash = Math.imul(hash, 16777619); } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js index 80a85f8be..1abe48139 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js @@ -16,6 +16,9 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c if (!Number.isInteger(window) || window <= 0) { throw new Error('computeTemporalDeltas expects `window` to be a positive integer.'); } + if (order !== 1 && order !== 2) { + throw new Error('computeTemporalDeltas expects `order` to be 1 or 2.'); + } const [batch, T, F] = input_features.dims; const base = /** @type {Float32Array} */ (input_features.data); @@ -43,10 +46,10 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c return new Tensor('float32', concatFloat32([base, delta]), [batch, T, F * 2]); } - const delta_delta = /** @type {{delta: Tensor}} */ ( + const recursive_result = /** @type {{delta: Tensor}} */ ( computeTemporalDeltas(delta_tensor, { order: 1, window, concatenate: false }) - ).delta.data; - const delta_delta_tensor = new Tensor('float32', delta_delta, [batch, T, F]); + ); + const delta_delta_tensor = recursive_result.delta; if (!concatenate) { return { delta: delta_tensor, @@ -54,6 +57,7 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c }; } + const delta_delta = /** @type {Float32Array} */ (delta_delta_tensor.data); return new Tensor('float32', concatFloat32([base, delta, delta_delta]), [batch, T, F * 3]); } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index e98938224..33729d394 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -84,6 +84,16 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { return { words: [], tokens: [], word_confidences: null, word_avg: null }; } + if (token_ids.length !== token_timestamps.length) { + throw new Error( + `buildTransducerDetailedOutputs expects equal lengths for token_ids (${token_ids.length}) and token_timestamps (${token_timestamps.length}).`, + ); + } + if (token_confidences && token_confidences.length !== token_ids.length) { + throw new Error( + `buildTransducerDetailedOutputs expects token_confidences length (${token_confidences.length}) to match token_ids length (${token_ids.length}).`, + ); + } /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ const tokens = []; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 77592f5a0..882dfb68f 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -326,13 +326,13 @@ export class AutomaticSpeechRecognitionPipeline ); } - const return_timestamps = !!(kwargs.return_timestamps ?? false); + const return_timestamps = !!(kwargs.return_timestamps); const decodeOptions = { tokenizer: this.tokenizer, return_timestamps, return_words: return_timestamps, - return_metrics: kwargs.return_metrics ?? false, + return_metrics: true, }; const single = !Array.isArray(audio); From 3bac1dc0e0b4f843e17e831617d67dc67b7772b3 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 1 Mar 2026 22:43:09 +0300 Subject: [PATCH 10/40] test(nemo-conformer-tdt): rewrite tests to match current API - Remove all timestamp_granularity tests (feature was removed) - Fix option names: return_tokens, return_words, return_timestamps - Fix output fields: tokens/words arrays, not token_ids/word_timestamps - Verify pipeline passes return_words + return_metrics when timestamps on - Add test: return_timestamps 'word' treated as truthy --- .../test_modeling_nemo_conformer_tdt.js | 21 ++- ..._pipelines_automatic_speech_recognition.js | 130 ++++++------------ 2 files changed, 55 insertions(+), 96 deletions(-) diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 83a1523e2..1106a9332 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -104,22 +104,21 @@ export default () => { const output = await model.transcribe(inputs, { tokenizer, - return_token_timestamps: true, - return_word_timestamps: true, - return_utterance_timestamp: true, + return_timestamps: true, + return_words: true, + return_tokens: true, }); expect(output.text).toBe("hello world"); - expect(output.token_ids).toEqual([1, 2]); - expect(output.token_timestamps).toEqual([ - [0, 0.04], - [0.04, 0.12], + expect(output.utterance_timestamp).toEqual([0, 0.12]); + expect(output.words).toEqual([ + expect.objectContaining({ text: "hello", start_time: 0, end_time: 0.04 }), + expect.objectContaining({ text: "world", start_time: 0.04, end_time: 0.12 }), ]); - expect(output.word_timestamps).toEqual([ - { text: "hello", timestamp: [0, 0.04] }, - { text: "world", timestamp: [0.04, 0.12] }, + expect(output.tokens).toEqual([ + expect.objectContaining({ id: 1, start_time: 0, end_time: 0.04 }), + expect.objectContaining({ id: 2, start_time: 0.04, end_time: 0.12 }), ]); - expect(output.utterance_timestamp).toEqual([0, 0.12]); }, MAX_TEST_EXECUTION_TIME, ); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 831a885ac..0763795d6 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -133,19 +133,22 @@ export default () => { config: { model_type: modelType }, async transcribe(_inputs, options) { calls.push(options); - return { - text: "hello world", - token_ids: [1, 2], - token_timestamps: [ - [0, 0.04], - [0.04, 0.08], - ], - word_timestamps: [ - { text: "hello", timestamp: [0, 0.04] }, - { text: "world", timestamp: [0.04, 0.08] }, - ], - utterance_timestamp: [0, 0.08], - }; + const result = { text: "hello world" }; + if (options.return_timestamps) { + result.utterance_timestamp = [0, 0.08]; + result.utterance_confidence = 0.95; + result.confidence_scores = { token_avg: 0.95, word_avg: 0.94, overall_log_prob: -0.05 }; + if (options.return_words) { + result.words = [ + { text: "hello", start_time: 0, end_time: 0.04, confidence: 0.96 }, + { text: "world", start_time: 0.04, end_time: 0.08, confidence: 0.93 }, + ]; + } + } + if (options.return_metrics) { + result.metrics = { total_ms: 42, rtf: 0.01 }; + } + return result; }, async dispose() {}, }; @@ -153,18 +156,7 @@ export default () => { const processor = Object.assign(async () => ({ input_features: {} }), { feature_extractor: { config: { sampling_rate: 16000 } }, }); - const tokenizer = { - decode(ids) { - const idArray = Array.isArray(ids) ? ids : [ids]; - return idArray - .map((id) => { - if (id === 1 || id === 1n) return " hello"; - if (id === 2 || id === 2n) return " world"; - return ""; - }) - .join(""); - }, - }; + const tokenizer = {}; return { pipe: new AutomaticSpeechRecognitionPipeline({ @@ -177,85 +169,53 @@ export default () => { }; }; - it("dispatches to nemo-conformer-tdt path", async () => { + it("returns text and metrics when timestamps disabled", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: false }); - expect(output).toEqual({ text: "hello world" }); + expect(output).toEqual({ text: "hello world", metrics: { total_ms: 42, rtf: 0.01 } }); expect(calls).toHaveLength(1); + expect(calls[0]).toMatchObject({ + return_timestamps: false, + return_words: false, + return_metrics: true, + }); }); - it("default timestamps use word granularity", async () => { + it("returns full output with words when return_timestamps is true", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: true }); - expect(output).toEqual({ + expect(output).toMatchObject({ text: "hello world", - chunks: [ - { text: "hello", timestamp: [0, 0.04] }, - { text: "world", timestamp: [0.04, 0.08] }, + utterance_timestamp: [0, 0.08], + utterance_confidence: 0.95, + words: [ + { text: "hello", start_time: 0, end_time: 0.04 }, + { text: "world", start_time: 0.04, end_time: 0.08 }, ], + confidence_scores: { token_avg: 0.95, word_avg: 0.94 }, + metrics: { total_ms: 42, rtf: 0.01 }, }); expect(calls[0]).toMatchObject({ - return_word_timestamps: true, - return_token_timestamps: false, - return_utterance_timestamp: false, - }); - }); - - it("supports utterance granularity", async () => { - const { pipe } = makeUnitPipe(); - const output = await pipe(new Float32Array(16000), { return_timestamps: true, - timestamp_granularity: "utterance", - }); - expect(output).toEqual({ - text: "hello world", - chunks: [{ text: "hello world", timestamp: [0, 0.08] }], + return_words: true, + return_metrics: true, }); }); - it("supports token granularity", async () => { - const { pipe } = makeUnitPipe(); - const output = await pipe(new Float32Array(16000), { - return_timestamps: true, - timestamp_granularity: "token", - }); - expect(output).toEqual({ + it("treats return_timestamps 'word' as truthy (same as true)", async () => { + const { pipe, calls } = makeUnitPipe(); + const output = await pipe(new Float32Array(16000), { return_timestamps: "word" }); + expect(output).toMatchObject({ text: "hello world", - chunks: [ - { text: " hello", timestamp: [0, 0.04] }, - { text: " world", timestamp: [0.04, 0.08] }, - ], + utterance_timestamp: [0, 0.08], + words: expect.any(Array), + metrics: expect.any(Object), }); - }); - - it("supports all granularities at once", async () => { - const { pipe } = makeUnitPipe(); - const output = await pipe(new Float32Array(16000), { + expect(calls[0]).toMatchObject({ return_timestamps: true, - timestamp_granularity: "all", + return_words: true, + return_metrics: true, }); - expect(output).toEqual({ - text: "hello world", - chunks: [ - { text: "hello", timestamp: [0, 0.04] }, - { text: "world", timestamp: [0.04, 0.08] }, - ], - tokens: [ - { text: " hello", timestamp: [0, 0.04] }, - { text: " world", timestamp: [0.04, 0.08] }, - ], - utterance: [0, 0.08], - }); - }); - - it("throws for invalid timestamp granularity", async () => { - const { pipe } = makeUnitPipe(); - await expect( - pipe(new Float32Array(16000), { - return_timestamps: true, - timestamp_granularity: "frame", - }), - ).rejects.toThrow("Invalid `timestamp_granularity`"); }); }); }); From c75ebd27f53584e38c3fa98262e3661d7d9cd9f1 Mon Sep 17 00:00:00 2001 From: ysdede Date: Mon, 2 Mar 2026 22:47:51 +0300 Subject: [PATCH 11/40] fix(nemo-conformer-tdt): harden decoding and feature utilities Address reviewer findings except the return_metrics policy decision. - Fix temporal delta concatenation to interleave per frame and add dtype validation. - Validate preemphasis range and clamp normalization variance in feature extraction. - Remove unsafe encoder layout inference; require explicit encoder_output_layout. - Redesign decode loop to read frame data on-demand instead of eager frame materialization. - Deduplicate word finalization and avoid zero-filling missing word confidences. - Tighten tests for delta layout/type checks, explicit layout requirement, call counts, and naming accuracy. --- .../feature_extraction_nemo_conformer_tdt.js | 23 ++-- .../modeling_nemo_conformer_tdt.js | 130 +++++++++--------- .../nemo_conformer_tdt/transducer_deltas.js | 29 ++-- .../nemo_conformer_tdt/transducer_text.js | 69 +++++----- ...t_feature_extraction_nemo_conformer_tdt.js | 12 ++ .../test_modeling_nemo_conformer_tdt.js | 42 ++++-- .../test_feature_extraction_parakeet.js | 2 +- ..._pipelines_automatic_speech_recognition.js | 2 + 8 files changed, 185 insertions(+), 124 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index e230ff50c..aba71c485 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -50,15 +50,15 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { if (this.delta_order > 0 && !this.delta_concatenate) { logger.warn( 'NemoConformerTDTFeatureExtractor: `delta_concatenate=false` is set. ' + - '`input_features` will remain base features and deltas are returned in extra fields.', + '`input_features` will remain base features and deltas are returned in extra fields.', ); } this.feature_cache = this.use_feature_cache ? new FeatureLRUCache({ - max_entries: this.config.feature_cache_max_entries ?? 128, - max_size_mb: this.config.feature_cache_max_size_mb ?? 64, - }) + max_entries: this.config.feature_cache_max_entries ?? 128, + max_size_mb: this.config.feature_cache_max_size_mb ?? 64, + }) : null; } @@ -69,10 +69,17 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { */ async _extract_fbank_features(waveform) { // Parakeet uses a custom preemphasis strategy: Apply preemphasis to entire waveform at once - const preemphasis = this.config.preemphasis; + const preemphasis = this.config.preemphasis ?? 0; + if (!Number.isFinite(preemphasis) || preemphasis < 0 || preemphasis >= 1) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected \`preemphasis\` in [0, 1), got ${this.config.preemphasis}.`, + ); + } waveform = new Float64Array(waveform); // Clone to avoid destructive changes - for (let j = waveform.length - 1; j >= 1; --j) { - waveform[j] -= preemphasis * waveform[j - 1]; + if (preemphasis !== 0) { + for (let j = waveform.length - 1; j >= 1; --j) { + waveform[j] -= preemphasis * waveform[j - 1]; + } } const features = await spectrogram( @@ -159,7 +166,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { for (let j = 0; j < num_features; ++j) { const mean = sum[j] / features_length; const variance = (sum_sq[j] - features_length * mean * mean) / divisor; - const std = Math.sqrt(variance) + EPSILON; + const std = Math.sqrt(Math.max(variance, 0)) + EPSILON; const inv_std = 1 / std; for (let i = 0; i < features_length; ++i) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index be25afe42..370328e77 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -82,17 +82,6 @@ function confidenceFromLogits(logits, tokenId, vocabSize) { }; } -function inferEncoderOutputLayout(outputTensor) { - if (outputTensor.dims.length !== 3 || outputTensor.dims[0] !== 1) { - throw new Error( - `Nemo Conformer TDT expected encoder output dims [1, D, T] or [1, T, D], got [${outputTensor.dims.join(', ')}].`, - ); - } - - // Heuristic fallback: in most Nemo exports D > T. - return outputTensor.dims[1] >= outputTensor.dims[2] ? 'BDT' : 'BTD'; -} - function resolveTransducerConfig(config, sessions) { const transducerConfig = config['transformers.js_config']?.transducer; if (!transducerConfig) { @@ -137,7 +126,7 @@ function resolveTransducerConfig(config, sessions) { if (missingDecoderInputs.length > 0) { throw new Error( `Nemo Conformer TDT decoder session is missing expected inputs: ${missingDecoderInputs.join(', ')}. ` + - 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', ); } const missingDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2].filter( @@ -146,7 +135,7 @@ function resolveTransducerConfig(config, sessions) { if (missingDecoderOutputs.length > 0) { throw new Error( `Nemo Conformer TDT decoder session is missing expected outputs: ${missingDecoderOutputs.join(', ')}. ` + - 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', + 'Override I/O names via `transformers.js_config.transducer.io` if your export uses different names.', ); } @@ -157,7 +146,7 @@ function resolveTransducerConfig(config, sessions) { if (!(encoderSession.outputNames ?? []).includes(io.encoder_output)) { throw new Error( `Nemo Conformer TDT encoder session is missing expected output: ${io.encoder_output}. ` + - 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', + 'Override `transformers.js_config.transducer.io.encoder_output` if your export uses a different name.', ); } @@ -165,6 +154,7 @@ function resolveTransducerConfig(config, sessions) { const subsamplingFactor = transducerConfig.subsampling_factor ?? 8; const frameShiftS = transducerConfig.frame_shift_s ?? 0.01; const blankTokenId = transducerConfig.blank_token_id ?? 0; + const encoderOutputLayout = transducerConfig.encoder_output_layout; const decoderTokenDType = transducerConfig.decoder_token_dtype ?? 'int32'; const decoderTokenLengthDType = transducerConfig.decoder_token_length_dtype ?? 'int32'; @@ -182,6 +172,9 @@ function resolveTransducerConfig(config, sessions) { if (!Number.isFinite(frameShiftS) || frameShiftS <= 0) { throw new Error('Invalid `transformers.js_config.transducer.frame_shift_s`: expected a positive number.'); } + if (encoderOutputLayout !== 'BDT' && encoderOutputLayout !== 'BTD') { + throw new Error('Invalid `transformers.js_config.transducer.encoder_output_layout`: expected "BDT" or "BTD".'); + } if (!['int32', 'int64'].includes(decoderTokenDType)) { throw new Error( 'Invalid `transformers.js_config.transducer.decoder_token_dtype`: expected "int32" or "int64".', @@ -201,7 +194,7 @@ function resolveTransducerConfig(config, sessions) { vocab_size: transducerConfig.vocab_size ?? config.vocab_size ?? null, duration_start_index: transducerConfig.duration_start_index ?? null, encoder_input_layout: transducerConfig.encoder_input_layout ?? 'BTF', - encoder_output_layout: transducerConfig.encoder_output_layout ?? null, + encoder_output_layout: encoderOutputLayout, encoder_frame_layout: transducerConfig.encoder_frame_layout ?? 'BD1', decoder_token_dtype: decoderTokenDType, decoder_token_length_dtype: decoderTokenLengthDType, @@ -264,7 +257,7 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { if (options.model_file_name && options.model_file_name !== 'encoder_model') { throw new Error( 'NemoConformerForTDT does not support `model_file_name` override. ' + - 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', + 'Expected canonical files: `encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`.', ); } @@ -283,8 +276,8 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { const reason = error?.message ?? String(error); throw new Error( 'Failed to load Nemo Conformer TDT sessions. Expected canonical v4 files under `onnx/`: ' + - '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + - `Original error: ${reason}`, + '`encoder_model{suffix}.onnx` and `decoder_model_merged{suffix}.onnx`. ' + + `Original error: ${reason}`, ); } @@ -316,33 +309,45 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { return outputs[name] ?? Object.values(outputs)[0]; } - _encoderOutputToFrames(encoderOutput) { - const layout = this.transducer.encoder_output_layout ?? inferEncoderOutputLayout(encoderOutput); - const dims = encoderOutput.dims; - const data = encoderOutput.data; - const frames = []; + _getEncoderFrameCount(encoderOutput) { + if (encoderOutput.dims.length !== 3 || encoderOutput.dims[0] !== 1) { + throw new Error( + `Nemo Conformer TDT expected encoder output dims [1, D, T] or [1, T, D], got [${encoderOutput.dims.join(', ')}].`, + ); + } + const layout = this.transducer.encoder_output_layout; + if (layout === 'BDT') { + return encoderOutput.dims[2]; + } + if (layout === 'BTD') { + return encoderOutput.dims[1]; + } + throw new Error( + `Unsupported encoder output layout "${layout}". Use 'BDT' or 'BTD' in transformers.js_config.transducer.`, + ); + } + + _getFrameData(encoderOutput, frameIndex, reusableFrame) { + const layout = this.transducer.encoder_output_layout; + if (encoderOutput.type !== 'float32') { + throw new Error(`Nemo Conformer TDT expected encoder output type "float32", got "${encoderOutput.type}".`); + } + const data = /** @type {Float32Array} */ (encoderOutput.data); if (layout === 'BDT') { - const D = dims[1]; - const T = dims[2]; - for (let t = 0; t < T; ++t) { - const frame = new Float32Array(D); - for (let d = 0; d < D; ++d) { - frame[d] = data[d * T + t]; - } - frames.push(frame); + const D = encoderOutput.dims[1]; + const T = encoderOutput.dims[2]; + const frame = reusableFrame && reusableFrame.length === D ? reusableFrame : new Float32Array(D); + for (let d = 0; d < D; ++d) { + frame[d] = data[d * T + frameIndex]; } - return frames; + return frame; } if (layout === 'BTD') { - const T = dims[1]; - const D = dims[2]; - for (let t = 0; t < T; ++t) { - const offset = t * D; - frames.push(new Float32Array(data.subarray(offset, offset + D))); - } - return frames; + const D = encoderOutput.dims[2]; + const offset = frameIndex * D; + return data.subarray(offset, offset + D); } throw new Error( @@ -530,18 +535,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const encodeMs = nowMs() - encodeStart; const encoderOutput = this._getEncoderOutput(encoderOutputs); - let frames; - try { - frames = this._encoderOutputToFrames(encoderOutput); - } finally { - const seen = new Set(); - for (const value of Object.values(encoderOutputs)) { - if (value instanceof Tensor && !seen.has(value)) { - value.dispose(); - seen.add(value); - } - } - } + const frameCount = this._getEncoderFrameCount(encoderOutput); const frameTime = this.transducer.subsampling_factor * this.transducer.frame_shift_s; const numLayers = this.transducer.decoder.num_layers; @@ -568,6 +562,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { let decoderState; let targetLengthTensor; + let reusableFrame = null; let emittedOnFrame = 0; const decodeStart = nowMs(); @@ -583,8 +578,12 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { ? new Tensor('int64', BigInt64Array.from([1n]), [1]) : new Tensor('int32', new Int32Array([1]), [1]); - for (let frameIndex = 0; frameIndex < frames.length;) { - const frameTensor = this._createFrameTensor(frames[frameIndex]); + for (let frameIndex = 0; frameIndex < frameCount; ) { + const frameData = this._getFrameData(encoderOutput, frameIndex, reusableFrame); + if (this.transducer.encoder_output_layout === 'BDT') { + reusableFrame = frameData; + } + const frameTensor = this._createFrameTensor(frameData); const prevTokenId = tokenIds.length > 0 ? tokenIds[tokenIds.length - 1] : blankId; const tokenTensor = this.transducer.decoder_token_dtype === 'int64' @@ -681,6 +680,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } finally { if (targetLengthTensor) targetLengthTensor.dispose(); if (decoderState) this._disposeDecoderState(decoderState); + const seen = new Set(); + for (const value of Object.values(encoderOutputs)) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } + } } const decodeMs = nowMs() - decodeStart; @@ -704,13 +710,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { result.utterance_timestamp = tokenTimestamps.length > 0 ? /** @type {[number, number]} */ ([ - tokenTimestamps[0][0], - tokenTimestamps[tokenTimestamps.length - 1][1], - ]) + tokenTimestamps[0][0], + tokenTimestamps[tokenTimestamps.length - 1][1], + ]) : /** @type {[number, number]} */ ([ - roundTs(timeOffset), - roundTs(frames.length * frameTime + timeOffset), - ]); + roundTs(timeOffset), + roundTs(frameCount * frameTime + timeOffset), + ]); if (detailed) { if (return_words) result.words = detailed.words; @@ -753,7 +759,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const totalMs = nowMs() - totalStart; const utteranceDuration = result.utterance_timestamp ? Math.max(result.utterance_timestamp[1] - result.utterance_timestamp[0], 1e-8) - : Math.max(frames.length * frameTime, 1e-8); + : Math.max(frameCount * frameTime, 1e-8); const rtf = totalMs / 1000 / utteranceDuration; result.metrics = { preprocess_ms: 0.0, @@ -780,8 +786,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { // Register with ModelRegistry so get_model_files / progress_callback enumerate // the correct ONNX files: encoder_model + decoder_model_merged. -MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); // model_type key -MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT); // architecture key +MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); // model_type key +MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT); // architecture key MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerTDTPreTrainedModel', NemoConformerTDTPreTrainedModel); MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerForTDT', NemoConformerForTDT); MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerTDTPreTrainedModel, 'NemoConformerTDTPreTrainedModel'); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js index 1abe48139..50651f82f 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js @@ -19,6 +19,9 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c if (order !== 1 && order !== 2) { throw new Error('computeTemporalDeltas expects `order` to be 1 or 2.'); } + if (input_features.type !== 'float32') { + throw new Error(`computeTemporalDeltas expects input tensor type "float32", got "${input_features.type}".`); + } const [batch, T, F] = input_features.dims; const base = /** @type {Float32Array} */ (input_features.data); @@ -43,7 +46,7 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c if (!concatenate) { return { delta: delta_tensor }; } - return new Tensor('float32', concatFloat32([base, delta]), [batch, T, F * 2]); + return new Tensor('float32', interleaveByFrame([base, delta], T, F), [batch, T, F * 2]); } const recursive_result = /** @type {{delta: Tensor}} */ ( @@ -58,16 +61,26 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c } const delta_delta = /** @type {Float32Array} */ (delta_delta_tensor.data); - return new Tensor('float32', concatFloat32([base, delta, delta_delta]), [batch, T, F * 3]); + return new Tensor('float32', interleaveByFrame([base, delta, delta_delta], T, F), [batch, T, F * 3]); } -function concatFloat32(items) { - const total = items.reduce((sum, arr) => sum + arr.length, 0); - const output = new Float32Array(total); - let offset = 0; +function interleaveByFrame(items, T, F) { + const chunkSize = T * F; for (const arr of items) { - output.set(arr, offset); - offset += arr.length; + if (arr.length !== chunkSize) { + throw new Error( + `computeTemporalDeltas expected concatenation arrays with length ${chunkSize}, got ${arr.length}.`, + ); + } + } + + const output = new Float32Array(chunkSize * items.length); + for (let t = 0; t < T; ++t) { + const srcOffset = t * F; + const dstOffset = t * F * items.length; + for (let i = 0; i < items.length; ++i) { + output.set(items[i].subarray(srcOffset, srcOffset + F), dstOffset + i * F); + } } return output; } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index 33729d394..a68bb828d 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -55,6 +55,28 @@ function resolveTokenPiece(tokenizer, id) { return { raw: rawToken || decoded, clean, startsNewWord }; } +/** + * @param {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} words + * @param {{ text: string, start: number, end: number, confs: number[] } | null} current + */ +function finalizeAndPushWord(words, current) { + if (!current) return; + + const text = current.text.trim(); + if (!text) return; + + /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ + const word = { + text, + start_time: current.start, + end_time: current.end, + }; + if (current.confs.length > 0) { + word.confidence = Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; + } + words.push(word); +} + /** * Decode token ids into final transcription text. * @param {any} tokenizer @@ -76,7 +98,7 @@ export function decodeTransducerText(tokenizer, token_ids) { * @returns {{ * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, * tokens: Array<{ token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, - * word_confidences: number[] | null, + * word_confidences: (number | null)[] | null, * word_avg: number | null, * }} */ @@ -127,22 +149,7 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times tokens.push(tok); if (!current || startsNewWord) { - if (current) { - const text = current.text.trim(); - if (text) { - /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ - const word = { - text, - start_time: current.start, - end_time: current.end, - }; - if (current.confs.length > 0) { - word.confidence = - Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; - } - words.push(word); - } - } + finalizeAndPushWord(words, current); current = { text: clean, start: ts[0], @@ -158,28 +165,16 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times } } - if (current) { - const text = current.text.trim(); - if (text) { - /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ - const word = { - text, - start_time: current.start, - end_time: current.end, - }; - if (current.confs.length > 0) { - word.confidence = - Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; - } - words.push(word); + finalizeAndPushWord(words, current); + + const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null; + let word_avg = null; + if (word_confidences) { + const validConfidences = word_confidences.filter((x) => x != null); + if (validConfidences.length > 0) { + word_avg = Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; } } - const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? 0) : null; - const word_avg = - word_confidences && word_confidences.length > 0 - ? Math.round((word_confidences.reduce((a, b) => a + b, 0) / word_confidences.length) * 1e6) / 1e6 - : null; - return { words, tokens, word_confidences, word_avg }; } diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index cee9568d0..aa0d6ffad 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -74,5 +74,17 @@ export default () => { }, MAX_TEST_EXECUTION_TIME, ); + + it( + "validates preemphasis range", + async () => { + const invalidHigh = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, preemphasis: 1 }); + await expect(invalidHigh(audio)).rejects.toThrow("preemphasis"); + + const invalidLow = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, preemphasis: -0.1 }); + await expect(invalidLow(audio)).rejects.toThrow("preemphasis"); + }, + MAX_TEST_EXECUTION_TIME, + ); }); }; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 1106a9332..4214bbd7b 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -111,14 +111,8 @@ export default () => { expect(output.text).toBe("hello world"); expect(output.utterance_timestamp).toEqual([0, 0.12]); - expect(output.words).toEqual([ - expect.objectContaining({ text: "hello", start_time: 0, end_time: 0.04 }), - expect.objectContaining({ text: "world", start_time: 0.04, end_time: 0.12 }), - ]); - expect(output.tokens).toEqual([ - expect.objectContaining({ id: 1, start_time: 0, end_time: 0.04 }), - expect.objectContaining({ id: 2, start_time: 0.04, end_time: 0.12 }), - ]); + expect(output.words).toEqual([expect.objectContaining({ text: "hello", start_time: 0, end_time: 0.04 }), expect.objectContaining({ text: "world", start_time: 0.04, end_time: 0.12 })]); + expect(output.tokens).toEqual([expect.objectContaining({ id: 1, start_time: 0, end_time: 0.04 }), expect.objectContaining({ id: 2, start_time: 0.04, end_time: 0.12 })]); }, MAX_TEST_EXECUTION_TIME, ); @@ -127,6 +121,20 @@ export default () => { const invalidConfig = { model_type: "nemo-conformer-tdt" }; expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("Missing `transformers.js_config.transducer`"); }); + + it("requires explicit encoder_output_layout in transducer config", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + encoder_output_layout: undefined, + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); + }); }); describe("Nemo Conformer TDT utilities", () => { @@ -146,12 +154,30 @@ export default () => { expect(split.delta.dims).toEqual([1, 4, 2]); expect(split.delta_delta.dims).toEqual([1, 4, 2]); + const concatOrder1 = computeTemporalDeltas(input, { order: 1, window: 1, concatenate: true }); + expect(concatOrder1.dims).toEqual([1, 4, 4]); + expect(Array.from(concatOrder1.data.slice(0, 8))).toEqual([ + 1, + 2, + 0.5, + 1, // t0: base + delta + 2, + 4, + 1, + 2, // t1: base + delta + ]); + const concat = computeTemporalDeltas(input, { order: 2, window: 1, concatenate: true }); expect(concat.dims).toEqual([1, 4, 6]); }, MAX_TEST_EXECUTION_TIME, ); + it("rejects non-float32 tensors for temporal deltas", () => { + const input = new Tensor("float64", Float64Array.from([1, 2, 2, 4]), [1, 2, 2]); + expect(() => computeTemporalDeltas(input, { order: 1, window: 1, concatenate: true })).toThrow('type "float32"'); + }); + it( "creates stable audio cache keys", async () => { diff --git a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js index fab1861d9..c0e67a58b 100644 --- a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js +++ b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js @@ -20,7 +20,7 @@ export default () => { }); it( - "extracts normalized features and mask from synthetic audio", + "extracts features and mask from synthetic audio", async () => { const duration_s = 1.0; const total = Math.floor(config.sampling_rate * duration_s); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 0763795d6..8159c6098 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -195,6 +195,7 @@ export default () => { confidence_scores: { token_avg: 0.95, word_avg: 0.94 }, metrics: { total_ms: 42, rtf: 0.01 }, }); + expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: true, return_words: true, @@ -211,6 +212,7 @@ export default () => { words: expect.any(Array), metrics: expect.any(Object), }); + expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: true, return_words: true, From 493a5881d60f138848af3861e637e7d7dfb8288b Mon Sep 17 00:00:00 2001 From: ysdede Date: Tue, 3 Mar 2026 23:33:52 +0300 Subject: [PATCH 12/40] fix(nemo-conformer-tdt): address critical review issues Fixes high-impact issues found in PR review validation:\n- force NemoConformerForTDT to MODEL_TYPES.NemoConformerTDT in registry overrides\n- ensure encoder outputs are disposed when pre-decode validation throws\n- remove stride sampling from audio cache key hashing to prevent false cache hits\n- use encoder_model selector key in get_model_files for Nemo per-component dtype/device overrides\n\nAlso adds targeted regression tests for mapping, disposal behavior, file selection, and cache key correctness. --- .../modeling_nemo_conformer_tdt.js | 6 +- .../nemo_conformer_tdt/transducer_cache.js | 5 +- packages/transformers/src/models/registry.js | 3 + .../utils/model_registry/get_model_files.js | 2 +- .../test_modeling_nemo_conformer_tdt.js | 79 +++++++++++++++++++ 5 files changed, 89 insertions(+), 6 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 370328e77..597b7e7fa 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -534,8 +534,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } const encodeMs = nowMs() - encodeStart; - const encoderOutput = this._getEncoderOutput(encoderOutputs); - const frameCount = this._getEncoderFrameCount(encoderOutput); + let frameCount = 0; + let encoderOutput = null; const frameTime = this.transducer.subsampling_factor * this.transducer.frame_shift_s; const numLayers = this.transducer.decoder.num_layers; @@ -568,6 +568,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const decodeStart = nowMs(); try { + encoderOutput = this._getEncoderOutput(encoderOutputs); + frameCount = this._getEncoderFrameCount(encoderOutput); decoderState = { state1: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), state2: new Tensor('float32', new Float32Array(numLayers * hiddenSize), [numLayers, 1, hiddenSize]), diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 5a3af06ef..bf71afcad 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -14,9 +14,8 @@ export function createAudioCacheKey(audio, sampling_rate = 16000) { hash ^= sampling_rate; hash = Math.imul(hash, 16777619); - // Sample stride hash to keep keying cheap for long audio. - const stride = Math.max(1, Math.floor(audio.length / 4096)); - for (let i = 0; i < audio.length; i += stride) { + // Hash all quantized samples to minimize false cache hits across waveforms. + for (let i = 0; i < audio.length; ++i) { const sample = Number.isFinite(audio[i]) ? audio[i] : 0; const q = Math.max(-32768, Math.min(32767, Math.round(sample * 32768))); hash ^= q; diff --git a/packages/transformers/src/models/registry.js b/packages/transformers/src/models/registry.js index 5c08bdded..eabe1e2fa 100644 --- a/packages/transformers/src/models/registry.js +++ b/packages/transformers/src/models/registry.js @@ -577,6 +577,9 @@ const CUSTOM_MAPPING = [ ], ['SupertonicForConditionalGeneration', ALL_MODEL_FILES.SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic], ['ChatterboxModel', ALL_MODEL_FILES.ChatterboxModel, MODEL_TYPES.Chatterbox], + // Keep AutoModel lookup in MODEL_MAPPING_NAMES_ENCODER_ONLY while forcing the + // correct runtime model type for two-artifact Nemo Conformer TDT loading. + ['NemoConformerForTDT', ALL_MODEL_FILES.NemoConformerForTDT, MODEL_TYPES.NemoConformerTDT], ]; for (const [name, model, type] of CUSTOM_MAPPING) { MODEL_TYPE_MAPPING.set(name, type); diff --git a/packages/transformers/src/utils/model_registry/get_model_files.js b/packages/transformers/src/utils/model_registry/get_model_files.js index b0cf029e1..ed64a5309 100644 --- a/packages/transformers/src/utils/model_registry/get_model_files.js +++ b/packages/transformers/src/utils/model_registry/get_model_files.js @@ -167,7 +167,7 @@ export async function get_model_files( add_model_file('conditional_decoder'); files.push('generation_config.json'); } else if (modelType === MODEL_TYPES.NemoConformerTDT) { - add_model_file('model', 'encoder_model'); + add_model_file('encoder_model'); add_model_file('decoder_model_merged'); } else if (modelType === MODEL_TYPES.AutoEncoder) { add_model_file('encoder_model'); diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 4214bbd7b..522b7db74 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -1,6 +1,8 @@ import { NemoConformerForTDT, Tensor } from "../../../src/transformers.js"; import { createAudioCacheKey, FeatureLRUCache } from "../../../src/models/nemo_conformer_tdt/transducer_cache.js"; import { computeTemporalDeltas } from "../../../src/models/nemo_conformer_tdt/transducer_deltas.js"; +import { MODEL_TYPE_MAPPING, MODEL_TYPES } from "../../../src/models/modeling_utils.js"; +import { get_model_files } from "../../../src/utils/model_registry/get_model_files.js"; import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; @@ -73,6 +75,11 @@ const BASE_CONFIG = { export default () => { describe("NemoConformerForTDT", () => { + it("maps NemoConformerForTDT to MODEL_TYPES.NemoConformerTDT", () => { + expect(MODEL_TYPE_MAPPING.get("NemoConformerForTDT")).toBe(MODEL_TYPES.NemoConformerTDT); + expect(MODEL_TYPE_MAPPING.get("nemo-conformer-tdt")).toBe(MODEL_TYPES.NemoConformerTDT); + }); + it( "greedily decodes scripted token and duration logits", async () => { @@ -135,6 +142,43 @@ export default () => { }; expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); }); + + it( + "disposes encoder outputs when frame-count validation fails before decode", + async () => { + class BadEncoderOutputModel extends NemoConformerForTDT { + constructor(config, sessions, encoderOutput) { + super(config, sessions, {}); + this.encoderOutput = encoderOutput; + } + + async _runEncoder() { + return { outputs: this.encoderOutput }; + } + } + + const badEncoderOutput = new Tensor("float32", new Float32Array([0, 1, 2, 3]), [2, 2]); + let disposed = 0; + const originalDispose = badEncoderOutput.dispose.bind(badEncoderOutput); + badEncoderOutput.dispose = () => { + disposed += 1; + originalDispose(); + }; + + const model = new BadEncoderOutputModel(BASE_CONFIG, BASE_SESSIONS, badEncoderOutput); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { decode: () => "" }, + }), + ).rejects.toThrow("expected encoder output dims"); + expect(disposed).toBe(1); + }, + MAX_TEST_EXECUTION_TIME, + ); }); describe("Nemo Conformer TDT utilities", () => { @@ -193,6 +237,41 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it("uses Nemo encoder selector key when resolving model files", async () => { + const files = await get_model_files("dummy/nemo", { + local_files_only: true, + config: { + architectures: ["UnknownArch"], + model_type: "nemo-conformer-tdt", + "transformers.js_config": {}, + }, + dtype: { + model: "int8", + encoder_model: "fp16", + decoder_model_merged: "q4", + }, + }); + expect(files).toEqual([ + "config.json", + "onnx/encoder_model_fp16.onnx", + "onnx/decoder_model_merged_q4.onnx", + ]); + }); + + it( + "distinguishes long waveforms that differ at unsampled indices", + async () => { + const a = new Float32Array(10000); + const b = new Float32Array(10000); + b[1] = 0.12345; // Index 1 was skipped by the prior stride-based hash for this length. + + const ka = createAudioCacheKey(a, 16000); + const kb = createAudioCacheKey(b, 16000); + expect(ka).not.toEqual(kb); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "evicts least-recently-used entries when full", async () => { From 5b4cdabd48a2179b4aaf254e9c10deff04397145 Mon Sep 17 00:00:00 2001 From: ysdede Date: Tue, 3 Mar 2026 23:45:28 +0300 Subject: [PATCH 13/40] fix(nemo-conformer-tdt): clamp timestamps and validate cache limits - Clamp token end timestamps to encoder frame bounds during TDT decoding.\n- Validate FeatureLRUCache constructor limits to fail fast on invalid settings.\n- Add regression tests for timestamp clamping and cache limit validation. --- .../modeling_nemo_conformer_tdt.js | 5 ++- .../nemo_conformer_tdt/transducer_cache.js | 6 +++ .../test_modeling_nemo_conformer_tdt.js | 39 +++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 597b7e7fa..401a30a54 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -650,10 +650,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenIds.push(tokenId); // TDT duration convention: step=0 means "stay on current frame" (duration index 0 = no advance). // We still associate the token with this frame, so durationFrames is at least 1. - const durationFrames = step > 0 ? step : 1; + const durationFrames = Math.max(1, step > 0 ? step : 1); + const endFrame = Math.min(frameCount, frameIndex + durationFrames); tokenTimestamps.push([ roundTs(frameIndex * frameTime + timeOffset), - roundTs((frameIndex + durationFrames) * frameTime + timeOffset), + roundTs(endFrame * frameTime + timeOffset), ]); if (tokenConfidences && maybeConfidence) { tokenConfidences.push(maybeConfidence.confidence); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index bf71afcad..4ea17da0e 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -33,6 +33,12 @@ export class FeatureLRUCache { * @param {{max_entries?: number, max_size_mb?: number}} [options] */ constructor({ max_entries = 128, max_size_mb = 64 } = {}) { + if (!Number.isInteger(max_entries) || max_entries < 0) { + throw new Error('FeatureLRUCache expected `max_entries` to be a non-negative integer.'); + } + if (!Number.isFinite(max_size_mb) || max_size_mb < 0) { + throw new Error('FeatureLRUCache expected `max_size_mb` to be a non-negative number.'); + } this.max_entries = max_entries; this.max_size_mb = max_size_mb; this.cache = new Map(); diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 522b7db74..d28e6ceae 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -124,6 +124,38 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "clamps token timestamps when step jumps beyond remaining frames", + async () => { + const tokenizer = { + decode(ids) { + const idArray = Array.isArray(ids) ? ids : [ids]; + return idArray.map((id) => (id === 1 || id === 1n ? " token" : "")).join(""); + }, + }; + + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [ + // Emit token=1 with duration index choosing a large step (argmax at tail). + { logits: [0.1, 10.0, 0.0, 0.0, 0.0, 0.0, 12.0] }, + ]); + + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + const output = await model.transcribe(inputs, { + tokenizer, + return_timestamps: true, + return_tokens: true, + }); + + expect(output.tokens).toHaveLength(1); + expect(output.tokens[0]).toEqual(expect.objectContaining({ start_time: 0, end_time: 0.12 })); + expect(output.utterance_timestamp).toEqual([0, 0.12]); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("fails fast when transducer config is missing", () => { const invalidConfig = { model_type: "nemo-conformer-tdt" }; expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("Missing `transformers.js_config.transducer`"); @@ -288,5 +320,12 @@ export default () => { }, MAX_TEST_EXECUTION_TIME, ); + + it("rejects invalid cache limits", () => { + expect(() => new FeatureLRUCache({ max_entries: -1 })).toThrow("max_entries"); + expect(() => new FeatureLRUCache({ max_entries: 1.25 })).toThrow("max_entries"); + expect(() => new FeatureLRUCache({ max_size_mb: -1 })).toThrow("max_size_mb"); + expect(() => new FeatureLRUCache({ max_size_mb: Number.POSITIVE_INFINITY })).toThrow("max_size_mb"); + }); }); }; From 7690227443ce4f7047a07bf185501cb4fa2379c2 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 00:05:18 +0300 Subject: [PATCH 14/40] fix(nemo-conformer-tdt): close remaining tensor disposal leaks Dispose intermediate tensors in computeTemporalDeltas concatenate paths and dispose replaced base input features when delta concatenation returns a new tensor.\n\nAdd regression tests that assert disposal behavior for delta concatenate flows and feature extractor reassignment. --- .../feature_extraction_nemo_conformer_tdt.js | 1 + .../nemo_conformer_tdt/transducer_deltas.js | 9 +++-- ...t_feature_extraction_nemo_conformer_tdt.js | 33 ++++++++++++++++++- .../test_modeling_nemo_conformer_tdt.js | 22 +++++++++++++ 4 files changed, 62 insertions(+), 3 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index aba71c485..8907b2ab8 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -194,6 +194,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { concatenate: this.delta_concatenate, }); if (delta_result instanceof Tensor) { + input_features.dispose(); input_features = delta_result; result.input_features = input_features; } else { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js index 50651f82f..957fa0776 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_deltas.js @@ -46,7 +46,9 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c if (!concatenate) { return { delta: delta_tensor }; } - return new Tensor('float32', interleaveByFrame([base, delta], T, F), [batch, T, F * 2]); + const result = new Tensor('float32', interleaveByFrame([base, delta], T, F), [batch, T, F * 2]); + delta_tensor.dispose(); + return result; } const recursive_result = /** @type {{delta: Tensor}} */ ( @@ -61,7 +63,10 @@ export function computeTemporalDeltas(input_features, { order = 1, window = 2, c } const delta_delta = /** @type {Float32Array} */ (delta_delta_tensor.data); - return new Tensor('float32', interleaveByFrame([base, delta, delta_delta], T, F), [batch, T, F * 3]); + const result = new Tensor('float32', interleaveByFrame([base, delta, delta_delta], T, F), [batch, T, F * 3]); + delta_delta_tensor.dispose(); + delta_tensor.dispose(); + return result; } function interleaveByFrame(items, T, F) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index aa0d6ffad..71b67e60d 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -1,4 +1,4 @@ -import { NemoConformerTDTFeatureExtractor } from "../../../src/transformers.js"; +import { NemoConformerTDTFeatureExtractor, Tensor } from "../../../src/transformers.js"; import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; @@ -54,6 +54,37 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "disposes replaced base features when concatenated delta output is used", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + delta_order: 1, + delta_window: 2, + delta_concatenate: true, + }); + + const originalDispose = Tensor.prototype.dispose; + let disposeCalls = 0; + Tensor.prototype.dispose = function () { + disposeCalls += 1; + return originalDispose.call(this); + }; + + try { + const { input_features } = await extractor(audio); + expect(input_features.dims[2]).toBe(80 * 2); + } finally { + Tensor.prototype.dispose = originalDispose; + } + + // One dispose from computeTemporalDeltas intermediate tensor, one from replacing base features tensor. + expect(disposeCalls).toBe(2); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "uses feature cache when enabled", async () => { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index d28e6ceae..2199fd42f 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -254,6 +254,28 @@ export default () => { expect(() => computeTemporalDeltas(input, { order: 1, window: 1, concatenate: true })).toThrow('type "float32"'); }); + it("disposes intermediate delta tensors in concatenate paths", () => { + const input = new Tensor("float32", Float32Array.from([1, 2, 2, 4, 3, 6, 4, 8]), [1, 4, 2]); + const originalDispose = Tensor.prototype.dispose; + let disposeCalls = 0; + Tensor.prototype.dispose = function () { + disposeCalls += 1; + return originalDispose.call(this); + }; + + try { + const order1 = computeTemporalDeltas(input, { order: 1, window: 1, concatenate: true }); + const order2 = computeTemporalDeltas(input, { order: 2, window: 1, concatenate: true }); + expect(order1.dims).toEqual([1, 4, 4]); + expect(order2.dims).toEqual([1, 4, 6]); + } finally { + Tensor.prototype.dispose = originalDispose; + } + + // order=1 concat disposes one intermediate tensor, order=2 concat disposes two. + expect(disposeCalls).toBe(3); + }); + it( "creates stable audio cache keys", async () => { From 1f065c367870e6e854ab5672bbcfc92648595148 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 00:26:30 +0300 Subject: [PATCH 15/40] fix(nemo-conformer-tdt): dispose auxiliary decoder outputs Dispose non-essential Tensor outputs returned by decoder steps to prevent cumulative memory growth. Keep logits/state tensors alive for decoding/state transitions and dispose extras immediately.\n\nAdd regression test to assert auxiliary decoder tensor outputs are disposed each step. --- .../modeling_nemo_conformer_tdt.js | 19 ++++++-- .../test_modeling_nemo_conformer_tdt.js | 44 +++++++++++++++++++ 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 401a30a54..92e52bdbc 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -609,12 +609,23 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } const logits = decoderOutput[io.decoder_output] ?? Object.values(decoderOutput)[0]; + const outputState1 = decoderOutput[io.decoder_output_state_1]; + const outputState2 = decoderOutput[io.decoder_output_state_2]; + const seenDecoderTensors = new Set(); + for (const value of Object.values(decoderOutput)) { + if (!(value instanceof Tensor) || seenDecoderTensors.has(value)) continue; + seenDecoderTensors.add(value); + if (value === logits || value === outputState1 || value === outputState2) { + continue; + } + value.dispose(); + } const logitsData = logits.data; if (logitsData.length < vocabSize) { logits.dispose(); this._disposeDecoderState({ - state1: decoderOutput[io.decoder_output_state_1], - state2: decoderOutput[io.decoder_output_state_2], + state1: outputState1, + state2: outputState2, }); throw new Error( `Nemo Conformer TDT decoder output is too small (${logitsData.length}) for vocab_size=${vocabSize}.`, @@ -639,8 +650,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } const newState = { - state1: decoderOutput[io.decoder_output_state_1] ?? decoderState.state1, - state2: decoderOutput[io.decoder_output_state_2] ?? decoderState.state2, + state1: outputState1 ?? decoderState.state1, + state2: outputState2 ?? decoderState.state2, }; if (tokenId !== blankId) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 2199fd42f..c7ae1a98c 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -211,6 +211,50 @@ export default () => { }, MAX_TEST_EXECUTION_TIME, ); + + it( + "disposes auxiliary decoder tensor outputs per decode step", + async () => { + class AuxDecoderOutputModel extends NemoConformerForTDT { + constructor(config, sessions) { + super(config, sessions, {}); + this.auxDisposals = 0; + } + + async _runEncoder() { + return { + outputs: new Tensor("float32", new Float32Array([0.1, 0.2]), [1, 2, 1]), + }; + } + + async _runDecoder() { + const stateShape = [1, 1, 2]; + const aux = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 1, 3]); + const originalDispose = aux.dispose.bind(aux); + aux.dispose = () => { + this.auxDisposals += 1; + originalDispose(); + }; + return { + outputs: new Tensor("float32", new Float32Array([10.0, 0.0, 0.0, 8.0, 0.0]), [1, 1, 5]), + output_states_1: new Tensor("float32", new Float32Array([0, 0]), stateShape), + output_states_2: new Tensor("float32", new Float32Array([0, 0]), stateShape), + auxiliary_scores: aux, + }; + } + } + + const model = new AuxDecoderOutputModel(BASE_CONFIG, BASE_SESSIONS); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + const output = await model.transcribe(inputs, { return_timestamps: false }); + expect(output).toEqual(expect.objectContaining({ text: "" })); + expect(model.auxDisposals).toBe(1); + }, + MAX_TEST_EXECUTION_TIME, + ); }); describe("Nemo Conformer TDT utilities", () => { From ec09a090c9c6950a24da00a31e9192a994e6bc0e Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 00:41:21 +0300 Subject: [PATCH 16/40] perf(nemo-conformer-tdt): avoid tolist in length hot path Compute encoder length directly from attention_mask.data instead of attention_mask.tolist() to avoid large transient array allocations in ASR decode hot path. --- .../nemo_conformer_tdt/modeling_nemo_conformer_tdt.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 92e52bdbc..a29349d53 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -411,8 +411,12 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { let length = null; const attentionMask = model_inputs.attention_mask; if (attentionMask instanceof Tensor) { - const mask = attentionMask.tolist(); - length = mask[0].reduce((acc, x) => acc + toInt(x), 0); + const maskData = attentionMask.data; + let sum = 0; + for (let i = 0; i < maskData.length; ++i) { + sum += toInt(maskData[i]); + } + length = sum; } else { length = inputFeatures.dims[1]; } From 8a90a7c90cf5f434cbe63e52be4c513c6e20992b Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 01:36:48 +0300 Subject: [PATCH 17/40] fix(nemo-conformer-tdt): harden duration and audio validation Fail fast when duration logits are required but missing in decoder output, and enforce positive-integer vocab size at runtime config validation. Validate prepared Nemo pipeline audio for non-empty finite samples before processor/model calls. Add regression tests for missing duration logits and non-finite audio rejection. --- .../modeling_nemo_conformer_tdt.js | 15 +++++++++++++ .../pipelines/automatic-speech-recognition.js | 21 ++++++++++++++++++ .../test_modeling_nemo_conformer_tdt.js | 22 +++++++++++++++++++ ..._pipelines_automatic_speech_recognition.js | 7 ++++++ 4 files changed, 65 insertions(+) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index a29349d53..d16eac520 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -457,6 +457,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } _validateRuntimeConfig(vocabSize) { + if (!Number.isInteger(vocabSize) || vocabSize <= 0) { + throw new Error( + `Invalid Nemo Conformer TDT config: vocab_size=${vocabSize} must be a positive integer.`, + ); + } if (this.transducer.blank_token_id >= vocabSize) { throw new Error( `Invalid Nemo Conformer TDT config: blank_token_id=${this.transducer.blank_token_id} must be < vocab_size=${vocabSize}.`, @@ -638,6 +643,16 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const tokenId = argmax(logitsData, 0, vocabSize); const durationStart = this.transducer.duration_start_index ?? vocabSize; const hasDurationLogits = logitsData.length > durationStart; + if (this.transducer.duration_start_index != null && !hasDurationLogits) { + logits.dispose(); + this._disposeDecoderState({ + state1: outputState1, + state2: outputState2, + }); + throw new Error( + `Nemo Conformer TDT decoder output is missing duration logits: expected values beyond index ${durationStart - 1}, got length=${logitsData.length}.`, + ); + } const step = hasDurationLogits ? argmax(logitsData, durationStart, logitsData.length - durationStart) - durationStart : 0; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 882dfb68f..92f16c7ac 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -140,6 +140,24 @@ export class AutomaticSpeechRecognitionPipeline Pipeline ) { + _validateNemoAudio(audio, index) { + if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { + throw new TypeError( + `Nemo Conformer TDT pipeline expected audio at index ${index} to be Float32Array or Float64Array.`, + ); + } + if (audio.length === 0) { + throw new Error(`Nemo Conformer TDT pipeline expected non-empty audio at index ${index}.`); + } + for (let i = 0; i < audio.length; ++i) { + if (!Number.isFinite(audio[i])) { + throw new Error( + `Nemo Conformer TDT pipeline expected finite audio samples; found ${audio[i]} at index ${index}:${i}.`, + ); + } + } + } + async _call(audio, kwargs = {}) { switch (this.model.config.model_type) { case 'whisper': @@ -339,6 +357,9 @@ export class AutomaticSpeechRecognitionPipeline const batchedAudio = single ? [audio] : audio; const sampling_rate = this.processor.feature_extractor.config.sampling_rate; const preparedAudios = await prepareAudios(batchedAudio, sampling_rate); + for (let i = 0; i < preparedAudios.length; ++i) { + this._validateNemoAudio(preparedAudios[i], i); + } const toReturn = []; for (const aud of preparedAudios) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index c7ae1a98c..c87bd4038 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -156,6 +156,28 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "fails fast when duration logits are required but missing", + async () => { + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [ + // Only vocab logits are returned; duration head is missing. + { logits: [0.1, 10.0, 0.0] }, + ]); + + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { decode: () => "" }, + return_timestamps: false, + }), + ).rejects.toThrow("missing duration logits"); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("fails fast when transducer config is missing", () => { const invalidConfig = { model_type: "nemo-conformer-tdt" }; expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("Missing `transformers.js_config.transducer`"); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 8159c6098..06f8cec16 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -219,6 +219,13 @@ export default () => { return_metrics: true, }); }); + + it("rejects non-finite audio samples before Nemo decoding", async () => { + const { pipe } = makeUnitPipe(); + await expect(pipe(Float32Array.from([0, Number.NaN, 0]), { return_timestamps: false })).rejects.toThrow( + "finite audio samples", + ); + }); }); }); }; From ce0a3eb91113322ffefe0ada6268542c724a76eb Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 01:48:50 +0300 Subject: [PATCH 18/40] fix: address prioritized review findings Fix placeholder interpolation in _prepare_model_inputs error text. Add fail-fast validation for Nemo delta_window and reject duplicate decoder output aliases in transducer io config. Add regression tests for delta_window validation and duplicate decoder output alias rejection. --- .../transformers/src/models/modeling_utils.js | 5 +++-- .../feature_extraction_nemo_conformer_tdt.js | 5 +++++ .../modeling_nemo_conformer_tdt.js | 7 +++++++ ...st_feature_extraction_nemo_conformer_tdt.js | 15 +++++++++++++++ .../test_modeling_nemo_conformer_tdt.js | 18 ++++++++++++++++++ 5 files changed, 48 insertions(+), 2 deletions(-) diff --git a/packages/transformers/src/models/modeling_utils.js b/packages/transformers/src/models/modeling_utils.js index ef6e5a99a..d537b559a 100644 --- a/packages/transformers/src/models/modeling_utils.js +++ b/packages/transformers/src/models/modeling_utils.js @@ -857,8 +857,9 @@ export class PreTrainedModel extends Callable { if (input_name in model_inputs) { if (inputs) { throw new Error( - '`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' + - 'Make sure to either pass {inputs} or {input_name}=...', + '`inputs` was passed alongside ' + + `\`${input_name}\` which is not allowed. ` + + `Make sure to either pass \`inputs\` or \`${input_name}\`=...`, ); } } else { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 8907b2ab8..3c5e5bd97 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -47,6 +47,11 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { `NemoConformerTDTFeatureExtractor expected delta_order in {0,1,2}, got ${this.delta_order}.`, ); } + if (!Number.isInteger(this.delta_window) || this.delta_window < 1) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected \`delta_window\` as a positive integer, got ${this.delta_window}.`, + ); + } if (this.delta_order > 0 && !this.delta_concatenate) { logger.warn( 'NemoConformerTDTFeatureExtractor: `delta_concatenate=false` is set. ' + diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index d16eac520..efc9a1576 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -107,6 +107,13 @@ function resolveTransducerConfig(config, sessions) { ...DEFAULT_TRANSDUCER_IO, ...(transducerConfig.io ?? {}), }; + const requiredDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2]; + if (new Set(requiredDecoderOutputs).size !== requiredDecoderOutputs.length) { + throw new Error( + 'Invalid `transformers.js_config.transducer.io`: decoder output names must be distinct ' + + '(decoder_output, decoder_output_state_1, decoder_output_state_2).', + ); + } const decoderSession = sessions?.decoder_model_merged; if (!decoderSession) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index 71b67e60d..6ded8dda4 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -117,5 +117,20 @@ export default () => { }, MAX_TEST_EXECUTION_TIME, ); + + it("validates delta_window at construction time", () => { + expect( + () => new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, delta_order: 1, delta_window: 0 }), + ).toThrow("delta_window"); + expect( + () => + new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + delta_order: 1, + delta_window: 1.5, + }), + ).toThrow("delta_window"); + }); }); }; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index c87bd4038..342d06f26 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -197,6 +197,24 @@ export default () => { expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); }); + it("rejects duplicate decoder output aliases in transducer io config", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + io: { + decoder_output: "outputs", + decoder_output_state_1: "outputs", + decoder_output_state_2: "output_states_2", + }, + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("must be distinct"); + }); + it( "disposes encoder outputs when frame-count validation fails before decode", async () => { From 5d91d396ec463481844351d613f61f88c8b41646 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 01:58:28 +0300 Subject: [PATCH 19/40] fix(nemo-conformer-tdt): apply low-risk hardening follow-ups Validate transcribe timeOffset as finite and guard encoderOutputs cleanup path to avoid masking primary failures. Align transducer_text JSDoc token type with runtime shape (include id). Harden Parakeet feature extractor test by using direct mask data and explicit tensor disposal via try/finally; add timeOffset validation regression test. --- .../modeling_nemo_conformer_tdt.js | 15 +++++++---- .../nemo_conformer_tdt/transducer_text.js | 2 +- .../test_modeling_nemo_conformer_tdt.js | 19 ++++++++++++++ .../test_feature_extraction_parakeet.js | 26 +++++++++++-------- 4 files changed, 45 insertions(+), 17 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index efc9a1576..eda732983 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -533,6 +533,9 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { timeOffset = 0, } = {}, ) { + if (!Number.isFinite(timeOffset)) { + throw new Error('NemoConformerForTDT.transcribe expected `timeOffset` to be a finite number.'); + } const totalStart = nowMs(); const io = this.transducer.io; const vocabSize = this._resolveVocabSize(tokenizer); @@ -720,11 +723,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } finally { if (targetLengthTensor) targetLengthTensor.dispose(); if (decoderState) this._disposeDecoderState(decoderState); - const seen = new Set(); - for (const value of Object.values(encoderOutputs)) { - if (value instanceof Tensor && !seen.has(value)) { - value.dispose(); - seen.add(value); + if (encoderOutputs && typeof encoderOutputs === 'object') { + const seen = new Set(); + for (const value of Object.values(encoderOutputs)) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } } } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index a68bb828d..d12a27553 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -97,7 +97,7 @@ export function decodeTransducerText(tokenizer, token_ids) { * @param {number[] | null} token_confidences * @returns {{ * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, - * tokens: Array<{ token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, + * tokens: Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, * word_confidences: (number | null)[] | null, * word_avg: number | null, * }} diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 342d06f26..29b060605 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -156,6 +156,25 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "rejects non-finite timeOffset", + async () => { + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [{ logits: [9.0, 0.0, 0.0, 1.0] }]); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { decode: () => "" }, + return_timestamps: true, + timeOffset: Number.NaN, + }), + ).rejects.toThrow("timeOffset"); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "fails fast when duration logits are required but missing", async () => { diff --git a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js index c0e67a58b..82ece82a9 100644 --- a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js +++ b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js @@ -27,17 +27,21 @@ export default () => { const audio = Float32Array.from({ length: total }, (_, i) => Math.sin((2 * Math.PI * 220 * i) / config.sampling_rate)); const { input_features, attention_mask } = await feature_extractor(audio); - - expect(input_features.dims[0]).toBe(1); - expect(input_features.dims[2]).toBe(config.feature_size); - expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); - - const validFrames = attention_mask.tolist()[0].reduce((acc, x) => acc + Number(x), 0); - expect(validFrames).toBeGreaterThan(0); - expect(validFrames).toBeLessThanOrEqual(input_features.dims[1]); - - const preview = Array.from(input_features.data.slice(0, 256)); - expect(preview.every(Number.isFinite)).toBe(true); + try { + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(config.feature_size); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + + const validFrames = attention_mask.data.reduce((acc, x) => acc + Number(x), 0); + expect(validFrames).toBeGreaterThan(0); + expect(validFrames).toBeLessThanOrEqual(input_features.dims[1]); + + const preview = Array.from(input_features.data.slice(0, 256)); + expect(preview.every(Number.isFinite)).toBe(true); + } finally { + input_features.dispose(); + attention_mask.dispose(); + } }, MAX_TEST_EXECUTION_TIME, ); From dfc2c130aa3b599d76e15a807fa15101893c4fa1 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 02:24:23 +0300 Subject: [PATCH 20/40] fix(nemo-conformer-tdt): enforce named outputs and frame-level confidences --- .../modeling_nemo_conformer_tdt.js | 44 +++++-- ...t_feature_extraction_nemo_conformer_tdt.js | 28 +++-- .../test_modeling_nemo_conformer_tdt.js | 112 ++++++++++++++++++ 3 files changed, 170 insertions(+), 14 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index eda732983..c47407a56 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -313,7 +313,15 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { _getEncoderOutput(outputs) { const name = this.transducer.io.encoder_output; - return outputs[name] ?? Object.values(outputs)[0]; + const out = outputs?.[name]; + if (!(out instanceof Tensor)) { + const available = outputs && typeof outputs === 'object' ? Object.keys(outputs).join(', ') : '(none)'; + throw new Error( + `Nemo Conformer TDT encoder output "${name}" was not returned by the session. ` + + `Available outputs: ${available}.`, + ); + } + return out; } _getEncoderFrameCount(encoderOutput) { @@ -570,8 +578,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const tokenTimestamps = []; /** @type {number[] | null} */ const tokenConfidences = needConfidences ? [] : null; - /** @type {number[] | null} */ - const frameConfidences = returnFrameConfidences ? [] : null; + /** @type {Map | null} */ + const frameConfidenceStats = returnFrameConfidences ? new Map() : null; /** @type {number[] | null} */ const frameIndices = returnFrameIndices ? [] : null; /** @type {number[] | null} */ @@ -627,7 +635,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { frameTensor.dispose(); } - const logits = decoderOutput[io.decoder_output] ?? Object.values(decoderOutput)[0]; + const logits = decoderOutput[io.decoder_output]; const outputState1 = decoderOutput[io.decoder_output_state_1]; const outputState2 = decoderOutput[io.decoder_output_state_2]; const seenDecoderTensors = new Set(); @@ -639,6 +647,18 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } value.dispose(); } + if (!(logits instanceof Tensor)) { + this._disposeDecoderState( + { + state1: outputState1, + state2: outputState2, + }, + decoderState, + ); + throw new Error( + `Nemo Conformer TDT decoder output "${io.decoder_output}" was not returned by the session.`, + ); + } const logitsData = logits.data; if (logitsData.length < vocabSize) { logits.dispose(); @@ -674,8 +694,14 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { needConfidences || returnLogProbs || returnFrameConfidences ? confidenceFromLogits(logitsData, tokenId, vocabSize) : null; - if (frameConfidences && maybeConfidence) { - frameConfidences.push(maybeConfidence.confidence); + if (frameConfidenceStats && maybeConfidence) { + const stats = frameConfidenceStats.get(frameIndex); + if (stats) { + stats.sum += maybeConfidence.confidence; + stats.count += 1; + } else { + frameConfidenceStats.set(frameIndex, { sum: maybeConfidence.confidence, count: 1 }); + } } const newState = { @@ -779,7 +805,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } // Frame confidences are independent of return_timestamps — emit whenever requested. - if (returnFrameConfidences && frameConfidences && frameConfidences.length > 0) { + if (returnFrameConfidences && frameConfidenceStats && frameConfidenceStats.size > 0) { + const frameConfidences = []; + for (const { sum, count } of frameConfidenceStats.values()) { + frameConfidences.push(sum / count); + } if (!result.confidence_scores) { result.confidence_scores = {}; } diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index 6ded8dda4..8c184800d 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -19,9 +19,14 @@ export default () => { async () => { const extractor = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80 }); const { input_features, attention_mask } = await extractor(audio); - expect(input_features.dims[0]).toBe(1); - expect(input_features.dims[2]).toBe(80); - expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + try { + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(80); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + } finally { + input_features.dispose(); + attention_mask.dispose(); + } }, MAX_TEST_EXECUTION_TIME, ); @@ -31,9 +36,14 @@ export default () => { async () => { const extractor = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 128 }); const { input_features, attention_mask } = await extractor(audio); - expect(input_features.dims[0]).toBe(1); - expect(input_features.dims[2]).toBe(128); - expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + try { + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(128); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + } finally { + input_features.dispose(); + attention_mask.dispose(); + } }, MAX_TEST_EXECUTION_TIME, ); @@ -49,7 +59,11 @@ export default () => { delta_concatenate: true, }); const { input_features } = await extractor(audio); - expect(input_features.dims[2]).toBe(128 * 3); + try { + expect(input_features.dims[2]).toBe(128 * 3); + } finally { + input_features.dispose(); + } }, MAX_TEST_EXECUTION_TIME, ); diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 29b060605..2e185e41f 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -156,6 +156,37 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "aggregates frame confidences per encoder frame (not per decode step)", + async () => { + const model = new MockNemoConformerForTDT(BASE_CONFIG, BASE_SESSIONS, [ + // Frame 0: emit token=1, step=0 + { logits: [0.0, 4.0, -2.0, 9.0, 1.0, 0.0] }, + // Frame 0: emit token=2, step=0 (hits max_symbols_per_step and advances frame) + { logits: [0.0, -1.0, 3.0, 9.0, 1.0, 0.0] }, + // Frame 1: emit blank, step=2 -> exits decode loop + { logits: [5.0, 0.0, 0.0, 0.0, 1.0, 9.0] }, + ]); + + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + const output = await model.transcribe(inputs, { + return_timestamps: false, + returnFrameConfidences: true, + }); + + expect(output.confidence_scores.frame).toHaveLength(2); + expect(output.confidence_scores.frame[0]).toBeCloseTo(0.9579343795, 6); + expect(output.confidence_scores.frame_avg).toBeCloseTo( + (output.confidence_scores.frame[0] + output.confidence_scores.frame[1]) / 2, + 6, + ); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "rejects non-finite timeOffset", async () => { @@ -216,6 +247,87 @@ export default () => { expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); }); + it( + "fails fast when named encoder output is missing at runtime", + async () => { + class MissingEncoderOutputModel extends NemoConformerForTDT { + async _runEncoder() { + return { + outputs: new Tensor("float32", new Float32Array([0.1, 0.2]), [1, 2, 1]), + }; + } + + async _runDecoder() { + const stateShape = [1, 1, 2]; + return { + outputs: new Tensor("float32", new Float32Array([9.0, 0.0, 0.0, 8.0]), [1, 1, 4]), + output_states_1: new Tensor("float32", new Float32Array([0, 0]), stateShape), + output_states_2: new Tensor("float32", new Float32Array([0, 0]), stateShape), + }; + } + } + + const config = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + io: { encoder_output: "encoder_out" }, + }, + }, + }; + const sessions = { + ...BASE_SESSIONS, + encoder_model: { + ...BASE_SESSIONS.encoder_model, + outputNames: ["encoder_out"], + }, + }; + const model = new MissingEncoderOutputModel(config, sessions, {}); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow( + 'encoder output "encoder_out" was not returned', + ); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "fails fast when named decoder logits output is missing at runtime", + async () => { + class MissingDecoderOutputModel extends NemoConformerForTDT { + async _runEncoder() { + return { + outputs: new Tensor("float32", new Float32Array([0.1, 0.2]), [1, 2, 1]), + }; + } + + async _runDecoder() { + const stateShape = [1, 1, 2]; + return { + unexpected_logits: new Tensor("float32", new Float32Array([9.0, 0.0, 0.0, 8.0]), [1, 1, 4]), + output_states_1: new Tensor("float32", new Float32Array([0, 0]), stateShape), + output_states_2: new Tensor("float32", new Float32Array([0, 0]), stateShape), + }; + } + } + + const model = new MissingDecoderOutputModel(BASE_CONFIG, BASE_SESSIONS, {}); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow( + 'decoder output "outputs" was not returned', + ); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("rejects duplicate decoder output aliases in transducer io config", () => { const invalidConfig = { ...BASE_CONFIG, From a5bd2cf71dd85fe1472f479117ace8f173bab7b5 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 02:29:57 +0300 Subject: [PATCH 21/40] docs(nemo-conformer-tdt): clarify cached tensor sharing semantics --- .../feature_extraction_nemo_conformer_tdt.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 3c5e5bd97..3e66d492c 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -119,6 +119,8 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { * delta_features?: Tensor; * delta_delta_features?: Tensor; * }>} A Promise resolving to an object containing extracted model inputs. + * When cache is enabled, tensor instances are shared with cached entries. + * Do not mutate or dispose returned tensors unless cache is disabled/cleared. */ async _call(audio) { validate_audio_inputs(audio, 'NemoConformerTDTFeatureExtractor'); @@ -127,7 +129,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { const key = `${createAudioCacheKey(audio, this.config.sampling_rate)}:${this.delta_order}:${this.delta_window}:${this.delta_concatenate}`; const cached = this.feature_cache.get(key); if (cached) { - return cached; + return { ...cached }; } const extracted = await this._extract(audio); From abada622a54ee4f2784a5b2ad5abf48397c1b2f9 Mon Sep 17 00:00:00 2001 From: ysdede Date: Wed, 4 Mar 2026 02:47:17 +0300 Subject: [PATCH 22/40] fix(nemo-conformer-tdt): harden decoder I/O validation and feed cleanup --- .../modeling_nemo_conformer_tdt.js | 16 +++++++++++++++ ...t_feature_extraction_nemo_conformer_tdt.js | 15 +++++++++----- .../test_modeling_nemo_conformer_tdt.js | 20 +++++++++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index c47407a56..b29359799 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -107,6 +107,19 @@ function resolveTransducerConfig(config, sessions) { ...DEFAULT_TRANSDUCER_IO, ...(transducerConfig.io ?? {}), }; + const requiredDecoderInputs = [ + io.decoder_encoder, + io.decoder_token, + io.decoder_token_length, + io.decoder_state_1, + io.decoder_state_2, + ]; + if (new Set(requiredDecoderInputs).size !== requiredDecoderInputs.length) { + throw new Error( + 'Invalid `transformers.js_config.transducer.io`: decoder input names must be distinct ' + + '(decoder_encoder, decoder_token, decoder_token_length, decoder_state_1, decoder_state_2).', + ); + } const requiredDecoderOutputs = [io.decoder_output, io.decoder_output_state_1, io.decoder_output_state_2]; if (new Set(requiredDecoderOutputs).size !== requiredDecoderOutputs.length) { throw new Error( @@ -445,6 +458,9 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } if (missingInputs.length > 0) { + for (const tensor of disposables) { + tensor.dispose(); + } throw new Error( `Nemo Conformer TDT encoder session expects additional inputs that are not available: ${missingInputs.join(', ')}.`, ); diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index 8c184800d..ef852a3b5 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -109,12 +109,17 @@ export default () => { feature_cache_max_entries: 8, feature_cache_max_size_mb: 8, }); - const first = await extractor(audio); - const second = await extractor(audio); + try { + const first = await extractor(audio); + const second = await extractor(audio); - expect(first).toBe(second); - expect(extractor.get_cache_stats().entries).toBe(1); - extractor.clear_cache(); + expect(first).not.toBe(second); + expect(first.input_features).toBe(second.input_features); + expect(first.attention_mask).toBe(second.attention_mask); + expect(extractor.get_cache_stats().entries).toBe(1); + } finally { + extractor.clear_cache(); + } expect(extractor.get_cache_stats().entries).toBe(0); }, MAX_TEST_EXECUTION_TIME, diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 2e185e41f..652038463 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -346,6 +346,26 @@ export default () => { expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("must be distinct"); }); + it("rejects duplicate decoder input aliases in transducer io config", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + io: { + decoder_encoder: "encoder_outputs", + decoder_token: "targets", + decoder_token_length: "target_length", + decoder_state_1: "input_states_1", + decoder_state_2: "input_states_1", + }, + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("must be distinct"); + }); + it( "disposes encoder outputs when frame-count validation fails before decode", async () => { From 62d8bc0d0d3ab0634cf2b460827122cd8a53575a Mon Sep 17 00:00:00 2001 From: ysdede Date: Thu, 5 Mar 2026 23:18:07 +0300 Subject: [PATCH 23/40] fix(nemo-conformer-tdt): address bot review findings - fail fast on missing decoder state outputs and invalid encoder layout enums\n- make FeatureLRUCache own cached tensor lifetimes (replace/evict/clear) with deduped disposal and deterministic size fallback\n- validate n_fft/win_length in Nemo feature extractor\n- align Nemo ASR pipeline docs with actual forwarded options\n- add regression coverage for runtime config validation, non-concatenated deltas/cache behavior, missing decoder state outputs, and cache disposal semantics\n\nValidation:\n- pnpm test -- tests/models.test.js --filter nemo_conformer_tdt\n- pnpm test -- tests/pipelines.test.js --filter automatic_speech_recognition --- .../feature_extraction_nemo_conformer_tdt.js | 17 +- .../modeling_nemo_conformer_tdt.js | 35 +++- .../nemo_conformer_tdt/transducer_cache.js | 63 +++++- .../pipelines/automatic-speech-recognition.js | 6 +- ...t_feature_extraction_nemo_conformer_tdt.js | 74 ++++++- .../test_modeling_nemo_conformer_tdt.js | 197 ++++++++++++++++++ 6 files changed, 375 insertions(+), 17 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 3e66d492c..1f569365b 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -17,6 +17,21 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { constructor(config) { super(config); + if (!Number.isInteger(this.config.n_fft) || this.config.n_fft <= 0) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected \`n_fft\` as a positive integer, got ${this.config.n_fft}.`, + ); + } + if ( + !Number.isInteger(this.config.win_length) || + this.config.win_length <= 0 || + this.config.win_length > this.config.n_fft + ) { + throw new Error( + `NemoConformerTDTFeatureExtractor expected \`win_length\` in [1, n_fft], got win_length=${this.config.win_length}, n_fft=${this.config.n_fft}.`, + ); + } + // Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist. this.config.mel_filters ??= mel_filter_bank( Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins @@ -119,7 +134,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { * delta_features?: Tensor; * delta_delta_features?: Tensor; * }>} A Promise resolving to an object containing extracted model inputs. - * When cache is enabled, tensor instances are shared with cached entries. + * When cache is enabled, tensor instances are shared and owned by the cache. * Do not mutate or dispose returned tensors unless cache is disabled/cleared. */ async _call(audio) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index b29359799..02e3e3c42 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -175,6 +175,8 @@ function resolveTransducerConfig(config, sessions) { const frameShiftS = transducerConfig.frame_shift_s ?? 0.01; const blankTokenId = transducerConfig.blank_token_id ?? 0; const encoderOutputLayout = transducerConfig.encoder_output_layout; + const encoderInputLayout = transducerConfig.encoder_input_layout ?? 'BTF'; + const encoderFrameLayout = transducerConfig.encoder_frame_layout ?? 'BD1'; const decoderTokenDType = transducerConfig.decoder_token_dtype ?? 'int32'; const decoderTokenLengthDType = transducerConfig.decoder_token_length_dtype ?? 'int32'; @@ -195,6 +197,12 @@ function resolveTransducerConfig(config, sessions) { if (encoderOutputLayout !== 'BDT' && encoderOutputLayout !== 'BTD') { throw new Error('Invalid `transformers.js_config.transducer.encoder_output_layout`: expected "BDT" or "BTD".'); } + if (encoderInputLayout !== 'BTF' && encoderInputLayout !== 'BFT') { + throw new Error('Invalid `transformers.js_config.transducer.encoder_input_layout`: expected "BTF" or "BFT".'); + } + if (encoderFrameLayout !== 'BD1' && encoderFrameLayout !== 'B1D') { + throw new Error('Invalid `transformers.js_config.transducer.encoder_frame_layout`: expected "BD1" or "B1D".'); + } if (!['int32', 'int64'].includes(decoderTokenDType)) { throw new Error( 'Invalid `transformers.js_config.transducer.decoder_token_dtype`: expected "int32" or "int64".', @@ -213,9 +221,9 @@ function resolveTransducerConfig(config, sessions) { frame_shift_s: frameShiftS, vocab_size: transducerConfig.vocab_size ?? config.vocab_size ?? null, duration_start_index: transducerConfig.duration_start_index ?? null, - encoder_input_layout: transducerConfig.encoder_input_layout ?? 'BTF', + encoder_input_layout: encoderInputLayout, encoder_output_layout: encoderOutputLayout, - encoder_frame_layout: transducerConfig.encoder_frame_layout ?? 'BD1', + encoder_frame_layout: encoderFrameLayout, decoder_token_dtype: decoderTokenDType, decoder_token_length_dtype: decoderTokenLengthDType, decoder: { @@ -316,10 +324,10 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { _disposeDecoderState(state, keepState = null) { if (!state) return; - if (state.state1 && state.state1 !== keepState?.state1) { + if (state.state1 instanceof Tensor && state.state1 !== keepState?.state1) { state.state1.dispose(); } - if (state.state2 && state.state2 !== keepState?.state2) { + if (state.state2 instanceof Tensor && state.state2 !== keepState?.state2) { state.state2.dispose(); } } @@ -470,7 +478,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } _resolveVocabSize(tokenizer) { - if (Number.isInteger(this.transducer.vocab_size) && this.transducer.vocab_size > 0) { + if (Number.isInteger(this.transducer.vocab_size)) { return this.transducer.vocab_size; } @@ -675,6 +683,19 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { `Nemo Conformer TDT decoder output "${io.decoder_output}" was not returned by the session.`, ); } + if (!(outputState1 instanceof Tensor) || !(outputState2 instanceof Tensor)) { + logits.dispose(); + this._disposeDecoderState( + { + state1: outputState1, + state2: outputState2, + }, + decoderState, + ); + throw new Error( + `Nemo Conformer TDT decoder state outputs "${io.decoder_output_state_1}" and "${io.decoder_output_state_2}" were not returned by the session.`, + ); + } const logitsData = logits.data; if (logitsData.length < vocabSize) { logits.dispose(); @@ -721,8 +742,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } const newState = { - state1: outputState1 ?? decoderState.state1, - state2: outputState2 ?? decoderState.state2, + state1: outputState1, + state2: outputState2, }; if (tokenId !== blankId) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 4ea17da0e..6dded2d5d 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -26,7 +26,7 @@ export function createAudioCacheKey(audio, sampling_rate = 16000) { /** * Lightweight LRU cache for extracted features. - * Stores values as-is and tracks approximate memory usage. + * Stores values as-is, owns cached tensor lifetimes, and tracks approximate memory usage. */ export class FeatureLRUCache { /** @@ -65,6 +65,7 @@ export class FeatureLRUCache { set(key, value) { const existing = this.cache.get(key); if (existing) { + disposeCachedValue(existing.value); this.current_size_bytes -= existing.size_bytes; this.cache.delete(key); } @@ -76,6 +77,9 @@ export class FeatureLRUCache { } clear() { + for (const { value } of this.cache.values()) { + disposeCachedValue(value); + } this.cache.clear(); this.current_size_bytes = 0; } @@ -96,25 +100,72 @@ export class FeatureLRUCache { if (oldest_key === undefined) break; const oldest = this.cache.get(oldest_key); this.cache.delete(oldest_key); + disposeCachedValue(oldest?.value); this.current_size_bytes -= oldest?.size_bytes ?? 0; } } } +function tensorByteSize(tensor) { + let byteLength = null; + try { + byteLength = /** @type {any} */ (tensor.data)?.byteLength ?? null; + } catch { + byteLength = null; + } + if (typeof byteLength === 'number' && byteLength >= 0) { + return byteLength; + } + + const bytesPerElement = { + bool: 1, + int8: 1, + uint8: 1, + int16: 2, + uint16: 2, + int32: 4, + uint32: 4, + int64: 8, + uint64: 8, + float16: 2, + float32: 4, + float64: 8, + }; + return tensor.size * (bytesPerElement[tensor.type] ?? 4); +} + +function collectCachedTensors(value, out = new Set()) { + if (value instanceof Tensor) { + out.add(value); + return out; + } + if (value?.input_features instanceof Tensor) out.add(value.input_features); + if (value?.attention_mask instanceof Tensor) out.add(value.attention_mask); + if (value?.delta_features instanceof Tensor) out.add(value.delta_features); + if (value?.delta_delta_features instanceof Tensor) out.add(value.delta_delta_features); + return out; +} + +function disposeCachedValue(value) { + for (const tensor of collectCachedTensors(value)) { + tensor.dispose(); + } +} + function estimateSizeBytes(value) { if (value instanceof Tensor) { - return /** @type {any} */ (value.data)?.byteLength ?? 0; + return tensorByteSize(value); } if (value?.input_features instanceof Tensor) { - let bytes = /** @type {any} */ (value.input_features.data)?.byteLength ?? 0; + let bytes = tensorByteSize(value.input_features); if (value.attention_mask instanceof Tensor) { - bytes += /** @type {any} */ (value.attention_mask.data)?.byteLength ?? 0; + bytes += tensorByteSize(value.attention_mask); } if (value.delta_features instanceof Tensor) { - bytes += /** @type {any} */ (value.delta_features.data)?.byteLength ?? 0; + bytes += tensorByteSize(value.delta_features); } if (value.delta_delta_features instanceof Tensor) { - bytes += /** @type {any} */ (value.delta_delta_features.data)?.byteLength ?? 0; + bytes += tensorByteSize(value.delta_delta_features); } return bytes; } diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 92f16c7ac..e4b0f8f90 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -325,8 +325,10 @@ export class AutomaticSpeechRecognitionPipeline * * Delegates to model.transcribe() and returns its output directly. * Use `return_timestamps: true` on the pipeline call to get utterance-level data. - * For words/tokens/metrics/debug, call model.transcribe() directly with the - * extended options (return_words, return_tokens, return_metrics, etc.). + * This pipeline always requests metrics, and enables word details when + * timestamps are requested. + * For token-level and debug controls, call `model.transcribe()` directly with + * extended options. */ async _call_nemo_conformer_tdt(audio, kwargs) { if (typeof (/** @type {any} */ (this.model).transcribe) !== 'function') { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index ef852a3b5..74f60d8d0 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -68,6 +68,35 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "supports non-concatenated delta and delta-delta features", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + delta_order: 2, + delta_window: 2, + delta_concatenate: false, + }); + const { input_features, delta_features, delta_delta_features, attention_mask } = await extractor(audio); + try { + expect(input_features.dims[0]).toBe(1); + expect(input_features.dims[2]).toBe(80); + expect(delta_features).toBeDefined(); + expect(delta_delta_features).toBeDefined(); + expect(delta_features.dims).toEqual(input_features.dims); + expect(delta_delta_features.dims).toEqual(input_features.dims); + expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); + } finally { + input_features.dispose(); + delta_features?.dispose(); + delta_delta_features?.dispose(); + attention_mask.dispose(); + } + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "disposes replaced base features when concatenated delta output is used", async () => { @@ -86,11 +115,13 @@ export default () => { return originalDispose.call(this); }; + let input_features; try { - const { input_features } = await extractor(audio); + ({ input_features } = await extractor(audio)); expect(input_features.dims[2]).toBe(80 * 2); } finally { Tensor.prototype.dispose = originalDispose; + input_features?.dispose(); } // One dispose from computeTemporalDeltas intermediate tensor, one from replacing base features tensor. @@ -125,6 +156,37 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "uses feature cache when enabled for non-concatenated delta outputs", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + delta_order: 2, + delta_window: 2, + delta_concatenate: false, + use_feature_cache: true, + feature_cache_max_entries: 8, + feature_cache_max_size_mb: 8, + }); + try { + const first = await extractor(audio); + const second = await extractor(audio); + + expect(first).not.toBe(second); + expect(first.input_features).toBe(second.input_features); + expect(first.attention_mask).toBe(second.attention_mask); + expect(first.delta_features).toBe(second.delta_features); + expect(first.delta_delta_features).toBe(second.delta_delta_features); + expect(extractor.get_cache_stats().entries).toBe(1); + } finally { + extractor.clear_cache(); + } + expect(extractor.get_cache_stats().entries).toBe(0); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "validates preemphasis range", async () => { @@ -151,5 +213,15 @@ export default () => { }), ).toThrow("delta_window"); }); + + it("validates n_fft and win_length at construction time", () => { + expect(() => new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, n_fft: 0 })).toThrow("n_fft"); + expect(() => new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, win_length: 0 })).toThrow( + "win_length", + ); + expect(() => new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80, win_length: 1024 })).toThrow( + "win_length", + ); + }); }); }; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 652038463..20f64da77 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -80,6 +80,96 @@ export default () => { expect(MODEL_TYPE_MAPPING.get("nemo-conformer-tdt")).toBe(MODEL_TYPES.NemoConformerTDT); }); + it( + "throws on invalid runtime config: vocab_size must be > 0", + async () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + vocab_size: 0, + }, + }, + }; + const model = new MockNemoConformerForTDT(invalidConfig, BASE_SESSIONS, []); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { + decode: () => "", + get_vocab: () => new Map([["a", 0]]), + }, + }), + ).rejects.toThrow("vocab_size"); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "throws on invalid runtime config: blank_token_id must be < vocab_size", + async () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + blank_token_id: 3, + }, + }, + }; + const model = new MockNemoConformerForTDT(invalidConfig, BASE_SESSIONS, []); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { + decode: () => "", + get_vocab: () => new Map([["a", 0], ["b", 1], ["c", 2]]), + }, + }), + ).rejects.toThrow("blank_token_id"); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "throws on invalid runtime config: duration_start_index must be >= vocab_size", + async () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + duration_start_index: 2, + }, + }, + }; + const model = new MockNemoConformerForTDT(invalidConfig, BASE_SESSIONS, []); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { + decode: () => "", + get_vocab: () => new Map([["a", 0], ["b", 1], ["c", 2]]), + }, + }), + ).rejects.toThrow("duration_start_index"); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "greedily decodes scripted token and duration logits", async () => { @@ -247,6 +337,34 @@ export default () => { expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_output_layout"); }); + it("rejects invalid encoder_input_layout at construction time", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + encoder_input_layout: "BAD", + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_input_layout"); + }); + + it("rejects invalid encoder_frame_layout at construction time", () => { + const invalidConfig = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + encoder_frame_layout: "BAD", + }, + }, + }; + expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_frame_layout"); + }); + it( "fails fast when named encoder output is missing at runtime", async () => { @@ -328,6 +446,35 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "fails fast when named decoder state outputs are missing at runtime", + async () => { + class MissingDecoderStateOutputsModel extends NemoConformerForTDT { + async _runEncoder() { + return { + outputs: new Tensor("float32", new Float32Array([0.1, 0.2]), [1, 2, 1]), + }; + } + + async _runDecoder() { + return { + outputs: new Tensor("float32", new Float32Array([9.0, 0.0, 0.0, 8.0]), [1, 1, 4]), + }; + } + } + + const model = new MissingDecoderStateOutputsModel(BASE_CONFIG, BASE_SESSIONS, {}); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), + }; + + await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow( + 'decoder state outputs "output_states_1" and "output_states_2" were not returned', + ); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("rejects duplicate decoder output aliases in transducer io config", () => { const invalidConfig = { ...BASE_CONFIG, @@ -578,6 +725,56 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it("disposes replaced cache entries", () => { + const cache = new FeatureLRUCache({ max_entries: 4, max_size_mb: 4 }); + const originalDispose = Tensor.prototype.dispose; + let disposeCalls = 0; + Tensor.prototype.dispose = function () { + disposeCalls += 1; + return originalDispose.call(this); + }; + + try { + cache.set("x", new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3])); + cache.set("x", new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3])); + expect(disposeCalls).toBe(1); + } finally { + Tensor.prototype.dispose = originalDispose; + cache.clear(); + } + }); + + it("disposes tensors on eviction and clear without double-disposing shared refs", () => { + const cache = new FeatureLRUCache({ max_entries: 1, max_size_mb: 4 }); + const originalDispose = Tensor.prototype.dispose; + let disposeCalls = 0; + Tensor.prototype.dispose = function () { + disposeCalls += 1; + return originalDispose.call(this); + }; + + try { + const sharedA = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3]); + cache.set("a", { + input_features: sharedA, + attention_mask: sharedA, + }); + const sharedB = new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3]); + cache.set("b", { + input_features: sharedB, + attention_mask: sharedB, + }); + // Eviction of "a" should dispose sharedA once, despite duplicate field references. + expect(disposeCalls).toBe(1); + + cache.clear(); + // Clear should dispose sharedB once. + expect(disposeCalls).toBe(2); + } finally { + Tensor.prototype.dispose = originalDispose; + } + }); + it("rejects invalid cache limits", () => { expect(() => new FeatureLRUCache({ max_entries: -1 })).toThrow("max_entries"); expect(() => new FeatureLRUCache({ max_entries: 1.25 })).toThrow("max_entries"); From 03fb8bd223c62e25f428f2e5d74f17d64bd9954e Mon Sep 17 00:00:00 2001 From: ysdede Date: Thu, 5 Mar 2026 23:26:02 +0300 Subject: [PATCH 24/40] style(nemo-conformer-tdt): simplify duration frame expression Apply Gemini review nit in Nemo decode loop by replacing a redundant duration expression with Math.max(1, step).\n\nValidation:\n- pnpm test -- tests/models.test.js --filter nemo_conformer_tdt --- .../models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 02e3e3c42..92aedc2ac 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -753,7 +753,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenIds.push(tokenId); // TDT duration convention: step=0 means "stay on current frame" (duration index 0 = no advance). // We still associate the token with this frame, so durationFrames is at least 1. - const durationFrames = Math.max(1, step > 0 ? step : 1); + const durationFrames = Math.max(1, step); const endFrame = Math.min(frameCount, frameIndex + durationFrames); tokenTimestamps.push([ roundTs(frameIndex * frameTime + timeOffset), From 426061e5109b1953987c2a58873c6739b67cdbba Mon Sep 17 00:00:00 2001 From: ysdede Date: Thu, 5 Mar 2026 23:42:42 +0300 Subject: [PATCH 25/40] fix(nemo-tdt): address PR10 follow-up review comments Checklist (bot comment IDs): - [x] 2892132356: guard tokenizer.get_vocab() return type before Object.keys in _resolveVocabSize. - [x] 2892132367: treat zero cache limits as explicit no-cache mode; do not store/dispose just-produced values. - [x] 2892132372: dispose processor tensors in Nemo ASR pipeline when cache does not own lifetimes. Added regression tests for vocab resolution fallback, zero-limit cache semantics, and Nemo pipeline tensor ownership behavior. Validation: - pnpm test -- tests/models.test.js --filter nemo_conformer_tdt - pnpm test -- tests/pipelines.test.js --filter automatic_speech_recognition --- .../modeling_nemo_conformer_tdt.js | 12 +- .../nemo_conformer_tdt/transducer_cache.js | 8 ++ .../pipelines/automatic-speech-recognition.js | 22 ++- .../test_modeling_nemo_conformer_tdt.js | 62 +++++++++ ..._pipelines_automatic_speech_recognition.js | 126 +++++++++++++++++- 5 files changed, 224 insertions(+), 6 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 92aedc2ac..cfe0cfc6b 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -484,9 +484,15 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { if (tokenizer?.get_vocab) { const vocab = tokenizer.get_vocab(); - const size = vocab instanceof Map ? vocab.size : Object.keys(vocab).length; - if (size > 0) { - return size; + if (vocab instanceof Map) { + if (vocab.size > 0) { + return vocab.size; + } + } else if (vocab && typeof vocab === 'object') { + const size = Object.keys(vocab).length; + if (size > 0) { + return size; + } } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 6dded2d5d..1b82f71f1 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -63,6 +63,14 @@ export class FeatureLRUCache { * @returns {void} */ set(key, value) { + // Explicit no-cache mode: keep caller ownership of current values. + if (this.max_entries === 0 || this.max_size_mb === 0) { + if (this.cache.size > 0) { + this.clear(); + } + return; + } + const existing = this.cache.get(key); if (existing) { disposeCachedValue(existing.value); diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index e4b0f8f90..4717ca279 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -363,11 +363,29 @@ export class AutomaticSpeechRecognitionPipeline this._validateNemoAudio(preparedAudios[i], i); } + const featureCache = this.processor.feature_extractor?.feature_cache; + const cacheOwnsTensors = !!( + featureCache && + featureCache.max_entries > 0 && + featureCache.max_size_mb > 0 + ); const toReturn = []; for (const aud of preparedAudios) { const inputs = await this.processor(aud); - const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); - toReturn.push(output); + try { + const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); + toReturn.push(output); + } finally { + if (!cacheOwnsTensors) { + const seen = new Set(); + for (const value of Object.values(inputs ?? {})) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } + } + } + } } return single ? toReturn[0] : toReturn; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 20f64da77..63d88cb07 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -170,6 +170,36 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "throws explicit vocab resolution error when tokenizer.get_vocab returns a non-object", + async () => { + const configWithoutVocab = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + vocab_size: undefined, + }, + }, + }; + const model = new MockNemoConformerForTDT(configWithoutVocab, BASE_SESSIONS, []); + const inputs = { + input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), + }; + + await expect( + model.transcribe(inputs, { + tokenizer: { + decode: () => "", + get_vocab: () => null, + }, + }), + ).rejects.toThrow("Unable to resolve vocabulary size"); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "greedily decodes scripted token and duration logits", async () => { @@ -775,6 +805,38 @@ export default () => { } }); + it("treats zero cache limits as explicit no-cache mode without disposing inserted values", () => { + const byEntries = new FeatureLRUCache({ max_entries: 0, max_size_mb: 4 }); + const bySize = new FeatureLRUCache({ max_entries: 4, max_size_mb: 0 }); + const t1 = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3]); + const t2 = new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3]); + + let t1Disposals = 0; + const t1Dispose = t1.dispose.bind(t1); + t1.dispose = () => { + t1Disposals += 1; + t1Dispose(); + }; + let t2Disposals = 0; + const t2Dispose = t2.dispose.bind(t2); + t2.dispose = () => { + t2Disposals += 1; + t2Dispose(); + }; + + byEntries.set("x", t1); + bySize.set("y", t2); + expect(byEntries.get("x")).toBeNull(); + expect(bySize.get("y")).toBeNull(); + expect(t1Disposals).toBe(0); + expect(t2Disposals).toBe(0); + + t1.dispose(); + t2.dispose(); + expect(t1Disposals).toBe(1); + expect(t2Disposals).toBe(1); + }); + it("rejects invalid cache limits", () => { expect(() => new FeatureLRUCache({ max_entries: -1 })).toThrow("max_entries"); expect(() => new FeatureLRUCache({ max_entries: 1.25 })).toThrow("max_entries"); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 06f8cec16..9f227a9b3 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -1,4 +1,4 @@ -import { pipeline, AutomaticSpeechRecognitionPipeline } from "../../src/transformers.js"; +import { pipeline, AutomaticSpeechRecognitionPipeline, Tensor } from "../../src/transformers.js"; import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; @@ -226,6 +226,130 @@ export default () => { "finite audio samples", ); }); + + it("disposes processor tensors after Nemo transcription when feature cache is disabled", async () => { + let disposeCalls = 0; + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe() { + return { text: "ok" }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => { + const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); + const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); + const trackDispose = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + trackDispose(input_features); + trackDispose(attention_mask); + return { input_features, attention_mask }; + }, { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + const output = await pipe(new Float32Array(16000), { return_timestamps: false }); + expect(output).toEqual({ text: "ok" }); + expect(disposeCalls).toBe(2); + }); + + it("keeps processor tensors alive when Nemo feature cache owns tensor lifetimes", async () => { + let disposeCalls = 0; + let lastInputs = null; + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe() { + return { text: "ok" }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => { + const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); + const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); + const trackDispose = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + trackDispose(input_features); + trackDispose(attention_mask); + lastInputs = { input_features, attention_mask }; + return lastInputs; + }, { + feature_extractor: { + config: { sampling_rate: 16000 }, + feature_cache: { max_entries: 2, max_size_mb: 8 }, + }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + try { + const output = await pipe(new Float32Array(16000), { return_timestamps: false }); + expect(output).toEqual({ text: "ok" }); + expect(disposeCalls).toBe(0); + } finally { + lastInputs?.input_features.dispose(); + lastInputs?.attention_mask.dispose(); + } + }); + + it("disposes processor tensors when Nemo feature cache limits disable caching", async () => { + let disposeCalls = 0; + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe() { + return { text: "ok" }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => { + const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); + const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); + const trackDispose = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + trackDispose(input_features); + trackDispose(attention_mask); + return { input_features, attention_mask }; + }, { + feature_extractor: { + config: { sampling_rate: 16000 }, + feature_cache: { max_entries: 0, max_size_mb: 8 }, + }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + const output = await pipe(new Float32Array(16000), { return_timestamps: false }); + expect(output).toEqual({ text: "ok" }); + expect(disposeCalls).toBe(2); + }); }); }); }; From d7476a6d5c29716268bebb956e28063e626eac63 Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 00:06:11 +0300 Subject: [PATCH 26/40] fix(transformers): resolve Nemo TDT typegen regressions - widen confidenceFromLogits input type to Tensor data arrays - narrow feature_cache access with explicit typed cast in ASR pipeline --- .../models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js | 2 +- .../src/pipelines/automatic-speech-recognition.js | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index cfe0cfc6b..b79b57cae 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -57,7 +57,7 @@ function roundTs(value) { } /** - * @param {Float32Array|number[]} logits + * @param {import('../../utils/tensor.js').Tensor['data']} logits * @param {number} tokenId * @param {number} vocabSize * @returns {{ confidence: number, logProb: number }} diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 4717ca279..b4c468a4c 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -363,7 +363,9 @@ export class AutomaticSpeechRecognitionPipeline this._validateNemoAudio(preparedAudios[i], i); } - const featureCache = this.processor.feature_extractor?.feature_cache; + const featureCache = /** @type {{ max_entries: number, max_size_mb: number }|null|undefined} */ ( + /** @type {any} */ (this.processor.feature_extractor)?.feature_cache + ); const cacheOwnsTensors = !!( featureCache && featureCache.max_entries > 0 && From 0989f7abf31a5e5e322c335c1d11e5e4cdd5365f Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 00:16:52 +0300 Subject: [PATCH 27/40] fix(nemo-tdt): address PR11 cache and vocab review feedback Checklist (bot comment IDs): - [x] 2892287484: handle array-returning tokenizer vocab in _resolveVocabSize. - [x] 2892322884: avoid disposing when re-setting the same object for an existing cache key. - [x] 2892322906: skip caching oversized values to prevent insert-then-dispose of caller-owned tensors. - [x] 2892322910: guard byteLength type in estimateSizeBytes. Added regression tests for array vocab sizing, same-object set behavior, oversized value skipping, and non-numeric byteLength handling. Validation: - pnpm test -- tests/models.test.js --filter nemo_conformer_tdt - pnpm test -- tests/pipelines.test.js --filter automatic_speech_recognition --- .../modeling_nemo_conformer_tdt.js | 4 ++ .../nemo_conformer_tdt/transducer_cache.js | 29 ++++++++- .../test_modeling_nemo_conformer_tdt.js | 64 +++++++++++++++++++ 3 files changed, 94 insertions(+), 3 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index b79b57cae..2ae145058 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -488,6 +488,10 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { if (vocab.size > 0) { return vocab.size; } + } else if (Array.isArray(vocab)) { + if (vocab.length > 0) { + return vocab.length; + } } else if (vocab && typeof vocab === 'object') { const size = Object.keys(vocab).length; if (size > 0) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 1b82f71f1..02fee3f28 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -71,14 +71,36 @@ export class FeatureLRUCache { return; } + const max_bytes = this.max_size_mb * 1024 * 1024; const existing = this.cache.get(key); + if (existing?.value === value) { + // Refresh recency for unchanged value without invalidating caller-owned references. + this.cache.delete(key); + if (existing.size_bytes <= max_bytes) { + this.cache.set(key, existing); + } else { + this.current_size_bytes -= existing.size_bytes; + } + return; + } + + const size_bytes = estimateSizeBytes(value); + if (size_bytes > max_bytes) { + // Cannot fit in cache: keep caller ownership and skip caching. + if (existing) { + disposeCachedValue(existing.value); + this.current_size_bytes -= existing.size_bytes; + this.cache.delete(key); + } + return; + } + if (existing) { disposeCachedValue(existing.value); this.current_size_bytes -= existing.size_bytes; this.cache.delete(key); } - const size_bytes = estimateSizeBytes(value); this.cache.set(key, { value, size_bytes }); this.current_size_bytes += size_bytes; this._evict(); @@ -177,8 +199,9 @@ function estimateSizeBytes(value) { } return bytes; } - if (value?.byteLength) { - return value.byteLength; + const byteLength = value?.byteLength; + if (typeof byteLength === 'number' && Number.isFinite(byteLength) && byteLength >= 0) { + return byteLength; } return 0; } diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 63d88cb07..01e977bf4 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -200,6 +200,25 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it("resolves vocab size from array tokenizers when config vocab_size is not set", () => { + const configWithoutVocab = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + vocab_size: undefined, + }, + }, + }; + const model = new MockNemoConformerForTDT(configWithoutVocab, BASE_SESSIONS, []); + expect( + model._resolveVocabSize({ + get_vocab: () => ["", "hello", "world"], + }), + ).toBe(3); + }); + it( "greedily decodes scripted token and duration logits", async () => { @@ -774,6 +793,25 @@ export default () => { } }); + it("does not dispose when re-setting the same value object for an existing key", () => { + const cache = new FeatureLRUCache({ max_entries: 4, max_size_mb: 4 }); + const tensor = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3]); + let disposeCalls = 0; + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + + cache.set("x", tensor); + cache.set("x", tensor); + expect(cache.get("x")).toBe(tensor); + expect(disposeCalls).toBe(0); + + cache.clear(); + expect(disposeCalls).toBe(1); + }); + it("disposes tensors on eviction and clear without double-disposing shared refs", () => { const cache = new FeatureLRUCache({ max_entries: 1, max_size_mb: 4 }); const originalDispose = Tensor.prototype.dispose; @@ -837,6 +875,32 @@ export default () => { expect(t2Disposals).toBe(1); }); + it("skips caching oversized values without disposing caller-owned tensors", () => { + const cache = new FeatureLRUCache({ max_entries: 4, max_size_mb: 0.000001 }); + const tensor = new Tensor("float32", new Float32Array([1, 2]), [1, 2]); + let disposeCalls = 0; + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + + cache.set("big", tensor); + expect(cache.get("big")).toBeNull(); + expect(disposeCalls).toBe(0); + + tensor.dispose(); + expect(disposeCalls).toBe(1); + }); + + it("ignores non-numeric byteLength values in size estimation", () => { + const cache = new FeatureLRUCache({ max_entries: 4, max_size_mb: 4 }); + cache.set("x", { byteLength: "invalid" }); + expect(cache.stats().entries).toBe(1); + expect(cache.stats().size_mb).toBe(0); + cache.clear(); + }); + it("rejects invalid cache limits", () => { expect(() => new FeatureLRUCache({ max_entries: -1 })).toThrow("max_entries"); expect(() => new FeatureLRUCache({ max_entries: 1.25 })).toThrow("max_entries"); From ee819a1c824d496434bf7dd3a0c15547e18fb624 Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 01:28:49 +0300 Subject: [PATCH 28/40] fix(nemo-tdt): add supports() for ASR model class selection --- .../models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 2ae145058..15a8c2a57 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -243,6 +243,10 @@ export class NemoConformerTDTPreTrainedModel extends PreTrainedModel { this.transducer = resolveTransducerConfig(config, sessions); } + static supports(model_type) { + return model_type === NEMO_CONFORMER_TDT_MODEL_TYPE; + } + /** * Load Nemo Conformer TDT sessions using v4 canonical ONNX filenames. * @type {typeof PreTrainedModel.from_pretrained} From b44f7f3ff1b6308cb57e084a10edc4b7dfc32bfd Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 01:44:32 +0300 Subject: [PATCH 29/40] fix(model-registry): include processor files for text-to-audio pipelines --- packages/transformers/package.json | 2 ++ .../src/utils/model_registry/get_pipeline_files.js | 6 +++++- packages/transformers/tests/utils/cache.test.js | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/packages/transformers/package.json b/packages/transformers/package.json index c00268823..031efea78 100644 --- a/packages/transformers/package.json +++ b/packages/transformers/package.json @@ -28,6 +28,8 @@ "dev": "node scripts/dev.mjs", "build": "node scripts/build.mjs && pnpm typegen", "test": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage", + "test:models": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage tests/models.test.js --runInBand", + "test:nemo-tdt": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage tests/models.test.js --runInBand --testNamePattern nemo_conformer_tdt", "readme": "python ./docs/scripts/build_readme.py", "docs-api": "node ./docs/scripts/generate.js", "docs-build": "doc-builder build transformers.js ./docs/source/ --not_python_module --build_dir ./docs/build/", diff --git a/packages/transformers/src/utils/model_registry/get_pipeline_files.js b/packages/transformers/src/utils/model_registry/get_pipeline_files.js index cff073b7d..bcac4cf56 100644 --- a/packages/transformers/src/utils/model_registry/get_pipeline_files.js +++ b/packages/transformers/src/utils/model_registry/get_pipeline_files.js @@ -32,9 +32,13 @@ export async function get_pipeline_files(task, modelId, options = {}) { // - 'text' tasks: always check tokenizer, skip processor (text models rarely have one) // - 'audio'/'image' tasks: skip tokenizer, always check processor // - 'multimodal' tasks: check both + // + // NOTE: + // `text-to-audio` may load `AutoModelForTextToSpectrogram` models (e.g., SpeechT5), + // which require processor files. Keep processor detection enabled for this task. const { type } = taskConfig; const include_tokenizer = type !== 'audio' && type !== 'image'; - const include_processor = type !== 'text'; + const include_processor = type !== 'text' || task === 'text-to-audio'; return get_files(modelId, { ...options, diff --git a/packages/transformers/tests/utils/cache.test.js b/packages/transformers/tests/utils/cache.test.js index 36f8880a7..1370d9896 100644 --- a/packages/transformers/tests/utils/cache.test.js +++ b/packages/transformers/tests/utils/cache.test.js @@ -6,6 +6,7 @@ import { MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; const LLAMA_MODEL_ID = "hf-internal-testing/tiny-random-LlamaForCausalLM"; const BERT_MODEL_ID = "hf-internal-testing/tiny-random-BertModel"; const VIT_MODEL_ID = "hf-internal-testing/tiny-random-vit"; +const SPEECHT5_MODEL_ID = "Xenova/speecht5_tts"; // Dedicated model IDs for cache clearing tests to avoid interference with other parallel tests. // These must NOT be used in any other test file. @@ -201,6 +202,16 @@ describe("Cache", () => { }, MAX_TEST_EXECUTION_TIME, ); + + it( + "should include processor files for text-to-audio when model provides them", + async () => { + const files = await ModelRegistry.get_pipeline_files("text-to-audio", SPEECHT5_MODEL_ID, DEFAULT_MODEL_OPTIONS); + expect(files).toContain("preprocessor_config.json"); + expect(files).toContain("tokenizer.json"); + }, + MAX_TEST_EXECUTION_TIME, + ); }); describe("is_cached", () => { From bfa97e6ec750bb193d2306898bd31fd05b91392b Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 01:54:44 +0300 Subject: [PATCH 30/40] Revert "fix(model-registry): include processor files for text-to-audio pipelines" This reverts commit b44f7f3ff1b6308cb57e084a10edc4b7dfc32bfd. --- packages/transformers/package.json | 2 -- .../src/utils/model_registry/get_pipeline_files.js | 6 +----- packages/transformers/tests/utils/cache.test.js | 11 ----------- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/packages/transformers/package.json b/packages/transformers/package.json index 031efea78..c00268823 100644 --- a/packages/transformers/package.json +++ b/packages/transformers/package.json @@ -28,8 +28,6 @@ "dev": "node scripts/dev.mjs", "build": "node scripts/build.mjs && pnpm typegen", "test": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage", - "test:models": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage tests/models.test.js --runInBand", - "test:nemo-tdt": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage tests/models.test.js --runInBand --testNamePattern nemo_conformer_tdt", "readme": "python ./docs/scripts/build_readme.py", "docs-api": "node ./docs/scripts/generate.js", "docs-build": "doc-builder build transformers.js ./docs/source/ --not_python_module --build_dir ./docs/build/", diff --git a/packages/transformers/src/utils/model_registry/get_pipeline_files.js b/packages/transformers/src/utils/model_registry/get_pipeline_files.js index bcac4cf56..cff073b7d 100644 --- a/packages/transformers/src/utils/model_registry/get_pipeline_files.js +++ b/packages/transformers/src/utils/model_registry/get_pipeline_files.js @@ -32,13 +32,9 @@ export async function get_pipeline_files(task, modelId, options = {}) { // - 'text' tasks: always check tokenizer, skip processor (text models rarely have one) // - 'audio'/'image' tasks: skip tokenizer, always check processor // - 'multimodal' tasks: check both - // - // NOTE: - // `text-to-audio` may load `AutoModelForTextToSpectrogram` models (e.g., SpeechT5), - // which require processor files. Keep processor detection enabled for this task. const { type } = taskConfig; const include_tokenizer = type !== 'audio' && type !== 'image'; - const include_processor = type !== 'text' || task === 'text-to-audio'; + const include_processor = type !== 'text'; return get_files(modelId, { ...options, diff --git a/packages/transformers/tests/utils/cache.test.js b/packages/transformers/tests/utils/cache.test.js index 1370d9896..36f8880a7 100644 --- a/packages/transformers/tests/utils/cache.test.js +++ b/packages/transformers/tests/utils/cache.test.js @@ -6,7 +6,6 @@ import { MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; const LLAMA_MODEL_ID = "hf-internal-testing/tiny-random-LlamaForCausalLM"; const BERT_MODEL_ID = "hf-internal-testing/tiny-random-BertModel"; const VIT_MODEL_ID = "hf-internal-testing/tiny-random-vit"; -const SPEECHT5_MODEL_ID = "Xenova/speecht5_tts"; // Dedicated model IDs for cache clearing tests to avoid interference with other parallel tests. // These must NOT be used in any other test file. @@ -202,16 +201,6 @@ describe("Cache", () => { }, MAX_TEST_EXECUTION_TIME, ); - - it( - "should include processor files for text-to-audio when model provides them", - async () => { - const files = await ModelRegistry.get_pipeline_files("text-to-audio", SPEECHT5_MODEL_ID, DEFAULT_MODEL_OPTIONS); - expect(files).toContain("preprocessor_config.json"); - expect(files).toContain("tokenizer.json"); - }, - MAX_TEST_EXECUTION_TIME, - ); }); describe("is_cached", () => { From a85dff25ba04e7b223dc89367e9e3da3a6424774 Mon Sep 17 00:00:00 2001 From: ysdede Date: Fri, 6 Mar 2026 02:43:07 +0300 Subject: [PATCH 31/40] fix(nemo-tdt): address PR #12 reviewer feedback --- .../feature_extraction_nemo_conformer_tdt.js | 2 +- .../modeling_nemo_conformer_tdt.js | 27 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 1f569365b..3ecea708a 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -149,7 +149,7 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { const extracted = await this._extract(audio); this.feature_cache.set(key, extracted); - return extracted; + return { ...extracted }; } return await this._extract(audio); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 15a8c2a57..64a6372eb 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -460,6 +460,11 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } else { length = inputFeatures.dims[1]; } + if (!Number.isInteger(length) || length < 0) { + throw new Error( + `Nemo Conformer TDT expected a non-negative integer encoder length, got: ${length}.`, + ); + } const lengthTensor = new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1]); disposables.push(lengthTensor); feeds[name] = lengthTensor; @@ -713,10 +718,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const logitsData = logits.data; if (logitsData.length < vocabSize) { logits.dispose(); - this._disposeDecoderState({ - state1: outputState1, - state2: outputState2, - }); + this._disposeDecoderState( + { + state1: outputState1, + state2: outputState2, + }, + decoderState, + ); throw new Error( `Nemo Conformer TDT decoder output is too small (${logitsData.length}) for vocab_size=${vocabSize}.`, ); @@ -726,10 +734,13 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const hasDurationLogits = logitsData.length > durationStart; if (this.transducer.duration_start_index != null && !hasDurationLogits) { logits.dispose(); - this._disposeDecoderState({ - state1: outputState1, - state2: outputState2, - }); + this._disposeDecoderState( + { + state1: outputState1, + state2: outputState2, + }, + decoderState, + ); throw new Error( `Nemo Conformer TDT decoder output is missing duration logits: expected values beyond index ${durationStart - 1}, got length=${logitsData.length}.`, ); From 8dfccddc4dd30aa6fb66108f35382c74a25dbf65 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 8 Mar 2026 00:45:06 +0300 Subject: [PATCH 32/40] feat(nemo-tdt): align asr pipeline outputs and long-audio handling Align the Nemo ASR pipeline with the shared task contract by returning text-only results by default and chunk-based timestamps for segment and word modes. Add automatic long-audio windowing, decoded-text-driven word reconstruction, and model-local helpers for window merge and chunk assembly. Also add regression coverage for numeric/punctuation word boundaries, windowed merge behavior, and auto-windowed long-form pipeline decoding. --- .../pipeline_nemo_conformer_tdt.js | 167 ++++++++++ .../transducer_segment_offsets.js | 87 ++++++ .../nemo_conformer_tdt/transducer_text.js | 155 +--------- .../transducer_window_merge.js | 179 +++++++++++ .../transducer_word_offsets.js | 216 +++++++++++++ .../utils_nemo_conformer_tdt.js | 4 + .../pipelines/automatic-speech-recognition.js | 96 +----- .../test_modeling_nemo_conformer_tdt.js | 66 ++++ ..._pipelines_automatic_speech_recognition.js | 292 ++++++++++++++++-- 9 files changed, 1007 insertions(+), 255 deletions(-) create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js create mode 100644 packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js diff --git a/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js new file mode 100644 index 000000000..8762fc1d1 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js @@ -0,0 +1,167 @@ +import { Tensor } from '../../utils/tensor.js'; +import { + buildWordChunks, + buildNemoSegmentChunks, + buildNemoWindowSpecs, + mergeNemoWindowResults, +} from './utils_nemo_conformer_tdt.js'; + +const NEMO_AUTO_WINDOW_THRESHOLD_S = 180; +const NEMO_AUTO_CHUNK_LENGTH_S = 90; +const NEMO_AUTO_STRIDE_LENGTH_S = 10; + +function validateNemoAudio(audio, index) { + if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { + throw new TypeError( + `Nemo Conformer TDT pipeline expected audio at index ${index} to be Float32Array or Float64Array.`, + ); + } + if (audio.length === 0) { + throw new Error(`Nemo Conformer TDT pipeline expected non-empty audio at index ${index}.`); + } + for (let i = 0; i < audio.length; ++i) { + if (!Number.isFinite(audio[i])) { + throw new Error( + `Nemo Conformer TDT pipeline expected finite audio samples; found ${audio[i]} at index ${index}:${i}.`, + ); + } + } +} + +/** + * Run the ASR pipeline adapter for Nemo Conformer TDT models. + * Keeps the public contract task-shaped while delegating rich outputs to `model.transcribe()`. + * + * @param {{ + * model: any, + * processor: any, + * tokenizer: any, + * audio: Float32Array|Float64Array|Array, + * kwargs: Record, + * prepareAudios: (audio: any[], sampling_rate: number) => Promise<(Float32Array|Float64Array)[]>, + * }} options + */ +export async function runNemoConformerTDTPipeline({ + model, + processor, + tokenizer, + audio, + kwargs, + prepareAudios, +}) { + if (typeof model?.transcribe !== 'function') { + throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); + } + if (!processor) { + throw new Error('Nemo Conformer TDT pipeline requires a processor.'); + } + if (!tokenizer) { + throw new Error('Nemo Conformer TDT pipeline requires a tokenizer.'); + } + if (!processor.feature_extractor?.config?.sampling_rate) { + throw new Error( + 'Nemo Conformer TDT pipeline requires `processor.feature_extractor.config.sampling_rate` to prepare audio.', + ); + } + + const return_timestamps = kwargs.return_timestamps ?? false; + const wantWordTimestamps = return_timestamps === 'word'; + const wantTimestampChunks = return_timestamps === true || wantWordTimestamps; + const requested_chunk_length_s = kwargs.chunk_length_s ?? 0; + const requested_stride_length_s = kwargs.stride_length_s ?? null; + + const single = !Array.isArray(audio); + const batchedAudio = single ? [audio] : audio; + const sampling_rate = processor.feature_extractor.config.sampling_rate; + const preparedAudios = await prepareAudios(batchedAudio, sampling_rate); + for (let i = 0; i < preparedAudios.length; ++i) { + validateNemoAudio(preparedAudios[i], i); + } + + const featureCache = /** @type {{ max_entries: number, max_size_mb: number }|null|undefined} */ ( + /** @type {any} */ (processor.feature_extractor)?.feature_cache + ); + const cacheOwnsTensors = !!( + featureCache && + featureCache.max_entries > 0 && + featureCache.max_size_mb > 0 + ); + + const runNemoTranscribe = async (windowAudio, decodeOptions) => { + const inputs = await processor(windowAudio); + try { + return await model.transcribe(inputs, decodeOptions); + } finally { + if (!cacheOwnsTensors) { + const seen = new Set(); + for (const value of Object.values(inputs ?? {})) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } + } + } + } + }; + + const toReturn = []; + for (const aud of preparedAudios) { + const audio_duration_s = aud.length / sampling_rate; + const autoWindowing = requested_chunk_length_s <= 0 && audio_duration_s > NEMO_AUTO_WINDOW_THRESHOLD_S; + const chunk_length_s = + requested_chunk_length_s > 0 + ? requested_chunk_length_s + : autoWindowing + ? NEMO_AUTO_CHUNK_LENGTH_S + : 0; + const stride_length_s = + requested_chunk_length_s > 0 + ? requested_stride_length_s + : autoWindowing + ? NEMO_AUTO_STRIDE_LENGTH_S + : null; + + if (chunk_length_s > 0) { + const windows = buildNemoWindowSpecs(aud, sampling_rate, chunk_length_s, stride_length_s); + const windowResults = []; + for (const window of windows) { + const output = await runNemoTranscribe(window.audio, { + tokenizer, + return_timestamps: true, + return_words: true, + return_tokens: true, + return_metrics: false, + timeOffset: window.start_s, + }); + windowResults.push({ window, output }); + } + + const merged = mergeNemoWindowResults(tokenizer, windowResults); + const result = { text: merged.text || windowResults.map((x) => x.output.text ?? '').join(' ').trim() }; + if (wantWordTimestamps) { + result.chunks = buildWordChunks(merged.words); + } else if (wantTimestampChunks) { + result.chunks = buildNemoSegmentChunks(merged.words, merged.utterance_timestamp, result.text); + } + toReturn.push(result); + continue; + } + + const output = await runNemoTranscribe(aud, { + tokenizer, + return_timestamps: wantTimestampChunks, + return_words: wantTimestampChunks, + return_metrics: false, + }); + + const result = { text: output.text ?? '' }; + if (wantWordTimestamps) { + result.chunks = buildWordChunks(output.words ?? []); + } else if (wantTimestampChunks) { + result.chunks = buildNemoSegmentChunks(output.words ?? [], output.utterance_timestamp ?? null, result.text); + } + toReturn.push(result); + } + + return single ? toReturn[0] : toReturn; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js new file mode 100644 index 000000000..0cb3e01a8 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js @@ -0,0 +1,87 @@ +const NEMO_SEGMENT_BREAK_REGEX = /[.!?;:]["')\]]*$/; +const NEMO_MAX_WORD_GAP_S = 0.8; + +/** + * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @returns {string} + */ +export function joinTimedWords(words) { + let text = ''; + for (const word of words) { + const part = word.text ?? ''; + if (!part) continue; + if (!text) { + text = part; + } else if (/^[,.;:!?)}\]]+$/.test(part)) { + text += part; + } else { + text += ` ${part}`; + } + } + return text; +} + +/** + * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @returns {Array<{ text: string, timestamp: [number, number] }>} + */ +export function buildWordChunks(words) { + return words.map((word) => ({ + text: word.text, + timestamp: [word.start_time, word.end_time], + })); +} + +/** + * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @returns {string} + */ +export function buildSegmentText(words) { + return joinTimedWords(words); +} + +/** + * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @param {[number, number] | null} utterance_timestamp + * @param {string} text + * @returns {Array<{ text: string, timestamp: [number, number] }>} + */ +export function buildNemoSegmentChunks(words, utterance_timestamp = null, text = '') { + if (!Array.isArray(words) || words.length === 0) { + if (utterance_timestamp) { + return [{ text, timestamp: utterance_timestamp }]; + } + return []; + } + + /** @type {Array<{ text: string, timestamp: [number, number] }>} */ + const chunks = []; + /** @type {typeof words} */ + let current = []; + for (const word of words) { + const prev = current.at(-1); + if (prev) { + const gap_s = Math.max(0, word.start_time - prev.end_time); + const shouldBreak = + NEMO_SEGMENT_BREAK_REGEX.test(prev.text) || + gap_s > NEMO_MAX_WORD_GAP_S; + if (shouldBreak) { + chunks.push({ + text: buildSegmentText(current), + timestamp: [current[0].start_time, current[current.length - 1].end_time], + }); + current = []; + } + } + current.push(word); + } + + if (current.length > 0) { + chunks.push({ + text: buildSegmentText(current), + timestamp: [current[0].start_time, current[current.length - 1].end_time], + }); + } + + return chunks; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index d12a27553..274a22962 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -1,81 +1,4 @@ -/** - * Cache tokenizer id->token maps for stable and fast boundary detection. - * @type {WeakMap>} - */ -const TOKEN_ID_TO_TEXT_CACHE = new WeakMap(); - -/** - * @param {any} tokenizer - * @returns {Map} - */ -function getIdToTokenMap(tokenizer) { - let cached = TOKEN_ID_TO_TEXT_CACHE.get(tokenizer); - if (cached) return cached; - - cached = new Map(); - if (tokenizer?.get_vocab) { - const vocab = tokenizer.get_vocab(); - // get_vocab() may return a Map or a plain Object depending on the tokenizer backend. - const entries = vocab instanceof Map ? vocab.entries() : Object.entries(vocab); - for (const [token, id] of entries) { - if (Number.isInteger(id)) { - cached.set(id, token); - } - } - } - TOKEN_ID_TO_TEXT_CACHE.set(tokenizer, cached); - return cached; -} - -/** - * Resolve per-token text and word boundary metadata in a tokenizer-agnostic way. - * Uses raw vocab token (if available) for boundary markers, and decoded token text for display. - * @param {any} tokenizer - * @param {number} id - * @returns {{ raw: string, clean: string, startsNewWord: boolean }} - */ -function resolveTokenPiece(tokenizer, id) { - const rawToken = getIdToTokenMap(tokenizer).get(id) ?? ''; - const decoded = tokenizer.decode([id], { - skip_special_tokens: true, - clean_up_tokenization_spaces: false, - }); - - // SentencePiece/BPE boundary markers used by common tokenizers. - const startsWithBoundaryMarker = /^(?:▁|Ġ)+/.test(rawToken); - const startsWithWhitespace = /^\s+/.test(decoded); - const startsNewWord = startsWithBoundaryMarker || startsWithWhitespace; - - // Human readable token text. - let clean = decoded.replace(/^\s+/, ''); - if (!clean) { - clean = rawToken.replace(/^(?:▁|Ġ|Ċ)+/, '').replace(/^ +/, ''); - } - - return { raw: rawToken || decoded, clean, startsNewWord }; -} - -/** - * @param {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} words - * @param {{ text: string, start: number, end: number, confs: number[] } | null} current - */ -function finalizeAndPushWord(words, current) { - if (!current) return; - - const text = current.text.trim(); - if (!text) return; - - /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ - const word = { - text, - start_time: current.start, - end_time: current.end, - }; - if (current.confs.length > 0) { - word.confidence = Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; - } - words.push(word); -} +import { buildTransducerWordOffsets } from './transducer_word_offsets.js'; /** * Decode token ids into final transcription text. @@ -103,78 +26,6 @@ export function decodeTransducerText(tokenizer, token_ids) { * }} */ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_timestamps, token_confidences = null) { - if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { - return { words: [], tokens: [], word_confidences: null, word_avg: null }; - } - if (token_ids.length !== token_timestamps.length) { - throw new Error( - `buildTransducerDetailedOutputs expects equal lengths for token_ids (${token_ids.length}) and token_timestamps (${token_timestamps.length}).`, - ); - } - if (token_confidences && token_confidences.length !== token_ids.length) { - throw new Error( - `buildTransducerDetailedOutputs expects token_confidences length (${token_confidences.length}) to match token_ids length (${token_ids.length}).`, - ); - } - - /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ - const tokens = []; - /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ - const words = []; - - /** @type {{ text: string, start: number, end: number, confs: number[] } | null} */ - let current = null; - - for (let i = 0; i < token_ids.length; ++i) { - const id = token_ids[i]; - const ts = token_timestamps[i]; - const piece = resolveTokenPiece(tokenizer, id); - const raw = piece.raw; - const startsNewWord = piece.startsNewWord; - const clean = piece.clean; - if (!clean) continue; - - const tok = { - id, - token: clean, - raw_token: raw, - is_word_start: startsNewWord, - start_time: ts[0], - end_time: ts[1], - }; - const conf = token_confidences?.[i]; - if (conf != null && Number.isFinite(conf)) { - tok.confidence = Math.round(conf * 1e6) / 1e6; - } - tokens.push(tok); - - if (!current || startsNewWord) { - finalizeAndPushWord(words, current); - current = { - text: clean, - start: ts[0], - end: ts[1], - confs: conf != null && Number.isFinite(conf) ? [conf] : [], - }; - } else { - current.text += clean; - current.end = ts[1]; - if (conf != null && Number.isFinite(conf)) { - current.confs.push(conf); - } - } - } - - finalizeAndPushWord(words, current); - - const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null; - let word_avg = null; - if (word_confidences) { - const validConfidences = word_confidences.filter((x) => x != null); - if (validConfidences.length > 0) { - word_avg = Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; - } - } - - return { words, tokens, word_confidences, word_avg }; + const fullText = decodeTransducerText(tokenizer, token_ids); + return buildTransducerWordOffsets(tokenizer, token_ids, token_timestamps, token_confidences, fullText); } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js new file mode 100644 index 000000000..3eb1f2609 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js @@ -0,0 +1,179 @@ +import { decodeTransducerText } from './transducer_text.js'; +import { joinTimedWords } from './transducer_segment_offsets.js'; + +/** + * @param {Float32Array|Float64Array} audio + * @param {number} sampling_rate + * @param {number} chunk_length_s + * @param {number | null} stride_length_s + * @returns {Array<{audio: Float32Array|Float64Array, start_s: number, end_s: number, left_stride_s: number, right_stride_s: number}>} + */ +export function buildNemoWindowSpecs(audio, sampling_rate, chunk_length_s, stride_length_s) { + if (!(chunk_length_s > 0)) { + return [ + { + audio, + start_s: 0, + end_s: audio.length / sampling_rate, + left_stride_s: 0, + right_stride_s: 0, + }, + ]; + } + + if (stride_length_s === null) { + stride_length_s = chunk_length_s / 6; + } else if (!(stride_length_s >= 0)) { + throw Error('`stride_length_s` must be non-negative.'); + } + if (chunk_length_s <= 2 * stride_length_s) { + throw Error('`chunk_length_s` must be larger than `2 * stride_length_s` for Nemo windowed decoding.'); + } + + const window = Math.floor(sampling_rate * chunk_length_s); + const stride = Math.floor(sampling_rate * stride_length_s); + const jump = window - 2 * stride; + + /** @type {Array<{audio: Float32Array|Float64Array, start_s: number, end_s: number, left_stride_s: number, right_stride_s: number}>} */ + const windows = []; + let offset = 0; + while (true) { + const offset_end = offset + window; + const subarr = audio.subarray(offset, offset_end); + const is_first = offset === 0; + const is_last = offset_end >= audio.length; + windows.push({ + audio: subarr, + start_s: offset / sampling_rate, + end_s: (offset + subarr.length) / sampling_rate, + left_stride_s: is_first ? 0 : stride / sampling_rate, + right_stride_s: is_last ? 0 : stride / sampling_rate, + }); + if (is_last) break; + offset += jump; + } + + return windows; +} + +function shouldKeepTimedItem(start_time, end_time, keep_start_s, keep_end_s, is_first_window, is_last_window) { + const midpoint = (start_time + end_time) / 2; + if (!is_first_window && midpoint < keep_start_s) { + return false; + } + if (!is_last_window && midpoint >= keep_end_s) { + return false; + } + return true; +} + +function dedupeMergedWords(words) { + /** @type {typeof words} */ + const merged = []; + for (const word of words) { + const prev = merged.at(-1); + if ( + prev && + prev.text === word.text && + word.start_time < prev.end_time + ) { + const prevDuration = prev.end_time - prev.start_time; + const nextDuration = word.end_time - word.start_time; + if (nextDuration > prevDuration) { + merged[merged.length - 1] = word; + } + continue; + } + merged.push(word); + } + return merged; +} + +function dedupeMergedTokens(tokens) { + /** @type {typeof tokens} */ + const merged = []; + for (const token of tokens) { + const prev = merged.at(-1); + if ( + prev && + prev.id === token.id && + prev.raw_token === token.raw_token && + token.start_time < prev.end_time + ) { + const prevDuration = prev.end_time - prev.start_time; + const nextDuration = token.end_time - token.start_time; + if (nextDuration > prevDuration) { + merged[merged.length - 1] = token; + } + continue; + } + merged.push(token); + } + return merged; +} + +/** + * @param {any} tokenizer + * @param {Array<{ window: { start_s: number, end_s: number, left_stride_s: number, right_stride_s: number }, output: { words?: any[], tokens?: any[] } }>} windowResults + * @returns {{ text: string, tokens: any[], words: any[], utterance_timestamp: [number, number] | null }} + */ +export function mergeNemoWindowResults(tokenizer, windowResults) { + /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ + const mergedTokens = []; + /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ + const mergedWords = []; + + for (const { window, output } of windowResults) { + const keep_start_s = window.start_s + window.left_stride_s; + const keep_end_s = window.end_s - window.right_stride_s; + const is_first_window = window.left_stride_s === 0; + const is_last_window = window.right_stride_s === 0; + + for (const token of output.tokens ?? []) { + if ( + shouldKeepTimedItem( + token.start_time, + token.end_time, + keep_start_s, + keep_end_s, + is_first_window, + is_last_window, + ) + ) { + mergedTokens.push(token); + } + } + + for (const word of output.words ?? []) { + if ( + shouldKeepTimedItem( + word.start_time, + word.end_time, + keep_start_s, + keep_end_s, + is_first_window, + is_last_window, + ) + ) { + mergedWords.push(word); + } + } + } + + const tokens = dedupeMergedTokens(mergedTokens); + const words = dedupeMergedWords(mergedWords); + const text = + words.length > 0 + ? joinTimedWords(words) + : tokens.length > 0 && typeof tokenizer?.decode === 'function' + ? decodeTransducerText(tokenizer, tokens.map((token) => token.id)) + : ''; + const utterance_timestamp = + words.length > 0 + ? /** @type {[number, number]} */ ([words[0].start_time, words[words.length - 1].end_time]) + : tokens.length > 0 + ? /** @type {[number, number]} */ ([tokens[0].start_time, tokens[tokens.length - 1].end_time]) + : null; + + return { text, tokens, words, utterance_timestamp }; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js new file mode 100644 index 000000000..5046a71a5 --- /dev/null +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js @@ -0,0 +1,216 @@ +/** + * Cache tokenizer id->token maps for stable and fast boundary detection. + * @type {WeakMap>} + */ +const TOKEN_ID_TO_TEXT_CACHE = new WeakMap(); + +/** + * @param {any} tokenizer + * @returns {Map} + */ +function getIdToTokenMap(tokenizer) { + let cached = TOKEN_ID_TO_TEXT_CACHE.get(tokenizer); + if (cached) return cached; + + cached = new Map(); + if (tokenizer?.get_vocab) { + const vocab = tokenizer.get_vocab(); + const entries = vocab instanceof Map ? vocab.entries() : Object.entries(vocab); + for (const [token, id] of entries) { + if (Number.isInteger(id)) { + cached.set(id, token); + } + } + } + TOKEN_ID_TO_TEXT_CACHE.set(tokenizer, cached); + return cached; +} + +/** + * Resolve per-token text and word boundary metadata in a tokenizer-agnostic way. + * @param {any} tokenizer + * @param {number} id + * @returns {{ raw: string, clean: string, startsNewWord: boolean }} + */ +function resolveTokenPiece(tokenizer, id) { + const rawToken = getIdToTokenMap(tokenizer).get(id) ?? ''; + const decoded = tokenizer.decode([id], { + skip_special_tokens: true, + clean_up_tokenization_spaces: false, + }); + + const startsWithBoundaryMarker = /^(?:▁|Ġ)+/.test(rawToken); + const startsWithWhitespace = /^\s+/.test(decoded); + const startsNewWord = startsWithBoundaryMarker || startsWithWhitespace; + + let clean = decoded.replace(/^\s+/, ''); + if (!clean) { + clean = rawToken.replace(/^(?:▁|Ġ|Ċ)+/, '').replace(/^ +/, ''); + } + + return { raw: rawToken || decoded, clean, startsNewWord }; +} + +/** + * @param {string} fullText + * @param {number} cursor + * @param {string} tokenText + * @returns {{ cursor: number, text: string, skippedWhitespace: boolean }} + */ +function consumeAlignedTokenText(fullText, cursor, tokenText) { + let skippedWhitespace = false; + while (cursor < fullText.length && /\s/.test(fullText[cursor])) { + skippedWhitespace = true; + cursor += 1; + } + + if (!tokenText) { + return { cursor, text: '', skippedWhitespace }; + } + + if (fullText.startsWith(tokenText, cursor)) { + return { + cursor: cursor + tokenText.length, + text: fullText.slice(cursor, cursor + tokenText.length), + skippedWhitespace, + }; + } + + const next = fullText.indexOf(tokenText, cursor); + if (next !== -1 && /^\s*$/.test(fullText.slice(cursor, next))) { + return { + cursor: next + tokenText.length, + text: fullText.slice(next, next + tokenText.length), + skippedWhitespace: skippedWhitespace || next > cursor, + }; + } + + return { + cursor: cursor + tokenText.length, + text: tokenText, + skippedWhitespace, + }; +} + +/** + * @param {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} words + * @param {{ text: string, start: number, end: number, confs: number[] } | null} current + */ +function finalizeAndPushWord(words, current) { + if (!current) return; + + const text = current.text.trim(); + if (!text) return; + + /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ + const word = { + text, + start_time: current.start, + end_time: current.end, + }; + if (current.confs.length > 0) { + word.confidence = Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; + } + words.push(word); +} + +/** + * @param {any} tokenizer + * @param {number[]} token_ids + * @param {[number, number][]} token_timestamps + * @param {number[] | null} token_confidences + * @param {string} fullText + * @returns {{ + * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, + * tokens: Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, + * word_confidences: (number | null)[] | null, + * word_avg: number | null, + * }} + */ +export function buildTransducerWordOffsets( + tokenizer, + token_ids, + token_timestamps, + token_confidences = null, + fullText = '', +) { + if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { + return { words: [], tokens: [], word_confidences: null, word_avg: null }; + } + if (token_ids.length !== token_timestamps.length) { + throw new Error( + `buildTransducerWordOffsets expects equal lengths for token_ids (${token_ids.length}) and token_timestamps (${token_timestamps.length}).`, + ); + } + if (token_confidences && token_confidences.length !== token_ids.length) { + throw new Error( + `buildTransducerWordOffsets expects token_confidences length (${token_confidences.length}) to match token_ids length (${token_ids.length}).`, + ); + } + + /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ + const tokens = []; + /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ + const words = []; + let textCursor = 0; + + /** @type {{ text: string, start: number, end: number, confs: number[] } | null} */ + let current = null; + + for (let i = 0; i < token_ids.length; ++i) { + const id = token_ids[i]; + const ts = token_timestamps[i]; + const piece = resolveTokenPiece(tokenizer, id); + const raw = piece.raw; + const clean = piece.clean; + if (!clean) continue; + + const aligned = consumeAlignedTokenText(fullText, textCursor, clean); + textCursor = aligned.cursor; + const tokenText = aligned.text || clean; + const startsNewWord = !current || aligned.skippedWhitespace || piece.startsNewWord; + + const tok = { + id, + token: tokenText, + raw_token: raw, + is_word_start: startsNewWord, + start_time: ts[0], + end_time: ts[1], + }; + const conf = token_confidences?.[i]; + if (conf != null && Number.isFinite(conf)) { + tok.confidence = Math.round(conf * 1e6) / 1e6; + } + tokens.push(tok); + + if (!current || startsNewWord) { + finalizeAndPushWord(words, current); + current = { + text: tokenText, + start: ts[0], + end: ts[1], + confs: conf != null && Number.isFinite(conf) ? [conf] : [], + }; + } else { + current.text += tokenText; + current.end = ts[1]; + if (conf != null && Number.isFinite(conf)) { + current.confs.push(conf); + } + } + } + + finalizeAndPushWord(words, current); + + const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null; + let word_avg = null; + if (word_confidences) { + const validConfidences = word_confidences.filter((x) => x != null); + if (validConfidences.length > 0) { + word_avg = Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; + } + } + + return { words, tokens, word_confidences, word_avg }; +} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js index 935336b1e..fdf5e7e67 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js @@ -2,3 +2,7 @@ export { createAudioCacheKey, FeatureLRUCache } from './transducer_cache.js'; export { computeTemporalDeltas } from './transducer_deltas.js'; export { decodeTransducerText, buildTransducerDetailedOutputs } from './transducer_text.js'; +export { buildTransducerWordOffsets } from './transducer_word_offsets.js'; +export { joinTimedWords, buildWordChunks, buildSegmentText, buildNemoSegmentChunks } from './transducer_segment_offsets.js'; +export { buildNemoWindowSpecs, mergeNemoWindowResults } from './transducer_window_merge.js'; +export { runNemoConformerTDTPipeline } from './pipeline_nemo_conformer_tdt.js'; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index b4c468a4c..6415f5857 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -3,6 +3,9 @@ import { Pipeline, prepareAudios } from './_base.js'; import { Tensor } from '../utils/tensor.js'; import { max, round } from '../utils/maths.js'; import { logger } from '../utils/logger.js'; +import { + runNemoConformerTDTPipeline, +} from '../models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js'; /** * @typedef {import('./_base.js').TextAudioPipelineConstructorArgs} TextAudioPipelineConstructorArgs @@ -140,24 +143,6 @@ export class AutomaticSpeechRecognitionPipeline Pipeline ) { - _validateNemoAudio(audio, index) { - if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { - throw new TypeError( - `Nemo Conformer TDT pipeline expected audio at index ${index} to be Float32Array or Float64Array.`, - ); - } - if (audio.length === 0) { - throw new Error(`Nemo Conformer TDT pipeline expected non-empty audio at index ${index}.`); - } - for (let i = 0; i < audio.length; ++i) { - if (!Number.isFinite(audio[i])) { - throw new Error( - `Nemo Conformer TDT pipeline expected finite audio samples; found ${audio[i]} at index ${index}:${i}.`, - ); - } - } - } - async _call(audio, kwargs = {}) { switch (this.model.config.model_type) { case 'whisper': @@ -323,74 +308,19 @@ export class AutomaticSpeechRecognitionPipeline /** * Nemo Conformer TDT ASR pipeline. * - * Delegates to model.transcribe() and returns its output directly. - * Use `return_timestamps: true` on the pipeline call to get utterance-level data. - * This pipeline always requests metrics, and enables word details when - * timestamps are requested. - * For token-level and debug controls, call `model.transcribe()` directly with - * extended options. + * Keeps the pipeline surface aligned with the shared ASR task contract: + * `{ text }` by default and `{ text, chunks }` when timestamps are requested. + * Rich Nemo-specific outputs remain available on direct `model.transcribe()`. */ async _call_nemo_conformer_tdt(audio, kwargs) { - if (typeof (/** @type {any} */ (this.model).transcribe) !== 'function') { - throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); - } - if (!this.processor) { - throw new Error('Nemo Conformer TDT pipeline requires a processor.'); - } - if (!this.tokenizer) { - throw new Error('Nemo Conformer TDT pipeline requires a tokenizer.'); - } - if (!this.processor.feature_extractor?.config?.sampling_rate) { - throw new Error( - 'Nemo Conformer TDT pipeline requires `processor.feature_extractor.config.sampling_rate` to prepare audio.', - ); - } - - const return_timestamps = !!(kwargs.return_timestamps); - - const decodeOptions = { + return runNemoConformerTDTPipeline({ + model: this.model, + processor: this.processor, tokenizer: this.tokenizer, - return_timestamps, - return_words: return_timestamps, - return_metrics: true, - }; - - const single = !Array.isArray(audio); - const batchedAudio = single ? [audio] : audio; - const sampling_rate = this.processor.feature_extractor.config.sampling_rate; - const preparedAudios = await prepareAudios(batchedAudio, sampling_rate); - for (let i = 0; i < preparedAudios.length; ++i) { - this._validateNemoAudio(preparedAudios[i], i); - } - - const featureCache = /** @type {{ max_entries: number, max_size_mb: number }|null|undefined} */ ( - /** @type {any} */ (this.processor.feature_extractor)?.feature_cache - ); - const cacheOwnsTensors = !!( - featureCache && - featureCache.max_entries > 0 && - featureCache.max_size_mb > 0 - ); - const toReturn = []; - for (const aud of preparedAudios) { - const inputs = await this.processor(aud); - try { - const output = await /** @type {any} */ (this.model).transcribe(inputs, decodeOptions); - toReturn.push(output); - } finally { - if (!cacheOwnsTensors) { - const seen = new Set(); - for (const value of Object.values(inputs ?? {})) { - if (value instanceof Tensor && !seen.has(value)) { - value.dispose(); - seen.add(value); - } - } - } - } - } - - return single ? toReturn[0] : toReturn; + audio, + kwargs, + prepareAudios, + }); } async _call_moonshine(audio, kwargs) { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 01e977bf4..c52df3667 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -1,6 +1,7 @@ import { NemoConformerForTDT, Tensor } from "../../../src/transformers.js"; import { createAudioCacheKey, FeatureLRUCache } from "../../../src/models/nemo_conformer_tdt/transducer_cache.js"; import { computeTemporalDeltas } from "../../../src/models/nemo_conformer_tdt/transducer_deltas.js"; +import { buildTransducerDetailedOutputs } from "../../../src/models/nemo_conformer_tdt/transducer_text.js"; import { MODEL_TYPE_MAPPING, MODEL_TYPES } from "../../../src/models/modeling_utils.js"; import { get_model_files } from "../../../src/utils/model_registry/get_model_files.js"; @@ -645,6 +646,71 @@ export default () => { }); describe("Nemo Conformer TDT utilities", () => { + it("keeps word boundaries from the final decoded text for numeric and punctuation tokens", () => { + const rawById = { + 1: "▁score", + 2: ".", + 3: "48", + 4: "-", + 5: "year", + 6: "-", + 7: "old", + 8: "▁with", + 9: "0", + 10: ".", + 11: "5", + }; + const tokenizer = { + get_vocab() { + return rawById; + }, + decode(ids) { + if (ids.length === 1) { + return rawById[ids[0]].replace(/^▁/, ""); + } + return "score. 48-year-old with 0.5"; + }, + }; + + const output = buildTransducerDetailedOutputs( + tokenizer, + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], + [ + [0.0, 0.3], + [0.3, 0.4], + [0.5, 0.8], + [0.8, 0.85], + [0.85, 1.05], + [1.05, 1.1], + [1.1, 1.3], + [1.4, 1.7], + [1.8, 1.9], + [1.9, 1.95], + [1.95, 2.05], + ], + ); + + expect(output.words.map((x) => x.text)).toEqual([ + "score.", + "48-year-old", + "with", + "0.5", + ]); + expect(output.tokens.map((x) => x.token)).toEqual([ + "score", + ".", + "48", + "-", + "year", + "-", + "old", + "with", + "0", + ".", + "5", + ]); + }); + it( "computes delta and delta-delta features", async () => { diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 9f227a9b3..5a377ece1 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -156,7 +156,17 @@ export default () => { const processor = Object.assign(async () => ({ input_features: {} }), { feature_extractor: { config: { sampling_rate: 16000 } }, }); - const tokenizer = {}; + const tokenizer = { + decode(ids) { + const pieces = { + 1: "hello", + 2: "world", + 3: "again", + 4: "today", + }; + return ids.map((id) => pieces[id] ?? "").filter(Boolean).join(" "); + }, + }; return { pipe: new AutomaticSpeechRecognitionPipeline({ @@ -169,57 +179,299 @@ export default () => { }; }; - it("returns text and metrics when timestamps disabled", async () => { + it("returns text when timestamps disabled", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: false }); - expect(output).toEqual({ text: "hello world", metrics: { total_ms: 42, rtf: 0.01 } }); + expect(output).toEqual({ text: "hello world" }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: false, return_words: false, - return_metrics: true, + return_metrics: false, }); }); - it("returns full output with words when return_timestamps is true", async () => { + it("returns timestamped chunks when return_timestamps is true", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: true }); - expect(output).toMatchObject({ + expect(output).toEqual({ text: "hello world", - utterance_timestamp: [0, 0.08], - utterance_confidence: 0.95, - words: [ - { text: "hello", start_time: 0, end_time: 0.04 }, - { text: "world", start_time: 0.04, end_time: 0.08 }, + chunks: [ + { text: "hello world", timestamp: [0, 0.08] }, ], - confidence_scores: { token_avg: 0.95, word_avg: 0.94 }, - metrics: { total_ms: 42, rtf: 0.01 }, }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: true, return_words: true, - return_metrics: true, + return_metrics: false, }); }); - it("treats return_timestamps 'word' as truthy (same as true)", async () => { + it("returns word chunks when return_timestamps is 'word'", async () => { const { pipe, calls } = makeUnitPipe(); const output = await pipe(new Float32Array(16000), { return_timestamps: "word" }); - expect(output).toMatchObject({ + expect(output).toEqual({ text: "hello world", - utterance_timestamp: [0, 0.08], - words: expect.any(Array), - metrics: expect.any(Object), + chunks: [ + { text: "hello", timestamp: [0, 0.04] }, + { text: "world", timestamp: [0.04, 0.08] }, + ], }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ return_timestamps: true, return_words: true, - return_metrics: true, + return_metrics: false, }); }); + it("merges overlapping windows when Nemo chunking is enabled", async () => { + const calls = []; + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe(_inputs, options) { + calls.push(options); + if (options.timeOffset === 0) { + return { + text: "hello world again", + words: [ + { text: "hello", start_time: 0, end_time: 0.5 }, + { text: "world", start_time: 0.5, end_time: 1.1 }, + { text: "again", start_time: 1.2, end_time: 1.8 }, + ], + tokens: [ + { id: 1, token: "hello", raw_token: "hello", is_word_start: true, start_time: 0, end_time: 0.5 }, + { id: 2, token: "world", raw_token: "world", is_word_start: true, start_time: 0.5, end_time: 1.1 }, + { id: 3, token: "again", raw_token: "again", is_word_start: true, start_time: 1.2, end_time: 1.8 }, + ], + }; + } + return { + text: "again today", + words: [ + { text: "again", start_time: 1.2, end_time: 1.8 }, + { text: "today", start_time: 1.8, end_time: 2.4 }, + ], + tokens: [ + { id: 3, token: "again", raw_token: "again", is_word_start: true, start_time: 1.2, end_time: 1.8 }, + { id: 4, token: "today", raw_token: "today", is_word_start: true, start_time: 1.8, end_time: 2.4 }, + ], + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const tokenizer = { + decode(ids) { + const pieces = { + 1: "hello", + 2: "world", + 3: "again", + 4: "today", + }; + return ids.map((id) => pieces[id] ?? "").filter(Boolean).join(" "); + }, + }; + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer, + processor, + }); + + const output = await pipe(new Float32Array(3 * 16000), { + return_timestamps: "word", + chunk_length_s: 2, + stride_length_s: 0.5, + }); + + expect(output).toEqual({ + text: "hello world again today", + chunks: [ + { text: "hello", timestamp: [0, 0.5] }, + { text: "world", timestamp: [0.5, 1.1] }, + { text: "again", timestamp: [1.2, 1.8] }, + { text: "today", timestamp: [1.8, 2.4] }, + ], + }); + expect(calls).toHaveLength(2); + expect(calls[0]).toMatchObject({ + return_timestamps: true, + return_words: true, + return_tokens: true, + return_metrics: false, + timeOffset: 0, + }); + expect(calls[1]).toMatchObject({ + return_timestamps: true, + return_words: true, + return_tokens: true, + return_metrics: false, + timeOffset: 1, + }); + }); + + it("reconstructs windowed Nemo text from merged words when token decode drops spaces", async () => { + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe(_inputs, options) { + if (options.timeOffset === 0) { + return { + text: "score. 48-year-old", + words: [ + { text: "score.", start_time: 0, end_time: 0.4 }, + { text: "48-year-old", start_time: 0.5, end_time: 1.3 }, + ], + tokens: [ + { id: 1, token: "score", raw_token: "▁score", is_word_start: true, start_time: 0, end_time: 0.3 }, + { id: 2, token: ".", raw_token: ".", is_word_start: false, start_time: 0.3, end_time: 0.4 }, + { id: 3, token: "48", raw_token: "48", is_word_start: false, start_time: 0.5, end_time: 0.8 }, + { id: 4, token: "-", raw_token: "-", is_word_start: false, start_time: 0.8, end_time: 0.85 }, + { id: 5, token: "year", raw_token: "year", is_word_start: false, start_time: 0.85, end_time: 1.05 }, + { id: 4, token: "-", raw_token: "-", is_word_start: false, start_time: 1.05, end_time: 1.1 }, + { id: 6, token: "old", raw_token: "old", is_word_start: false, start_time: 1.1, end_time: 1.3 }, + ], + }; + } + return { + text: "with 0.5", + words: [ + { text: "with", start_time: 1.4, end_time: 1.7 }, + { text: "0.5", start_time: 1.8, end_time: 2.05 }, + ], + tokens: [ + { id: 7, token: "with", raw_token: "▁with", is_word_start: true, start_time: 1.4, end_time: 1.7 }, + { id: 8, token: "0", raw_token: "0", is_word_start: false, start_time: 1.8, end_time: 1.9 }, + { id: 2, token: ".", raw_token: ".", is_word_start: false, start_time: 1.9, end_time: 1.95 }, + { id: 9, token: "5", raw_token: "5", is_word_start: false, start_time: 1.95, end_time: 2.05 }, + ], + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const tokenizer = { + decode(ids) { + const pieces = { + 1: "score", + 2: ".", + 3: "48", + 4: "-", + 5: "year", + 6: "old", + 7: "with", + 8: "0", + 9: "5", + }; + return ids.map((id) => pieces[id] ?? "").join(""); + }, + }; + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer, + processor, + }); + + const output = await pipe(new Float32Array(3 * 16000), { + return_timestamps: "word", + chunk_length_s: 2, + stride_length_s: 0.5, + }); + + expect(output.text).toBe("score. 48-year-old with 0.5"); + expect(output.chunks).toEqual([ + { text: "score.", timestamp: [0, 0.4] }, + { text: "48-year-old", timestamp: [0.5, 1.3] }, + { text: "with", timestamp: [1.4, 1.7] }, + { text: "0.5", timestamp: [1.8, 2.05] }, + ]); + }); + + it("auto-window long Nemo audio with 90s chunks and 10s stride", async () => { + const calls = []; + const wordsByOffset = new Map([ + [0, { id: 1, text: "alpha", start: 0, end: 1 }], + [70, { id: 2, text: "beta", start: 85, end: 86 }], + [140, { id: 3, text: "gamma", start: 155, end: 156 }], + [210, { id: 4, text: "delta", start: 225, end: 226 }], + ]); + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe(_inputs, options) { + calls.push(options); + const item = wordsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + words: [ + { text: item.text, start_time: item.start, end_time: item.end }, + ], + tokens: [ + { + id: item.id, + token: item.text, + raw_token: item.text, + is_word_start: true, + start_time: item.start, + end_time: item.end, + }, + ], + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const tokenizer = { + decode(ids) { + const pieces = { + 1: "alpha", + 2: "beta", + 3: "gamma", + 4: "delta", + }; + return ids.map((id) => pieces[id] ?? "").filter(Boolean).join(" "); + }, + }; + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer, + processor, + }); + + const output = await pipe(new Float32Array(300 * 16000), { return_timestamps: "word" }); + + expect(output).toEqual({ + text: "alpha beta gamma delta", + chunks: [ + { text: "alpha", timestamp: [0, 1] }, + { text: "beta", timestamp: [85, 86] }, + { text: "gamma", timestamp: [155, 156] }, + { text: "delta", timestamp: [225, 226] }, + ], + }); + expect(calls).toHaveLength(4); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 70, 140, 210]); + for (const call of calls) { + expect(call).toMatchObject({ + return_timestamps: true, + return_words: true, + return_tokens: true, + return_metrics: false, + }); + } + }); + it("rejects non-finite audio samples before Nemo decoding", async () => { const { pipe } = makeUnitPipe(); await expect(pipe(Float32Array.from([0, Number.NaN, 0]), { return_timestamps: false })).rejects.toThrow( From 816f581180012e3ce017876da0b150d02046d2a4 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 8 Mar 2026 00:58:29 +0300 Subject: [PATCH 33/40] chore(tests): drop unrelated parakeet feature extractor coverage Remove the standalone parakeet feature extractor test from this branch. It exercises an existing parakeet_ctc path that is outside the scope of Conformer TDT integration and makes the PR look broader than it is. --- .../test_feature_extraction_parakeet.js | 49 ------------------- 1 file changed, 49 deletions(-) delete mode 100644 packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js diff --git a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js b/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js deleted file mode 100644 index 82ece82a9..000000000 --- a/packages/transformers/tests/models/parakeet/test_feature_extraction_parakeet.js +++ /dev/null @@ -1,49 +0,0 @@ -import { ParakeetFeatureExtractor } from "../../../src/transformers.js"; - -import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; - -export default () => { - describe("ParakeetFeatureExtractor", () => { - const config = { - feature_size: 80, - sampling_rate: 16000, - n_fft: 512, - win_length: 400, - hop_length: 160, - preemphasis: 0.97, - }; - - /** @type {ParakeetFeatureExtractor} */ - let feature_extractor; - beforeAll(() => { - feature_extractor = new ParakeetFeatureExtractor(config); - }); - - it( - "extracts features and mask from synthetic audio", - async () => { - const duration_s = 1.0; - const total = Math.floor(config.sampling_rate * duration_s); - const audio = Float32Array.from({ length: total }, (_, i) => Math.sin((2 * Math.PI * 220 * i) / config.sampling_rate)); - - const { input_features, attention_mask } = await feature_extractor(audio); - try { - expect(input_features.dims[0]).toBe(1); - expect(input_features.dims[2]).toBe(config.feature_size); - expect(attention_mask.dims).toEqual([1, input_features.dims[1]]); - - const validFrames = attention_mask.data.reduce((acc, x) => acc + Number(x), 0); - expect(validFrames).toBeGreaterThan(0); - expect(validFrames).toBeLessThanOrEqual(input_features.dims[1]); - - const preview = Array.from(input_features.data.slice(0, 256)); - expect(preview.every(Number.isFinite)).toBe(true); - } finally { - input_features.dispose(); - attention_mask.dispose(); - } - }, - MAX_TEST_EXECUTION_TIME, - ); - }); -}; From f59ba068954fbabb0d34158e0758b6b3de08b838 Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 8 Mar 2026 19:47:55 +0300 Subject: [PATCH 34/40] feat(nemo-conformer-tdt): add sentence-based ASR pipeline chunking Use conservative sentence boundaries for pipeline timestamps and long-audio cursoring in the NeMo Conformer TDT pipeline. This keeps the HF-style pipeline contract while replacing the old fixed-window merge path with sentence-driven retranscription. Also remove dead NeMo window-merge helpers, delete the obsolete compatibility barrel, and extend the model and pipeline tests around cache handling, timestamps, and long-audio behavior. --- .../feature_extraction_nemo_conformer_tdt.js | 18 +- .../modeling_nemo_conformer_tdt.js | 176 +++--- .../pipeline_nemo_conformer_tdt.js | 281 +++++++-- .../nemo_conformer_tdt/transducer_cache.js | 10 +- .../transducer_segment_offsets.js | 159 +++-- .../nemo_conformer_tdt/transducer_text.js | 7 +- .../transducer_window_merge.js | 173 +----- .../transducer_word_offsets.js | 40 +- .../utils_nemo_conformer_tdt.js | 8 - .../pipelines/automatic-speech-recognition.js | 2 +- ...t_feature_extraction_nemo_conformer_tdt.js | 84 +++ .../test_modeling_nemo_conformer_tdt.js | 101 +++- ..._pipelines_automatic_speech_recognition.js | 571 ++++++++++++++---- 13 files changed, 1122 insertions(+), 508 deletions(-) delete mode 100644 packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 3ecea708a..7df9afba6 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -6,6 +6,16 @@ import { FeatureLRUCache, createAudioCacheKey } from './transducer_cache.js'; import { computeTemporalDeltas } from './transducer_deltas.js'; const EPSILON = 1e-5; +export const NEMO_FEATURE_OUTPUT_OWNERSHIP = Symbol('NemoConformerTDTFeatureOutputOwnership'); + +function tagNemoFeatureOutputOwnership(value, cacheOwnsTensors) { + Object.defineProperty(value, NEMO_FEATURE_OUTPUT_OWNERSHIP, { + value: cacheOwnsTensors, + enumerable: false, + configurable: true, + }); + return value; +} /** * Feature extractor for Nemo Conformer TDT models. @@ -144,15 +154,15 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { const key = `${createAudioCacheKey(audio, this.config.sampling_rate)}:${this.delta_order}:${this.delta_window}:${this.delta_concatenate}`; const cached = this.feature_cache.get(key); if (cached) { - return { ...cached }; + return tagNemoFeatureOutputOwnership({ ...cached }, true); } const extracted = await this._extract(audio); - this.feature_cache.set(key, extracted); - return { ...extracted }; + const cacheOwnsTensors = this.feature_cache.set(key, extracted); + return tagNemoFeatureOutputOwnership({ ...extracted }, cacheOwnsTensors); } - return await this._extract(audio); + return tagNemoFeatureOutputOwnership(await this._extract(audio), false); } async _extract(audio) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 64a6372eb..0b129e5be 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -536,54 +536,66 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { /** * Transcribe model-ready features using TDT decoding. * - * - `return_timestamps: false` → `{ text, is_final }` (+ metrics if `return_metrics`) - * - `return_timestamps: true` → adds `utterance_confidence`, `utterance_timestamp`, `confidence_scores` - * - `return_words: true` (requires `return_timestamps`) → adds `words` list - * - `return_tokens: true` (requires `return_timestamps`) → adds `tokens` list - * - `return_metrics` is independent and can be combined with either level. + * - `returnTimestamps: false` → `{ text, isFinal }` (+ metrics if `returnMetrics`) + * - `returnTimestamps: true` → adds `utteranceTimestamp` and grouped `confidence` + * - `returnWords: true` (requires `returnTimestamps`) → adds `words` list + * - `returnTokens: true` (requires `returnTimestamps`) → adds `tokens` list + * - `returnMetrics` is independent and can be combined with either level. * - Debug flags (`returnFrameConfidences`, `returnFrameIndices`, `returnLogProbs`, `returnTdtSteps`) are independent. + * - Legacy snake_case aliases (`return_timestamps`, `return_words`, `return_tokens`, `return_metrics`) are accepted. * * @param {Object} model_inputs Processor outputs (must include `input_features`). * @param {Object} [decode_options] * @param {any} [decode_options.tokenizer] Tokenizer for text reconstruction and word boundaries. - * @param {boolean} [decode_options.return_timestamps=true] Include utterance-level timestamps and confidence averages. - * @param {boolean} [decode_options.return_words=false] Include word-level list (requires return_timestamps). - * @param {boolean} [decode_options.return_tokens=false] Include token-level list (requires return_timestamps). - * @param {boolean} [decode_options.return_metrics=false] Include encoding/decoding timing metrics. - * @param {boolean} [decode_options.returnFrameConfidences=false] Include per-frame confidence scores in confidence_scores. + * @param {boolean} [decode_options.returnTimestamps=true] Include utterance-level timestamps and confidence aggregates. + * @param {boolean} [decode_options.return_timestamps] Legacy alias for `returnTimestamps`. + * @param {boolean} [decode_options.returnWords=false] Include word-level list (requires `returnTimestamps`). + * @param {boolean} [decode_options.return_words] Legacy alias for `returnWords`. + * @param {boolean} [decode_options.returnTokens=false] Include token-level list (requires `returnTimestamps`). + * @param {boolean} [decode_options.return_tokens] Legacy alias for `returnTokens`. + * @param {boolean} [decode_options.returnMetrics=false] Include encoding/decoding timing metrics. + * @param {boolean} [decode_options.return_metrics] Legacy alias for `returnMetrics`. + * @param {boolean} [decode_options.returnFrameConfidences=false] Include per-frame confidence scores in `confidence`. * @param {boolean} [decode_options.returnFrameIndices=false] Include per-token encoder frame indices. * @param {boolean} [decode_options.returnLogProbs=false] Include per-token log probabilities. * @param {boolean} [decode_options.returnTdtSteps=false] Include raw TDT duration steps. * @param {number} [decode_options.timeOffset=0] Offset added to all timestamps (seconds). * @returns {Promise<{ * text: string, - * is_final: boolean, - * utterance_confidence?: number, - * utterance_timestamp?: [number, number], - * words?: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, - * tokens?: Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, - * confidence_scores?: { token_avg: number|null, word_avg: number|null, frame: number[]|null, frame_avg: number|null, overall_log_prob: number|null }, - * metrics?: { preprocess_ms: number, encode_ms: number, decode_ms: number, tokenize_ms: number, total_ms: number, rtf: number, rtf_x: number }, - * frameIndices?: number[] | null, - * logProbs?: number[] | null, - * tdtSteps?: number[] | null, + * isFinal: boolean, + * utteranceTimestamp?: [number, number], + * words?: Array<{ text: string, startTime: number, endTime: number, confidence?: number }>, + * tokens?: Array<{ id: number, token: string, rawToken: string, isWordStart: boolean, startTime: number, endTime: number, confidence?: number }>, + * confidence?: { utterance?: number|null, wordAverage?: number|null, frames?: number[]|null, frameAverage?: number|null, averageLogProb?: number|null }, + * metrics?: { preprocessMs: number, encodeMs: number, decodeMs: number, tokenizeMs: number, totalMs: number, rtf: number, rtfX: number }, + * debug?: { frameIndices?: number[] | null, logProbs?: number[] | null, tdtSteps?: number[] | null }, * }>} */ async transcribe( model_inputs, - { + decode_options = {}, + ) { + const { tokenizer = null, - return_timestamps = true, - return_words = false, - return_tokens = false, - return_metrics = false, + returnTimestamps: returnTimestampsOption, + return_timestamps: legacyReturnTimestamps, + returnWords: returnWordsOption, + return_words: legacyReturnWords, + returnTokens: returnTokensOption, + return_tokens: legacyReturnTokens, + returnMetrics: returnMetricsOption, + return_metrics: legacyReturnMetrics, returnFrameConfidences = false, returnFrameIndices = false, returnLogProbs = false, returnTdtSteps = false, timeOffset = 0, - } = {}, - ) { + } = decode_options; + const returnTimestamps = returnTimestampsOption ?? legacyReturnTimestamps ?? true; + const returnWords = returnWordsOption ?? legacyReturnWords ?? false; + const returnTokens = returnTokensOption ?? legacyReturnTokens ?? false; + const returnMetrics = returnMetricsOption ?? legacyReturnMetrics ?? false; + if (!Number.isFinite(timeOffset)) { throw new Error('NemoConformerForTDT.transcribe expected `timeOffset` to be a finite number.'); } @@ -613,7 +625,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const blankId = this.transducer.blank_token_id; const maxSymbolsPerStep = this.transducer.max_symbols_per_step; - const needConfidences = !!return_timestamps; + const needConfidences = !!returnTimestamps; /** @type {number[]} */ const tokenIds = []; @@ -825,44 +837,45 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { const tokenizeStart = nowMs(); const text = decodeTransducerText(tokenizer, tokenIds); - const needDetailed = return_timestamps && (return_words || return_tokens); + const needDetailed = returnTimestamps && (returnWords || returnTokens); const detailed = needDetailed ? buildTransducerDetailedOutputs(tokenizer, tokenIds, tokenTimestamps, tokenConfidences) : null; const tokenizeMs = nowMs() - tokenizeStart; /** @type {any} */ - const result = { text, is_final: true }; - - if (return_timestamps) { - result.utterance_confidence = - tokenConfidences && tokenConfidences.length > 0 - ? roundMetric(tokenConfidences.reduce((a, b) => a + b, 0) / tokenConfidences.length, 6) - : null; - - result.utterance_timestamp = - tokenTimestamps.length > 0 - ? /** @type {[number, number]} */ ([ - tokenTimestamps[0][0], - tokenTimestamps[tokenTimestamps.length - 1][1], - ]) - : /** @type {[number, number]} */ ([ - roundTs(timeOffset), - roundTs(frameCount * frameTime + timeOffset), - ]); + const result = { text, isFinal: true }; + const utteranceConfidence = + tokenConfidences && tokenConfidences.length > 0 + ? roundMetric(tokenConfidences.reduce((a, b) => a + b, 0) / tokenConfidences.length, 6) + : null; + const utteranceTimestamp = + tokenTimestamps.length > 0 + ? /** @type {[number, number]} */ ([ + tokenTimestamps[0][0], + tokenTimestamps[tokenTimestamps.length - 1][1], + ]) + : /** @type {[number, number]} */ ([ + roundTs(timeOffset), + roundTs(frameCount * frameTime + timeOffset), + ]); + const averageLogProb = + logProbs && logProbs.length > 0 + ? roundMetric(logProbs.reduce((a, b) => a + b, 0) / logProbs.length, 6) + : null; + + if (returnTimestamps) { + result.utteranceTimestamp = utteranceTimestamp; if (detailed) { - if (return_words) result.words = detailed.words; - if (return_tokens) result.tokens = detailed.tokens; + if (returnWords) result.words = detailed.words; + if (returnTokens) result.tokens = detailed.tokens; } - result.confidence_scores = { - token_avg: result.utterance_confidence, - word_avg: detailed?.word_avg != null ? roundMetric(detailed.word_avg, 6) : null, - overall_log_prob: - logProbs && logProbs.length > 0 - ? roundMetric(logProbs.reduce((a, b) => a + b, 0) / logProbs.length, 6) - : null, + result.confidence = { + utterance: utteranceConfidence, + wordAverage: detailed?.wordAverage != null ? roundMetric(detailed.wordAverage, 6) : null, + averageLogProb, }; } @@ -872,40 +885,51 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { for (const { sum, count } of frameConfidenceStats.values()) { frameConfidences.push(sum / count); } - if (!result.confidence_scores) { - result.confidence_scores = {}; - } - result.confidence_scores.frame = frameConfidences; - result.confidence_scores.frame_avg = roundMetric( - frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, - 6, - ); + result.confidence = { + ...(result.confidence ?? {}), + frames: frameConfidences, + frameAverage: roundMetric( + frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, + 6, + ), + }; + } + + if (!returnTimestamps && averageLogProb != null) { + result.confidence = { + ...(result.confidence ?? {}), + averageLogProb, + }; } + const debug = {}; if (returnFrameIndices) { - result.frameIndices = frameIndices; + debug.frameIndices = frameIndices; } if (returnLogProbs) { - result.logProbs = logProbs; + debug.logProbs = logProbs; } if (returnTdtSteps) { - result.tdtSteps = tdtSteps; + debug.tdtSteps = tdtSteps; + } + if (Object.keys(debug).length > 0) { + result.debug = debug; } - if (return_metrics) { + if (returnMetrics) { const totalMs = nowMs() - totalStart; - const utteranceDuration = result.utterance_timestamp - ? Math.max(result.utterance_timestamp[1] - result.utterance_timestamp[0], 1e-8) + const utteranceDuration = result.utteranceTimestamp + ? Math.max(result.utteranceTimestamp[1] - result.utteranceTimestamp[0], 1e-8) : Math.max(frameCount * frameTime, 1e-8); const rtf = totalMs / 1000 / utteranceDuration; result.metrics = { - preprocess_ms: 0.0, - encode_ms: roundMetric(encodeMs, 2), - decode_ms: roundMetric(decodeMs, 2), - tokenize_ms: roundMetric(tokenizeMs, 2), - total_ms: roundMetric(totalMs, 2), + preprocessMs: 0.0, + encodeMs: roundMetric(encodeMs, 2), + decodeMs: roundMetric(decodeMs, 2), + tokenizeMs: roundMetric(tokenizeMs, 2), + totalMs: roundMetric(totalMs, 2), rtf: roundMetric(rtf, 4), - rtf_x: roundMetric(1 / Math.max(rtf, 1e-8), 2), + rtfX: roundMetric(1 / Math.max(rtf, 1e-8), 2), }; } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js index 8762fc1d1..a2b00abb4 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js @@ -1,14 +1,25 @@ import { Tensor } from '../../utils/tensor.js'; +import { NEMO_FEATURE_OUTPUT_OWNERSHIP } from './feature_extraction_nemo_conformer_tdt.js'; import { buildWordChunks, buildNemoSegmentChunks, - buildNemoWindowSpecs, - mergeNemoWindowResults, -} from './utils_nemo_conformer_tdt.js'; + joinTimedWords, + partitionNemoWordsIntoSegments, +} from './transducer_segment_offsets.js'; +import { + dedupeMergedWords, +} from './transducer_window_merge.js'; const NEMO_AUTO_WINDOW_THRESHOLD_S = 180; +const NEMO_MIN_CHUNK_LENGTH_S = 20; +const NEMO_MAX_CHUNK_LENGTH_S = 180; const NEMO_AUTO_CHUNK_LENGTH_S = 90; -const NEMO_AUTO_STRIDE_LENGTH_S = 10; +const NEMO_AUTO_WINDOW_FALLBACK_OVERLAP_S = 10; +const NEMO_AUTO_WINDOW_EPSILON_S = 1e-6; +const NEMO_SEGMENT_DEDUP_TOLERANCE_S = 0.15; +const NEMO_CURSOR_MIN_ADVANCE_S = 1.0; +const NEMO_CURSOR_GAP_THRESHOLD_S = 0.2; +const NEMO_CURSOR_SNAP_WINDOW_S = 0.5; function validateNemoAudio(audio, index) { if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { @@ -28,6 +39,204 @@ function validateNemoAudio(audio, index) { } } +function disposeNemoPipelineInputs(inputs) { + const seen = new Set(); + for (const value of Object.values(inputs ?? {})) { + if (value instanceof Tensor && !seen.has(value)) { + value.dispose(); + seen.add(value); + } + } +} + +function normalizeNemoChunkLengthS(value) { + const num = Number(value); + if (!Number.isFinite(num) || num <= 0) { + return 0; + } + return Math.max(NEMO_MIN_CHUNK_LENGTH_S, Math.min(NEMO_MAX_CHUNK_LENGTH_S, num)); +} + +function flattenNemoSegmentWords(segments) { + return segments.flatMap((segment) => segment.words); +} + +function mergePendingAndCurrentNemoWords(pendingWords, currentWords) { + const normalizedPendingWords = Array.isArray(pendingWords) ? pendingWords : []; + const normalizedCurrentWords = Array.isArray(currentWords) ? currentWords : []; + + if (normalizedPendingWords.length === 0) { + return dedupeMergedWords(normalizedCurrentWords); + } + if (normalizedCurrentWords.length === 0) { + return dedupeMergedWords(normalizedPendingWords); + } + + const pendingStart = normalizedPendingWords[0].startTime; + const currentStart = normalizedCurrentWords[0].startTime; + if (currentStart <= pendingStart + NEMO_AUTO_WINDOW_EPSILON_S) { + return dedupeMergedWords(normalizedCurrentWords); + } + + return dedupeMergedWords([...normalizedPendingWords, ...normalizedCurrentWords]); +} + +function normalizeNemoSegmentText(text) { + return String(text ?? '') + .normalize('NFKC') + .replace(/[“”]/g, '"') + .replace(/[‘’]/g, '\'') + .replace(/\s+/g, ' ') + .trim() + .toLowerCase(); +} + +function isDuplicateFinalizedNemoSegment(finalizedSegments, segment) { + const normalized = normalizeNemoSegmentText(segment.text); + if (!normalized) { + return false; + } + + return finalizedSegments.some((candidate) => + normalizeNemoSegmentText(candidate.text) === normalized && + Math.abs(candidate.timestamp[1] - segment.timestamp[1]) < NEMO_SEGMENT_DEDUP_TOLERANCE_S, + ); +} + +function appendFinalizedNemoSegment(finalizedSegments, segment) { + if (isDuplicateFinalizedNemoSegment(finalizedSegments, segment)) { + return; + } + finalizedSegments.push(segment); +} + +function relocateNemoCursorToNearbyGap(target_s, words) { + let best = target_s; + let bestDist = NEMO_CURSOR_SNAP_WINDOW_S + 1; + + for (let i = 0; i < words.length - 1; ++i) { + const current = words[i]; + const next = words[i + 1]; + const gapStart = current.endTime; + const gapEnd = next.startTime; + const gap = gapEnd - gapStart; + if (gap < NEMO_CURSOR_GAP_THRESHOLD_S) { + continue; + } + + for (const candidate of [gapStart, gapEnd]) { + if (candidate + NEMO_AUTO_WINDOW_EPSILON_S < target_s) { + continue; + } + const dist = candidate - target_s; + if (dist <= NEMO_CURSOR_SNAP_WINDOW_S && dist < bestDist) { + best = candidate; + bestDist = dist; + } + } + } + + return best; +} + +async function runNemoAutoSentenceWindowing({ + audio, + sampling_rate, + chunk_length_s, + tokenizer, + runNemoTranscribe, +}) { + const audio_duration_s = audio.length / sampling_rate; + const fallback_overlap_s = Math.min(NEMO_AUTO_WINDOW_FALLBACK_OVERLAP_S, Math.max(0, chunk_length_s - 1)); + const fallback_advance_s = Math.max(1, chunk_length_s - fallback_overlap_s); + const maxWindows = Math.max(4, Math.ceil(audio_duration_s / fallback_advance_s) * 4); + + /** @type {Array<{ words: Array<{ text: string, startTime: number, endTime: number, confidence?: number }>, text: string, timestamp: [number, number] }>} */ + const finalizedSegments = []; + /** @type {Array<{ text: string, startTime: number, endTime: number, confidence?: number }>} */ + let pendingWords = []; + let lastTextFallback = ''; + let start_s = 0; + let shouldMergePending = false; + + for (let windowIndex = 0; windowIndex < maxWindows && start_s < audio_duration_s - NEMO_AUTO_WINDOW_EPSILON_S; ++windowIndex) { + const end_s = Math.min(audio_duration_s, start_s + chunk_length_s); + const start_sample = Math.max(0, Math.min(audio.length - 1, Math.floor(start_s * sampling_rate))); + const end_sample = Math.max(start_sample + 1, Math.min(audio.length, Math.ceil(end_s * sampling_rate))); + const windowAudio = audio.subarray(start_sample, end_sample); + const is_last_window = end_s >= audio_duration_s - NEMO_AUTO_WINDOW_EPSILON_S; + + const output = await runNemoTranscribe(windowAudio, { + tokenizer, + returnTimestamps: true, + returnWords: true, + returnMetrics: false, + timeOffset: start_s, + }); + lastTextFallback = output.text ?? lastTextFallback; + + const currentWords = Array.isArray(output.words) ? output.words : []; + const windowWords = shouldMergePending + ? mergePendingAndCurrentNemoWords(pendingWords, currentWords) + : dedupeMergedWords(currentWords); + const segments = partitionNemoWordsIntoSegments(windowWords); + + if (is_last_window) { + for (const segment of segments) { + appendFinalizedNemoSegment(finalizedSegments, segment); + } + pendingWords = []; + break; + } + + if (segments.length > 1) { + const pendingSegment = segments[segments.length - 1]; + const pendingStart_s = pendingSegment.timestamp[0]; + if (pendingStart_s >= start_s + NEMO_CURSOR_MIN_ADVANCE_S - NEMO_AUTO_WINDOW_EPSILON_S) { + const readySegments = segments.slice(0, -1); + for (const segment of readySegments) { + appendFinalizedNemoSegment(finalizedSegments, segment); + } + + pendingWords = dedupeMergedWords(pendingSegment.words); + shouldMergePending = false; + + const next_start_s = Math.min( + audio_duration_s, + relocateNemoCursorToNearbyGap(pendingStart_s, windowWords), + ); + if (next_start_s > start_s + NEMO_AUTO_WINDOW_EPSILON_S) { + start_s = next_start_s; + continue; + } + } + } + + pendingWords = windowWords; + shouldMergePending = true; + + const fallback_start_s = Math.min(audio_duration_s, start_s + fallback_advance_s); + if (fallback_start_s <= start_s + NEMO_AUTO_WINDOW_EPSILON_S) { + break; + } + start_s = fallback_start_s; + } + + const words = dedupeMergedWords([...flattenNemoSegmentWords(finalizedSegments), ...pendingWords]); + const text = words.length > 0 ? joinTimedWords(words) : String(lastTextFallback ?? '').trim(); + const utteranceTimestamp = + words.length > 0 + ? /** @type {[number, number]} */ ([words[0].startTime, words[words.length - 1].endTime]) + : null; + + return { + text, + words, + utteranceTimestamp, + chunks: buildNemoSegmentChunks(words, utteranceTimestamp, text), + }; +} + /** * Run the ASR pipeline adapter for Nemo Conformer TDT models. * Keeps the public contract task-shaped while delegating rich outputs to `model.transcribe()`. @@ -67,8 +276,7 @@ export async function runNemoConformerTDTPipeline({ const return_timestamps = kwargs.return_timestamps ?? false; const wantWordTimestamps = return_timestamps === 'word'; const wantTimestampChunks = return_timestamps === true || wantWordTimestamps; - const requested_chunk_length_s = kwargs.chunk_length_s ?? 0; - const requested_stride_length_s = kwargs.stride_length_s ?? null; + const requested_chunk_length_s = normalizeNemoChunkLengthS(kwargs.chunk_length_s ?? 0); const single = !Array.isArray(audio); const batchedAudio = single ? [audio] : audio; @@ -78,28 +286,14 @@ export async function runNemoConformerTDTPipeline({ validateNemoAudio(preparedAudios[i], i); } - const featureCache = /** @type {{ max_entries: number, max_size_mb: number }|null|undefined} */ ( - /** @type {any} */ (processor.feature_extractor)?.feature_cache - ); - const cacheOwnsTensors = !!( - featureCache && - featureCache.max_entries > 0 && - featureCache.max_size_mb > 0 - ); - const runNemoTranscribe = async (windowAudio, decodeOptions) => { const inputs = await processor(windowAudio); + const cacheOwnsTensors = inputs?.[NEMO_FEATURE_OUTPUT_OWNERSHIP] === true; try { return await model.transcribe(inputs, decodeOptions); } finally { if (!cacheOwnsTensors) { - const seen = new Set(); - for (const value of Object.values(inputs ?? {})) { - if (value instanceof Tensor && !seen.has(value)) { - value.dispose(); - seen.add(value); - } - } + disposeNemoPipelineInputs(inputs); } } }; @@ -114,34 +308,21 @@ export async function runNemoConformerTDTPipeline({ : autoWindowing ? NEMO_AUTO_CHUNK_LENGTH_S : 0; - const stride_length_s = - requested_chunk_length_s > 0 - ? requested_stride_length_s - : autoWindowing - ? NEMO_AUTO_STRIDE_LENGTH_S - : null; - - if (chunk_length_s > 0) { - const windows = buildNemoWindowSpecs(aud, sampling_rate, chunk_length_s, stride_length_s); - const windowResults = []; - for (const window of windows) { - const output = await runNemoTranscribe(window.audio, { - tokenizer, - return_timestamps: true, - return_words: true, - return_tokens: true, - return_metrics: false, - timeOffset: window.start_s, - }); - windowResults.push({ window, output }); - } + const useSentenceWindowing = chunk_length_s > 0; - const merged = mergeNemoWindowResults(tokenizer, windowResults); - const result = { text: merged.text || windowResults.map((x) => x.output.text ?? '').join(' ').trim() }; + if (useSentenceWindowing) { + const merged = await runNemoAutoSentenceWindowing({ + audio: aud, + sampling_rate, + chunk_length_s, + tokenizer, + runNemoTranscribe, + }); + const result = { text: merged.text }; if (wantWordTimestamps) { result.chunks = buildWordChunks(merged.words); } else if (wantTimestampChunks) { - result.chunks = buildNemoSegmentChunks(merged.words, merged.utterance_timestamp, result.text); + result.chunks = merged.chunks; } toReturn.push(result); continue; @@ -149,16 +330,16 @@ export async function runNemoConformerTDTPipeline({ const output = await runNemoTranscribe(aud, { tokenizer, - return_timestamps: wantTimestampChunks, - return_words: wantTimestampChunks, - return_metrics: false, + returnTimestamps: wantTimestampChunks, + returnWords: wantTimestampChunks, + returnMetrics: false, }); const result = { text: output.text ?? '' }; if (wantWordTimestamps) { result.chunks = buildWordChunks(output.words ?? []); } else if (wantTimestampChunks) { - result.chunks = buildNemoSegmentChunks(output.words ?? [], output.utterance_timestamp ?? null, result.text); + result.chunks = buildNemoSegmentChunks(output.words ?? [], output.utteranceTimestamp ?? null, result.text); } toReturn.push(result); } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index 02fee3f28..a9979356f 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -60,7 +60,7 @@ export class FeatureLRUCache { /** * @param {string} key * @param {any} value - * @returns {void} + * @returns {boolean} Whether the cache retained ownership of the supplied value. */ set(key, value) { // Explicit no-cache mode: keep caller ownership of current values. @@ -68,7 +68,7 @@ export class FeatureLRUCache { if (this.cache.size > 0) { this.clear(); } - return; + return false; } const max_bytes = this.max_size_mb * 1024 * 1024; @@ -78,10 +78,11 @@ export class FeatureLRUCache { this.cache.delete(key); if (existing.size_bytes <= max_bytes) { this.cache.set(key, existing); + return true; } else { this.current_size_bytes -= existing.size_bytes; + return false; } - return; } const size_bytes = estimateSizeBytes(value); @@ -92,7 +93,7 @@ export class FeatureLRUCache { this.current_size_bytes -= existing.size_bytes; this.cache.delete(key); } - return; + return false; } if (existing) { @@ -104,6 +105,7 @@ export class FeatureLRUCache { this.cache.set(key, { value, size_bytes }); this.current_size_bytes += size_bytes; this._evict(); + return this.cache.get(key)?.value === value; } clear() { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js index 0cb3e01a8..632b35d77 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_segment_offsets.js @@ -1,8 +1,28 @@ -const NEMO_SEGMENT_BREAK_REGEX = /[.!?;:]["')\]]*$/; -const NEMO_MAX_WORD_GAP_S = 0.8; +const NEMO_STRONG_SENTENCE_END_REGEX = /[!?…](?:["')\]]+)?$/u; +const NEMO_PERIOD_SENTENCE_END_REGEX = /\.(?:["')\]]+)?$/u; +const NEMO_TRAILING_CLOSERS_REGEX = /["')\]]+$/gu; +const NEMO_LEADING_OPENERS_REGEX = /^[("'“‘\[{]+/u; +const NEMO_DOTTED_ACRONYM_REGEX = /^(?:[A-Z]\.){2,}$/; +const NEMO_SINGLE_LETTER_ENUM_REGEX = /^[A-Z]\.$/; +const NEMO_ROMAN_ENUM_REGEX = /^(?:[IVXLCDM]+)\.$/i; +const NEMO_NUMERIC_ENUM_REGEX = /^\d+\.$/; +const NEMO_FALLBACK_SEGMENT_GAP_S = 3.0; +const NEMO_NON_BREAKING_PERIOD_WORDS = new Set([ + 'mr.', + 'mrs.', + 'ms.', + 'dr.', + 'prof.', + 'sr.', + 'jr.', + 'vs.', + 'etc.', + 'e.g.', + 'i.e.', +]); /** - * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @param {Array<{ text: string, startTime: number, endTime: number }>} words * @returns {string} */ export function joinTimedWords(words) { @@ -22,66 +42,137 @@ export function joinTimedWords(words) { } /** - * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @param {Array<{ text: string, startTime: number, endTime: number }>} words * @returns {Array<{ text: string, timestamp: [number, number] }>} */ export function buildWordChunks(words) { return words.map((word) => ({ text: word.text, - timestamp: [word.start_time, word.end_time], + timestamp: [word.startTime, word.endTime], })); } /** - * @param {Array<{ text: string, start_time: number, end_time: number }>} words + * @param {Array<{ text: string, startTime: number, endTime: number }>} words * @returns {string} */ export function buildSegmentText(words) { return joinTimedWords(words); } +function stripTrailingClosers(text) { + return String(text ?? '').replace(NEMO_TRAILING_CLOSERS_REGEX, ''); +} + +function looksLikeSentenceStart(text) { + const cleaned = String(text ?? '').replace(NEMO_LEADING_OPENERS_REGEX, ''); + return /^[A-Z]/.test(cleaned); +} + /** - * @param {Array<{ text: string, start_time: number, end_time: number }>} words - * @param {[number, number] | null} utterance_timestamp - * @param {string} text - * @returns {Array<{ text: string, timestamp: [number, number] }>} + * Conservative sentence-boundary heuristic for ASR word timestamps. + * Favors under-segmentation over mid-sentence false positives. + * + * @param {{ text: string }} currentWord + * @param {{ text: string } | null} nextWord + * @param {number} gap_s + * @returns {boolean} */ -export function buildNemoSegmentChunks(words, utterance_timestamp = null, text = '') { +export function shouldEndSentenceAfterWord(currentWord, nextWord, gap_s = 0) { + if (!nextWord) { + return false; + } + + if (gap_s >= NEMO_FALLBACK_SEGMENT_GAP_S) { + return true; + } + + const currentText = String(currentWord?.text ?? ''); + if (!currentText) { + return false; + } + + if (NEMO_STRONG_SENTENCE_END_REGEX.test(currentText)) { + return true; + } + + if (!NEMO_PERIOD_SENTENCE_END_REGEX.test(currentText)) { + return false; + } + + const stripped = stripTrailingClosers(currentText); + const lowered = stripped.toLowerCase(); + if ( + NEMO_NON_BREAKING_PERIOD_WORDS.has(lowered) || + NEMO_DOTTED_ACRONYM_REGEX.test(stripped) || + NEMO_SINGLE_LETTER_ENUM_REGEX.test(stripped) || + NEMO_ROMAN_ENUM_REGEX.test(stripped) || + NEMO_NUMERIC_ENUM_REGEX.test(stripped) + ) { + return false; + } + + return looksLikeSentenceStart(nextWord.text); +} + +/** + * Partition timed words into conservative sentence-like segments. + * + * @param {Array<{ text: string, startTime: number, endTime: number }>} words + * @returns {Array<{ words: Array<{ text: string, startTime: number, endTime: number }>, text: string, timestamp: [number, number] }>} + */ +export function partitionNemoWordsIntoSegments(words) { if (!Array.isArray(words) || words.length === 0) { - if (utterance_timestamp) { - return [{ text, timestamp: utterance_timestamp }]; - } return []; } - /** @type {Array<{ text: string, timestamp: [number, number] }>} */ - const chunks = []; + /** @type {Array<{ words: Array<{ text: string, startTime: number, endTime: number }>, text: string, timestamp: [number, number] }>} */ + const segments = []; /** @type {typeof words} */ let current = []; - for (const word of words) { - const prev = current.at(-1); - if (prev) { - const gap_s = Math.max(0, word.start_time - prev.end_time); - const shouldBreak = - NEMO_SEGMENT_BREAK_REGEX.test(prev.text) || - gap_s > NEMO_MAX_WORD_GAP_S; - if (shouldBreak) { - chunks.push({ - text: buildSegmentText(current), - timestamp: [current[0].start_time, current[current.length - 1].end_time], - }); - current = []; - } - } + for (let i = 0; i < words.length; ++i) { + const word = words[i]; current.push(word); + + const nextWord = words[i + 1] ?? null; + const gap_s = nextWord ? Math.max(0, nextWord.startTime - word.endTime) : 0; + if (shouldEndSentenceAfterWord(word, nextWord, gap_s)) { + segments.push({ + words: current, + text: buildSegmentText(current), + timestamp: [current[0].startTime, current[current.length - 1].endTime], + }); + current = []; + } } if (current.length > 0) { - chunks.push({ + segments.push({ + words: current, text: buildSegmentText(current), - timestamp: [current[0].start_time, current[current.length - 1].end_time], + timestamp: [current[0].startTime, current[current.length - 1].endTime], }); } - return chunks; + return segments; +} + +/** + * @param {Array<{ text: string, startTime: number, endTime: number }>} words + * @param {[number, number] | null} utteranceTimestamp + * @param {string} text + * @returns {Array<{ text: string, timestamp: [number, number] }>} + */ +export function buildNemoSegmentChunks(words, utteranceTimestamp = null, text = '') { + if (!Array.isArray(words) || words.length === 0) { + if (utteranceTimestamp) { + return [{ text, timestamp: utteranceTimestamp }]; + } + return []; + } + + return partitionNemoWordsIntoSegments(words).map((segment) => ({ + text: segment.text, + timestamp: segment.timestamp, + })); } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index 274a22962..f155b9962 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -19,10 +19,9 @@ export function decodeTransducerText(tokenizer, token_ids) { * @param {[number, number][]} token_timestamps * @param {number[] | null} token_confidences * @returns {{ - * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, - * tokens: Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, - * word_confidences: (number | null)[] | null, - * word_avg: number | null, + * words: Array<{ text: string, startTime: number, endTime: number, confidence?: number }>, + * tokens: Array<{ id: number, token: string, rawToken: string, isWordStart: boolean, startTime: number, endTime: number, confidence?: number }>, + * wordAverage: number | null, * }} */ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_timestamps, token_confidences = null) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js index 3eb1f2609..0ea9fd750 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js @@ -1,84 +1,24 @@ -import { decodeTransducerText } from './transducer_text.js'; -import { joinTimedWords } from './transducer_segment_offsets.js'; - -/** - * @param {Float32Array|Float64Array} audio - * @param {number} sampling_rate - * @param {number} chunk_length_s - * @param {number | null} stride_length_s - * @returns {Array<{audio: Float32Array|Float64Array, start_s: number, end_s: number, left_stride_s: number, right_stride_s: number}>} - */ -export function buildNemoWindowSpecs(audio, sampling_rate, chunk_length_s, stride_length_s) { - if (!(chunk_length_s > 0)) { - return [ - { - audio, - start_s: 0, - end_s: audio.length / sampling_rate, - left_stride_s: 0, - right_stride_s: 0, - }, - ]; - } - - if (stride_length_s === null) { - stride_length_s = chunk_length_s / 6; - } else if (!(stride_length_s >= 0)) { - throw Error('`stride_length_s` must be non-negative.'); - } - if (chunk_length_s <= 2 * stride_length_s) { - throw Error('`chunk_length_s` must be larger than `2 * stride_length_s` for Nemo windowed decoding.'); - } - - const window = Math.floor(sampling_rate * chunk_length_s); - const stride = Math.floor(sampling_rate * stride_length_s); - const jump = window - 2 * stride; - - /** @type {Array<{audio: Float32Array|Float64Array, start_s: number, end_s: number, left_stride_s: number, right_stride_s: number}>} */ - const windows = []; - let offset = 0; - while (true) { - const offset_end = offset + window; - const subarr = audio.subarray(offset, offset_end); - const is_first = offset === 0; - const is_last = offset_end >= audio.length; - windows.push({ - audio: subarr, - start_s: offset / sampling_rate, - end_s: (offset + subarr.length) / sampling_rate, - left_stride_s: is_first ? 0 : stride / sampling_rate, - right_stride_s: is_last ? 0 : stride / sampling_rate, - }); - if (is_last) break; - offset += jump; - } - - return windows; +function normalizeMergedWordText(text) { + return String(text ?? '') + .normalize('NFKC') + .toLowerCase() + .replace(/^[("'“‘\[{]+/g, '') + .replace(/[.,!?;:)"'”’\]}]+$/g, '') + .trim(); } -function shouldKeepTimedItem(start_time, end_time, keep_start_s, keep_end_s, is_first_window, is_last_window) { - const midpoint = (start_time + end_time) / 2; - if (!is_first_window && midpoint < keep_start_s) { - return false; - } - if (!is_last_window && midpoint >= keep_end_s) { - return false; - } - return true; -} - -function dedupeMergedWords(words) { +export function dedupeMergedWords(words) { /** @type {typeof words} */ const merged = []; for (const word of words) { const prev = merged.at(-1); if ( prev && - prev.text === word.text && - word.start_time < prev.end_time + normalizeMergedWordText(prev.text) === normalizeMergedWordText(word.text) && + word.startTime < prev.endTime ) { - const prevDuration = prev.end_time - prev.start_time; - const nextDuration = word.end_time - word.start_time; + const prevDuration = prev.endTime - prev.startTime; + const nextDuration = word.endTime - word.startTime; if (nextDuration > prevDuration) { merged[merged.length - 1] = word; } @@ -88,92 +28,3 @@ function dedupeMergedWords(words) { } return merged; } - -function dedupeMergedTokens(tokens) { - /** @type {typeof tokens} */ - const merged = []; - for (const token of tokens) { - const prev = merged.at(-1); - if ( - prev && - prev.id === token.id && - prev.raw_token === token.raw_token && - token.start_time < prev.end_time - ) { - const prevDuration = prev.end_time - prev.start_time; - const nextDuration = token.end_time - token.start_time; - if (nextDuration > prevDuration) { - merged[merged.length - 1] = token; - } - continue; - } - merged.push(token); - } - return merged; -} - -/** - * @param {any} tokenizer - * @param {Array<{ window: { start_s: number, end_s: number, left_stride_s: number, right_stride_s: number }, output: { words?: any[], tokens?: any[] } }>} windowResults - * @returns {{ text: string, tokens: any[], words: any[], utterance_timestamp: [number, number] | null }} - */ -export function mergeNemoWindowResults(tokenizer, windowResults) { - /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ - const mergedTokens = []; - /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ - const mergedWords = []; - - for (const { window, output } of windowResults) { - const keep_start_s = window.start_s + window.left_stride_s; - const keep_end_s = window.end_s - window.right_stride_s; - const is_first_window = window.left_stride_s === 0; - const is_last_window = window.right_stride_s === 0; - - for (const token of output.tokens ?? []) { - if ( - shouldKeepTimedItem( - token.start_time, - token.end_time, - keep_start_s, - keep_end_s, - is_first_window, - is_last_window, - ) - ) { - mergedTokens.push(token); - } - } - - for (const word of output.words ?? []) { - if ( - shouldKeepTimedItem( - word.start_time, - word.end_time, - keep_start_s, - keep_end_s, - is_first_window, - is_last_window, - ) - ) { - mergedWords.push(word); - } - } - } - - const tokens = dedupeMergedTokens(mergedTokens); - const words = dedupeMergedWords(mergedWords); - const text = - words.length > 0 - ? joinTimedWords(words) - : tokens.length > 0 && typeof tokenizer?.decode === 'function' - ? decodeTransducerText(tokenizer, tokens.map((token) => token.id)) - : ''; - const utterance_timestamp = - words.length > 0 - ? /** @type {[number, number]} */ ([words[0].start_time, words[words.length - 1].end_time]) - : tokens.length > 0 - ? /** @type {[number, number]} */ ([tokens[0].start_time, tokens[tokens.length - 1].end_time]) - : null; - - return { text, tokens, words, utterance_timestamp }; -} diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js index 5046a71a5..9422b3270 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js @@ -93,7 +93,7 @@ function consumeAlignedTokenText(fullText, cursor, tokenText) { } /** - * @param {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} words + * @param {Array<{ text: string, startTime: number, endTime: number, confidence?: number }>} words * @param {{ text: string, start: number, end: number, confs: number[] } | null} current */ function finalizeAndPushWord(words, current) { @@ -102,11 +102,11 @@ function finalizeAndPushWord(words, current) { const text = current.text.trim(); if (!text) return; - /** @type {{ text: string, start_time: number, end_time: number, confidence?: number }} */ + /** @type {{ text: string, startTime: number, endTime: number, confidence?: number }} */ const word = { text, - start_time: current.start, - end_time: current.end, + startTime: current.start, + endTime: current.end, }; if (current.confs.length > 0) { word.confidence = Math.round((current.confs.reduce((a, b) => a + b, 0) / current.confs.length) * 1e6) / 1e6; @@ -121,10 +121,9 @@ function finalizeAndPushWord(words, current) { * @param {number[] | null} token_confidences * @param {string} fullText * @returns {{ - * words: Array<{ text: string, start_time: number, end_time: number, confidence?: number }>, - * tokens: Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>, - * word_confidences: (number | null)[] | null, - * word_avg: number | null, + * words: Array<{ text: string, startTime: number, endTime: number, confidence?: number }>, + * tokens: Array<{ id: number, token: string, rawToken: string, isWordStart: boolean, startTime: number, endTime: number, confidence?: number }>, + * wordAverage: number | null, * }} */ export function buildTransducerWordOffsets( @@ -135,7 +134,7 @@ export function buildTransducerWordOffsets( fullText = '', ) { if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { - return { words: [], tokens: [], word_confidences: null, word_avg: null }; + return { words: [], tokens: [], wordAverage: null }; } if (token_ids.length !== token_timestamps.length) { throw new Error( @@ -148,9 +147,9 @@ export function buildTransducerWordOffsets( ); } - /** @type {Array<{ id: number, token: string, raw_token: string, is_word_start: boolean, start_time: number, end_time: number, confidence?: number }>} */ + /** @type {Array<{ id: number, token: string, rawToken: string, isWordStart: boolean, startTime: number, endTime: number, confidence?: number }>} */ const tokens = []; - /** @type {Array<{ text: string, start_time: number, end_time: number, confidence?: number }>} */ + /** @type {Array<{ text: string, startTime: number, endTime: number, confidence?: number }>} */ const words = []; let textCursor = 0; @@ -173,10 +172,10 @@ export function buildTransducerWordOffsets( const tok = { id, token: tokenText, - raw_token: raw, - is_word_start: startsNewWord, - start_time: ts[0], - end_time: ts[1], + rawToken: raw, + isWordStart: startsNewWord, + startTime: ts[0], + endTime: ts[1], }; const conf = token_confidences?.[i]; if (conf != null && Number.isFinite(conf)) { @@ -203,14 +202,13 @@ export function buildTransducerWordOffsets( finalizeAndPushWord(words, current); - const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null; - let word_avg = null; - if (word_confidences) { - const validConfidences = word_confidences.filter((x) => x != null); + let wordAverage = null; + if (words.some((x) => x.confidence != null)) { + const validConfidences = words.map((x) => x.confidence).filter((x) => x != null); if (validConfidences.length > 0) { - word_avg = Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; + wordAverage = Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; } } - return { words, tokens, word_confidences, word_avg }; + return { words, tokens, wordAverage }; } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js deleted file mode 100644 index fdf5e7e67..000000000 --- a/packages/transformers/src/models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js +++ /dev/null @@ -1,8 +0,0 @@ -// Backwards-compatible barrel for older internal imports. -export { createAudioCacheKey, FeatureLRUCache } from './transducer_cache.js'; -export { computeTemporalDeltas } from './transducer_deltas.js'; -export { decodeTransducerText, buildTransducerDetailedOutputs } from './transducer_text.js'; -export { buildTransducerWordOffsets } from './transducer_word_offsets.js'; -export { joinTimedWords, buildWordChunks, buildSegmentText, buildNemoSegmentChunks } from './transducer_segment_offsets.js'; -export { buildNemoWindowSpecs, mergeNemoWindowResults } from './transducer_window_merge.js'; -export { runNemoConformerTDTPipeline } from './pipeline_nemo_conformer_tdt.js'; diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index 6415f5857..a81b779c4 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -5,7 +5,7 @@ import { max, round } from '../utils/maths.js'; import { logger } from '../utils/logger.js'; import { runNemoConformerTDTPipeline, -} from '../models/nemo_conformer_tdt/utils_nemo_conformer_tdt.js'; +} from '../models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js'; /** * @typedef {import('./_base.js').TextAudioPipelineConstructorArgs} TextAudioPipelineConstructorArgs diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js index 74f60d8d0..77489d0e1 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_feature_extraction_nemo_conformer_tdt.js @@ -1,4 +1,5 @@ import { NemoConformerTDTFeatureExtractor, Tensor } from "../../../src/transformers.js"; +import { NEMO_FEATURE_OUTPUT_OWNERSHIP } from "../../../src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js"; import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; @@ -156,6 +157,89 @@ export default () => { MAX_TEST_EXECUTION_TIME, ); + it( + "marks uncached outputs as caller-owned", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ ...base, feature_size: 80 }); + const outputs = await extractor(audio); + try { + expect(outputs[NEMO_FEATURE_OUTPUT_OWNERSHIP]).toBe(false); + } finally { + outputs.input_features.dispose(); + outputs.attention_mask.dispose(); + } + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "marks cached outputs as cache-owned", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + use_feature_cache: true, + feature_cache_max_entries: 8, + feature_cache_max_size_mb: 8, + }); + try { + const first = await extractor(audio); + const second = await extractor(audio); + + expect(first[NEMO_FEATURE_OUTPUT_OWNERSHIP]).toBe(true); + expect(second[NEMO_FEATURE_OUTPUT_OWNERSHIP]).toBe(true); + expect(first.input_features).toBe(second.input_features); + } finally { + extractor.clear_cache(); + } + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "marks skipped-cache outputs as caller-owned", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + use_feature_cache: true, + feature_cache_max_entries: 0, + feature_cache_max_size_mb: 8, + }); + const outputs = await extractor(audio); + try { + expect(outputs[NEMO_FEATURE_OUTPUT_OWNERSHIP]).toBe(false); + expect(extractor.get_cache_stats().entries).toBe(0); + } finally { + outputs.input_features.dispose(); + outputs.attention_mask.dispose(); + } + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "marks oversized-cache outputs as caller-owned", + async () => { + const extractor = new NemoConformerTDTFeatureExtractor({ + ...base, + feature_size: 80, + use_feature_cache: true, + feature_cache_max_entries: 8, + feature_cache_max_size_mb: 0.000001, + }); + const outputs = await extractor(audio); + try { + expect(outputs[NEMO_FEATURE_OUTPUT_OWNERSHIP]).toBe(false); + expect(extractor.get_cache_stats().entries).toBe(0); + } finally { + outputs.input_features.dispose(); + outputs.attention_mask.dispose(); + } + }, + MAX_TEST_EXECUTION_TIME, + ); + it( "uses feature cache when enabled for non-concatenated delta outputs", async () => { diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index c52df3667..b0faa6b81 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -1,6 +1,11 @@ import { NemoConformerForTDT, Tensor } from "../../../src/transformers.js"; import { createAudioCacheKey, FeatureLRUCache } from "../../../src/models/nemo_conformer_tdt/transducer_cache.js"; import { computeTemporalDeltas } from "../../../src/models/nemo_conformer_tdt/transducer_deltas.js"; +import { + buildNemoSegmentChunks, + partitionNemoWordsIntoSegments, + shouldEndSentenceAfterWord, +} from "../../../src/models/nemo_conformer_tdt/transducer_segment_offsets.js"; import { buildTransducerDetailedOutputs } from "../../../src/models/nemo_conformer_tdt/transducer_text.js"; import { MODEL_TYPE_MAPPING, MODEL_TYPES } from "../../../src/models/modeling_utils.js"; import { get_model_files } from "../../../src/utils/model_registry/get_model_files.js"; @@ -251,15 +256,17 @@ export default () => { const output = await model.transcribe(inputs, { tokenizer, - return_timestamps: true, - return_words: true, - return_tokens: true, + returnTimestamps: true, + returnWords: true, + returnTokens: true, }); expect(output.text).toBe("hello world"); - expect(output.utterance_timestamp).toEqual([0, 0.12]); - expect(output.words).toEqual([expect.objectContaining({ text: "hello", start_time: 0, end_time: 0.04 }), expect.objectContaining({ text: "world", start_time: 0.04, end_time: 0.12 })]); - expect(output.tokens).toEqual([expect.objectContaining({ id: 1, start_time: 0, end_time: 0.04 }), expect.objectContaining({ id: 2, start_time: 0.04, end_time: 0.12 })]); + expect(output.isFinal).toBe(true); + expect(output.utteranceTimestamp).toEqual([0, 0.12]); + expect(output.words).toEqual([expect.objectContaining({ text: "hello", startTime: 0, endTime: 0.04 }), expect.objectContaining({ text: "world", startTime: 0.04, endTime: 0.12 })]); + expect(output.tokens).toEqual([expect.objectContaining({ id: 1, startTime: 0, endTime: 0.04 }), expect.objectContaining({ id: 2, startTime: 0.04, endTime: 0.12 })]); + expect(output.confidence).toEqual(expect.objectContaining({ utterance: expect.any(Number), wordAverage: expect.any(Number), averageLogProb: expect.any(Number) })); }, MAX_TEST_EXECUTION_TIME, ); @@ -285,13 +292,13 @@ export default () => { const output = await model.transcribe(inputs, { tokenizer, - return_timestamps: true, - return_tokens: true, + returnTimestamps: true, + returnTokens: true, }); expect(output.tokens).toHaveLength(1); - expect(output.tokens[0]).toEqual(expect.objectContaining({ start_time: 0, end_time: 0.12 })); - expect(output.utterance_timestamp).toEqual([0, 0.12]); + expect(output.tokens[0]).toEqual(expect.objectContaining({ startTime: 0, endTime: 0.12 })); + expect(output.utteranceTimestamp).toEqual([0, 0.12]); }, MAX_TEST_EXECUTION_TIME, ); @@ -313,14 +320,14 @@ export default () => { }; const output = await model.transcribe(inputs, { - return_timestamps: false, + returnTimestamps: false, returnFrameConfidences: true, }); - expect(output.confidence_scores.frame).toHaveLength(2); - expect(output.confidence_scores.frame[0]).toBeCloseTo(0.9579343795, 6); - expect(output.confidence_scores.frame_avg).toBeCloseTo( - (output.confidence_scores.frame[0] + output.confidence_scores.frame[1]) / 2, + expect(output.confidence.frames).toHaveLength(2); + expect(output.confidence.frames[0]).toBeCloseTo(0.9579343795, 6); + expect(output.confidence.frameAverage).toBeCloseTo( + (output.confidence.frames[0] + output.confidence.frames[1]) / 2, 6, ); }, @@ -338,7 +345,7 @@ export default () => { await expect( model.transcribe(inputs, { tokenizer: { decode: () => "" }, - return_timestamps: true, + returnTimestamps: true, timeOffset: Number.NaN, }), ).rejects.toThrow("timeOffset"); @@ -361,7 +368,7 @@ export default () => { await expect( model.transcribe(inputs, { tokenizer: { decode: () => "" }, - return_timestamps: false, + returnTimestamps: false, }), ).rejects.toThrow("missing duration logits"); }, @@ -637,7 +644,7 @@ export default () => { input_features: new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]), }; - const output = await model.transcribe(inputs, { return_timestamps: false }); + const output = await model.transcribe(inputs, { returnTimestamps: false }); expect(output).toEqual(expect.objectContaining({ text: "" })); expect(model.auxDisposals).toBe(1); }, @@ -646,6 +653,52 @@ export default () => { }); describe("Nemo Conformer TDT utilities", () => { + it("uses conservative sentence boundaries for punctuation, abbreviations, and long silences", () => { + expect(shouldEndSentenceAfterWord({ text: "Hello." }, { text: "World" }, 0)).toBe(true); + expect(shouldEndSentenceAfterWord({ text: "U.S." }, { text: "Report" }, 0)).toBe(false); + expect(shouldEndSentenceAfterWord({ text: "3." }, { text: "Title" }, 0)).toBe(false); + expect(shouldEndSentenceAfterWord({ text: "I." }, { text: "Overview" }, 0)).toBe(false); + expect(shouldEndSentenceAfterWord({ text: "Dr." }, { text: "Brown" }, 0)).toBe(false); + expect(shouldEndSentenceAfterWord({ text: "wait" }, { text: "Next" }, 3.2)).toBe(true); + }); + + it("partitions timed words into conservative sentence-like chunks", () => { + const words = [ + { text: "Hello.", startTime: 0, endTime: 0.4 }, + { text: "World", startTime: 0.5, endTime: 0.8 }, + { text: "again.", startTime: 0.8, endTime: 1.1 }, + { text: "U.S.", startTime: 1.2, endTime: 1.5 }, + { text: "Report", startTime: 1.6, endTime: 2.0 }, + { text: "update.", startTime: 2.0, endTime: 2.4 }, + { text: "pause", startTime: 6.0, endTime: 6.3 }, + { text: "Next", startTime: 9.5, endTime: 9.8 }, + { text: "sentence.", startTime: 9.8, endTime: 10.2 }, + ]; + + const segments = partitionNemoWordsIntoSegments(words); + expect(segments.map((x) => x.text)).toEqual([ + "Hello.", + "World again.", + "U.S. Report update.", + "pause", + "Next sentence.", + ]); + expect(segments.map((x) => x.timestamp)).toEqual([ + [0, 0.4], + [0.5, 1.1], + [1.2, 2.4], + [6.0, 6.3], + [9.5, 10.2], + ]); + expect(buildNemoSegmentChunks(words)).toEqual([ + { text: "Hello.", timestamp: [0, 0.4] }, + { text: "World again.", timestamp: [0.5, 1.1] }, + { text: "U.S. Report update.", timestamp: [1.2, 2.4] }, + { text: "pause", timestamp: [6.0, 6.3] }, + { text: "Next sentence.", timestamp: [9.5, 10.2] }, + ]); + }); + it("keeps word boundaries from the final decoded text for numeric and punctuation tokens", () => { const rawById = { 1: "▁score", @@ -827,11 +880,11 @@ export default () => { "evicts least-recently-used entries when full", async () => { const cache = new FeatureLRUCache({ max_entries: 2, max_size_mb: 4 }); - cache.set("a", new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3])); - cache.set("b", new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3])); + expect(cache.set("a", new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3]))).toBe(true); + expect(cache.set("b", new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3]))).toBe(true); expect(cache.get("a")).not.toBeNull(); - cache.set("c", new Tensor("float32", new Float32Array([7, 8, 9]), [1, 3])); + expect(cache.set("c", new Tensor("float32", new Float32Array([7, 8, 9]), [1, 3]))).toBe(true); // `b` should be evicted because `a` was recently accessed. expect(cache.get("b")).toBeNull(); expect(cache.get("a")).not.toBeNull(); @@ -928,8 +981,8 @@ export default () => { t2Dispose(); }; - byEntries.set("x", t1); - bySize.set("y", t2); + expect(byEntries.set("x", t1)).toBe(false); + expect(bySize.set("y", t2)).toBe(false); expect(byEntries.get("x")).toBeNull(); expect(bySize.get("y")).toBeNull(); expect(t1Disposals).toBe(0); @@ -951,7 +1004,7 @@ export default () => { originalDispose(); }; - cache.set("big", tensor); + expect(cache.set("big", tensor)).toBe(false); expect(cache.get("big")).toBeNull(); expect(disposeCalls).toBe(0); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index 5a377ece1..f4d398371 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -1,4 +1,5 @@ import { pipeline, AutomaticSpeechRecognitionPipeline, Tensor } from "../../src/transformers.js"; +import { NEMO_FEATURE_OUTPUT_OWNERSHIP } from "../../src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js"; import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; @@ -127,6 +128,15 @@ export default () => { }); describe("nemo-conformer-tdt (unit)", () => { + const withNemoTensorOwnership = (value, cacheOwnsTensors) => { + Object.defineProperty(value, NEMO_FEATURE_OUTPUT_OWNERSHIP, { + value: cacheOwnsTensors, + enumerable: false, + configurable: true, + }); + return value; + }; + const makeUnitPipe = (modelType = "nemo-conformer-tdt") => { const calls = []; const model = { @@ -134,19 +144,18 @@ export default () => { async transcribe(_inputs, options) { calls.push(options); const result = { text: "hello world" }; - if (options.return_timestamps) { - result.utterance_timestamp = [0, 0.08]; - result.utterance_confidence = 0.95; - result.confidence_scores = { token_avg: 0.95, word_avg: 0.94, overall_log_prob: -0.05 }; - if (options.return_words) { + if (options.returnTimestamps) { + result.utteranceTimestamp = [0, 0.08]; + result.confidence = { utterance: 0.95, wordAverage: 0.94, averageLogProb: -0.05 }; + if (options.returnWords) { result.words = [ - { text: "hello", start_time: 0, end_time: 0.04, confidence: 0.96 }, - { text: "world", start_time: 0.04, end_time: 0.08, confidence: 0.93 }, + { text: "hello", startTime: 0, endTime: 0.04, confidence: 0.96 }, + { text: "world", startTime: 0.04, endTime: 0.08, confidence: 0.93 }, ]; } } - if (options.return_metrics) { - result.metrics = { total_ms: 42, rtf: 0.01 }; + if (options.returnMetrics) { + result.metrics = { totalMs: 42, rtf: 0.01, rtfX: 100 }; } return result; }, @@ -185,9 +194,9 @@ export default () => { expect(output).toEqual({ text: "hello world" }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ - return_timestamps: false, - return_words: false, - return_metrics: false, + returnTimestamps: false, + returnWords: false, + returnMetrics: false, }); }); @@ -202,9 +211,9 @@ export default () => { }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ - return_timestamps: true, - return_words: true, - return_metrics: false, + returnTimestamps: true, + returnWords: true, + returnMetrics: false, }); }); @@ -220,43 +229,93 @@ export default () => { }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ - return_timestamps: true, - return_words: true, - return_metrics: false, + returnTimestamps: true, + returnWords: true, + returnMetrics: false, + }); + }); + + it("builds conservative sentence chunks from Nemo word timestamps", async () => { + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe() { + return { + text: "Hello. World again. U.S. Report update.", + utteranceTimestamp: [0, 2.4], + words: [ + { text: "Hello.", startTime: 0, endTime: 0.4 }, + { text: "World", startTime: 0.5, endTime: 0.8 }, + { text: "again.", startTime: 0.8, endTime: 1.1 }, + { text: "U.S.", startTime: 1.2, endTime: 1.5 }, + { text: "Report", startTime: 1.6, endTime: 2.0 }, + { text: "update.", startTime: 2.0, endTime: 2.4 }, + ], + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + const output = await pipe(new Float32Array(16000), { return_timestamps: true }); + expect(output).toEqual({ + text: "Hello. World again. U.S. Report update.", + chunks: [ + { text: "Hello.", timestamp: [0, 0.4] }, + { text: "World again.", timestamp: [0.5, 1.1] }, + { text: "U.S. Report update.", timestamp: [1.2, 2.4] }, + ], }); }); - it("merges overlapping windows when Nemo chunking is enabled", async () => { + it("uses explicit chunk_length_s as a bounded sentence window size override", async () => { const calls = []; + const outputsByOffset = new Map([ + [0, { + text: "Alpha. Beta. Carry", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "Beta.", startTime: 17, endTime: 18 }, + { text: "Carry", startTime: 19.95, endTime: 20 }, + ], + }], + [19.95, { + text: "Carry on. Gamma", + words: [ + { text: "Carry", startTime: 19.95, endTime: 20 }, + { text: "on.", startTime: 20, endTime: 20.5 }, + { text: "Gamma", startTime: 37.9, endTime: 38 }, + ], + }], + [37.9, { + text: "Gamma. Tail resumes. Omega.", + words: [ + { text: "Gamma.", startTime: 37.9, endTime: 39 }, + { text: "Tail", startTime: 39.2, endTime: 39.6 }, + { text: "resumes.", startTime: 39.6, endTime: 40.1 }, + { text: "Omega.", startTime: 40.1, endTime: 40.45 }, + ], + }], + ]); const model = { config: { model_type: "nemo-conformer-tdt" }, async transcribe(_inputs, options) { calls.push(options); - if (options.timeOffset === 0) { - return { - text: "hello world again", - words: [ - { text: "hello", start_time: 0, end_time: 0.5 }, - { text: "world", start_time: 0.5, end_time: 1.1 }, - { text: "again", start_time: 1.2, end_time: 1.8 }, - ], - tokens: [ - { id: 1, token: "hello", raw_token: "hello", is_word_start: true, start_time: 0, end_time: 0.5 }, - { id: 2, token: "world", raw_token: "world", is_word_start: true, start_time: 0.5, end_time: 1.1 }, - { id: 3, token: "again", raw_token: "again", is_word_start: true, start_time: 1.2, end_time: 1.8 }, - ], - }; + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); } return { - text: "again today", - words: [ - { text: "again", start_time: 1.2, end_time: 1.8 }, - { text: "today", start_time: 1.8, end_time: 2.4 }, - ], - tokens: [ - { id: 3, token: "again", raw_token: "again", is_word_start: true, start_time: 1.2, end_time: 1.8 }, - { id: 4, token: "today", raw_token: "today", is_word_start: true, start_time: 1.8, end_time: 2.4 }, - ], + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, }; }, async dispose() {}, @@ -282,71 +341,228 @@ export default () => { processor, }); - const output = await pipe(new Float32Array(3 * 16000), { + const output = await pipe(new Float32Array(40.5 * 16000), { return_timestamps: "word", chunk_length_s: 2, - stride_length_s: 0.5, }); expect(output).toEqual({ - text: "hello world again today", + text: "Alpha. Beta. Carry on. Gamma. Tail resumes. Omega.", chunks: [ - { text: "hello", timestamp: [0, 0.5] }, - { text: "world", timestamp: [0.5, 1.1] }, - { text: "again", timestamp: [1.2, 1.8] }, - { text: "today", timestamp: [1.8, 2.4] }, + { text: "Alpha.", timestamp: [0, 1] }, + { text: "Beta.", timestamp: [17, 18] }, + { text: "Carry", timestamp: [19.95, 20] }, + { text: "on.", timestamp: [20, 20.5] }, + { text: "Gamma.", timestamp: [37.9, 39] }, + { text: "Tail", timestamp: [39.2, 39.6] }, + { text: "resumes.", timestamp: [39.6, 40.1] }, + { text: "Omega.", timestamp: [40.1, 40.45] }, ], }); - expect(calls).toHaveLength(2); + expect(calls).toHaveLength(3); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 19.95, 37.9]); expect(calls[0]).toMatchObject({ - return_timestamps: true, - return_words: true, - return_tokens: true, - return_metrics: false, + returnTimestamps: true, + returnWords: true, + returnMetrics: false, timeOffset: 0, }); expect(calls[1]).toMatchObject({ + returnTimestamps: true, + returnWords: true, + returnMetrics: false, + timeOffset: 19.95, + }); + expect(calls[2]).toMatchObject({ + returnTimestamps: true, + returnWords: true, + returnMetrics: false, + timeOffset: 37.9, + }); + }); + + it("replaces boundary-truncated sentences with the longer retranscribed sentence", async () => { + const calls = []; + const outputsByOffset = new Map([ + [0, { + text: "Alpha. Beta. It won't run away, and it won't come to life.", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "Beta.", startTime: 11, endTime: 12 }, + { text: "It", startTime: 17.2, endTime: 17.5 }, + { text: "won't", startTime: 17.5, endTime: 17.9 }, + { text: "run", startTime: 17.9, endTime: 18.2 }, + { text: "away,", startTime: 18.2, endTime: 18.6 }, + { text: "and", startTime: 18.6, endTime: 18.8 }, + { text: "it", startTime: 18.8, endTime: 19.0 }, + { text: "won't", startTime: 19.0, endTime: 19.3 }, + { text: "come", startTime: 19.3, endTime: 19.5 }, + { text: "to", startTime: 19.5, endTime: 19.65 }, + { text: "life.", startTime: 19.65, endTime: 19.8 }, + ], + }], + [17.2, { + text: "It won't run away, and it won't come to life until someone finds it. Omega.", + words: [ + { text: "It", startTime: 17.2, endTime: 17.5 }, + { text: "won't", startTime: 17.5, endTime: 17.9 }, + { text: "run", startTime: 17.9, endTime: 18.2 }, + { text: "away,", startTime: 18.2, endTime: 18.6 }, + { text: "and", startTime: 18.6, endTime: 18.8 }, + { text: "it", startTime: 18.8, endTime: 19.0 }, + { text: "won't", startTime: 19.0, endTime: 19.3 }, + { text: "come", startTime: 19.3, endTime: 19.5 }, + { text: "to", startTime: 19.5, endTime: 19.65 }, + { text: "life", startTime: 19.65, endTime: 19.95 }, + { text: "until", startTime: 19.95, endTime: 20.4 }, + { text: "someone", startTime: 20.4, endTime: 21.0 }, + { text: "finds", startTime: 21.0, endTime: 21.5 }, + { text: "it.", startTime: 21.5, endTime: 22.0 }, + { text: "Omega.", startTime: 28, endTime: 29 }, + ], + }], + ]); + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe(_inputs, options) { + calls.push(options); + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + const output = await pipe(new Float32Array(Math.ceil(31 * 16000)), { return_timestamps: true, - return_words: true, - return_tokens: true, - return_metrics: false, - timeOffset: 1, + chunk_length_s: 20, + }); + + expect(output).toEqual({ + text: "Alpha. Beta. It won't run away, and it won't come to life until someone finds it. Omega.", + chunks: [ + { text: "Alpha.", timestamp: [0, 1] }, + { text: "Beta.", timestamp: [11, 12] }, + { text: "It won't run away, and it won't come to life until someone finds it.", timestamp: [17.2, 22] }, + { text: "Omega.", timestamp: [28, 29] }, + ], + }); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 17.2]); + }); + + it("retranscribes the dropped last sentence from its start without stale carry", async () => { + const calls = []; + const outputsByOffset = new Map([ + [0, { + text: "Alpha. The pressure gauge mark. He watched as the fruit", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "The", startTime: 16.8, endTime: 17.0 }, + { text: "pressure", startTime: 17.0, endTime: 17.4 }, + { text: "gauge", startTime: 17.4, endTime: 17.76 }, + { text: "mark.", startTime: 17.76, endTime: 18.56 }, + { text: "He", startTime: 18.56, endTime: 18.72 }, + { text: "watched", startTime: 18.72, endTime: 18.96 }, + { text: "as", startTime: 18.96, endTime: 19.04 }, + { text: "the", startTime: 19.04, endTime: 19.2 }, + { text: "fruit", startTime: 19.2, endTime: 19.36 }, + ], + }], + [18.56, { + text: "He watched as the fluid.", + words: [ + { text: "He", startTime: 18.56, endTime: 18.72 }, + { text: "watched", startTime: 18.72, endTime: 19.12 }, + { text: "as", startTime: 19.12, endTime: 19.28 }, + { text: "the", startTime: 19.28, endTime: 19.36 }, + { text: "fluid.", startTime: 19.36, endTime: 20 }, + ], + }], + ]); + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe(_inputs, options) { + calls.push(options); + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + const output = await pipe(new Float32Array(Math.ceil(21 * 16000)), { + return_timestamps: "word", + chunk_length_s: 20, }); + + expect(output).toEqual({ + text: "Alpha. The pressure gauge mark. He watched as the fluid.", + chunks: [ + { text: "Alpha.", timestamp: [0, 1] }, + { text: "The", timestamp: [16.8, 17] }, + { text: "pressure", timestamp: [17, 17.4] }, + { text: "gauge", timestamp: [17.4, 17.76] }, + { text: "mark.", timestamp: [17.76, 18.56] }, + { text: "He", timestamp: [18.56, 18.72] }, + { text: "watched", timestamp: [18.72, 19.12] }, + { text: "as", timestamp: [19.12, 19.28] }, + { text: "the", timestamp: [19.28, 19.36] }, + { text: "fluid.", timestamp: [19.36, 20] }, + ], + }); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 18.56]); }); it("reconstructs windowed Nemo text from merged words when token decode drops spaces", async () => { + const calls = []; const model = { config: { model_type: "nemo-conformer-tdt" }, async transcribe(_inputs, options) { + calls.push(options); if (options.timeOffset === 0) { return { text: "score. 48-year-old", words: [ - { text: "score.", start_time: 0, end_time: 0.4 }, - { text: "48-year-old", start_time: 0.5, end_time: 1.3 }, - ], - tokens: [ - { id: 1, token: "score", raw_token: "▁score", is_word_start: true, start_time: 0, end_time: 0.3 }, - { id: 2, token: ".", raw_token: ".", is_word_start: false, start_time: 0.3, end_time: 0.4 }, - { id: 3, token: "48", raw_token: "48", is_word_start: false, start_time: 0.5, end_time: 0.8 }, - { id: 4, token: "-", raw_token: "-", is_word_start: false, start_time: 0.8, end_time: 0.85 }, - { id: 5, token: "year", raw_token: "year", is_word_start: false, start_time: 0.85, end_time: 1.05 }, - { id: 4, token: "-", raw_token: "-", is_word_start: false, start_time: 1.05, end_time: 1.1 }, - { id: 6, token: "old", raw_token: "old", is_word_start: false, start_time: 1.1, end_time: 1.3 }, + { text: "score.", startTime: 0, endTime: 0.4 }, + { text: "48-year-old", startTime: 0.5, endTime: 1.3 }, ], }; } return { text: "with 0.5", words: [ - { text: "with", start_time: 1.4, end_time: 1.7 }, - { text: "0.5", start_time: 1.8, end_time: 2.05 }, - ], - tokens: [ - { id: 7, token: "with", raw_token: "▁with", is_word_start: true, start_time: 1.4, end_time: 1.7 }, - { id: 8, token: "0", raw_token: "0", is_word_start: false, start_time: 1.8, end_time: 1.9 }, - { id: 2, token: ".", raw_token: ".", is_word_start: false, start_time: 1.9, end_time: 1.95 }, - { id: 9, token: "5", raw_token: "5", is_word_start: false, start_time: 1.95, end_time: 2.05 }, + { text: "with", startTime: 1.4, endTime: 1.7 }, + { text: "0.5", startTime: 1.8, endTime: 2.05 }, ], }; }, @@ -378,10 +594,9 @@ export default () => { processor, }); - const output = await pipe(new Float32Array(3 * 16000), { + const output = await pipe(new Float32Array(Math.ceil(20.1 * 16000)), { return_timestamps: "word", - chunk_length_s: 2, - stride_length_s: 0.5, + chunk_length_s: 20, }); expect(output.text).toBe("score. 48-year-old with 0.5"); @@ -391,39 +606,62 @@ export default () => { { text: "with", timestamp: [1.4, 1.7] }, { text: "0.5", timestamp: [1.8, 2.05] }, ]); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 10]); }); - it("auto-window long Nemo audio with 90s chunks and 10s stride", async () => { + it("auto-windows long Nemo audio with 90s sentence windows", async () => { const calls = []; - const wordsByOffset = new Map([ - [0, { id: 1, text: "alpha", start: 0, end: 1 }], - [70, { id: 2, text: "beta", start: 85, end: 86 }], - [140, { id: 3, text: "gamma", start: 155, end: 156 }], - [210, { id: 4, text: "delta", start: 225, end: 226 }], + const outputsByOffset = new Map([ + [0, { + text: "Alpha. Beta. Gamma. Carry", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "Beta.", startTime: 30, endTime: 31 }, + { text: "Gamma.", startTime: 69, endTime: 70 }, + { text: "Carry", startTime: 84, endTime: 85 }, + ], + }], + [84, { + text: "Carry on. Delta. Epsilon. Tail", + words: [ + { text: "Carry", startTime: 84, endTime: 85 }, + { text: "on.", startTime: 86, endTime: 87 }, + { text: "Delta.", startTime: 110, endTime: 111 }, + { text: "Epsilon.", startTime: 139, endTime: 140 }, + { text: "Tail", startTime: 154, endTime: 155 }, + ], + }], + [154, { + text: "Tail resumes. Zeta. Eta. Final", + words: [ + { text: "Tail", startTime: 154, endTime: 155 }, + { text: "resumes.", startTime: 156, endTime: 157 }, + { text: "Zeta.", startTime: 180, endTime: 181 }, + { text: "Eta.", startTime: 209, endTime: 210 }, + { text: "Final", startTime: 224, endTime: 225 }, + ], + }], + [224, { + text: "Final line. Omega.", + words: [ + { text: "Final", startTime: 224, endTime: 225 }, + { text: "line.", startTime: 226, endTime: 227 }, + { text: "Omega.", startTime: 250, endTime: 251 }, + ], + }], ]); const model = { config: { model_type: "nemo-conformer-tdt" }, async transcribe(_inputs, options) { calls.push(options); - const item = wordsByOffset.get(options.timeOffset); + const item = outputsByOffset.get(options.timeOffset); if (!item) { throw new Error(`Unexpected timeOffset ${options.timeOffset}`); } return { text: item.text, - words: [ - { text: item.text, start_time: item.start, end_time: item.end }, - ], - tokens: [ - { - id: item.id, - token: item.text, - raw_token: item.text, - is_word_start: true, - start_time: item.start, - end_time: item.end, - }, - ], + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, }; }, async dispose() {}, @@ -452,26 +690,123 @@ export default () => { const output = await pipe(new Float32Array(300 * 16000), { return_timestamps: "word" }); expect(output).toEqual({ - text: "alpha beta gamma delta", + text: "Alpha. Beta. Gamma. Carry on. Delta. Epsilon. Tail resumes. Zeta. Eta. Final line. Omega.", chunks: [ - { text: "alpha", timestamp: [0, 1] }, - { text: "beta", timestamp: [85, 86] }, - { text: "gamma", timestamp: [155, 156] }, - { text: "delta", timestamp: [225, 226] }, + { text: "Alpha.", timestamp: [0, 1] }, + { text: "Beta.", timestamp: [30, 31] }, + { text: "Gamma.", timestamp: [69, 70] }, + { text: "Carry", timestamp: [84, 85] }, + { text: "on.", timestamp: [86, 87] }, + { text: "Delta.", timestamp: [110, 111] }, + { text: "Epsilon.", timestamp: [139, 140] }, + { text: "Tail", timestamp: [154, 155] }, + { text: "resumes.", timestamp: [156, 157] }, + { text: "Zeta.", timestamp: [180, 181] }, + { text: "Eta.", timestamp: [209, 210] }, + { text: "Final", timestamp: [224, 225] }, + { text: "line.", timestamp: [226, 227] }, + { text: "Omega.", timestamp: [250, 251] }, ], }); expect(calls).toHaveLength(4); - expect(calls.map((x) => x.timeOffset)).toEqual([0, 70, 140, 210]); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 84, 154, 224]); for (const call of calls) { expect(call).toMatchObject({ - return_timestamps: true, - return_words: true, - return_tokens: true, - return_metrics: false, + returnTimestamps: true, + returnWords: true, + returnMetrics: false, }); } }); + it("returns sentence chunks for auto-windowed long Nemo audio", async () => { + const calls = []; + const outputsByOffset = new Map([ + [0, { + text: "Alpha. Beta. Gamma. Carry", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "Beta.", startTime: 30, endTime: 31 }, + { text: "Gamma.", startTime: 69, endTime: 70 }, + { text: "Carry", startTime: 84, endTime: 85 }, + ], + }], + [84, { + text: "Carry on. Delta. Epsilon. Tail", + words: [ + { text: "Carry", startTime: 84, endTime: 85 }, + { text: "on.", startTime: 86, endTime: 87 }, + { text: "Delta.", startTime: 110, endTime: 111 }, + { text: "Epsilon.", startTime: 139, endTime: 140 }, + { text: "Tail", startTime: 154, endTime: 155 }, + ], + }], + [154, { + text: "Tail resumes. Zeta. Eta. Final", + words: [ + { text: "Tail", startTime: 154, endTime: 155 }, + { text: "resumes.", startTime: 156, endTime: 157 }, + { text: "Zeta.", startTime: 180, endTime: 181 }, + { text: "Eta.", startTime: 209, endTime: 210 }, + { text: "Final", startTime: 224, endTime: 225 }, + ], + }], + [224, { + text: "Final line. Omega.", + words: [ + { text: "Final", startTime: 224, endTime: 225 }, + { text: "line.", startTime: 226, endTime: 227 }, + { text: "Omega.", startTime: 250, endTime: 251 }, + ], + }], + ]); + const model = { + config: { model_type: "nemo-conformer-tdt" }, + async transcribe(_inputs, options) { + calls.push(options); + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, + }; + }, + async dispose() {}, + }; + const processor = Object.assign(async () => ({ input_features: {} }), { + feature_extractor: { config: { sampling_rate: 16000 } }, + }); + const pipe = new AutomaticSpeechRecognitionPipeline({ + task: PIPELINE_ID, + model, + tokenizer: {}, + processor, + }); + + const output = await pipe(new Float32Array(300 * 16000), { return_timestamps: true }); + + expect(output).toEqual({ + text: "Alpha. Beta. Gamma. Carry on. Delta. Epsilon. Tail resumes. Zeta. Eta. Final line. Omega.", + chunks: [ + { text: "Alpha.", timestamp: [0, 1] }, + { text: "Beta.", timestamp: [30, 31] }, + { text: "Gamma.", timestamp: [69, 70] }, + { text: "Carry on.", timestamp: [84, 87] }, + { text: "Delta.", timestamp: [110, 111] }, + { text: "Epsilon.", timestamp: [139, 140] }, + { text: "Tail resumes.", timestamp: [154, 157] }, + { text: "Zeta.", timestamp: [180, 181] }, + { text: "Eta.", timestamp: [209, 210] }, + { text: "Final line.", timestamp: [224, 227] }, + { text: "Omega.", timestamp: [250, 251] }, + ], + }); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 84, 154, 224]); + }); + it("rejects non-finite audio samples before Nemo decoding", async () => { const { pipe } = makeUnitPipe(); await expect(pipe(Float32Array.from([0, Number.NaN, 0]), { return_timestamps: false })).rejects.toThrow( @@ -500,7 +835,7 @@ export default () => { }; trackDispose(input_features); trackDispose(attention_mask); - return { input_features, attention_mask }; + return withNemoTensorOwnership({ input_features, attention_mask }, false); }, { feature_extractor: { config: { sampling_rate: 16000 } }, }); @@ -538,13 +873,10 @@ export default () => { }; trackDispose(input_features); trackDispose(attention_mask); - lastInputs = { input_features, attention_mask }; + lastInputs = withNemoTensorOwnership({ input_features, attention_mask }, true); return lastInputs; }, { - feature_extractor: { - config: { sampling_rate: 16000 }, - feature_cache: { max_entries: 2, max_size_mb: 8 }, - }, + feature_extractor: { config: { sampling_rate: 16000 } }, }); const pipe = new AutomaticSpeechRecognitionPipeline({ task: PIPELINE_ID, @@ -584,12 +916,9 @@ export default () => { }; trackDispose(input_features); trackDispose(attention_mask); - return { input_features, attention_mask }; + return withNemoTensorOwnership({ input_features, attention_mask }, false); }, { - feature_extractor: { - config: { sampling_rate: 16000 }, - feature_cache: { max_entries: 0, max_size_mb: 8 }, - }, + feature_extractor: { config: { sampling_rate: 16000 } }, }); const pipe = new AutomaticSpeechRecognitionPipeline({ task: PIPELINE_ID, From 00b3d9346b34d081cc8636751aa3ccdf1ca334cc Mon Sep 17 00:00:00 2001 From: ysdede Date: Sun, 8 Mar 2026 23:49:45 +0300 Subject: [PATCH 35/40] fix(nemo): scope ASR tests and address review fixes Keep the shared ASR pipeline suite focused on the public Nemo contract and move adapter-specific windowing, retranscription, cache-ownership, and disposal coverage into a dedicated Nemo pipeline test file. Narrow the source diff by removing explanatory Nemo comments and reverting unrelated upstream-only tweaks, while also fixing the review findings around cursor snap-forward merging, tokenizer vocab-shape handling, empty timestamp validation, and cache borrow/release semantics for active inference. Verification: - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/models.test.js -t "nemo_conformer_tdt" - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/pipelines.test.js -t "Nemo Conformer TDT pipeline adapter|Automatic Speech Recognition" --- .../transformers/src/models/modeling_utils.js | 5 +- .../feature_extraction_nemo_conformer_tdt.js | 24 +- .../modeling_nemo_conformer_tdt.js | 6 +- .../pipeline_nemo_conformer_tdt.js | 58 +- .../nemo_conformer_tdt/transducer_cache.js | 87 +- .../transducer_word_offsets.js | 29 +- packages/transformers/src/models/registry.js | 2 - .../pipelines/automatic-speech-recognition.js | 7 - .../test_modeling_nemo_conformer_tdt.js | 181 +++-- ..._pipelines_automatic_speech_recognition.js | 765 +----------------- .../test_pipelines_nemo_conformer_tdt.js | 732 +++++++++++++++++ 11 files changed, 1020 insertions(+), 876 deletions(-) create mode 100644 packages/transformers/tests/pipelines/test_pipelines_nemo_conformer_tdt.js diff --git a/packages/transformers/src/models/modeling_utils.js b/packages/transformers/src/models/modeling_utils.js index cc6194134..475de6be0 100644 --- a/packages/transformers/src/models/modeling_utils.js +++ b/packages/transformers/src/models/modeling_utils.js @@ -877,9 +877,8 @@ export class PreTrainedModel extends Callable { if (input_name in model_inputs) { if (inputs) { throw new Error( - '`inputs` was passed alongside ' + - `\`${input_name}\` which is not allowed. ` + - `Make sure to either pass \`inputs\` or \`${input_name}\`=...`, + '`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' + + 'Make sure to either pass {inputs} or {input_name}=...', ); } } else { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js index 7df9afba6..bca3bc021 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js @@ -7,13 +7,21 @@ import { computeTemporalDeltas } from './transducer_deltas.js'; const EPSILON = 1e-5; export const NEMO_FEATURE_OUTPUT_OWNERSHIP = Symbol('NemoConformerTDTFeatureOutputOwnership'); +export const NEMO_FEATURE_OUTPUT_RELEASE = Symbol('NemoConformerTDTFeatureOutputRelease'); -function tagNemoFeatureOutputOwnership(value, cacheOwnsTensors) { +function tagNemoFeatureOutputOwnership(value, cacheOwnsTensors, release = null) { Object.defineProperty(value, NEMO_FEATURE_OUTPUT_OWNERSHIP, { value: cacheOwnsTensors, enumerable: false, configurable: true, }); + if (release) { + Object.defineProperty(value, NEMO_FEATURE_OUTPUT_RELEASE, { + value: release, + enumerable: false, + configurable: true, + }); + } return value; } @@ -152,14 +160,22 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor { if (this.feature_cache) { const key = `${createAudioCacheKey(audio, this.config.sampling_rate)}:${this.delta_order}:${this.delta_window}:${this.delta_concatenate}`; - const cached = this.feature_cache.get(key); + const cached = this.feature_cache.acquire(key); if (cached) { - return tagNemoFeatureOutputOwnership({ ...cached }, true); + return tagNemoFeatureOutputOwnership({ ...cached.value }, true, cached.release); } const extracted = await this._extract(audio); const cacheOwnsTensors = this.feature_cache.set(key, extracted); - return tagNemoFeatureOutputOwnership({ ...extracted }, cacheOwnsTensors); + if (!cacheOwnsTensors) { + return tagNemoFeatureOutputOwnership({ ...extracted }, false); + } + + const borrowed = this.feature_cache.acquire(key); + if (!borrowed) { + return tagNemoFeatureOutputOwnership({ ...extracted }, false); + } + return tagNemoFeatureOutputOwnership({ ...borrowed.value }, true, borrowed.release); } return tagNemoFeatureOutputOwnership(await this._extract(audio), false); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 0b129e5be..3f664d57a 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -945,10 +945,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } } -// Register with ModelRegistry so get_model_files / progress_callback enumerate -// the correct ONNX files: encoder_model + decoder_model_merged. -MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); // model_type key -MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT); // architecture key +MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); +MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT); MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerTDTPreTrainedModel', NemoConformerTDTPreTrainedModel); MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerForTDT', NemoConformerForTDT); MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerTDTPreTrainedModel, 'NemoConformerTDTPreTrainedModel'); diff --git a/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js index a2b00abb4..83b758b7b 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js @@ -1,14 +1,12 @@ import { Tensor } from '../../utils/tensor.js'; -import { NEMO_FEATURE_OUTPUT_OWNERSHIP } from './feature_extraction_nemo_conformer_tdt.js'; +import { NEMO_FEATURE_OUTPUT_OWNERSHIP, NEMO_FEATURE_OUTPUT_RELEASE } from './feature_extraction_nemo_conformer_tdt.js'; import { buildWordChunks, buildNemoSegmentChunks, joinTimedWords, partitionNemoWordsIntoSegments, } from './transducer_segment_offsets.js'; -import { - dedupeMergedWords, -} from './transducer_window_merge.js'; +import { dedupeMergedWords } from './transducer_window_merge.js'; const NEMO_AUTO_WINDOW_THRESHOLD_S = 180; const NEMO_MIN_CHUNK_LENGTH_S = 20; @@ -49,6 +47,13 @@ function disposeNemoPipelineInputs(inputs) { } } +function releaseNemoPipelineInputs(inputs) { + const release = inputs?.[NEMO_FEATURE_OUTPUT_RELEASE]; + if (typeof release === 'function') { + release(); + } +} + function normalizeNemoChunkLengthS(value) { const num = Number(value); if (!Number.isFinite(num) || num <= 0) { @@ -85,7 +90,7 @@ function normalizeNemoSegmentText(text) { return String(text ?? '') .normalize('NFKC') .replace(/[“”]/g, '"') - .replace(/[‘’]/g, '\'') + .replace(/[‘’]/g, "'") .replace(/\s+/g, ' ') .trim() .toLowerCase(); @@ -97,9 +102,10 @@ function isDuplicateFinalizedNemoSegment(finalizedSegments, segment) { return false; } - return finalizedSegments.some((candidate) => - normalizeNemoSegmentText(candidate.text) === normalized && - Math.abs(candidate.timestamp[1] - segment.timestamp[1]) < NEMO_SEGMENT_DEDUP_TOLERANCE_S, + return finalizedSegments.some( + (candidate) => + normalizeNemoSegmentText(candidate.text) === normalized && + Math.abs(candidate.timestamp[1] - segment.timestamp[1]) < NEMO_SEGMENT_DEDUP_TOLERANCE_S, ); } @@ -139,13 +145,7 @@ function relocateNemoCursorToNearbyGap(target_s, words) { return best; } -async function runNemoAutoSentenceWindowing({ - audio, - sampling_rate, - chunk_length_s, - tokenizer, - runNemoTranscribe, -}) { +async function runNemoAutoSentenceWindowing({ audio, sampling_rate, chunk_length_s, tokenizer, runNemoTranscribe }) { const audio_duration_s = audio.length / sampling_rate; const fallback_overlap_s = Math.min(NEMO_AUTO_WINDOW_FALLBACK_OVERLAP_S, Math.max(0, chunk_length_s - 1)); const fallback_advance_s = Math.max(1, chunk_length_s - fallback_overlap_s); @@ -159,7 +159,11 @@ async function runNemoAutoSentenceWindowing({ let start_s = 0; let shouldMergePending = false; - for (let windowIndex = 0; windowIndex < maxWindows && start_s < audio_duration_s - NEMO_AUTO_WINDOW_EPSILON_S; ++windowIndex) { + for ( + let windowIndex = 0; + windowIndex < maxWindows && start_s < audio_duration_s - NEMO_AUTO_WINDOW_EPSILON_S; + ++windowIndex + ) { const end_s = Math.min(audio_duration_s, start_s + chunk_length_s); const start_sample = Math.max(0, Math.min(audio.length - 1, Math.floor(start_s * sampling_rate))); const end_sample = Math.max(start_sample + 1, Math.min(audio.length, Math.ceil(end_s * sampling_rate))); @@ -199,12 +203,11 @@ async function runNemoAutoSentenceWindowing({ } pendingWords = dedupeMergedWords(pendingSegment.words); - shouldMergePending = false; - const next_start_s = Math.min( audio_duration_s, relocateNemoCursorToNearbyGap(pendingStart_s, windowWords), ); + shouldMergePending = next_start_s > pendingStart_s + NEMO_AUTO_WINDOW_EPSILON_S; if (next_start_s > start_s + NEMO_AUTO_WINDOW_EPSILON_S) { start_s = next_start_s; continue; @@ -250,14 +253,7 @@ async function runNemoAutoSentenceWindowing({ * prepareAudios: (audio: any[], sampling_rate: number) => Promise<(Float32Array|Float64Array)[]>, * }} options */ -export async function runNemoConformerTDTPipeline({ - model, - processor, - tokenizer, - audio, - kwargs, - prepareAudios, -}) { +export async function runNemoConformerTDTPipeline({ model, processor, tokenizer, audio, kwargs, prepareAudios }) { if (typeof model?.transcribe !== 'function') { throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.'); } @@ -292,7 +288,9 @@ export async function runNemoConformerTDTPipeline({ try { return await model.transcribe(inputs, decodeOptions); } finally { - if (!cacheOwnsTensors) { + if (cacheOwnsTensors) { + releaseNemoPipelineInputs(inputs); + } else { disposeNemoPipelineInputs(inputs); } } @@ -303,11 +301,7 @@ export async function runNemoConformerTDTPipeline({ const audio_duration_s = aud.length / sampling_rate; const autoWindowing = requested_chunk_length_s <= 0 && audio_duration_s > NEMO_AUTO_WINDOW_THRESHOLD_S; const chunk_length_s = - requested_chunk_length_s > 0 - ? requested_chunk_length_s - : autoWindowing - ? NEMO_AUTO_CHUNK_LENGTH_S - : 0; + requested_chunk_length_s > 0 ? requested_chunk_length_s : autoWindowing ? NEMO_AUTO_CHUNK_LENGTH_S : 0; const useSentenceWindowing = chunk_length_s > 0; if (useSentenceWindowing) { diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index a9979356f..d7b8c6939 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -50,13 +50,31 @@ export class FeatureLRUCache { * @returns {any|null} */ get(key) { - const entry = this.cache.get(key); + const entry = this._touch(key); if (!entry) return null; - this.cache.delete(key); - this.cache.set(key, entry); return entry.value; } + /** + * @param {string} key + * @returns {{ value: any, release: () => void } | null} + */ + acquire(key) { + const entry = this._touch(key); + if (!entry) return null; + + entry.borrowers += 1; + let released = false; + return { + value: entry.value, + release: () => { + if (released) return; + released = true; + this._releaseEntry(entry); + }, + }; + } + /** * @param {string} key * @param {any} value @@ -75,12 +93,12 @@ export class FeatureLRUCache { const existing = this.cache.get(key); if (existing?.value === value) { // Refresh recency for unchanged value without invalidating caller-owned references. - this.cache.delete(key); if (existing.size_bytes <= max_bytes) { + this.cache.delete(key); this.cache.set(key, existing); return true; } else { - this.current_size_bytes -= existing.size_bytes; + this._deleteEntry(key, existing); return false; } } @@ -89,31 +107,30 @@ export class FeatureLRUCache { if (size_bytes > max_bytes) { // Cannot fit in cache: keep caller ownership and skip caching. if (existing) { - disposeCachedValue(existing.value); - this.current_size_bytes -= existing.size_bytes; - this.cache.delete(key); + this._deleteEntry(key, existing); } return false; } if (existing) { - disposeCachedValue(existing.value); - this.current_size_bytes -= existing.size_bytes; - this.cache.delete(key); + this._deleteEntry(key, existing); } - this.cache.set(key, { value, size_bytes }); + this.cache.set(key, { + value, + size_bytes, + borrowers: 0, + pendingDispose: false, + }); this.current_size_bytes += size_bytes; this._evict(); return this.cache.get(key)?.value === value; } clear() { - for (const { value } of this.cache.values()) { - disposeCachedValue(value); + for (const [key, entry] of Array.from(this.cache.entries())) { + this._deleteEntry(key, entry); } - this.cache.clear(); - this.current_size_bytes = 0; } stats() { @@ -131,9 +148,41 @@ export class FeatureLRUCache { const oldest_key = this.cache.keys().next().value; if (oldest_key === undefined) break; const oldest = this.cache.get(oldest_key); - this.cache.delete(oldest_key); - disposeCachedValue(oldest?.value); - this.current_size_bytes -= oldest?.size_bytes ?? 0; + if (!oldest) break; + this._deleteEntry(oldest_key, oldest); + } + } + + _touch(key) { + const entry = this.cache.get(key); + if (!entry) return null; + this.cache.delete(key); + this.cache.set(key, entry); + return entry; + } + + _deleteEntry(key, entry) { + const current = this.cache.get(key); + if (current !== entry) { + return; + } + + this.cache.delete(key); + this.current_size_bytes -= entry.size_bytes; + if (entry.borrowers > 0) { + entry.pendingDispose = true; + } else { + disposeCachedValue(entry.value); + } + } + + _releaseEntry(entry) { + if (entry.borrowers > 0) { + entry.borrowers -= 1; + } + if (entry.borrowers === 0 && entry.pendingDispose) { + entry.pendingDispose = false; + disposeCachedValue(entry.value); } } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js index 9422b3270..d1b5d8d1a 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js @@ -15,10 +15,23 @@ function getIdToTokenMap(tokenizer) { cached = new Map(); if (tokenizer?.get_vocab) { const vocab = tokenizer.get_vocab(); - const entries = vocab instanceof Map ? vocab.entries() : Object.entries(vocab); - for (const [token, id] of entries) { - if (Number.isInteger(id)) { - cached.set(id, token); + if (Array.isArray(vocab)) { + for (let id = 0; id < vocab.length; ++id) { + if (typeof vocab[id] === 'string') { + cached.set(id, vocab[id]); + } + } + } else if (vocab instanceof Map) { + for (const [token, id] of vocab.entries()) { + if (Number.isInteger(id)) { + cached.set(id, token); + } + } + } else if (vocab && typeof vocab === 'object') { + for (const [token, id] of Object.entries(vocab)) { + if (Number.isInteger(id)) { + cached.set(id, token); + } } } } @@ -133,7 +146,7 @@ export function buildTransducerWordOffsets( token_confidences = null, fullText = '', ) { - if (!tokenizer || token_ids.length === 0 || token_timestamps.length === 0) { + if (!tokenizer) { return { words: [], tokens: [], wordAverage: null }; } if (token_ids.length !== token_timestamps.length) { @@ -146,6 +159,9 @@ export function buildTransducerWordOffsets( `buildTransducerWordOffsets expects token_confidences length (${token_confidences.length}) to match token_ids length (${token_ids.length}).`, ); } + if (token_ids.length === 0) { + return { words: [], tokens: [], wordAverage: null }; + } /** @type {Array<{ id: number, token: string, rawToken: string, isWordStart: boolean, startTime: number, endTime: number, confidence?: number }>} */ const tokens = []; @@ -206,7 +222,8 @@ export function buildTransducerWordOffsets( if (words.some((x) => x.confidence != null)) { const validConfidences = words.map((x) => x.confidence).filter((x) => x != null); if (validConfidences.length > 0) { - wordAverage = Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; + wordAverage = + Math.round((validConfidences.reduce((a, b) => a + b, 0) / validConfidences.length) * 1e6) / 1e6; } } diff --git a/packages/transformers/src/models/registry.js b/packages/transformers/src/models/registry.js index fc80c7707..a5a7d4582 100644 --- a/packages/transformers/src/models/registry.js +++ b/packages/transformers/src/models/registry.js @@ -581,8 +581,6 @@ const CUSTOM_MAPPING = [ ], ['SupertonicForConditionalGeneration', ALL_MODEL_FILES.SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic], ['ChatterboxModel', ALL_MODEL_FILES.ChatterboxModel, MODEL_TYPES.Chatterbox], - // Keep AutoModel lookup in MODEL_MAPPING_NAMES_ENCODER_ONLY while forcing the - // correct runtime model type for two-artifact Nemo Conformer TDT loading. ['NemoConformerForTDT', ALL_MODEL_FILES.NemoConformerForTDT, MODEL_TYPES.NemoConformerTDT], ]; for (const [name, model, type] of CUSTOM_MAPPING) { diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index a81b779c4..addd4ee10 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -305,13 +305,6 @@ export class AutomaticSpeechRecognitionPipeline return single ? toReturn[0] : toReturn; } - /** - * Nemo Conformer TDT ASR pipeline. - * - * Keeps the pipeline surface aligned with the shared ASR task contract: - * `{ text }` by default and `{ text, chunks }` when timestamps are requested. - * Rich Nemo-specific outputs remain available on direct `model.transcribe()`. - */ async _call_nemo_conformer_tdt(audio, kwargs) { return runNemoConformerTDTPipeline({ model: this.model, diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index b0faa6b81..01f8f3d98 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -1,12 +1,9 @@ import { NemoConformerForTDT, Tensor } from "../../../src/transformers.js"; import { createAudioCacheKey, FeatureLRUCache } from "../../../src/models/nemo_conformer_tdt/transducer_cache.js"; import { computeTemporalDeltas } from "../../../src/models/nemo_conformer_tdt/transducer_deltas.js"; -import { - buildNemoSegmentChunks, - partitionNemoWordsIntoSegments, - shouldEndSentenceAfterWord, -} from "../../../src/models/nemo_conformer_tdt/transducer_segment_offsets.js"; +import { buildNemoSegmentChunks, partitionNemoWordsIntoSegments, shouldEndSentenceAfterWord } from "../../../src/models/nemo_conformer_tdt/transducer_segment_offsets.js"; import { buildTransducerDetailedOutputs } from "../../../src/models/nemo_conformer_tdt/transducer_text.js"; +import { buildTransducerWordOffsets } from "../../../src/models/nemo_conformer_tdt/transducer_word_offsets.js"; import { MODEL_TYPE_MAPPING, MODEL_TYPES } from "../../../src/models/modeling_utils.js"; import { get_model_files } from "../../../src/utils/model_registry/get_model_files.js"; @@ -138,7 +135,12 @@ export default () => { model.transcribe(inputs, { tokenizer: { decode: () => "", - get_vocab: () => new Map([["a", 0], ["b", 1], ["c", 2]]), + get_vocab: () => + new Map([ + ["a", 0], + ["b", 1], + ["c", 2], + ]), }, }), ).rejects.toThrow("blank_token_id"); @@ -168,7 +170,12 @@ export default () => { model.transcribe(inputs, { tokenizer: { decode: () => "", - get_vocab: () => new Map([["a", 0], ["b", 1], ["c", 2]]), + get_vocab: () => + new Map([ + ["a", 0], + ["b", 1], + ["c", 2], + ]), }, }), ).rejects.toThrow("duration_start_index"); @@ -326,10 +333,7 @@ export default () => { expect(output.confidence.frames).toHaveLength(2); expect(output.confidence.frames[0]).toBeCloseTo(0.9579343795, 6); - expect(output.confidence.frameAverage).toBeCloseTo( - (output.confidence.frames[0] + output.confidence.frames[1]) / 2, - 6, - ); + expect(output.confidence.frameAverage).toBeCloseTo((output.confidence.frames[0] + output.confidence.frames[1]) / 2, 6); }, MAX_TEST_EXECUTION_TIME, ); @@ -464,9 +468,7 @@ export default () => { input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), }; - await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow( - 'encoder output "encoder_out" was not returned', - ); + await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow('encoder output "encoder_out" was not returned'); }, MAX_TEST_EXECUTION_TIME, ); @@ -496,9 +498,7 @@ export default () => { input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), }; - await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow( - 'decoder output "outputs" was not returned', - ); + await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow('decoder output "outputs" was not returned'); }, MAX_TEST_EXECUTION_TIME, ); @@ -525,9 +525,7 @@ export default () => { input_features: new Tensor("float32", new Float32Array([0, 0, 0, 0, 0, 0]), [1, 3, 2]), }; - await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow( - 'decoder state outputs "output_states_1" and "output_states_2" were not returned', - ); + await expect(model.transcribe(inputs, { tokenizer: { decode: () => "" } })).rejects.toThrow('decoder state outputs "output_states_1" and "output_states_2" were not returned'); }, MAX_TEST_EXECUTION_TIME, ); @@ -676,13 +674,7 @@ export default () => { ]; const segments = partitionNemoWordsIntoSegments(words); - expect(segments.map((x) => x.text)).toEqual([ - "Hello.", - "World again.", - "U.S. Report update.", - "pause", - "Next sentence.", - ]); + expect(segments.map((x) => x.text)).toEqual(["Hello.", "World again.", "U.S. Report update.", "pause", "Next sentence."]); expect(segments.map((x) => x.timestamp)).toEqual([ [0, 0.4], [0.5, 1.1], @@ -743,25 +735,84 @@ export default () => { ], ); - expect(output.words.map((x) => x.text)).toEqual([ - "score.", - "48-year-old", - "with", - "0.5", - ]); - expect(output.tokens.map((x) => x.token)).toEqual([ - "score", - ".", - "48", - "-", - "year", - "-", - "old", - "with", - "0", - ".", - "5", - ]); + expect(output.words.map((x) => x.text)).toEqual(["score.", "48-year-old", "with", "0.5"]); + expect(output.tokens.map((x) => x.token)).toEqual(["score", ".", "48", "-", "year", "-", "old", "with", "0", ".", "5"]); + }); + + it("builds word offsets from array-backed tokenizer vocabularies", () => { + const vocab = ["", "▁hello", "▁world"]; + const tokenizer = { + get_vocab() { + return vocab; + }, + decode(ids) { + const pieces = ids.map((id) => vocab[id] ?? "").join(""); + return pieces.replace(/▁/g, "").trim(); + }, + }; + + const output = buildTransducerWordOffsets( + tokenizer, + [1, 2], + [ + [0.0, 0.3], + [0.3, 0.6], + ], + null, + "hello world", + ); + + expect(output.words.map((x) => x.text)).toEqual(["hello", "world"]); + expect(output.tokens.map((x) => x.rawToken)).toEqual(["▁hello", "▁world"]); + expect(output.tokens.map((x) => x.isWordStart)).toEqual([true, true]); + }); + + it("falls back to decoded token text when tokenizer vocab metadata is unavailable", () => { + const token_ids = [1, 2]; + const timestamps = [ + [0.0, 0.3], + [0.3, 0.6], + ]; + + const fromNull = buildTransducerWordOffsets( + { + get_vocab: () => null, + decode(ids) { + return ids[0] === 1 ? " hello" : "world"; + }, + }, + token_ids, + timestamps, + null, + "hello world", + ); + const fromPrimitive = buildTransducerWordOffsets( + { + get_vocab: () => 42, + decode(ids) { + return ids[0] === 1 ? " hello" : "world"; + }, + }, + token_ids, + timestamps, + null, + "hello world", + ); + + expect(fromNull.words.map((x) => x.text)).toEqual(["hello", "world"]); + expect(fromPrimitive.words.map((x) => x.text)).toEqual(["hello", "world"]); + }); + + it("rejects mismatched empty timestamp inputs for word offsets", () => { + expect(() => + buildTransducerWordOffsets( + { + decode: () => "hello", + }, + [1], + [], + ), + ).toThrow("equal lengths"); }); it( @@ -855,11 +906,7 @@ export default () => { decoder_model_merged: "q4", }, }); - expect(files).toEqual([ - "config.json", - "onnx/encoder_model_fp16.onnx", - "onnx/decoder_model_merged_q4.onnx", - ]); + expect(files).toEqual(["config.json", "onnx/encoder_model_fp16.onnx", "onnx/decoder_model_merged_q4.onnx"]); }); it( @@ -962,6 +1009,38 @@ export default () => { } }); + it("defers disposal for borrowed cache entries until they are released", () => { + const cache = new FeatureLRUCache({ max_entries: 1, max_size_mb: 4 }); + const tensorA = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3]); + const tensorB = new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3]); + let disposeCalls = 0; + const track = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + track(tensorA); + track(tensorB); + + cache.set("a", tensorA); + const borrowedA = cache.acquire("a"); + expect(borrowedA?.value).toBe(tensorA); + + cache.set("b", tensorB); + expect(disposeCalls).toBe(0); + borrowedA?.release(); + expect(disposeCalls).toBe(1); + + const borrowedB = cache.acquire("b"); + expect(borrowedB?.value).toBe(tensorB); + cache.clear(); + expect(disposeCalls).toBe(1); + borrowedB?.release(); + expect(disposeCalls).toBe(2); + }); + it("treats zero cache limits as explicit no-cache mode without disposing inserted values", () => { const byEntries = new FeatureLRUCache({ max_entries: 0, max_size_mb: 4 }); const bySize = new FeatureLRUCache({ max_entries: 4, max_size_mb: 0 }); diff --git a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js index f4d398371..cbaa96636 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js +++ b/packages/transformers/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -1,5 +1,4 @@ -import { pipeline, AutomaticSpeechRecognitionPipeline, Tensor } from "../../src/transformers.js"; -import { NEMO_FEATURE_OUTPUT_OWNERSHIP } from "../../src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js"; +import { pipeline, AutomaticSpeechRecognitionPipeline } from "../../src/transformers.js"; import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; @@ -97,7 +96,6 @@ export default () => { const model_id = "Xenova/tiny-random-Wav2Vec2ForCTC-ONNX"; const SAMPLING_RATE = 16000; const audios = [new Float32Array(SAMPLING_RATE).fill(0), Float32Array.from({ length: SAMPLING_RATE }, (_, i) => i / 16000)]; - const long_audios = [new Float32Array(SAMPLING_RATE * 60).fill(0), Float32Array.from({ length: SAMPLING_RATE * 60 }, (_, i) => (i % 1000) / 1000)]; const max_new_tokens = 5; /** @type {AutomaticSpeechRecognitionPipeline} */ @@ -127,71 +125,45 @@ export default () => { }, MAX_MODEL_DISPOSE_TIME); }); - describe("nemo-conformer-tdt (unit)", () => { - const withNemoTensorOwnership = (value, cacheOwnsTensors) => { - Object.defineProperty(value, NEMO_FEATURE_OUTPUT_OWNERSHIP, { - value: cacheOwnsTensors, - enumerable: false, - configurable: true, - }); - return value; - }; - - const makeUnitPipe = (modelType = "nemo-conformer-tdt") => { + describe("nemo-conformer-tdt", () => { + const makeUnitPipe = () => { const calls = []; const model = { - config: { model_type: modelType }, + config: { model_type: "nemo-conformer-tdt" }, async transcribe(_inputs, options) { calls.push(options); const result = { text: "hello world" }; if (options.returnTimestamps) { result.utteranceTimestamp = [0, 0.08]; - result.confidence = { utterance: 0.95, wordAverage: 0.94, averageLogProb: -0.05 }; - if (options.returnWords) { - result.words = [ - { text: "hello", startTime: 0, endTime: 0.04, confidence: 0.96 }, - { text: "world", startTime: 0.04, endTime: 0.08, confidence: 0.93 }, - ]; - } - } - if (options.returnMetrics) { - result.metrics = { totalMs: 42, rtf: 0.01, rtfX: 100 }; + result.words = [ + { text: "hello", startTime: 0, endTime: 0.04 }, + { text: "world", startTime: 0.04, endTime: 0.08 }, + ]; } return result; }, async dispose() {}, }; - const processor = Object.assign(async () => ({ input_features: {} }), { feature_extractor: { config: { sampling_rate: 16000 } }, }); - const tokenizer = { - decode(ids) { - const pieces = { - 1: "hello", - 2: "world", - 3: "again", - 4: "today", - }; - return ids.map((id) => pieces[id] ?? "").filter(Boolean).join(" "); - }, - }; return { pipe: new AutomaticSpeechRecognitionPipeline({ task: PIPELINE_ID, model, - tokenizer, + tokenizer: {}, processor, }), calls, }; }; - it("returns text when timestamps disabled", async () => { + it("returns text when timestamps are disabled", async () => { const { pipe, calls } = makeUnitPipe(); - const output = await pipe(new Float32Array(16000), { return_timestamps: false }); - expect(output).toEqual({ text: "hello world" }); + await expect(pipe(new Float32Array(16000), { return_timestamps: false })).resolves.toEqual({ + text: "hello world", + }); expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ returnTimestamps: false, @@ -200,16 +172,12 @@ export default () => { }); }); - it("returns timestamped chunks when return_timestamps is true", async () => { + it("returns sentence chunks when return_timestamps is true", async () => { const { pipe, calls } = makeUnitPipe(); - const output = await pipe(new Float32Array(16000), { return_timestamps: true }); - expect(output).toEqual({ + await expect(pipe(new Float32Array(16000), { return_timestamps: true })).resolves.toEqual({ text: "hello world", - chunks: [ - { text: "hello world", timestamp: [0, 0.08] }, - ], + chunks: [{ text: "hello world", timestamp: [0, 0.08] }], }); - expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ returnTimestamps: true, returnWords: true, @@ -219,718 +187,19 @@ export default () => { it("returns word chunks when return_timestamps is 'word'", async () => { const { pipe, calls } = makeUnitPipe(); - const output = await pipe(new Float32Array(16000), { return_timestamps: "word" }); - expect(output).toEqual({ + await expect(pipe(new Float32Array(16000), { return_timestamps: "word" })).resolves.toEqual({ text: "hello world", chunks: [ { text: "hello", timestamp: [0, 0.04] }, { text: "world", timestamp: [0.04, 0.08] }, ], }); - expect(calls).toHaveLength(1); expect(calls[0]).toMatchObject({ returnTimestamps: true, returnWords: true, returnMetrics: false, }); }); - - it("builds conservative sentence chunks from Nemo word timestamps", async () => { - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe() { - return { - text: "Hello. World again. U.S. Report update.", - utteranceTimestamp: [0, 2.4], - words: [ - { text: "Hello.", startTime: 0, endTime: 0.4 }, - { text: "World", startTime: 0.5, endTime: 0.8 }, - { text: "again.", startTime: 0.8, endTime: 1.1 }, - { text: "U.S.", startTime: 1.2, endTime: 1.5 }, - { text: "Report", startTime: 1.6, endTime: 2.0 }, - { text: "update.", startTime: 2.0, endTime: 2.4 }, - ], - }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => ({ input_features: {} }), { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer: {}, - processor, - }); - - const output = await pipe(new Float32Array(16000), { return_timestamps: true }); - expect(output).toEqual({ - text: "Hello. World again. U.S. Report update.", - chunks: [ - { text: "Hello.", timestamp: [0, 0.4] }, - { text: "World again.", timestamp: [0.5, 1.1] }, - { text: "U.S. Report update.", timestamp: [1.2, 2.4] }, - ], - }); - }); - - it("uses explicit chunk_length_s as a bounded sentence window size override", async () => { - const calls = []; - const outputsByOffset = new Map([ - [0, { - text: "Alpha. Beta. Carry", - words: [ - { text: "Alpha.", startTime: 0, endTime: 1 }, - { text: "Beta.", startTime: 17, endTime: 18 }, - { text: "Carry", startTime: 19.95, endTime: 20 }, - ], - }], - [19.95, { - text: "Carry on. Gamma", - words: [ - { text: "Carry", startTime: 19.95, endTime: 20 }, - { text: "on.", startTime: 20, endTime: 20.5 }, - { text: "Gamma", startTime: 37.9, endTime: 38 }, - ], - }], - [37.9, { - text: "Gamma. Tail resumes. Omega.", - words: [ - { text: "Gamma.", startTime: 37.9, endTime: 39 }, - { text: "Tail", startTime: 39.2, endTime: 39.6 }, - { text: "resumes.", startTime: 39.6, endTime: 40.1 }, - { text: "Omega.", startTime: 40.1, endTime: 40.45 }, - ], - }], - ]); - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe(_inputs, options) { - calls.push(options); - const item = outputsByOffset.get(options.timeOffset); - if (!item) { - throw new Error(`Unexpected timeOffset ${options.timeOffset}`); - } - return { - text: item.text, - utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], - words: item.words, - }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => ({ input_features: {} }), { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const tokenizer = { - decode(ids) { - const pieces = { - 1: "hello", - 2: "world", - 3: "again", - 4: "today", - }; - return ids.map((id) => pieces[id] ?? "").filter(Boolean).join(" "); - }, - }; - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer, - processor, - }); - - const output = await pipe(new Float32Array(40.5 * 16000), { - return_timestamps: "word", - chunk_length_s: 2, - }); - - expect(output).toEqual({ - text: "Alpha. Beta. Carry on. Gamma. Tail resumes. Omega.", - chunks: [ - { text: "Alpha.", timestamp: [0, 1] }, - { text: "Beta.", timestamp: [17, 18] }, - { text: "Carry", timestamp: [19.95, 20] }, - { text: "on.", timestamp: [20, 20.5] }, - { text: "Gamma.", timestamp: [37.9, 39] }, - { text: "Tail", timestamp: [39.2, 39.6] }, - { text: "resumes.", timestamp: [39.6, 40.1] }, - { text: "Omega.", timestamp: [40.1, 40.45] }, - ], - }); - expect(calls).toHaveLength(3); - expect(calls.map((x) => x.timeOffset)).toEqual([0, 19.95, 37.9]); - expect(calls[0]).toMatchObject({ - returnTimestamps: true, - returnWords: true, - returnMetrics: false, - timeOffset: 0, - }); - expect(calls[1]).toMatchObject({ - returnTimestamps: true, - returnWords: true, - returnMetrics: false, - timeOffset: 19.95, - }); - expect(calls[2]).toMatchObject({ - returnTimestamps: true, - returnWords: true, - returnMetrics: false, - timeOffset: 37.9, - }); - }); - - it("replaces boundary-truncated sentences with the longer retranscribed sentence", async () => { - const calls = []; - const outputsByOffset = new Map([ - [0, { - text: "Alpha. Beta. It won't run away, and it won't come to life.", - words: [ - { text: "Alpha.", startTime: 0, endTime: 1 }, - { text: "Beta.", startTime: 11, endTime: 12 }, - { text: "It", startTime: 17.2, endTime: 17.5 }, - { text: "won't", startTime: 17.5, endTime: 17.9 }, - { text: "run", startTime: 17.9, endTime: 18.2 }, - { text: "away,", startTime: 18.2, endTime: 18.6 }, - { text: "and", startTime: 18.6, endTime: 18.8 }, - { text: "it", startTime: 18.8, endTime: 19.0 }, - { text: "won't", startTime: 19.0, endTime: 19.3 }, - { text: "come", startTime: 19.3, endTime: 19.5 }, - { text: "to", startTime: 19.5, endTime: 19.65 }, - { text: "life.", startTime: 19.65, endTime: 19.8 }, - ], - }], - [17.2, { - text: "It won't run away, and it won't come to life until someone finds it. Omega.", - words: [ - { text: "It", startTime: 17.2, endTime: 17.5 }, - { text: "won't", startTime: 17.5, endTime: 17.9 }, - { text: "run", startTime: 17.9, endTime: 18.2 }, - { text: "away,", startTime: 18.2, endTime: 18.6 }, - { text: "and", startTime: 18.6, endTime: 18.8 }, - { text: "it", startTime: 18.8, endTime: 19.0 }, - { text: "won't", startTime: 19.0, endTime: 19.3 }, - { text: "come", startTime: 19.3, endTime: 19.5 }, - { text: "to", startTime: 19.5, endTime: 19.65 }, - { text: "life", startTime: 19.65, endTime: 19.95 }, - { text: "until", startTime: 19.95, endTime: 20.4 }, - { text: "someone", startTime: 20.4, endTime: 21.0 }, - { text: "finds", startTime: 21.0, endTime: 21.5 }, - { text: "it.", startTime: 21.5, endTime: 22.0 }, - { text: "Omega.", startTime: 28, endTime: 29 }, - ], - }], - ]); - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe(_inputs, options) { - calls.push(options); - const item = outputsByOffset.get(options.timeOffset); - if (!item) { - throw new Error(`Unexpected timeOffset ${options.timeOffset}`); - } - return { - text: item.text, - utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], - words: item.words, - }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => ({ input_features: {} }), { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer: {}, - processor, - }); - - const output = await pipe(new Float32Array(Math.ceil(31 * 16000)), { - return_timestamps: true, - chunk_length_s: 20, - }); - - expect(output).toEqual({ - text: "Alpha. Beta. It won't run away, and it won't come to life until someone finds it. Omega.", - chunks: [ - { text: "Alpha.", timestamp: [0, 1] }, - { text: "Beta.", timestamp: [11, 12] }, - { text: "It won't run away, and it won't come to life until someone finds it.", timestamp: [17.2, 22] }, - { text: "Omega.", timestamp: [28, 29] }, - ], - }); - expect(calls.map((x) => x.timeOffset)).toEqual([0, 17.2]); - }); - - it("retranscribes the dropped last sentence from its start without stale carry", async () => { - const calls = []; - const outputsByOffset = new Map([ - [0, { - text: "Alpha. The pressure gauge mark. He watched as the fruit", - words: [ - { text: "Alpha.", startTime: 0, endTime: 1 }, - { text: "The", startTime: 16.8, endTime: 17.0 }, - { text: "pressure", startTime: 17.0, endTime: 17.4 }, - { text: "gauge", startTime: 17.4, endTime: 17.76 }, - { text: "mark.", startTime: 17.76, endTime: 18.56 }, - { text: "He", startTime: 18.56, endTime: 18.72 }, - { text: "watched", startTime: 18.72, endTime: 18.96 }, - { text: "as", startTime: 18.96, endTime: 19.04 }, - { text: "the", startTime: 19.04, endTime: 19.2 }, - { text: "fruit", startTime: 19.2, endTime: 19.36 }, - ], - }], - [18.56, { - text: "He watched as the fluid.", - words: [ - { text: "He", startTime: 18.56, endTime: 18.72 }, - { text: "watched", startTime: 18.72, endTime: 19.12 }, - { text: "as", startTime: 19.12, endTime: 19.28 }, - { text: "the", startTime: 19.28, endTime: 19.36 }, - { text: "fluid.", startTime: 19.36, endTime: 20 }, - ], - }], - ]); - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe(_inputs, options) { - calls.push(options); - const item = outputsByOffset.get(options.timeOffset); - if (!item) { - throw new Error(`Unexpected timeOffset ${options.timeOffset}`); - } - return { - text: item.text, - utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], - words: item.words, - }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => ({ input_features: {} }), { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer: {}, - processor, - }); - - const output = await pipe(new Float32Array(Math.ceil(21 * 16000)), { - return_timestamps: "word", - chunk_length_s: 20, - }); - - expect(output).toEqual({ - text: "Alpha. The pressure gauge mark. He watched as the fluid.", - chunks: [ - { text: "Alpha.", timestamp: [0, 1] }, - { text: "The", timestamp: [16.8, 17] }, - { text: "pressure", timestamp: [17, 17.4] }, - { text: "gauge", timestamp: [17.4, 17.76] }, - { text: "mark.", timestamp: [17.76, 18.56] }, - { text: "He", timestamp: [18.56, 18.72] }, - { text: "watched", timestamp: [18.72, 19.12] }, - { text: "as", timestamp: [19.12, 19.28] }, - { text: "the", timestamp: [19.28, 19.36] }, - { text: "fluid.", timestamp: [19.36, 20] }, - ], - }); - expect(calls.map((x) => x.timeOffset)).toEqual([0, 18.56]); - }); - - it("reconstructs windowed Nemo text from merged words when token decode drops spaces", async () => { - const calls = []; - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe(_inputs, options) { - calls.push(options); - if (options.timeOffset === 0) { - return { - text: "score. 48-year-old", - words: [ - { text: "score.", startTime: 0, endTime: 0.4 }, - { text: "48-year-old", startTime: 0.5, endTime: 1.3 }, - ], - }; - } - return { - text: "with 0.5", - words: [ - { text: "with", startTime: 1.4, endTime: 1.7 }, - { text: "0.5", startTime: 1.8, endTime: 2.05 }, - ], - }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => ({ input_features: {} }), { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const tokenizer = { - decode(ids) { - const pieces = { - 1: "score", - 2: ".", - 3: "48", - 4: "-", - 5: "year", - 6: "old", - 7: "with", - 8: "0", - 9: "5", - }; - return ids.map((id) => pieces[id] ?? "").join(""); - }, - }; - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer, - processor, - }); - - const output = await pipe(new Float32Array(Math.ceil(20.1 * 16000)), { - return_timestamps: "word", - chunk_length_s: 20, - }); - - expect(output.text).toBe("score. 48-year-old with 0.5"); - expect(output.chunks).toEqual([ - { text: "score.", timestamp: [0, 0.4] }, - { text: "48-year-old", timestamp: [0.5, 1.3] }, - { text: "with", timestamp: [1.4, 1.7] }, - { text: "0.5", timestamp: [1.8, 2.05] }, - ]); - expect(calls.map((x) => x.timeOffset)).toEqual([0, 10]); - }); - - it("auto-windows long Nemo audio with 90s sentence windows", async () => { - const calls = []; - const outputsByOffset = new Map([ - [0, { - text: "Alpha. Beta. Gamma. Carry", - words: [ - { text: "Alpha.", startTime: 0, endTime: 1 }, - { text: "Beta.", startTime: 30, endTime: 31 }, - { text: "Gamma.", startTime: 69, endTime: 70 }, - { text: "Carry", startTime: 84, endTime: 85 }, - ], - }], - [84, { - text: "Carry on. Delta. Epsilon. Tail", - words: [ - { text: "Carry", startTime: 84, endTime: 85 }, - { text: "on.", startTime: 86, endTime: 87 }, - { text: "Delta.", startTime: 110, endTime: 111 }, - { text: "Epsilon.", startTime: 139, endTime: 140 }, - { text: "Tail", startTime: 154, endTime: 155 }, - ], - }], - [154, { - text: "Tail resumes. Zeta. Eta. Final", - words: [ - { text: "Tail", startTime: 154, endTime: 155 }, - { text: "resumes.", startTime: 156, endTime: 157 }, - { text: "Zeta.", startTime: 180, endTime: 181 }, - { text: "Eta.", startTime: 209, endTime: 210 }, - { text: "Final", startTime: 224, endTime: 225 }, - ], - }], - [224, { - text: "Final line. Omega.", - words: [ - { text: "Final", startTime: 224, endTime: 225 }, - { text: "line.", startTime: 226, endTime: 227 }, - { text: "Omega.", startTime: 250, endTime: 251 }, - ], - }], - ]); - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe(_inputs, options) { - calls.push(options); - const item = outputsByOffset.get(options.timeOffset); - if (!item) { - throw new Error(`Unexpected timeOffset ${options.timeOffset}`); - } - return { - text: item.text, - utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], - words: item.words, - }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => ({ input_features: {} }), { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const tokenizer = { - decode(ids) { - const pieces = { - 1: "alpha", - 2: "beta", - 3: "gamma", - 4: "delta", - }; - return ids.map((id) => pieces[id] ?? "").filter(Boolean).join(" "); - }, - }; - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer, - processor, - }); - - const output = await pipe(new Float32Array(300 * 16000), { return_timestamps: "word" }); - - expect(output).toEqual({ - text: "Alpha. Beta. Gamma. Carry on. Delta. Epsilon. Tail resumes. Zeta. Eta. Final line. Omega.", - chunks: [ - { text: "Alpha.", timestamp: [0, 1] }, - { text: "Beta.", timestamp: [30, 31] }, - { text: "Gamma.", timestamp: [69, 70] }, - { text: "Carry", timestamp: [84, 85] }, - { text: "on.", timestamp: [86, 87] }, - { text: "Delta.", timestamp: [110, 111] }, - { text: "Epsilon.", timestamp: [139, 140] }, - { text: "Tail", timestamp: [154, 155] }, - { text: "resumes.", timestamp: [156, 157] }, - { text: "Zeta.", timestamp: [180, 181] }, - { text: "Eta.", timestamp: [209, 210] }, - { text: "Final", timestamp: [224, 225] }, - { text: "line.", timestamp: [226, 227] }, - { text: "Omega.", timestamp: [250, 251] }, - ], - }); - expect(calls).toHaveLength(4); - expect(calls.map((x) => x.timeOffset)).toEqual([0, 84, 154, 224]); - for (const call of calls) { - expect(call).toMatchObject({ - returnTimestamps: true, - returnWords: true, - returnMetrics: false, - }); - } - }); - - it("returns sentence chunks for auto-windowed long Nemo audio", async () => { - const calls = []; - const outputsByOffset = new Map([ - [0, { - text: "Alpha. Beta. Gamma. Carry", - words: [ - { text: "Alpha.", startTime: 0, endTime: 1 }, - { text: "Beta.", startTime: 30, endTime: 31 }, - { text: "Gamma.", startTime: 69, endTime: 70 }, - { text: "Carry", startTime: 84, endTime: 85 }, - ], - }], - [84, { - text: "Carry on. Delta. Epsilon. Tail", - words: [ - { text: "Carry", startTime: 84, endTime: 85 }, - { text: "on.", startTime: 86, endTime: 87 }, - { text: "Delta.", startTime: 110, endTime: 111 }, - { text: "Epsilon.", startTime: 139, endTime: 140 }, - { text: "Tail", startTime: 154, endTime: 155 }, - ], - }], - [154, { - text: "Tail resumes. Zeta. Eta. Final", - words: [ - { text: "Tail", startTime: 154, endTime: 155 }, - { text: "resumes.", startTime: 156, endTime: 157 }, - { text: "Zeta.", startTime: 180, endTime: 181 }, - { text: "Eta.", startTime: 209, endTime: 210 }, - { text: "Final", startTime: 224, endTime: 225 }, - ], - }], - [224, { - text: "Final line. Omega.", - words: [ - { text: "Final", startTime: 224, endTime: 225 }, - { text: "line.", startTime: 226, endTime: 227 }, - { text: "Omega.", startTime: 250, endTime: 251 }, - ], - }], - ]); - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe(_inputs, options) { - calls.push(options); - const item = outputsByOffset.get(options.timeOffset); - if (!item) { - throw new Error(`Unexpected timeOffset ${options.timeOffset}`); - } - return { - text: item.text, - utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], - words: item.words, - }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => ({ input_features: {} }), { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer: {}, - processor, - }); - - const output = await pipe(new Float32Array(300 * 16000), { return_timestamps: true }); - - expect(output).toEqual({ - text: "Alpha. Beta. Gamma. Carry on. Delta. Epsilon. Tail resumes. Zeta. Eta. Final line. Omega.", - chunks: [ - { text: "Alpha.", timestamp: [0, 1] }, - { text: "Beta.", timestamp: [30, 31] }, - { text: "Gamma.", timestamp: [69, 70] }, - { text: "Carry on.", timestamp: [84, 87] }, - { text: "Delta.", timestamp: [110, 111] }, - { text: "Epsilon.", timestamp: [139, 140] }, - { text: "Tail resumes.", timestamp: [154, 157] }, - { text: "Zeta.", timestamp: [180, 181] }, - { text: "Eta.", timestamp: [209, 210] }, - { text: "Final line.", timestamp: [224, 227] }, - { text: "Omega.", timestamp: [250, 251] }, - ], - }); - expect(calls.map((x) => x.timeOffset)).toEqual([0, 84, 154, 224]); - }); - - it("rejects non-finite audio samples before Nemo decoding", async () => { - const { pipe } = makeUnitPipe(); - await expect(pipe(Float32Array.from([0, Number.NaN, 0]), { return_timestamps: false })).rejects.toThrow( - "finite audio samples", - ); - }); - - it("disposes processor tensors after Nemo transcription when feature cache is disabled", async () => { - let disposeCalls = 0; - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe() { - return { text: "ok" }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => { - const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); - const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); - const trackDispose = (tensor) => { - const originalDispose = tensor.dispose.bind(tensor); - tensor.dispose = () => { - disposeCalls += 1; - originalDispose(); - }; - }; - trackDispose(input_features); - trackDispose(attention_mask); - return withNemoTensorOwnership({ input_features, attention_mask }, false); - }, { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer: {}, - processor, - }); - - const output = await pipe(new Float32Array(16000), { return_timestamps: false }); - expect(output).toEqual({ text: "ok" }); - expect(disposeCalls).toBe(2); - }); - - it("keeps processor tensors alive when Nemo feature cache owns tensor lifetimes", async () => { - let disposeCalls = 0; - let lastInputs = null; - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe() { - return { text: "ok" }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => { - const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); - const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); - const trackDispose = (tensor) => { - const originalDispose = tensor.dispose.bind(tensor); - tensor.dispose = () => { - disposeCalls += 1; - originalDispose(); - }; - }; - trackDispose(input_features); - trackDispose(attention_mask); - lastInputs = withNemoTensorOwnership({ input_features, attention_mask }, true); - return lastInputs; - }, { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer: {}, - processor, - }); - - try { - const output = await pipe(new Float32Array(16000), { return_timestamps: false }); - expect(output).toEqual({ text: "ok" }); - expect(disposeCalls).toBe(0); - } finally { - lastInputs?.input_features.dispose(); - lastInputs?.attention_mask.dispose(); - } - }); - - it("disposes processor tensors when Nemo feature cache limits disable caching", async () => { - let disposeCalls = 0; - const model = { - config: { model_type: "nemo-conformer-tdt" }, - async transcribe() { - return { text: "ok" }; - }, - async dispose() {}, - }; - const processor = Object.assign(async () => { - const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); - const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); - const trackDispose = (tensor) => { - const originalDispose = tensor.dispose.bind(tensor); - tensor.dispose = () => { - disposeCalls += 1; - originalDispose(); - }; - }; - trackDispose(input_features); - trackDispose(attention_mask); - return withNemoTensorOwnership({ input_features, attention_mask }, false); - }, { - feature_extractor: { config: { sampling_rate: 16000 } }, - }); - const pipe = new AutomaticSpeechRecognitionPipeline({ - task: PIPELINE_ID, - model, - tokenizer: {}, - processor, - }); - - const output = await pipe(new Float32Array(16000), { return_timestamps: false }); - expect(output).toEqual({ text: "ok" }); - expect(disposeCalls).toBe(2); - }); }); }); }; diff --git a/packages/transformers/tests/pipelines/test_pipelines_nemo_conformer_tdt.js b/packages/transformers/tests/pipelines/test_pipelines_nemo_conformer_tdt.js new file mode 100644 index 000000000..517051971 --- /dev/null +++ b/packages/transformers/tests/pipelines/test_pipelines_nemo_conformer_tdt.js @@ -0,0 +1,732 @@ +import { Tensor } from "../../src/transformers.js"; +import { NEMO_FEATURE_OUTPUT_OWNERSHIP, NEMO_FEATURE_OUTPUT_RELEASE } from "../../src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js"; +import { runNemoConformerTDTPipeline } from "../../src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js"; + +const SAMPLING_RATE = 16000; + +const makeProcessor = (impl = async () => ({ input_features: {} })) => + Object.assign(impl, { + feature_extractor: { config: { sampling_rate: SAMPLING_RATE } }, + }); + +const makeTokenizer = () => ({ + decode(ids) { + const pieces = { + 1: "hello", + 2: "world", + 3: "again", + 4: "today", + }; + return ids + .map((id) => pieces[id] ?? "") + .filter(Boolean) + .join(" "); + }, +}); + +const prepareAudios = async (audios) => audios; + +const runPipeline = ({ model, audio = new Float32Array(SAMPLING_RATE), kwargs = {}, tokenizer = makeTokenizer(), processor = makeProcessor() }) => + runNemoConformerTDTPipeline({ + model, + processor, + tokenizer, + audio, + kwargs, + prepareAudios, + }); + +const withNemoTensorOwnership = (value, cacheOwnsTensors, release = null) => { + Object.defineProperty(value, NEMO_FEATURE_OUTPUT_OWNERSHIP, { + value: cacheOwnsTensors, + enumerable: false, + configurable: true, + }); + if (release) { + Object.defineProperty(value, NEMO_FEATURE_OUTPUT_RELEASE, { + value: release, + enumerable: false, + configurable: true, + }); + } + return value; +}; + +export default () => { + describe("Nemo Conformer TDT pipeline adapter", () => { + it("builds conservative sentence chunks from Nemo word timestamps", async () => { + const model = { + async transcribe() { + return { + text: "Hello. World again. U.S. Report update.", + utteranceTimestamp: [0, 2.4], + words: [ + { text: "Hello.", startTime: 0, endTime: 0.4 }, + { text: "World", startTime: 0.5, endTime: 0.8 }, + { text: "again.", startTime: 0.8, endTime: 1.1 }, + { text: "U.S.", startTime: 1.2, endTime: 1.5 }, + { text: "Report", startTime: 1.6, endTime: 2.0 }, + { text: "update.", startTime: 2.0, endTime: 2.4 }, + ], + }; + }, + }; + + await expect(runPipeline({ model, kwargs: { return_timestamps: true } })).resolves.toEqual({ + text: "Hello. World again. U.S. Report update.", + chunks: [ + { text: "Hello.", timestamp: [0, 0.4] }, + { text: "World again.", timestamp: [0.5, 1.1] }, + { text: "U.S. Report update.", timestamp: [1.2, 2.4] }, + ], + }); + }); + + it("uses explicit chunk_length_s as a bounded sentence window size override", async () => { + const calls = []; + const outputsByOffset = new Map([ + [ + 0, + { + text: "Alpha. Beta. Carry", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "Beta.", startTime: 17, endTime: 18 }, + { text: "Carry", startTime: 19.95, endTime: 20 }, + ], + }, + ], + [ + 19.95, + { + text: "Carry on. Gamma", + words: [ + { text: "Carry", startTime: 19.95, endTime: 20 }, + { text: "on.", startTime: 20, endTime: 20.5 }, + { text: "Gamma", startTime: 37.9, endTime: 38 }, + ], + }, + ], + [ + 37.9, + { + text: "Gamma. Tail resumes. Omega.", + words: [ + { text: "Gamma.", startTime: 37.9, endTime: 39 }, + { text: "Tail", startTime: 39.2, endTime: 39.6 }, + { text: "resumes.", startTime: 39.6, endTime: 40.1 }, + { text: "Omega.", startTime: 40.1, endTime: 40.45 }, + ], + }, + ], + ]); + const model = { + async transcribe(_inputs, options) { + calls.push(options); + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, + }; + }, + }; + + await expect( + runPipeline({ + model, + audio: new Float32Array(40.5 * SAMPLING_RATE), + kwargs: { return_timestamps: "word", chunk_length_s: 2 }, + }), + ).resolves.toEqual({ + text: "Alpha. Beta. Carry on. Gamma. Tail resumes. Omega.", + chunks: [ + { text: "Alpha.", timestamp: [0, 1] }, + { text: "Beta.", timestamp: [17, 18] }, + { text: "Carry", timestamp: [19.95, 20] }, + { text: "on.", timestamp: [20, 20.5] }, + { text: "Gamma.", timestamp: [37.9, 39] }, + { text: "Tail", timestamp: [39.2, 39.6] }, + { text: "resumes.", timestamp: [39.6, 40.1] }, + { text: "Omega.", timestamp: [40.1, 40.45] }, + ], + }); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 19.95, 37.9]); + expect(calls[0]).toMatchObject({ returnTimestamps: true, returnWords: true, returnMetrics: false, timeOffset: 0 }); + expect(calls[1]).toMatchObject({ returnTimestamps: true, returnWords: true, returnMetrics: false, timeOffset: 19.95 }); + expect(calls[2]).toMatchObject({ returnTimestamps: true, returnWords: true, returnMetrics: false, timeOffset: 37.9 }); + }); + + it("replaces boundary-truncated sentences with the longer retranscribed sentence", async () => { + const calls = []; + const outputsByOffset = new Map([ + [ + 0, + { + text: "Alpha. Beta. It won't run away, and it won't come to life.", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "Beta.", startTime: 11, endTime: 12 }, + { text: "It", startTime: 17.2, endTime: 17.5 }, + { text: "won't", startTime: 17.5, endTime: 17.9 }, + { text: "run", startTime: 17.9, endTime: 18.2 }, + { text: "away,", startTime: 18.2, endTime: 18.6 }, + { text: "and", startTime: 18.6, endTime: 18.8 }, + { text: "it", startTime: 18.8, endTime: 19.0 }, + { text: "won't", startTime: 19.0, endTime: 19.3 }, + { text: "come", startTime: 19.3, endTime: 19.5 }, + { text: "to", startTime: 19.5, endTime: 19.65 }, + { text: "life.", startTime: 19.65, endTime: 19.8 }, + ], + }, + ], + [ + 17.2, + { + text: "It won't run away, and it won't come to life until someone finds it. Omega.", + words: [ + { text: "It", startTime: 17.2, endTime: 17.5 }, + { text: "won't", startTime: 17.5, endTime: 17.9 }, + { text: "run", startTime: 17.9, endTime: 18.2 }, + { text: "away,", startTime: 18.2, endTime: 18.6 }, + { text: "and", startTime: 18.6, endTime: 18.8 }, + { text: "it", startTime: 18.8, endTime: 19.0 }, + { text: "won't", startTime: 19.0, endTime: 19.3 }, + { text: "come", startTime: 19.3, endTime: 19.5 }, + { text: "to", startTime: 19.5, endTime: 19.65 }, + { text: "life", startTime: 19.65, endTime: 19.95 }, + { text: "until", startTime: 19.95, endTime: 20.4 }, + { text: "someone", startTime: 20.4, endTime: 21.0 }, + { text: "finds", startTime: 21.0, endTime: 21.5 }, + { text: "it.", startTime: 21.5, endTime: 22.0 }, + { text: "Omega.", startTime: 28, endTime: 29 }, + ], + }, + ], + ]); + const model = { + async transcribe(_inputs, options) { + calls.push(options); + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, + }; + }, + }; + + await expect( + runPipeline({ + model, + audio: new Float32Array(Math.ceil(31 * SAMPLING_RATE)), + kwargs: { return_timestamps: true, chunk_length_s: 20 }, + }), + ).resolves.toEqual({ + text: "Alpha. Beta. It won't run away, and it won't come to life until someone finds it. Omega.", + chunks: [ + { text: "Alpha.", timestamp: [0, 1] }, + { text: "Beta.", timestamp: [11, 12] }, + { text: "It won't run away, and it won't come to life until someone finds it.", timestamp: [17.2, 22] }, + { text: "Omega.", timestamp: [28, 29] }, + ], + }); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 17.2]); + }); + + it("retranscribes the dropped last sentence from its start without stale carry", async () => { + const calls = []; + const outputsByOffset = new Map([ + [ + 0, + { + text: "Alpha. The pressure gauge mark. He watched as the fruit", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "The", startTime: 16.8, endTime: 17.0 }, + { text: "pressure", startTime: 17.0, endTime: 17.4 }, + { text: "gauge", startTime: 17.4, endTime: 17.76 }, + { text: "mark.", startTime: 17.76, endTime: 18.56 }, + { text: "He", startTime: 18.56, endTime: 18.72 }, + { text: "watched", startTime: 18.72, endTime: 18.96 }, + { text: "as", startTime: 18.96, endTime: 19.04 }, + { text: "the", startTime: 19.04, endTime: 19.2 }, + { text: "fruit", startTime: 19.2, endTime: 19.36 }, + ], + }, + ], + [ + 18.56, + { + text: "He watched as the fluid.", + words: [ + { text: "He", startTime: 18.56, endTime: 18.72 }, + { text: "watched", startTime: 18.72, endTime: 19.12 }, + { text: "as", startTime: 19.12, endTime: 19.28 }, + { text: "the", startTime: 19.28, endTime: 19.36 }, + { text: "fluid.", startTime: 19.36, endTime: 20 }, + ], + }, + ], + ]); + const model = { + async transcribe(_inputs, options) { + calls.push(options); + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, + }; + }, + }; + + await expect( + runPipeline({ + model, + audio: new Float32Array(Math.ceil(21 * SAMPLING_RATE)), + kwargs: { return_timestamps: "word", chunk_length_s: 20 }, + }), + ).resolves.toEqual({ + text: "Alpha. The pressure gauge mark. He watched as the fluid.", + chunks: [ + { text: "Alpha.", timestamp: [0, 1] }, + { text: "The", timestamp: [16.8, 17] }, + { text: "pressure", timestamp: [17, 17.4] }, + { text: "gauge", timestamp: [17.4, 17.76] }, + { text: "mark.", timestamp: [17.76, 18.56] }, + { text: "He", timestamp: [18.56, 18.72] }, + { text: "watched", timestamp: [18.72, 19.12] }, + { text: "as", timestamp: [19.12, 19.28] }, + { text: "the", timestamp: [19.28, 19.36] }, + { text: "fluid.", timestamp: [19.36, 20] }, + ], + }); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 18.56]); + }); + + it("preserves the pending prefix when cursor snapping restarts inside the last sentence", async () => { + const calls = []; + const outputsByOffset = new Map([ + [ + 0, + { + text: "Alpha. Carry on", + words: [ + { text: "Alpha.", startTime: 0, endTime: 19.65 }, + { text: "Carry", startTime: 19.7, endTime: 20.0 }, + { text: "on.", startTime: 20.5, endTime: 20.8 }, + ], + }, + ], + [ + 20, + { + text: "on. Gamma.", + words: [ + { text: "on.", startTime: 20.5, endTime: 20.8 }, + { text: "Gamma.", startTime: 28, endTime: 29 }, + ], + }, + ], + ]); + const model = { + async transcribe(_inputs, options) { + calls.push(options); + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, + }; + }, + }; + + await expect( + runPipeline({ + model, + audio: new Float32Array(Math.ceil(31 * SAMPLING_RATE)), + kwargs: { return_timestamps: true, chunk_length_s: 20 }, + }), + ).resolves.toEqual({ + text: "Alpha. Carry on. Gamma.", + chunks: [ + { text: "Alpha.", timestamp: [0, 19.65] }, + { text: "Carry on.", timestamp: [19.7, 20.8] }, + { text: "Gamma.", timestamp: [28, 29] }, + ], + }); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 20]); + }); + + it("reconstructs windowed Nemo text from merged words when token decode drops spaces", async () => { + const calls = []; + const model = { + async transcribe(_inputs, options) { + calls.push(options); + if (options.timeOffset === 0) { + return { + text: "score. 48-year-old", + words: [ + { text: "score.", startTime: 0, endTime: 0.4 }, + { text: "48-year-old", startTime: 0.5, endTime: 1.3 }, + ], + }; + } + return { + text: "with 0.5", + words: [ + { text: "with", startTime: 1.4, endTime: 1.7 }, + { text: "0.5", startTime: 1.8, endTime: 2.05 }, + ], + }; + }, + }; + const tokenizer = { + decode(ids) { + const pieces = { + 1: "score", + 2: ".", + 3: "48", + 4: "-", + 5: "year", + 6: "old", + 7: "with", + 8: "0", + 9: "5", + }; + return ids.map((id) => pieces[id] ?? "").join(""); + }, + }; + + const output = await runPipeline({ + model, + tokenizer, + audio: new Float32Array(Math.ceil(20.1 * SAMPLING_RATE)), + kwargs: { return_timestamps: "word", chunk_length_s: 20 }, + }); + + expect(output.text).toBe("score. 48-year-old with 0.5"); + expect(output.chunks).toEqual([ + { text: "score.", timestamp: [0, 0.4] }, + { text: "48-year-old", timestamp: [0.5, 1.3] }, + { text: "with", timestamp: [1.4, 1.7] }, + { text: "0.5", timestamp: [1.8, 2.05] }, + ]); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 10]); + }); + + it("auto-windows long Nemo audio with 90s sentence windows", async () => { + const calls = []; + const outputsByOffset = new Map([ + [ + 0, + { + text: "Alpha. Beta. Gamma. Carry", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "Beta.", startTime: 30, endTime: 31 }, + { text: "Gamma.", startTime: 69, endTime: 70 }, + { text: "Carry", startTime: 84, endTime: 85 }, + ], + }, + ], + [ + 84, + { + text: "Carry on. Delta. Epsilon. Tail", + words: [ + { text: "Carry", startTime: 84, endTime: 85 }, + { text: "on.", startTime: 86, endTime: 87 }, + { text: "Delta.", startTime: 110, endTime: 111 }, + { text: "Epsilon.", startTime: 139, endTime: 140 }, + { text: "Tail", startTime: 154, endTime: 155 }, + ], + }, + ], + [ + 154, + { + text: "Tail resumes. Zeta. Eta. Final", + words: [ + { text: "Tail", startTime: 154, endTime: 155 }, + { text: "resumes.", startTime: 156, endTime: 157 }, + { text: "Zeta.", startTime: 180, endTime: 181 }, + { text: "Eta.", startTime: 209, endTime: 210 }, + { text: "Final", startTime: 224, endTime: 225 }, + ], + }, + ], + [ + 224, + { + text: "Final line. Omega.", + words: [ + { text: "Final", startTime: 224, endTime: 225 }, + { text: "line.", startTime: 226, endTime: 227 }, + { text: "Omega.", startTime: 250, endTime: 251 }, + ], + }, + ], + ]); + const model = { + async transcribe(_inputs, options) { + calls.push(options); + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, + }; + }, + }; + + await expect( + runPipeline({ + model, + audio: new Float32Array(300 * SAMPLING_RATE), + kwargs: { return_timestamps: "word" }, + }), + ).resolves.toEqual({ + text: "Alpha. Beta. Gamma. Carry on. Delta. Epsilon. Tail resumes. Zeta. Eta. Final line. Omega.", + chunks: [ + { text: "Alpha.", timestamp: [0, 1] }, + { text: "Beta.", timestamp: [30, 31] }, + { text: "Gamma.", timestamp: [69, 70] }, + { text: "Carry", timestamp: [84, 85] }, + { text: "on.", timestamp: [86, 87] }, + { text: "Delta.", timestamp: [110, 111] }, + { text: "Epsilon.", timestamp: [139, 140] }, + { text: "Tail", timestamp: [154, 155] }, + { text: "resumes.", timestamp: [156, 157] }, + { text: "Zeta.", timestamp: [180, 181] }, + { text: "Eta.", timestamp: [209, 210] }, + { text: "Final", timestamp: [224, 225] }, + { text: "line.", timestamp: [226, 227] }, + { text: "Omega.", timestamp: [250, 251] }, + ], + }); + expect(calls).toHaveLength(4); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 84, 154, 224]); + for (const call of calls) { + expect(call).toMatchObject({ + returnTimestamps: true, + returnWords: true, + returnMetrics: false, + }); + } + }); + + it("returns sentence chunks for auto-windowed long Nemo audio", async () => { + const calls = []; + const outputsByOffset = new Map([ + [ + 0, + { + text: "Alpha. Beta. Gamma. Carry", + words: [ + { text: "Alpha.", startTime: 0, endTime: 1 }, + { text: "Beta.", startTime: 30, endTime: 31 }, + { text: "Gamma.", startTime: 69, endTime: 70 }, + { text: "Carry", startTime: 84, endTime: 85 }, + ], + }, + ], + [ + 84, + { + text: "Carry on. Delta. Epsilon. Tail", + words: [ + { text: "Carry", startTime: 84, endTime: 85 }, + { text: "on.", startTime: 86, endTime: 87 }, + { text: "Delta.", startTime: 110, endTime: 111 }, + { text: "Epsilon.", startTime: 139, endTime: 140 }, + { text: "Tail", startTime: 154, endTime: 155 }, + ], + }, + ], + [ + 154, + { + text: "Tail resumes. Zeta. Eta. Final", + words: [ + { text: "Tail", startTime: 154, endTime: 155 }, + { text: "resumes.", startTime: 156, endTime: 157 }, + { text: "Zeta.", startTime: 180, endTime: 181 }, + { text: "Eta.", startTime: 209, endTime: 210 }, + { text: "Final", startTime: 224, endTime: 225 }, + ], + }, + ], + [ + 224, + { + text: "Final line. Omega.", + words: [ + { text: "Final", startTime: 224, endTime: 225 }, + { text: "line.", startTime: 226, endTime: 227 }, + { text: "Omega.", startTime: 250, endTime: 251 }, + ], + }, + ], + ]); + const model = { + async transcribe(_inputs, options) { + calls.push(options); + const item = outputsByOffset.get(options.timeOffset); + if (!item) { + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + } + return { + text: item.text, + utteranceTimestamp: [item.words[0].startTime, item.words[item.words.length - 1].endTime], + words: item.words, + }; + }, + }; + + await expect( + runPipeline({ + model, + audio: new Float32Array(300 * SAMPLING_RATE), + kwargs: { return_timestamps: true }, + }), + ).resolves.toEqual({ + text: "Alpha. Beta. Gamma. Carry on. Delta. Epsilon. Tail resumes. Zeta. Eta. Final line. Omega.", + chunks: [ + { text: "Alpha.", timestamp: [0, 1] }, + { text: "Beta.", timestamp: [30, 31] }, + { text: "Gamma.", timestamp: [69, 70] }, + { text: "Carry on.", timestamp: [84, 87] }, + { text: "Delta.", timestamp: [110, 111] }, + { text: "Epsilon.", timestamp: [139, 140] }, + { text: "Tail resumes.", timestamp: [154, 157] }, + { text: "Zeta.", timestamp: [180, 181] }, + { text: "Eta.", timestamp: [209, 210] }, + { text: "Final line.", timestamp: [224, 227] }, + { text: "Omega.", timestamp: [250, 251] }, + ], + }); + expect(calls.map((x) => x.timeOffset)).toEqual([0, 84, 154, 224]); + }); + + it("rejects non-finite audio samples before Nemo decoding", async () => { + const model = { + async transcribe() { + return { text: "hello world" }; + }, + }; + await expect( + runPipeline({ + model, + audio: Float32Array.from([0, Number.NaN, 0]), + kwargs: { return_timestamps: false }, + }), + ).rejects.toThrow("finite audio samples"); + }); + + it("disposes processor tensors after Nemo transcription when feature cache is disabled", async () => { + let disposeCalls = 0; + const model = { + async transcribe() { + return { text: "ok" }; + }, + }; + const processor = makeProcessor(async () => { + const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); + const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); + const trackDispose = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + trackDispose(input_features); + trackDispose(attention_mask); + return withNemoTensorOwnership({ input_features, attention_mask }, false); + }); + + await expect(runPipeline({ model, processor })).resolves.toEqual({ text: "ok" }); + expect(disposeCalls).toBe(2); + }); + + it("keeps processor tensors alive when Nemo feature cache owns tensor lifetimes", async () => { + let disposeCalls = 0; + let releaseCalls = 0; + let lastInputs = null; + const model = { + async transcribe() { + return { text: "ok" }; + }, + }; + const processor = makeProcessor(async () => { + const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); + const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); + const trackDispose = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + trackDispose(input_features); + trackDispose(attention_mask); + lastInputs = withNemoTensorOwnership({ input_features, attention_mask }, true, () => { + releaseCalls += 1; + }); + return lastInputs; + }); + + try { + await expect(runPipeline({ model, processor })).resolves.toEqual({ text: "ok" }); + expect(disposeCalls).toBe(0); + expect(releaseCalls).toBe(1); + } finally { + lastInputs?.input_features.dispose(); + lastInputs?.attention_mask.dispose(); + } + }); + + it("disposes processor tensors when Nemo feature cache limits disable caching", async () => { + let disposeCalls = 0; + const model = { + async transcribe() { + return { text: "ok" }; + }, + }; + const processor = makeProcessor(async () => { + const input_features = new Tensor("float32", new Float32Array([0, 0]), [1, 1, 2]); + const attention_mask = new Tensor("int64", BigInt64Array.from([1n]), [1, 1]); + const trackDispose = (tensor) => { + const originalDispose = tensor.dispose.bind(tensor); + tensor.dispose = () => { + disposeCalls += 1; + originalDispose(); + }; + }; + trackDispose(input_features); + trackDispose(attention_mask); + return withNemoTensorOwnership({ input_features, attention_mask }, false); + }); + + await expect(runPipeline({ model, processor })).resolves.toEqual({ text: "ok" }); + expect(disposeCalls).toBe(2); + }); + }); +}; From 07118c38d80796d0bd07a447b1c8aef2c200bf31 Mon Sep 17 00:00:00 2001 From: ysdede Date: Mon, 9 Mar 2026 00:38:13 +0300 Subject: [PATCH 36/40] fix(nemo-tdt): address follow-up review threads Apply the remaining valid Nemo Conformer TDT review fixes without widening the shared ASR pipeline surface. - honor encoder_input_layout for canonical input_features feeds - keep borrowed cache entries counted until they are actually released - reject tokenizer-less non-empty word-offset reconstruction - raise the auto-window budget to match the minimum guaranteed cursor advance - add focused model and pipeline regressions for each fix Verified with: - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/models.test.js -t "nemo_conformer_tdt" - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/pipelines.test.js -t "Nemo Conformer TDT pipeline adapter|Automatic Speech Recognition" --- .../modeling_nemo_conformer_tdt.js | 60 +++++++--------- .../pipeline_nemo_conformer_tdt.js | 5 +- .../nemo_conformer_tdt/transducer_cache.js | 3 +- .../transducer_word_offsets.js | 6 +- .../test_modeling_nemo_conformer_tdt.js | 69 +++++++++++++++++++ .../test_pipelines_nemo_conformer_tdt.js | 61 ++++++++++++++++ 6 files changed, 165 insertions(+), 39 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index 3f664d57a..c59f40927 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -420,30 +420,33 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } const missingInputs = []; - for (const name of encoderSession.inputNames) { - if (model_inputs[name] instanceof Tensor) { - feeds[name] = model_inputs[name]; - continue; + let preparedEncoderInput = null; + const getPreparedEncoderInput = () => { + if (preparedEncoderInput) { + return preparedEncoderInput; } - if (name === 'input_features') { - feeds[name] = inputFeatures; + const layout = this.transducer.encoder_input_layout; + if (layout === 'BTF') { + preparedEncoderInput = inputFeatures; + } else if (layout === 'BFT') { + preparedEncoderInput = inputFeatures.transpose(0, 2, 1); + disposables.push(preparedEncoderInput); + } else { + throw new Error( + `Unsupported encoder input layout "${layout}". Use 'BTF' or 'BFT' in transformers.js_config.transducer.`, + ); + } + return preparedEncoderInput; + }; + for (const name of encoderSession.inputNames) { + if (name === 'input_features' || name === 'audio_signal') { + feeds[name] = getPreparedEncoderInput(); continue; } - if (name === 'audio_signal') { - const layout = this.transducer.encoder_input_layout; - if (layout === 'BTF') { - feeds[name] = inputFeatures; - } else if (layout === 'BFT') { - const transposed = inputFeatures.transpose(0, 2, 1); - disposables.push(transposed); - feeds[name] = transposed; - } else { - throw new Error( - `Unsupported encoder input layout "${layout}". Use 'BTF' or 'BFT' in transformers.js_config.transducer.`, - ); - } + if (model_inputs[name] instanceof Tensor) { + feeds[name] = model_inputs[name]; continue; } @@ -516,9 +519,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { _validateRuntimeConfig(vocabSize) { if (!Number.isInteger(vocabSize) || vocabSize <= 0) { - throw new Error( - `Invalid Nemo Conformer TDT config: vocab_size=${vocabSize} must be a positive integer.`, - ); + throw new Error(`Invalid Nemo Conformer TDT config: vocab_size=${vocabSize} must be a positive integer.`); } if (this.transducer.blank_token_id >= vocabSize) { throw new Error( @@ -571,10 +572,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { * debug?: { frameIndices?: number[] | null, logProbs?: number[] | null, tdtSteps?: number[] | null }, * }>} */ - async transcribe( - model_inputs, - decode_options = {}, - ) { + async transcribe(model_inputs, decode_options = {}) { const { tokenizer = null, returnTimestamps: returnTimestampsOption, @@ -855,10 +853,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { tokenTimestamps[0][0], tokenTimestamps[tokenTimestamps.length - 1][1], ]) - : /** @type {[number, number]} */ ([ - roundTs(timeOffset), - roundTs(frameCount * frameTime + timeOffset), - ]); + : /** @type {[number, number]} */ ([roundTs(timeOffset), roundTs(frameCount * frameTime + timeOffset)]); const averageLogProb = logProbs && logProbs.length > 0 ? roundMetric(logProbs.reduce((a, b) => a + b, 0) / logProbs.length, 6) @@ -888,10 +883,7 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { result.confidence = { ...(result.confidence ?? {}), frames: frameConfidences, - frameAverage: roundMetric( - frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, - 6, - ), + frameAverage: roundMetric(frameConfidences.reduce((a, b) => a + b, 0) / frameConfidences.length, 6), }; } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js index 83b758b7b..f28fd5611 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js @@ -149,7 +149,10 @@ async function runNemoAutoSentenceWindowing({ audio, sampling_rate, chunk_length const audio_duration_s = audio.length / sampling_rate; const fallback_overlap_s = Math.min(NEMO_AUTO_WINDOW_FALLBACK_OVERLAP_S, Math.max(0, chunk_length_s - 1)); const fallback_advance_s = Math.max(1, chunk_length_s - fallback_overlap_s); - const maxWindows = Math.max(4, Math.ceil(audio_duration_s / fallback_advance_s) * 4); + const maxWindows = Math.max( + 4, + Math.ceil(Math.max(0, audio_duration_s - chunk_length_s) / NEMO_CURSOR_MIN_ADVANCE_S) + 2, + ); /** @type {Array<{ words: Array<{ text: string, startTime: number, endTime: number, confidence?: number }>, text: string, timestamp: [number, number] }>} */ const finalizedSegments = []; diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js index d7b8c6939..6c0cfab1f 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js @@ -168,10 +168,10 @@ export class FeatureLRUCache { } this.cache.delete(key); - this.current_size_bytes -= entry.size_bytes; if (entry.borrowers > 0) { entry.pendingDispose = true; } else { + this.current_size_bytes -= entry.size_bytes; disposeCachedValue(entry.value); } } @@ -182,6 +182,7 @@ export class FeatureLRUCache { } if (entry.borrowers === 0 && entry.pendingDispose) { entry.pendingDispose = false; + this.current_size_bytes -= entry.size_bytes; disposeCachedValue(entry.value); } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js index d1b5d8d1a..24a11423b 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js @@ -146,9 +146,6 @@ export function buildTransducerWordOffsets( token_confidences = null, fullText = '', ) { - if (!tokenizer) { - return { words: [], tokens: [], wordAverage: null }; - } if (token_ids.length !== token_timestamps.length) { throw new Error( `buildTransducerWordOffsets expects equal lengths for token_ids (${token_ids.length}) and token_timestamps (${token_timestamps.length}).`, @@ -162,6 +159,9 @@ export function buildTransducerWordOffsets( if (token_ids.length === 0) { return { words: [], tokens: [], wordAverage: null }; } + if (!tokenizer) { + throw new Error('buildTransducerWordOffsets requires a tokenizer for non-empty token_ids.'); + } /** @type {Array<{ id: number, token: string, rawToken: string, isWordStart: boolean, startTime: number, endTime: number, confidence?: number }>} */ const tokens = []; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 01f8f3d98..c24fa0fd7 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -412,6 +412,35 @@ export default () => { expect(() => new NemoConformerForTDT(invalidConfig, BASE_SESSIONS, {})).toThrow("encoder_input_layout"); }); + it("applies encoder_input_layout to canonical input_features feeds", () => { + const config = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + encoder_input_layout: "BFT", + }, + }, + }; + const model = new NemoConformerForTDT(config, BASE_SESSIONS, {}); + const input_features = new Tensor("float32", new Float32Array([1, 2, 3, 4, 5, 6]), [1, 3, 2]); + + const { feeds, disposables } = model._buildEncoderFeeds({ input_features }); + + try { + expect(disposables).toHaveLength(1); + expect(feeds.input_features).not.toBe(input_features); + expect(feeds.input_features.dims).toEqual([1, 2, 3]); + expect(Array.from(feeds.input_features.data)).toEqual([1, 3, 5, 2, 4, 6]); + } finally { + for (const tensor of disposables) { + tensor.dispose(); + } + input_features.dispose(); + } + }); + it("rejects invalid encoder_frame_layout at construction time", () => { const invalidConfig = { ...BASE_CONFIG, @@ -815,6 +844,10 @@ export default () => { ).toThrow("equal lengths"); }); + it("requires a tokenizer for non-empty word offsets", () => { + expect(() => buildTransducerWordOffsets(null, [1], [[0.0, 0.3]], null, "hello")).toThrow("requires a tokenizer"); + }); + it( "computes delta and delta-delta features", async () => { @@ -1041,6 +1074,42 @@ export default () => { expect(disposeCalls).toBe(2); }); + it("keeps borrowed entry bytes counted until release", () => { + const cache = new FeatureLRUCache({ max_entries: 4, max_size_mb: 0.00002 }); + const tensorA = new Tensor("float32", new Float32Array([1, 2, 3]), [1, 3]); + const tensorB = new Tensor("float32", new Float32Array([4, 5, 6]), [1, 3]); + + let tensorADisposals = 0; + const disposeA = tensorA.dispose.bind(tensorA); + tensorA.dispose = () => { + tensorADisposals += 1; + disposeA(); + }; + + let tensorBDisposals = 0; + const disposeB = tensorB.dispose.bind(tensorB); + tensorB.dispose = () => { + tensorBDisposals += 1; + disposeB(); + }; + + expect(cache.set("a", tensorA)).toBe(true); + const borrowedA = cache.acquire("a"); + expect(borrowedA?.value).toBe(tensorA); + + expect(cache.set("b", tensorB)).toBe(false); + expect(cache.get("a")).toBeNull(); + expect(cache.get("b")).toBeNull(); + expect(cache.stats().entries).toBe(0); + expect(cache.stats().size_mb).toBeGreaterThan(0); + expect(tensorADisposals).toBe(0); + expect(tensorBDisposals).toBe(1); + + borrowedA?.release(); + expect(cache.stats().size_mb).toBe(0); + expect(tensorADisposals).toBe(1); + }); + it("treats zero cache limits as explicit no-cache mode without disposing inserted values", () => { const byEntries = new FeatureLRUCache({ max_entries: 0, max_size_mb: 4 }); const bySize = new FeatureLRUCache({ max_entries: 4, max_size_mb: 0 }); diff --git a/packages/transformers/tests/pipelines/test_pipelines_nemo_conformer_tdt.js b/packages/transformers/tests/pipelines/test_pipelines_nemo_conformer_tdt.js index 517051971..2ef8f3986 100644 --- a/packages/transformers/tests/pipelines/test_pipelines_nemo_conformer_tdt.js +++ b/packages/transformers/tests/pipelines/test_pipelines_nemo_conformer_tdt.js @@ -532,6 +532,67 @@ export default () => { } }); + it("does not truncate long audio when sentence cursor advances one second at a time", async () => { + const calls = []; + const expectedChunks = Array.from({ length: 13 }, (_, index) => ({ + text: `Alpha${index}.`, + timestamp: [index, index + 0.2], + })).concat([{ text: "Omega.", timestamp: [180, 180.5] }]); + const expectedText = expectedChunks.map((chunk) => chunk.text).join(" "); + + const model = { + async transcribe(_inputs, options) { + calls.push(options); + + if (Number.isInteger(options.timeOffset) && options.timeOffset >= 0 && options.timeOffset < 12) { + const offset = options.timeOffset; + return { + text: `Alpha${offset}. Carry`, + utteranceTimestamp: [offset, offset + 1.2], + words: [ + { text: `Alpha${offset}.`, startTime: offset, endTime: offset + 0.2 }, + { text: "Carry", startTime: offset + 1, endTime: offset + 1.2 }, + ], + }; + } + + if (options.timeOffset === 12) { + return { + text: "Alpha12. Omega.", + utteranceTimestamp: [12, 180.5], + words: [ + { text: "Alpha12.", startTime: 12, endTime: 12.2 }, + { text: "Omega.", startTime: 180, endTime: 180.5 }, + ], + }; + } + + if (options.timeOffset === 180) { + return { + text: "Omega.", + utteranceTimestamp: [180, 180.5], + words: [{ text: "Omega.", startTime: 180, endTime: 180.5 }], + }; + } + + throw new Error(`Unexpected timeOffset ${options.timeOffset}`); + }, + }; + + await expect( + runPipeline({ + model, + audio: new Float32Array(181 * SAMPLING_RATE), + kwargs: { return_timestamps: true }, + }), + ).resolves.toEqual({ + text: expectedText, + chunks: expectedChunks, + }); + + expect(calls.map((x) => x.timeOffset)).toEqual([...Array.from({ length: 13 }, (_, index) => index), 180]); + }); + it("returns sentence chunks for auto-windowed long Nemo audio", async () => { const calls = []; const outputsByOffset = new Map([ From 341df3d707f621e0e9cf7b5ab2fadf48de15b379 Mon Sep 17 00:00:00 2001 From: ysdede Date: Mon, 9 Mar 2026 01:15:32 +0300 Subject: [PATCH 37/40] chore(asr): restore upstream cast spacing Restore the original cast spacing in the unrelated moonshine path so the Nemo PR does not carry an extra formatting-only diff in automatic-speech-recognition.js. --- .../transformers/src/pipelines/automatic-speech-recognition.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/transformers/src/pipelines/automatic-speech-recognition.js b/packages/transformers/src/pipelines/automatic-speech-recognition.js index addd4ee10..1548bfc87 100644 --- a/packages/transformers/src/pipelines/automatic-speech-recognition.js +++ b/packages/transformers/src/pipelines/automatic-speech-recognition.js @@ -331,7 +331,7 @@ export class AutomaticSpeechRecognitionPipeline const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6; const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs }); - const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0]; + const text = this.processor.batch_decode(/** @type {Tensor} */ (outputs), { skip_special_tokens: true })[0]; toReturn.push({ text }); } return single ? toReturn[0] : toReturn; From 29f2baaf879f3571626448fd6ecb452060f7c404 Mon Sep 17 00:00:00 2001 From: ysdede Date: Mon, 9 Mar 2026 01:34:13 +0300 Subject: [PATCH 38/40] fix(nemo-tdt): handle sparse vocab and merge dedupe Resolve sparse tokenizer vocab fallback by deriving the runtime size from the maximum token id instead of counting entries. This keeps decoder sizing correct when tokenizer ids are non-contiguous. Tighten merged-word dedupe so punctuation-only overlaps are only collapsed when their raw normalized text also matches, which avoids dropping distinct punctuation tokens across window boundaries. Add focused Nemo model regressions and verify with: - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/models.test.js -t "nemo_conformer_tdt" - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/pipelines.test.js -t "Nemo Conformer TDT pipeline adapter|Automatic Speech Recognition" --- .../modeling_nemo_conformer_tdt.js | 23 +++++++++--- .../transducer_window_merge.js | 15 +++++++- .../test_modeling_nemo_conformer_tdt.js | 37 +++++++++++++++++++ 3 files changed, 69 insertions(+), 6 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index c59f40927..b32332583 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -497,17 +497,30 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { if (tokenizer?.get_vocab) { const vocab = tokenizer.get_vocab(); if (vocab instanceof Map) { - if (vocab.size > 0) { - return vocab.size; + let maxId = -1; + for (const id of vocab.values()) { + const numericId = Number(id); + if (Number.isInteger(numericId) && numericId >= 0) { + maxId = Math.max(maxId, numericId); + } + } + if (maxId >= 0) { + return maxId + 1; } } else if (Array.isArray(vocab)) { if (vocab.length > 0) { return vocab.length; } } else if (vocab && typeof vocab === 'object') { - const size = Object.keys(vocab).length; - if (size > 0) { - return size; + let maxId = -1; + for (const id of Object.values(vocab)) { + const numericId = Number(id); + if (Number.isInteger(numericId) && numericId >= 0) { + maxId = Math.max(maxId, numericId); + } + } + if (maxId >= 0) { + return maxId + 1; } } } diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js index 0ea9fd750..a1cc3a15c 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_window_merge.js @@ -7,14 +7,27 @@ function normalizeMergedWordText(text) { .trim(); } +function normalizeRawMergedWordText(text) { + return String(text ?? '') + .normalize('NFKC') + .toLowerCase() + .trim(); +} + export function dedupeMergedWords(words) { /** @type {typeof words} */ const merged = []; for (const word of words) { const prev = merged.at(-1); + const prevText = normalizeMergedWordText(prev?.text); + const wordText = normalizeMergedWordText(word.text); if ( prev && - normalizeMergedWordText(prev.text) === normalizeMergedWordText(word.text) && + prevText === wordText && + ( + prevText.length > 0 || + normalizeRawMergedWordText(prev.text) === normalizeRawMergedWordText(word.text) + ) && word.startTime < prev.endTime ) { const prevDuration = prev.endTime - prev.startTime; diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index c24fa0fd7..121ad5b79 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -4,6 +4,7 @@ import { computeTemporalDeltas } from "../../../src/models/nemo_conformer_tdt/tr import { buildNemoSegmentChunks, partitionNemoWordsIntoSegments, shouldEndSentenceAfterWord } from "../../../src/models/nemo_conformer_tdt/transducer_segment_offsets.js"; import { buildTransducerDetailedOutputs } from "../../../src/models/nemo_conformer_tdt/transducer_text.js"; import { buildTransducerWordOffsets } from "../../../src/models/nemo_conformer_tdt/transducer_word_offsets.js"; +import { dedupeMergedWords } from "../../../src/models/nemo_conformer_tdt/transducer_window_merge.js"; import { MODEL_TYPE_MAPPING, MODEL_TYPES } from "../../../src/models/modeling_utils.js"; import { get_model_files } from "../../../src/utils/model_registry/get_model_files.js"; @@ -232,6 +233,29 @@ export default () => { ).toBe(3); }); + it("resolves vocab size from the maximum sparse tokenizer id when config vocab_size is not set", () => { + const configWithoutVocab = { + ...BASE_CONFIG, + "transformers.js_config": { + ...BASE_CONFIG["transformers.js_config"], + transducer: { + ...BASE_CONFIG["transformers.js_config"].transducer, + vocab_size: undefined, + }, + }, + }; + const model = new MockNemoConformerForTDT(configWithoutVocab, BASE_SESSIONS, []); + expect( + model._resolveVocabSize({ + get_vocab: () => ({ + "": 0, + hello: 2, + world: 7, + }), + }), + ).toBe(8); + }); + it( "greedily decodes scripted token and duration logits", async () => { @@ -768,6 +792,19 @@ export default () => { expect(output.tokens.map((x) => x.token)).toEqual(["score", ".", "48", "-", "year", "-", "old", "with", "0", ".", "5"]); }); + it("does not collapse distinct overlapping punctuation-only tokens during merge dedupe", () => { + expect( + dedupeMergedWords([ + { text: ".", startTime: 1.0, endTime: 1.3 }, + { text: "?", startTime: 1.2, endTime: 1.5 }, + { text: "?", startTime: 1.2, endTime: 1.6 }, + ]), + ).toEqual([ + { text: ".", startTime: 1.0, endTime: 1.3 }, + { text: "?", startTime: 1.2, endTime: 1.6 }, + ]); + }); + it("builds word offsets from array-backed tokenizer vocabularies", () => { const vocab = ["", "▁hello", "▁world"]; const tokenizer = { From 39e5cb1d3f82ee4d06e720f461f3386331fda7dd Mon Sep 17 00:00:00 2001 From: ysdede Date: Mon, 9 Mar 2026 01:49:37 +0300 Subject: [PATCH 39/40] fix(nemo-tdt): merge split domain suffix words Treat likely domain suffixes as continuations when tokenizer decoding inserts whitespace after a trailing period, so sequences like `LibriVox. org.` reconstruct as `LibriVox.org.` in detailed word offsets. Add a focused regression covering the split `.org` token pattern and verify with: - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/models.test.js -t "nemo_conformer_tdt" - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/pipelines.test.js -t "Nemo Conformer TDT pipeline adapter|Automatic Speech Recognition" --- .../transducer_word_offsets.js | 71 ++++++++++++++++++- .../test_modeling_nemo_conformer_tdt.js | 46 ++++++++++++ 2 files changed, 115 insertions(+), 2 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js index 24a11423b..1c8b456ef 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js @@ -3,6 +3,31 @@ * @type {WeakMap>} */ const TOKEN_ID_TO_TEXT_CACHE = new WeakMap(); +const LIKELY_DOMAIN_SUFFIXES = new Set([ + 'ai', + 'app', + 'au', + 'biz', + 'ca', + 'cn', + 'co', + 'com', + 'de', + 'dev', + 'edu', + 'fr', + 'gov', + 'info', + 'io', + 'jp', + 'me', + 'mil', + 'net', + 'org', + 'tv', + 'uk', + 'us', +]); /** * @param {any} tokenizer @@ -105,6 +130,46 @@ function consumeAlignedTokenText(fullText, cursor, tokenText) { }; } +/** + * @param {Array<{ raw: string, clean: string, startsNewWord: boolean }>} pieces + * @param {number} startIndex + * @returns {string} + */ +function collectUpcomingWordText(pieces, startIndex) { + let text = ''; + for (let i = startIndex; i < pieces.length; ++i) { + if (i > startIndex && pieces[i].startsNewWord) { + break; + } + text += pieces[i].clean; + } + return text; +} + +/** + * @param {string} text + * @returns {boolean} + */ +function isLikelyDomainSuffix(text) { + const normalized = String(text ?? '') + .toLowerCase() + .replace(/[.,!?;:]+$/g, ''); + return LIKELY_DOMAIN_SUFFIXES.has(normalized); +} + +/** + * @param {{ text: string, start: number, end: number, confs: number[] } | null} current + * @param {Array<{ raw: string, clean: string, startsNewWord: boolean }>} pieces + * @param {number} index + * @returns {boolean} + */ +function shouldMergeDomainSuffixWord(current, pieces, index) { + if (!current || !/[A-Za-z0-9-]\.$/.test(current.text.trim())) { + return false; + } + return isLikelyDomainSuffix(collectUpcomingWordText(pieces, index)); +} + /** * @param {Array<{ text: string, startTime: number, endTime: number, confidence?: number }>} words * @param {{ text: string, start: number, end: number, confs: number[] } | null} current @@ -168,6 +233,7 @@ export function buildTransducerWordOffsets( /** @type {Array<{ text: string, startTime: number, endTime: number, confidence?: number }>} */ const words = []; let textCursor = 0; + const pieces = token_ids.map((id) => resolveTokenPiece(tokenizer, id)); /** @type {{ text: string, start: number, end: number, confs: number[] } | null} */ let current = null; @@ -175,7 +241,7 @@ export function buildTransducerWordOffsets( for (let i = 0; i < token_ids.length; ++i) { const id = token_ids[i]; const ts = token_timestamps[i]; - const piece = resolveTokenPiece(tokenizer, id); + const piece = pieces[i]; const raw = piece.raw; const clean = piece.clean; if (!clean) continue; @@ -183,7 +249,8 @@ export function buildTransducerWordOffsets( const aligned = consumeAlignedTokenText(fullText, textCursor, clean); textCursor = aligned.cursor; const tokenText = aligned.text || clean; - const startsNewWord = !current || aligned.skippedWhitespace || piece.startsNewWord; + const mergeDomainSuffix = shouldMergeDomainSuffixWord(current, pieces, i); + const startsNewWord = !current || (!mergeDomainSuffix && (aligned.skippedWhitespace || piece.startsNewWord)); const tok = { id, diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index 121ad5b79..c2ba5c6b5 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -792,6 +792,52 @@ export default () => { expect(output.tokens.map((x) => x.token)).toEqual(["score", ".", "48", "-", "year", "-", "old", "with", "0", ".", "5"]); }); + it("merges domain suffixes that tokenizer decoding splits after a period", () => { + const rawById = { + 1: "▁L", + 2: "ib", + 3: "ri", + 4: "V", + 5: "o", + 6: "x", + 7: ".", + 8: "▁or", + 9: "g", + 10: ".", + }; + const tokenizer = { + get_vocab() { + return rawById; + }, + decode(ids) { + if (ids.length === 1) { + return rawById[ids[0]].replace(/^▁/, ""); + } + return "LibriVox. org."; + }, + }; + + const output = buildTransducerDetailedOutputs( + tokenizer, + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [ + [10.88, 10.96], + [10.96, 11.12], + [11.12, 11.28], + [11.28, 11.44], + [11.44, 11.6], + [11.6, 11.76], + [11.76, 11.84], + [12.0, 12.08], + [12.08, 12.16], + [12.16, 12.24], + ], + ); + + expect(output.words.map((x) => x.text)).toEqual(["LibriVox.org."]); + expect(output.tokens.map((x) => x.isWordStart)).toEqual([true, false, false, false, false, false, false, false, false, false]); + }); + it("does not collapse distinct overlapping punctuation-only tokens during merge dedupe", () => { expect( dedupeMergedWords([ From 495bab59275cc4651ab429acf97d6020b01c38f8 Mon Sep 17 00:00:00 2001 From: ysdede Date: Mon, 9 Mar 2026 01:56:50 +0300 Subject: [PATCH 40/40] Revert "fix(nemo-tdt): merge split domain suffix words" This reverts commit 39e5cb1d3f82ee4d06e720f461f3386331fda7dd. --- .../transducer_word_offsets.js | 71 +------------------ .../test_modeling_nemo_conformer_tdt.js | 46 ------------ 2 files changed, 2 insertions(+), 115 deletions(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js index 1c8b456ef..24a11423b 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_word_offsets.js @@ -3,31 +3,6 @@ * @type {WeakMap>} */ const TOKEN_ID_TO_TEXT_CACHE = new WeakMap(); -const LIKELY_DOMAIN_SUFFIXES = new Set([ - 'ai', - 'app', - 'au', - 'biz', - 'ca', - 'cn', - 'co', - 'com', - 'de', - 'dev', - 'edu', - 'fr', - 'gov', - 'info', - 'io', - 'jp', - 'me', - 'mil', - 'net', - 'org', - 'tv', - 'uk', - 'us', -]); /** * @param {any} tokenizer @@ -130,46 +105,6 @@ function consumeAlignedTokenText(fullText, cursor, tokenText) { }; } -/** - * @param {Array<{ raw: string, clean: string, startsNewWord: boolean }>} pieces - * @param {number} startIndex - * @returns {string} - */ -function collectUpcomingWordText(pieces, startIndex) { - let text = ''; - for (let i = startIndex; i < pieces.length; ++i) { - if (i > startIndex && pieces[i].startsNewWord) { - break; - } - text += pieces[i].clean; - } - return text; -} - -/** - * @param {string} text - * @returns {boolean} - */ -function isLikelyDomainSuffix(text) { - const normalized = String(text ?? '') - .toLowerCase() - .replace(/[.,!?;:]+$/g, ''); - return LIKELY_DOMAIN_SUFFIXES.has(normalized); -} - -/** - * @param {{ text: string, start: number, end: number, confs: number[] } | null} current - * @param {Array<{ raw: string, clean: string, startsNewWord: boolean }>} pieces - * @param {number} index - * @returns {boolean} - */ -function shouldMergeDomainSuffixWord(current, pieces, index) { - if (!current || !/[A-Za-z0-9-]\.$/.test(current.text.trim())) { - return false; - } - return isLikelyDomainSuffix(collectUpcomingWordText(pieces, index)); -} - /** * @param {Array<{ text: string, startTime: number, endTime: number, confidence?: number }>} words * @param {{ text: string, start: number, end: number, confs: number[] } | null} current @@ -233,7 +168,6 @@ export function buildTransducerWordOffsets( /** @type {Array<{ text: string, startTime: number, endTime: number, confidence?: number }>} */ const words = []; let textCursor = 0; - const pieces = token_ids.map((id) => resolveTokenPiece(tokenizer, id)); /** @type {{ text: string, start: number, end: number, confs: number[] } | null} */ let current = null; @@ -241,7 +175,7 @@ export function buildTransducerWordOffsets( for (let i = 0; i < token_ids.length; ++i) { const id = token_ids[i]; const ts = token_timestamps[i]; - const piece = pieces[i]; + const piece = resolveTokenPiece(tokenizer, id); const raw = piece.raw; const clean = piece.clean; if (!clean) continue; @@ -249,8 +183,7 @@ export function buildTransducerWordOffsets( const aligned = consumeAlignedTokenText(fullText, textCursor, clean); textCursor = aligned.cursor; const tokenText = aligned.text || clean; - const mergeDomainSuffix = shouldMergeDomainSuffixWord(current, pieces, i); - const startsNewWord = !current || (!mergeDomainSuffix && (aligned.skippedWhitespace || piece.startsNewWord)); + const startsNewWord = !current || aligned.skippedWhitespace || piece.startsNewWord; const tok = { id, diff --git a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js index c2ba5c6b5..121ad5b79 100644 --- a/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js +++ b/packages/transformers/tests/models/nemo_conformer_tdt/test_modeling_nemo_conformer_tdt.js @@ -792,52 +792,6 @@ export default () => { expect(output.tokens.map((x) => x.token)).toEqual(["score", ".", "48", "-", "year", "-", "old", "with", "0", ".", "5"]); }); - it("merges domain suffixes that tokenizer decoding splits after a period", () => { - const rawById = { - 1: "▁L", - 2: "ib", - 3: "ri", - 4: "V", - 5: "o", - 6: "x", - 7: ".", - 8: "▁or", - 9: "g", - 10: ".", - }; - const tokenizer = { - get_vocab() { - return rawById; - }, - decode(ids) { - if (ids.length === 1) { - return rawById[ids[0]].replace(/^▁/, ""); - } - return "LibriVox. org."; - }, - }; - - const output = buildTransducerDetailedOutputs( - tokenizer, - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - [ - [10.88, 10.96], - [10.96, 11.12], - [11.12, 11.28], - [11.28, 11.44], - [11.44, 11.6], - [11.6, 11.76], - [11.76, 11.84], - [12.0, 12.08], - [12.08, 12.16], - [12.16, 12.24], - ], - ); - - expect(output.words.map((x) => x.text)).toEqual(["LibriVox.org."]); - expect(output.tokens.map((x) => x.isWordStart)).toEqual([true, false, false, false, false, false, false, false, false, false]); - }); - it("does not collapse distinct overlapping punctuation-only tokens during merge dedupe", () => { expect( dedupeMergedWords([