From cd5d80110e45a34f2f805c425b5083f53cf8b33d Mon Sep 17 00:00:00 2001 From: Henrik Levring Date: Thu, 5 Mar 2026 22:33:02 +0800 Subject: [PATCH 1/2] Add NeMo Conformer RNNT support for character-level tokenizers Some RNNT models (e.g. Danish parakeet-rnnt-110m) use character-level SentencePiece tokenizers that lack word-start markers and require int32 encoder length inputs. This commit adds the necessary support. modeling_nemo_conformer_tdt.js: - Add configurable encoder_length_dtype (int32/int64) to transducer config, defaulting to int64 for backward compatibility - Fix JSDoc type for confidenceFromLogits logits parameter transducer_text.js: - Add fallback word-boundary detection for character-level tokenizers that emit tokens without word-start markers, enabling correct word-level timestamps and confidences package.json: - Bump onnxruntime-node from 1.24.2 to 1.25.0-dev.20260228 to align with onnxruntime-web 1.25.0-dev and resolve onnxruntime-common peer dependency conflicts --- packages/transformers/package.json | 2 +- .../modeling_nemo_conformer_tdt.js | 13 +++++- .../nemo_conformer_tdt/transducer_text.js | 43 +++++++++++++++++++ 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/packages/transformers/package.json b/packages/transformers/package.json index 8b56e46b5..21c2e4458 100644 --- a/packages/transformers/package.json +++ b/packages/transformers/package.json @@ -57,7 +57,7 @@ "dependencies": { "@huggingface/jinja": "^0.5.5", "@huggingface/tokenizers": "^0.1.2", - "onnxruntime-node": "1.24.2", + "onnxruntime-node": "1.25.0-dev.20260228-6e72d31970", "onnxruntime-web": "1.25.0-dev.20260221-b2a6e69e82", "sharp": "^0.34.5" }, diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index b29359799..c26b08e75 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -57,7 +57,7 @@ function roundTs(value) { } /** - * @param {Float32Array|number[]} logits + * @param {import('../../utils/tensor.js').DataArray} logits * @param {number} tokenId * @param {number} vocabSize * @returns {{ confidence: number, logProb: number }} @@ -175,6 +175,7 @@ function resolveTransducerConfig(config, sessions) { const frameShiftS = transducerConfig.frame_shift_s ?? 0.01; const blankTokenId = transducerConfig.blank_token_id ?? 0; const encoderOutputLayout = transducerConfig.encoder_output_layout; + const encoderLengthDType = transducerConfig.encoder_length_dtype ?? 'int64'; const decoderTokenDType = transducerConfig.decoder_token_dtype ?? 'int32'; const decoderTokenLengthDType = transducerConfig.decoder_token_length_dtype ?? 'int32'; @@ -195,6 +196,11 @@ function resolveTransducerConfig(config, sessions) { if (encoderOutputLayout !== 'BDT' && encoderOutputLayout !== 'BTD') { throw new Error('Invalid `transformers.js_config.transducer.encoder_output_layout`: expected "BDT" or "BTD".'); } + if (!['int32', 'int64'].includes(encoderLengthDType)) { + throw new Error( + 'Invalid `transformers.js_config.transducer.encoder_length_dtype`: expected "int32" or "int64".', + ); + } if (!['int32', 'int64'].includes(decoderTokenDType)) { throw new Error( 'Invalid `transformers.js_config.transducer.decoder_token_dtype`: expected "int32" or "int64".', @@ -216,6 +222,7 @@ function resolveTransducerConfig(config, sessions) { encoder_input_layout: transducerConfig.encoder_input_layout ?? 'BTF', encoder_output_layout: encoderOutputLayout, encoder_frame_layout: transducerConfig.encoder_frame_layout ?? 'BD1', + encoder_length_dtype: encoderLengthDType, decoder_token_dtype: decoderTokenDType, decoder_token_length_dtype: decoderTokenLengthDType, decoder: { @@ -448,7 +455,9 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } else { length = inputFeatures.dims[1]; } - const lengthTensor = new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1]); + const lengthTensor = this.transducer.encoder_length_dtype === 'int64' + ? new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1]) + : new Tensor('int32', new Int32Array([length]), [1]); disposables.push(lengthTensor); feeds[name] = lengthTensor; continue; diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index d12a27553..e52ae19cb 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -167,6 +167,49 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times finalizeAndPushWord(words, current); + // Fallback for character-level tokenizers (e.g., Danish RNNT) where individual + // tokens lack ▁/Ġ word-start markers. Decode the full sequence to get the text + // with proper spaces, then use those spaces to detect word boundaries. + if (words.length <= 1 && tokens.length > 1) { + const fullDecoded = tokenizer + .decode(token_ids, { skip_special_tokens: true, clean_up_tokenization_spaces: false }) + .trimStart(); + + words.length = 0; + current = null; + let pos = 0; + + for (let j = 0; j < tokens.length; j++) { + let foundSpace = false; + while (pos < fullDecoded.length && /\s/.test(fullDecoded[pos])) { + foundSpace = true; + pos++; + } + + const startsNewWord = j === 0 || foundSpace; + tokens[j].is_word_start = startsNewWord; + pos += tokens[j].token.length; + + if (!current || startsNewWord) { + finalizeAndPushWord(words, current); + current = { + text: tokens[j].token, + start: tokens[j].start_time, + end: tokens[j].end_time, + confs: tokens[j].confidence != null ? [tokens[j].confidence] : [], + }; + } else { + current.text += tokens[j].token; + current.end = tokens[j].end_time; + if (tokens[j].confidence != null) { + current.confs.push(tokens[j].confidence); + } + } + } + + finalizeAndPushWord(words, current); + } + const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null; let word_avg = null; if (word_confidences) { From a801d84c9dba399b291cf6b89eb6b1ed7abdd027 Mon Sep 17 00:00:00 2001 From: Henrik Levring Date: Fri, 6 Mar 2026 14:22:52 +0800 Subject: [PATCH 2/2] fix: guard pos against exceeding decoded text length Guard pos against exceeding decoded text length in word-boundary fallback --- .../src/models/nemo_conformer_tdt/transducer_text.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index e52ae19cb..d5cdb716b 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -188,7 +188,7 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times const startsNewWord = j === 0 || foundSpace; tokens[j].is_word_start = startsNewWord; - pos += tokens[j].token.length; + pos = Math.min(pos + tokens[j].token.length, fullDecoded.length); if (!current || startsNewWord) { finalizeAndPushWord(words, current);