diff --git a/packages/transformers/package.json b/packages/transformers/package.json index 8b56e46b5..21c2e4458 100644 --- a/packages/transformers/package.json +++ b/packages/transformers/package.json @@ -57,7 +57,7 @@ "dependencies": { "@huggingface/jinja": "^0.5.5", "@huggingface/tokenizers": "^0.1.2", - "onnxruntime-node": "1.24.2", + "onnxruntime-node": "1.25.0-dev.20260228-6e72d31970", "onnxruntime-web": "1.25.0-dev.20260221-b2a6e69e82", "sharp": "^0.34.5" }, diff --git a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js index b29359799..c26b08e75 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js @@ -57,7 +57,7 @@ function roundTs(value) { } /** - * @param {Float32Array|number[]} logits + * @param {import('../../utils/tensor.js').DataArray} logits * @param {number} tokenId * @param {number} vocabSize * @returns {{ confidence: number, logProb: number }} @@ -175,6 +175,7 @@ function resolveTransducerConfig(config, sessions) { const frameShiftS = transducerConfig.frame_shift_s ?? 0.01; const blankTokenId = transducerConfig.blank_token_id ?? 0; const encoderOutputLayout = transducerConfig.encoder_output_layout; + const encoderLengthDType = transducerConfig.encoder_length_dtype ?? 'int64'; const decoderTokenDType = transducerConfig.decoder_token_dtype ?? 'int32'; const decoderTokenLengthDType = transducerConfig.decoder_token_length_dtype ?? 'int32'; @@ -195,6 +196,11 @@ function resolveTransducerConfig(config, sessions) { if (encoderOutputLayout !== 'BDT' && encoderOutputLayout !== 'BTD') { throw new Error('Invalid `transformers.js_config.transducer.encoder_output_layout`: expected "BDT" or "BTD".'); } + if (!['int32', 'int64'].includes(encoderLengthDType)) { + throw new Error( + 'Invalid `transformers.js_config.transducer.encoder_length_dtype`: expected "int32" or "int64".', + ); + } if (!['int32', 'int64'].includes(decoderTokenDType)) { throw new Error( 'Invalid `transformers.js_config.transducer.decoder_token_dtype`: expected "int32" or "int64".', @@ -216,6 +222,7 @@ function resolveTransducerConfig(config, sessions) { encoder_input_layout: transducerConfig.encoder_input_layout ?? 'BTF', encoder_output_layout: encoderOutputLayout, encoder_frame_layout: transducerConfig.encoder_frame_layout ?? 'BD1', + encoder_length_dtype: encoderLengthDType, decoder_token_dtype: decoderTokenDType, decoder_token_length_dtype: decoderTokenLengthDType, decoder: { @@ -448,7 +455,9 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel { } else { length = inputFeatures.dims[1]; } - const lengthTensor = new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1]); + const lengthTensor = this.transducer.encoder_length_dtype === 'int64' + ? new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1]) + : new Tensor('int32', new Int32Array([length]), [1]); disposables.push(lengthTensor); feeds[name] = lengthTensor; continue; diff --git a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js index d12a27553..d5cdb716b 100644 --- a/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js +++ b/packages/transformers/src/models/nemo_conformer_tdt/transducer_text.js @@ -167,6 +167,49 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times finalizeAndPushWord(words, current); + // Fallback for character-level tokenizers (e.g., Danish RNNT) where individual + // tokens lack ▁/Ġ word-start markers. Decode the full sequence to get the text + // with proper spaces, then use those spaces to detect word boundaries. + if (words.length <= 1 && tokens.length > 1) { + const fullDecoded = tokenizer + .decode(token_ids, { skip_special_tokens: true, clean_up_tokenization_spaces: false }) + .trimStart(); + + words.length = 0; + current = null; + let pos = 0; + + for (let j = 0; j < tokens.length; j++) { + let foundSpace = false; + while (pos < fullDecoded.length && /\s/.test(fullDecoded[pos])) { + foundSpace = true; + pos++; + } + + const startsNewWord = j === 0 || foundSpace; + tokens[j].is_word_start = startsNewWord; + pos = Math.min(pos + tokens[j].token.length, fullDecoded.length); + + if (!current || startsNewWord) { + finalizeAndPushWord(words, current); + current = { + text: tokens[j].token, + start: tokens[j].start_time, + end: tokens[j].end_time, + confs: tokens[j].confidence != null ? [tokens[j].confidence] : [], + }; + } else { + current.text += tokens[j].token; + current.end = tokens[j].end_time; + if (tokens[j].confidence != null) { + current.confs.push(tokens[j].confidence); + } + } + } + + finalizeAndPushWord(words, current); + } + const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null; let word_avg = null; if (word_confidences) {