Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/transformers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
"dependencies": {
"@huggingface/jinja": "^0.5.5",
"@huggingface/tokenizers": "^0.1.2",
"onnxruntime-node": "1.24.2",
"onnxruntime-node": "1.25.0-dev.20260228-6e72d31970",
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Dependency on dev version

Updating onnxruntime-node from stable 1.24.2 to 1.25.0-dev.20260228-6e72d31970 introduces a pre-release/dev version dependency. This could:

  • Introduce instability in production environments
  • Cause compatibility issues with existing ONNX models
  • Make debugging harder due to non-stable APIs

Consider pinning to a stable release version instead.

"onnxruntime-web": "1.25.0-dev.20260221-b2a6e69e82",
"sharp": "^0.34.5"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ function roundTs(value) {
}

/**
* @param {Float32Array|number[]} logits
* @param {import('../../utils/tensor.js').DataArray} logits
* @param {number} tokenId
* @param {number} vocabSize
* @returns {{ confidence: number, logProb: number }}
Expand Down Expand Up @@ -175,6 +175,7 @@ function resolveTransducerConfig(config, sessions) {
const frameShiftS = transducerConfig.frame_shift_s ?? 0.01;
const blankTokenId = transducerConfig.blank_token_id ?? 0;
const encoderOutputLayout = transducerConfig.encoder_output_layout;
const encoderLengthDType = transducerConfig.encoder_length_dtype ?? 'int64';
const decoderTokenDType = transducerConfig.decoder_token_dtype ?? 'int32';
const decoderTokenLengthDType = transducerConfig.decoder_token_length_dtype ?? 'int32';

Expand All @@ -195,6 +196,11 @@ function resolveTransducerConfig(config, sessions) {
if (encoderOutputLayout !== 'BDT' && encoderOutputLayout !== 'BTD') {
throw new Error('Invalid `transformers.js_config.transducer.encoder_output_layout`: expected "BDT" or "BTD".');
}
if (!['int32', 'int64'].includes(encoderLengthDType)) {
throw new Error(
'Invalid `transformers.js_config.transducer.encoder_length_dtype`: expected "int32" or "int64".',
);
}
if (!['int32', 'int64'].includes(decoderTokenDType)) {
throw new Error(
'Invalid `transformers.js_config.transducer.decoder_token_dtype`: expected "int32" or "int64".',
Expand All @@ -216,6 +222,7 @@ function resolveTransducerConfig(config, sessions) {
encoder_input_layout: transducerConfig.encoder_input_layout ?? 'BTF',
encoder_output_layout: encoderOutputLayout,
encoder_frame_layout: transducerConfig.encoder_frame_layout ?? 'BD1',
encoder_length_dtype: encoderLengthDType,
decoder_token_dtype: decoderTokenDType,
decoder_token_length_dtype: decoderTokenLengthDType,
decoder: {
Expand Down Expand Up @@ -448,7 +455,9 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel {
} else {
length = inputFeatures.dims[1];
}
const lengthTensor = new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1]);
const lengthTensor = this.transducer.encoder_length_dtype === 'int64'
? new Tensor('int64', BigInt64Array.from([BigInt(length)]), [1])
: new Tensor('int32', new Int32Array([length]), [1]);
disposables.push(lengthTensor);
feeds[name] = lengthTensor;
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,49 @@ export function buildTransducerDetailedOutputs(tokenizer, token_ids, token_times

finalizeAndPushWord(words, current);

// Fallback for character-level tokenizers (e.g., Danish RNNT) where individual
// tokens lack ▁/Ġ word-start markers. Decode the full sequence to get the text
// with proper spaces, then use those spaces to detect word boundaries.
if (words.length <= 1 && tokens.length > 1) {
const fullDecoded = tokenizer
.decode(token_ids, { skip_special_tokens: true, clean_up_tokenization_spaces: false })
.trimStart();

words.length = 0;
current = null;
let pos = 0;

for (let j = 0; j < tokens.length; j++) {
let foundSpace = false;
while (pos < fullDecoded.length && /\s/.test(fullDecoded[pos])) {
foundSpace = true;
pos++;
}

const startsNewWord = j === 0 || foundSpace;
tokens[j].is_word_start = startsNewWord;
pos = Math.min(pos + tokens[j].token.length, fullDecoded.length);

if (!current || startsNewWord) {
finalizeAndPushWord(words, current);
current = {
text: tokens[j].token,
start: tokens[j].start_time,
end: tokens[j].end_time,
confs: tokens[j].confidence != null ? [tokens[j].confidence] : [],
};
} else {
current.text += tokens[j].token;
current.end = tokens[j].end_time;
if (tokens[j].confidence != null) {
current.confs.push(tokens[j].confidence);
}
}
}

finalizeAndPushWord(words, current);
}
Comment on lines +173 to +211
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This new fallback block introduces significant code duplication. The logic for building words by creating or extending the current word object (lines 193-207) is nearly identical to the logic in the preceding loop (lines 151-165).

To improve maintainability and avoid redundancy, I recommend refactoring this duplicated logic into a separate helper function. This function could take the tokens array and be responsible for building the words array. You could then call it once for the initial word construction and again within this fallback block after updating the is_word_start flags.


const word_confidences = words.some((x) => x.confidence != null) ? words.map((x) => x.confidence ?? null) : null;
let word_avg = null;
if (word_confidences) {
Expand Down