Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions crates/inference/src/tokenizer/sentencepiece.rs
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,21 @@ impl SentencePieceTokenizer {
}
}

// HF Metaspace semantics: a space-piece (▁ or ' ') represents the space
// *before* a word. Trailing whitespace in the input has no following word,
// so no space-piece should be emitted for it. Strip any trailing space
// character(s) added during the loop.
if prev_was_space && self.inner.escape_whitespaces {
while out.ends_with(META_SPACE) {
let trim_len = META_SPACE.len_utf8();
out.truncate(out.len() - trim_len);
}
} else if prev_was_space && !self.inner.escape_whitespaces {
while out.ends_with(' ') {
out.pop();
}
}

out
}

Expand Down Expand Up @@ -896,4 +911,31 @@ mod tests {
assert_eq!(parse_byte_piece("<0x41>"), Some(0x41));
assert_eq!(parse_byte_piece("not-a-byte"), None);
}

// HF Metaspace semantics: trailing whitespace in input must not produce a
// trailing space-piece token. These unit tests cover the normalize() fix.

#[test]
fn test_normalize_strips_trailing_metaspace() {
let tokenizer = synthetic_tokenizer();
// Trailing whitespace → no trailing ▁ in normalized form.
assert_eq!(tokenizer.normalize("hello world "), "▁hello▁world");
// Single trailing space.
assert_eq!(tokenizer.normalize("hello "), "▁hello");
}

#[test]
fn test_normalize_leading_whitespace_no_duplicate_prefix() {
let tokenizer = synthetic_tokenizer();
// Leading whitespace: dummy_prefix already sets prev_was_space=true, so
// the leading space is collapsed; no extra ▁ before the first word.
assert_eq!(tokenizer.normalize(" hello world"), "▁hello▁world");
}

#[test]
fn test_normalize_trailing_and_leading_whitespace() {
let tokenizer = synthetic_tokenizer();
// Both leading and trailing whitespace — strip trailing, collapse leading.
assert_eq!(tokenizer.normalize(" hello world "), "▁hello▁world");
}
}
11 changes: 11 additions & 0 deletions crates/inference/tests/audit_tokenizer_parity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,17 @@ fn multilingual_e5_small_sentencepiece_parity() {
13, 2,
],
},
// Whitespace regression cases — HF ref collected with AutoTokenizer
// (transformers==4.x) on 2026-05-25. These cover the trailing-space bug
// where lattice used to emit an extra ▁ (token 6) before EOS.
Case {
input: " leading whitespace and multiple spaces ",
expected: &[0, 105207, 35011, 65421, 136, 48716, 32628, 7, 2],
},
Case {
input: "trailing space ",
expected: &[0, 141037, 214, 32628, 2],
},
],
);
}
Expand Down