diff --git a/crates/embed/tests/tokenizer_parity_e2e.rs b/crates/embed/tests/tokenizer_parity_e2e.rs new file mode 100644 index 00000000..c0909303 --- /dev/null +++ b/crates/embed/tests/tokenizer_parity_e2e.rs @@ -0,0 +1,258 @@ +//! End-to-end tokenizer parity test for the embedding service path (P0-E1). +//! +//! Confirms that the tokenizer used by the embedding service produces the same +//! token IDs as the HF reference tokenizer. This is the "tokenizer fixes flow +//! through to the embed layer" gate: if this passes, the fixes shipped in +//! impl-tokenizer-fixes (SentencePiece BOS/EOS, Qwen BPE EOS, AddedToken +//! longest-match) are correctly wired through the `BertModel` load path that +//! `NativeEmbeddingService` uses internally. +//! +//! Tests operate at the **tokenized-ID level** so no model weight files are +//! required. Each test loads the tokenizer from the HF snapshot directory +//! (the same path `BertModel::from_pretrained` resolves to), calls +//! `tokenize()`, and asserts the IDs match the golden HF reference values +//! from `audit_tokenizer_parity.rs`. +//! +//! Tests are skipped (not failed) when the HF snapshot is absent, matching +//! the convention used by `audit_tokenizer_parity.rs`. +//! +//! Run: +//! cargo test -p lattice-embed --test tokenizer_parity_e2e + +use lattice_inference::{Tokenizer, load_tokenizer}; +use std::path::PathBuf; + +// --------------------------------------------------------------------------- +// HF snapshot path helpers — match audit_tokenizer_parity.rs +// --------------------------------------------------------------------------- + +fn hf_cache() -> PathBuf { + let home = std::env::var("HOME").expect("HOME not set"); + PathBuf::from(home).join(".cache/huggingface/hub") +} + +fn bge_dir() -> PathBuf { + hf_cache() + .join("models--BAAI--bge-small-en-v1.5") + .join("snapshots") + .join("5c38ec7c405ec4b44b94cc5a9bb96e735b38267a") +} + +fn e5_dir() -> PathBuf { + hf_cache() + .join("models--intfloat--multilingual-e5-small") + .join("snapshots") + .join("614241f622f53c4eeff9890bdc4f31cfecc418b3") +} + +fn qwen_dir() -> PathBuf { + hf_cache() + .join("models--Qwen--Qwen3-Embedding-0.6B") + .join("snapshots") + .join("97b0c614be4d77ee51c0cef4e5f07c00f9eb65b3") +} + +// --------------------------------------------------------------------------- +// Test helper +// --------------------------------------------------------------------------- + +struct Case { + input: &'static str, + expected: &'static [u32], +} + +/// Load a tokenizer from `dir` via the same code path `BertModel::from_pretrained` +/// uses, then assert token IDs match HF reference values. +fn check_embed_service_tokenizer_parity(label: &str, tok: &dyn Tokenizer, cases: &[Case]) { + let mut failures = 0; + for (i, c) in cases.iter().enumerate() { + let out = tok.tokenize(c.input); + let actual = &out.input_ids[..out.real_length]; + if actual != c.expected { + failures += 1; + eprintln!( + "PARITY FAIL [{label}] case {}: input={:?}\n hf_expected: {:?}\n lattice_got: {:?}", + i + 1, + c.input, + c.expected, + actual, + ); + } + } + assert_eq!( + failures, + 0, + "[{label}] {failures}/{} tokenizer-parity cases failed — see stderr", + cases.len() + ); +} + +// --------------------------------------------------------------------------- +// BGE WordPiece — confirms AddedToken longest-match fix (P1-T3) flows through +// --------------------------------------------------------------------------- + +/// P0-E1: BGE WordPiece tokenizer used by NativeEmbeddingService produces +/// correct IDs, including AddedToken special tokens as whole units. +/// +/// Reference IDs match `bge_small_en_v15_wordpiece_parity` in +/// `audit_tokenizer_parity.rs`. Both tests must agree; if either fails the +/// embed path and the raw tokenizer path have diverged. +#[test] +fn embed_service_bge_tokenizer_parity() { + let dir = bge_dir(); + if !dir.exists() { + eprintln!( + "SKIP embed_service_bge_tokenizer_parity: {} not found", + dir.display() + ); + return; + } + + let tok = load_tokenizer(&dir).expect("load bge tokenizer"); + check_embed_service_tokenizer_parity( + "embed-path / bge-small-en-v1.5 (WordPiece)", + tok.as_ref(), + &[ + Case { + input: "Hello, world!", + expected: &[101, 7592, 1010, 2088, 999, 102], + }, + Case { + input: "", + expected: &[101, 102], + }, + // P1-T3 fix: [CLS] and [SEP] in input text matched as AddedToken units. + Case { + input: "[CLS] special tokens [SEP]", + expected: &[101, 101, 2569, 19204, 2015, 102, 102], + }, + Case { + input: "This is a longer sentence with multiple tokens to verify sequential encoding.", + expected: &[ + 101, 2023, 2003, 1037, 2936, 6251, 2007, 3674, 19204, 2015, 2000, 20410, 25582, + 17181, 1012, 102, + ], + }, + Case { + input: "https://example.com/path?query=value", + expected: &[ + 101, 16770, 1024, 1013, 1013, 2742, 1012, 4012, 1013, 4130, 1029, 23032, 1027, + 3643, 102, + ], + }, + ], + ); +} + +// --------------------------------------------------------------------------- +// E5 SentencePiece — confirms BOS/EOS injection (P0-T1) flows through +// --------------------------------------------------------------------------- + +/// P0-E1: E5 SentencePiece tokenizer used by NativeEmbeddingService wraps every +/// sequence with BOS id=0 and EOS id=2 (P0-T1 fix). +/// +/// Before the fix: 0/9 cases passed (BOS/EOS absent). +/// After the fix: 9/9 cases pass. These cases are a subset of those in +/// `multilingual_e5_small_sentencepiece_parity` in `audit_tokenizer_parity.rs`. +#[test] +fn embed_service_e5_tokenizer_parity() { + let dir = e5_dir(); + if !dir.exists() { + eprintln!( + "SKIP embed_service_e5_tokenizer_parity: {} not found", + dir.display() + ); + return; + } + + let tok = load_tokenizer(&dir).expect("load e5 tokenizer"); + check_embed_service_tokenizer_parity( + "embed-path / multilingual-e5-small (SentencePiece)", + tok.as_ref(), + &[ + // BOS (0) + EOS (2) framing on normal text. + Case { + input: "Hello, world!", + expected: &[0, 35378, 4, 8999, 38, 2], + }, + // Empty input: BOS + EOS only. + Case { + input: "", + expected: &[0, 2], + }, + Case { + input: " leading space", + expected: &[0, 105207, 32628, 2], + }, + Case { + input: "This is a longer sentence with multiple tokens to verify sequential encoding.", + expected: &[ + 0, 3293, 83, 10, 51713, 149357, 678, 48716, 47, 84694, 47, 493, 40383, 243228, + 289, 22, 587, 6238, 5, 2, + ], + }, + Case { + input: "https://example.com/path?query=value", + expected: &[ + 0, 3975, 696, 3355, 11, 33209, 5, 277, 64, 128405, 32, 944, 1294, 1369, 27494, + 13, 2, + ], + }, + ], + ); +} + +// --------------------------------------------------------------------------- +// Qwen BPE — confirms EOS injection (P0-T2) flows through +// --------------------------------------------------------------------------- + +/// P0-E1: Qwen BPE tokenizer used by NativeEmbeddingService appends EOS +/// id=151643 (P0-T2 fix). +/// +/// Before the fix: 0/10 cases passed (EOS absent). +/// After the fix: 10/10 cases pass. These cases are a subset of those in +/// `qwen3_embedding_0_6b_bpe_parity` in `audit_tokenizer_parity.rs`. +#[test] +fn embed_service_qwen_tokenizer_parity() { + let dir = qwen_dir(); + if !dir.exists() { + eprintln!( + "SKIP embed_service_qwen_tokenizer_parity: {} not found", + dir.display() + ); + return; + } + + let tok = load_tokenizer(&dir).expect("load qwen tokenizer"); + check_embed_service_tokenizer_parity( + "embed-path / qwen3-embedding-0.6b (BPE)", + tok.as_ref(), + &[ + // EOS (151643) appended. + Case { + input: "Hello, world!", + expected: &[9707, 11, 1879, 0, 151643], + }, + // Empty input: just EOS. + Case { + input: "", + expected: &[151643], + }, + Case { + input: " leading space", + expected: &[6388, 3550, 151643], + }, + Case { + input: "This is a longer sentence with multiple tokens to verify sequential encoding.", + expected: &[ + 1986, 374, 264, 5021, 11652, 448, 5248, 11211, 311, 10146, 51000, 11170, 13, + 151643, + ], + }, + Case { + input: "https://example.com/path?query=value", + expected: &[2428, 1110, 8687, 905, 50976, 30, 1631, 46538, 151643], + }, + ], + ); +}