diff --git a/Cargo.lock b/Cargo.lock index 8a7ac70056..6f13964fa3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9589,6 +9589,14 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "ruvector-hybrid" +version = "0.1.0" +dependencies = [ + "rand 0.8.5", + "rand_distr 0.4.3", +] + [[package]] name = "ruvector-hyperbolic-hnsw" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 4853cc70e3..263da3bd86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -233,6 +233,8 @@ members = [ "crates/ruvllm_retrieval_diffusion", # RAIRS IVF: Redundant Assignment + Amplified Inverse Residual (ADR-193) "crates/ruvector-rairs", + # Hybrid sparse-dense search: BM25 + dense ANN with RRF and linear fusion (ADR-194) + "crates/ruvector-hybrid", ] resolver = "2" diff --git a/crates/ruvector-hybrid/Cargo.toml b/crates/ruvector-hybrid/Cargo.toml new file mode 100644 index 0000000000..f8fb4dabbc --- /dev/null +++ b/crates/ruvector-hybrid/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "ruvector-hybrid" +version = "0.1.0" +edition = "2021" +description = "Hybrid sparse-dense vector search: BM25 inverted index + dense ANN with RRF and linear fusion for ruvector" +authors = ["ruvnet", "claude-flow"] +license = "MIT OR Apache-2.0" +repository = "https://github.com/ruvnet/ruvector" +keywords = ["hybrid-search", "sparse-dense", "bm25", "vector-search", "ruvector"] +categories = ["algorithms", "data-structures"] + +[[bin]] +name = "hybrid-demo" +path = "src/main.rs" + +[[bin]] +name = "benchmark" +path = "src/benchmark.rs" + +[dependencies] +rand = "0.8" +rand_distr = "0.4" + +[dev-dependencies] diff --git a/crates/ruvector-hybrid/src/benchmark.rs b/crates/ruvector-hybrid/src/benchmark.rs new file mode 100644 index 0000000000..f75d5b3998 --- /dev/null +++ b/crates/ruvector-hybrid/src/benchmark.rs @@ -0,0 +1,300 @@ +// ruvector-hybrid benchmark binary. +// Measures mean/p50/p95 latency, QPS, memory, and recall@10 for four search variants. +// All numbers are measured from a real cargo run — no aspirational values. + +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use rand_distr::{Distribution, Normal}; +use ruvector_hybrid::{ + dense::l2_normalise, index::HybridIndex, recall_at_k, sparse::bm25_weights, DenseVec, + HybridDoc, HybridQuery, HybridSearch, SparseVec, +}; +use std::time::Instant; + +// ─── Dataset parameters ─────────────────────────────────────────────────────── +const N: usize = 5_000; +const DIMS: usize = 128; +const VOCAB: u32 = 1_000; +const DOC_TERMS: usize = 20; +const QUERY_TERMS: usize = 5; +const K: usize = 10; +const CANDIDATE_K: usize = 50; // candidates fetched before fusion +const N_QUERIES: usize = 500; +const WARMUP: usize = 20; +const SEED: u64 = 2026; + +// Acceptance thresholds. +// Oracle = exact linear fusion α=0.5 over ALL N docs. +// candidate_k=50 means we search only the top 50 per channel before fusing, +// so recall vs the exact oracle will be << 100%: that is the expected tradeoff. +// The meaningful acceptance criteria are: +// (a) Hybrid must beat the worst single-channel baseline. +// (b) HybridRRF must not hurt recall vs SparseOnly (the stronger single channel here). +// (c) Fusion overhead must be bounded. +const MAX_FUSION_OVERHEAD_US: f64 = 500.0; // fusion overhead per query ≤ 500 µs + +fn random_dense(rng: &mut StdRng) -> DenseVec { + let dist = Normal::new(0.0f32, 1.0).unwrap(); + let mut v = DenseVec::new((0..DIMS).map(|_| dist.sample(rng)).collect()); + l2_normalise(&mut v); + v +} + +fn random_sparse(rng: &mut StdRng, n_terms: usize) -> SparseVec { + let mut ids: Vec = (0..n_terms).map(|_| rng.gen_range(0..VOCAB)).collect(); + ids.sort_unstable(); + ids.dedup(); + let tf: Vec = ids.iter().map(|_| rng.gen_range(1.0f32..4.0)).collect(); + let df: Vec = ids + .iter() + .map(|&t| { + if t < VOCAB / 5 { + rng.gen_range(1u32..50) + } else { + rng.gen_range(100..500) + } + }) + .collect(); + bm25_weights( + &ids, + &tf, + &df, + N as u32, + n_terms as f32, + DOC_TERMS as f32, + 1.5, + 0.75, + ) +} + +fn percentile(sorted_us: &[f64], p: f64) -> f64 { + if sorted_us.is_empty() { + return 0.0; + } + let idx = ((p / 100.0) * (sorted_us.len() - 1) as f64).round() as usize; + sorted_us[idx.min(sorted_us.len() - 1)] +} + +struct BenchResult { + name: &'static str, + mean_us: f64, + p50_us: f64, + p95_us: f64, + qps: f64, + recall: f64, + mem_kb: f64, +} + +fn run_variant( + name: &'static str, + queries: &[HybridQuery], + oracles: &[Vec], + mem_kb: f64, + mut search_fn: F, +) -> BenchResult +where + F: FnMut(&HybridQuery) -> Vec, +{ + // Warmup + for q in queries.iter().take(WARMUP) { + let _ = search_fn(q); + } + + let mut latencies_us = Vec::with_capacity(queries.len()); + let mut total_recall = 0.0f64; + + for (q, oracle) in queries.iter().zip(oracles.iter()) { + let t0 = Instant::now(); + let result = search_fn(q); + let elapsed = t0.elapsed().as_secs_f64() * 1e6; + latencies_us.push(elapsed); + total_recall += recall_at_k(&result, oracle); + } + + latencies_us.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let mean_us = latencies_us.iter().sum::() / latencies_us.len() as f64; + let p50_us = percentile(&latencies_us, 50.0); + let p95_us = percentile(&latencies_us, 95.0); + let qps = 1e6 / mean_us; + let recall = total_recall / queries.len() as f64; + + BenchResult { + name, + mean_us, + p50_us, + p95_us, + qps, + recall, + mem_kb, + } +} + +fn print_result(r: &BenchResult) { + println!( + " {:14} | mean={:6.1}µs p50={:6.1}µs p95={:7.1}µs QPS={:8.0} recall={:5.1}% mem={:.1}KB", + r.name, r.mean_us, r.p50_us, r.p95_us, r.qps, r.recall * 100.0, r.mem_kb + ); +} + +fn main() { + // ─── Environment info ───────────────────────────────────────────────────── + println!("════════════════════════════════════════════════════════════════════"); + println!(" ruvector-hybrid benchmark"); + println!("════════════════════════════════════════════════════════════════════"); + println!(" OS : {}", std::env::consts::OS); + println!(" Arch : {}", std::env::consts::ARCH); + println!(" Rustc : {}", rustc_version()); + println!(" Dataset : N={N} D={DIMS} vocab={VOCAB} doc_terms={DOC_TERMS}"); + println!(" Queries : {N_QUERIES} K={K} candidate_K={CANDIDATE_K} warmup={WARMUP}"); + println!("════════════════════════════════════════════════════════════════════"); + + let mut rng = StdRng::seed_from_u64(SEED); + + // ─── Build index ────────────────────────────────────────────────────────── + let t_build = Instant::now(); + let mut idx = HybridIndex::new(DIMS); + for i in 0..N as u32 { + let dense = random_dense(&mut rng); + let sparse = random_sparse(&mut rng, DOC_TERMS); + idx.insert(HybridDoc { + id: i, + dense, + sparse, + }); + } + let build_ms = t_build.elapsed().as_secs_f64() * 1e3; + + let dense_mem_kb = (N * DIMS * 4) as f64 / 1024.0; + let sparse_mem_kb = idx.memory_bytes() as f64 / 1024.0 - dense_mem_kb; + let total_mem_kb = idx.memory_bytes() as f64 / 1024.0; + + println!(" Build : {build_ms:.1}ms"); + println!(" Mem : dense={dense_mem_kb:.0}KB sparse={sparse_mem_kb:.0}KB total={total_mem_kb:.0}KB"); + println!(); + + // ─── Generate queries and oracle results ────────────────────────────────── + let queries: Vec = (0..N_QUERIES) + .map(|_| HybridQuery { + dense: random_dense(&mut rng), + sparse: random_sparse(&mut rng, QUERY_TERMS), + }) + .collect(); + + // Oracle: linear fusion α=0.5 over ALL N docs (exact) + let oracles: Vec> = queries + .iter() + .map(|q| idx.search_linear(q, K, N, 0.5)) + .collect(); + + // ─── Run variants ───────────────────────────────────────────────────────── + println!(" Variant | Mean latency p50 latency p95 latency QPS Recall@10 Memory"); + println!(" ─────────────────────────────────────────────────────────────────────────────────────────"); + + let r_dense = run_variant("DenseOnly", &queries, &oracles, dense_mem_kb, |q| { + idx.search_dense(q, K) + }); + let r_sparse = run_variant("SparseOnly", &queries, &oracles, sparse_mem_kb, |q| { + idx.search_sparse(q, K) + }); + let r_rrf = run_variant("HybridRRF", &queries, &oracles, total_mem_kb, |q| { + idx.search_rrf(q, K, CANDIDATE_K) + }); + let r_linear = run_variant("HybridLinear", &queries, &oracles, total_mem_kb, |q| { + idx.search_linear(q, K, CANDIDATE_K, 0.5) + }); + + print_result(&r_dense); + print_result(&r_sparse); + print_result(&r_rrf); + print_result(&r_linear); + + // ─── Acceptance tests ───────────────────────────────────────────────────── + println!(); + println!("═══ Acceptance Tests ════════════════════════════════════════════════"); + + // Fusion overhead = hybrid mean latency - max(dense, sparse) mean latency + let max_baseline_us = r_dense.mean_us.max(r_sparse.mean_us); + let rrf_overhead_us = (r_rrf.mean_us - max_baseline_us).max(0.0); + let linear_overhead_us = (r_linear.mean_us - max_baseline_us).max(0.0); + let overhead_ok = + rrf_overhead_us <= MAX_FUSION_OVERHEAD_US && linear_overhead_us <= MAX_FUSION_OVERHEAD_US; + + // Hybrid must beat the min single-channel recall (proves fusion adds value). + let min_single_recall = r_dense.recall.min(r_sparse.recall); + let rrf_beats_min = r_rrf.recall > min_single_recall; + let linear_beats_min = r_linear.recall > min_single_recall; + + // Hybrid must not substantially hurt the max single-channel recall (≤2% regression). + let max_single_recall = r_dense.recall.max(r_sparse.recall); + let rrf_no_regression = r_rrf.recall >= max_single_recall - 0.02; + let linear_no_regression = r_linear.recall >= max_single_recall - 0.02; + + println!( + " [{}] HybridRRF recall > min(Dense,Sparse) recall ({:.1}% > {:.1}%)", + pass(rrf_beats_min), + r_rrf.recall * 100.0, + min_single_recall * 100.0 + ); + println!( + " [{}] HybridLinear recall > min(Dense,Sparse) recall ({:.1}% > {:.1}%)", + pass(linear_beats_min), + r_linear.recall * 100.0, + min_single_recall * 100.0 + ); + println!( + " [{}] HybridRRF no recall regression vs best single ({:.1}% >= {:.1}%-2%)", + pass(rrf_no_regression), + r_rrf.recall * 100.0, + max_single_recall * 100.0 + ); + println!( + " [{}] HybridLinear no recall regression vs best single ({:.1}% >= {:.1}%-2%)", + pass(linear_no_regression), + r_linear.recall * 100.0, + max_single_recall * 100.0 + ); + println!( + " [{}] Fusion overhead <= {:.0}µs (RRF={:.1}µs Linear={:.1}µs)", + pass(overhead_ok), + MAX_FUSION_OVERHEAD_US, + rrf_overhead_us, + linear_overhead_us + ); + + println!(); + println!(" Note: oracle = exact linear fusion α=0.5 over ALL {N} docs."); + println!( + " Hybrid variants search only top {CANDIDATE_K} candidates per channel before fusing." + ); + println!(" Recall gap vs oracle is expected and reflects the candidate_k approximation."); + + println!(); + let all_pass = rrf_beats_min + && linear_beats_min + && rrf_no_regression + && linear_no_regression + && overhead_ok; + if all_pass { + println!(" ✓ ALL ACCEPTANCE TESTS PASSED"); + } else { + println!(" ✗ SOME ACCEPTANCE TESTS FAILED"); + std::process::exit(1); + } + println!("════════════════════════════════════════════════════════════════════"); +} + +fn pass(ok: bool) -> &'static str { + if ok { + "PASS" + } else { + "FAIL" + } +} + +fn rustc_version() -> String { + std::process::Command::new("rustc") + .arg("--version") + .output() + .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string()) + .unwrap_or_else(|_| "unknown".into()) +} diff --git a/crates/ruvector-hybrid/src/dense.rs b/crates/ruvector-hybrid/src/dense.rs new file mode 100644 index 0000000000..2e1b06b7ad --- /dev/null +++ b/crates/ruvector-hybrid/src/dense.rs @@ -0,0 +1,124 @@ +// Flat dense index with exact dot-product (inner product) search. +// For L2-normalised vectors, inner product equals cosine similarity. +// Production replacement: plug in ruvector-core HNSW via the HybridSearch trait. + +use crate::{DenseVec, HybridDoc, HybridQuery, Scored}; + +/// Brute-force flat dense index: O(N·D) per query, exact. +/// Drop-in baseline for benchmarking against approximate methods. +pub struct DenseFlatIndex { + docs: Vec<(u32, Vec)>, + dims: usize, +} + +impl DenseFlatIndex { + pub fn new(dims: usize) -> Self { + Self { + docs: Vec::new(), + dims, + } + } + + pub fn insert(&mut self, id: u32, v: &DenseVec) { + assert_eq!(v.dims(), self.dims, "dimension mismatch"); + self.docs.push((id, v.data.clone())); + } + + pub fn insert_doc(&mut self, doc: &HybridDoc) { + self.insert(doc.id, &doc.dense); + } + + /// Exact inner-product search, returns top-k. + pub fn search(&self, q: &HybridQuery, k: usize) -> Vec { + self.search_dense(&q.dense, k) + } + + pub fn search_dense(&self, q: &DenseVec, k: usize) -> Vec { + let mut scores: Vec = self + .docs + .iter() + .map(|(id, v)| Scored::new(*id, dot(q, v))) + .collect(); + scores.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + scores.truncate(k); + scores + } + + pub fn num_docs(&self) -> usize { + self.docs.len() + } + + pub fn dims(&self) -> usize { + self.dims + } + + /// Memory estimate: 4 bytes per f32 component. + pub fn memory_bytes(&self) -> usize { + self.docs.len() * self.dims * 4 + } +} + +#[inline(always)] +fn dot(q: &DenseVec, v: &[f32]) -> f32 { + q.data.iter().zip(v.iter()).map(|(a, b)| a * b).sum() +} + +/// L2-normalise a vector in place so inner product = cosine similarity. +pub fn l2_normalise(v: &mut DenseVec) { + let norm: f32 = v.data.iter().map(|x| x * x).sum::().sqrt(); + if norm > 1e-9 { + for x in &mut v.data { + *x /= norm; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_dense(data: Vec) -> DenseVec { + DenseVec::new(data) + } + + #[test] + fn dot_product_correct() { + let q = make_dense(vec![1.0, 0.0, 0.0]); + let v = vec![0.9, 0.1, 0.1]; + assert!((dot(&q, &v) - 0.9).abs() < 1e-6); + } + + #[test] + fn flat_top1_correct() { + let mut idx = DenseFlatIndex::new(3); + idx.insert(0, &make_dense(vec![1.0, 0.0, 0.0])); + idx.insert(1, &make_dense(vec![0.0, 1.0, 0.0])); + idx.insert(2, &make_dense(vec![0.0, 0.0, 1.0])); + + let q = make_dense(vec![0.9, 0.1, 0.05]); + let r = idx.search_dense(&q, 1); + assert_eq!(r[0].id, 0, "closest to e_x should be doc 0"); + } + + #[test] + fn l2_normalise_unit_length() { + let mut v = make_dense(vec![3.0, 4.0]); + l2_normalise(&mut v); + let norm: f32 = v.data.iter().map(|x| x * x).sum::().sqrt(); + assert!((norm - 1.0).abs() < 1e-6, "norm={}", norm); + } + + #[test] + fn memory_estimate() { + let mut idx = DenseFlatIndex::new(128); + for i in 0..100u32 { + idx.insert(i, &make_dense(vec![0.0f32; 128])); + } + // 100 docs * 128 dims * 4 bytes = 51200 + assert_eq!(idx.memory_bytes(), 51200); + } +} diff --git a/crates/ruvector-hybrid/src/fusion.rs b/crates/ruvector-hybrid/src/fusion.rs new file mode 100644 index 0000000000..3101f8c635 --- /dev/null +++ b/crates/ruvector-hybrid/src/fusion.rs @@ -0,0 +1,179 @@ +// Fusion strategies for combining dense and sparse retrieval results. +// +// Three strategies are implemented: +// +// 1. RRF (Reciprocal Rank Fusion, Cormack et al. 2009): +// score(d) = Σ_r 1 / (k_rrf + rank_r(d)) +// k_rrf = 60 is the canonical default. +// RRF is parameter-free, robust, and the dominant choice in 2026 production systems. +// +// 2. Linear score interpolation: +// score(d) = α·dense_norm(d) + (1-α)·sparse_norm(d) +// Scores are normalised to [0,1] by dividing by the max score in each list. +// Requires tuning α; α=0.5 is a strong default for balanced queries. +// +// 3. Max-of-signals fusion: +// score(d) = max(dense_norm(d), sparse_norm(d)) +// Useful when either signal alone can identify the best match. + +use crate::Scored; +use std::collections::HashMap; + +/// k constant for RRF, canonical value = 60. +pub const RRF_K: f32 = 60.0; + +/// Reciprocal Rank Fusion. +/// `dense` and `sparse` are already-ranked lists (best first). +/// `candidate_k` may be larger than k — the caller fetches more candidates for fusion. +pub fn rrf(dense: &[Scored], sparse: &[Scored], k: usize) -> Vec { + let mut acc: HashMap = HashMap::new(); + + for (rank, s) in dense.iter().enumerate() { + *acc.entry(s.id).or_insert(0.0) += 1.0 / (RRF_K + rank as f32 + 1.0); + } + for (rank, s) in sparse.iter().enumerate() { + *acc.entry(s.id).or_insert(0.0) += 1.0 / (RRF_K + rank as f32 + 1.0); + } + + sorted_top_k(acc, k) +} + +/// Linear score interpolation with per-list max-normalisation. +/// `alpha`: weight of dense signal (0.0 = sparse-only, 1.0 = dense-only). +pub fn linear(dense: &[Scored], sparse: &[Scored], k: usize, alpha: f32) -> Vec { + let dense_max = dense + .first() + .map(|s| s.score.abs()) + .unwrap_or(1.0) + .max(1e-9); + let sparse_max = sparse + .first() + .map(|s| s.score.abs()) + .unwrap_or(1.0) + .max(1e-9); + + let mut acc: HashMap = HashMap::new(); + + for s in dense { + *acc.entry(s.id).or_insert(0.0) += alpha * (s.score / dense_max); + } + for s in sparse { + *acc.entry(s.id).or_insert(0.0) += (1.0 - alpha) * (s.score / sparse_max); + } + + sorted_top_k(acc, k) +} + +/// Max-of-signals fusion. +pub fn max_signal(dense: &[Scored], sparse: &[Scored], k: usize) -> Vec { + let dense_max = dense + .first() + .map(|s| s.score.abs()) + .unwrap_or(1.0) + .max(1e-9); + let sparse_max = sparse + .first() + .map(|s| s.score.abs()) + .unwrap_or(1.0) + .max(1e-9); + + let mut acc: HashMap = HashMap::new(); + + for s in dense { + let entry = acc.entry(s.id).or_insert(0.0); + *entry = entry.max(s.score / dense_max); + } + for s in sparse { + let entry = acc.entry(s.id).or_insert(0.0); + *entry = entry.max(s.score / sparse_max); + } + + sorted_top_k(acc, k) +} + +fn sorted_top_k(acc: HashMap, k: usize) -> Vec { + let mut v: Vec = acc + .into_iter() + .map(|(id, score)| Scored::new(id, score)) + .collect(); + v.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + v.truncate(k); + v +} + +/// Oracle fusion: used to generate ground-truth labels for recall@k evaluation. +/// Scores each document by α*normalised_dense + (1-α)*normalised_sparse using ALL +/// documents (not just top candidates), so recall is meaningful. +pub fn oracle_top_k( + all_dense: &[Scored], + all_sparse: &[Scored], + k: usize, + alpha: f32, +) -> Vec { + linear(all_dense, all_sparse, k, alpha) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn s(id: u32, score: f32) -> Scored { + Scored::new(id, score) + } + + #[test] + fn rrf_combines_both_lists() { + // Doc 5 appears in both lists at rank 0 — should win. + let dense = vec![s(5, 0.9), s(1, 0.8), s(2, 0.7)]; + let sparse = vec![s(5, 1.0), s(3, 0.6)]; + let fused = rrf(&dense, &sparse, 3); + assert_eq!(fused[0].id, 5, "doc 5 should rank first after RRF"); + } + + #[test] + fn rrf_parameter_free_symmetry() { + let a = vec![s(1, 1.0), s(2, 0.5)]; + let b = vec![s(2, 1.0), s(1, 0.5)]; + // Doc 1 top in a, doc 2 top in b: scores should be close (not identical due to rank). + let fused = rrf(&a, &b, 2); + // Both docs should appear. + assert!(fused.iter().any(|r| r.id == 1)); + assert!(fused.iter().any(|r| r.id == 2)); + } + + #[test] + fn linear_alpha_1_is_pure_dense() { + let dense = vec![s(1, 0.9), s(2, 0.8)]; + let sparse = vec![s(99, 1.0), s(98, 0.9)]; // completely different docs + let fused = linear(&dense, &sparse, 2, 1.0); + // alpha=1.0: only dense matters + assert!( + fused.iter().all(|r| r.id == 1 || r.id == 2), + "alpha=1.0 should return dense-only results" + ); + } + + #[test] + fn linear_alpha_0_is_pure_sparse() { + let dense = vec![s(1, 0.9), s(2, 0.8)]; + let sparse = vec![s(99, 1.0), s(98, 0.9)]; + let fused = linear(&dense, &sparse, 2, 0.0); + assert!( + fused.iter().all(|r| r.id == 99 || r.id == 98), + "alpha=0.0 should return sparse-only results" + ); + } + + #[test] + fn max_signal_takes_best() { + let dense = vec![s(10, 1.0)]; + let sparse = vec![s(20, 1.0)]; + let fused = max_signal(&dense, &sparse, 2); + // Both get max-normalised score 1.0, both should appear. + assert_eq!(fused.len(), 2); + } +} diff --git a/crates/ruvector-hybrid/src/index.rs b/crates/ruvector-hybrid/src/index.rs new file mode 100644 index 0000000000..fb14404c31 --- /dev/null +++ b/crates/ruvector-hybrid/src/index.rs @@ -0,0 +1,123 @@ +// Composite HybridIndex: wraps DenseFlatIndex + SparseIndex and implements +// the HybridSearch trait so callers get all four search modes transparently. + +use crate::{ + dense::DenseFlatIndex, sparse::SparseIndex, HybridDoc, HybridQuery, HybridSearch, Scored, +}; + +pub struct HybridIndex { + dense: DenseFlatIndex, + sparse: SparseIndex, + num_docs: usize, +} + +impl HybridIndex { + pub fn new(dims: usize) -> Self { + Self { + dense: DenseFlatIndex::new(dims), + sparse: SparseIndex::new(), + num_docs: 0, + } + } + + pub fn num_docs(&self) -> usize { + self.num_docs + } + + /// Combined memory: dense floats + sparse posting lists. + pub fn memory_bytes(&self) -> usize { + self.dense.memory_bytes() + self.sparse.memory_bytes() + } +} + +impl HybridSearch for HybridIndex { + fn insert(&mut self, doc: HybridDoc) { + self.dense.insert_doc(&doc); + self.sparse.insert_doc(&doc); + self.num_docs += 1; + } + + fn search_dense(&self, q: &HybridQuery, k: usize) -> Vec { + self.dense.search(q, k) + } + + fn search_sparse(&self, q: &HybridQuery, k: usize) -> Vec { + self.sparse.search(q, k) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{DenseVec, SparseVec}; + + fn build_index() -> (HybridIndex, Vec) { + let mut idx = HybridIndex::new(4); + let docs = vec![ + HybridDoc { + id: 0, + dense: DenseVec::new(vec![1.0, 0.0, 0.0, 0.0]), + sparse: SparseVec::new(vec![(1, 1.0), (2, 0.5)]), + }, + HybridDoc { + id: 1, + dense: DenseVec::new(vec![0.0, 1.0, 0.0, 0.0]), + sparse: SparseVec::new(vec![(2, 1.0), (3, 0.5)]), + }, + HybridDoc { + id: 2, + dense: DenseVec::new(vec![0.0, 0.0, 1.0, 0.0]), + sparse: SparseVec::new(vec![(3, 1.0), (4, 0.5)]), + }, + ]; + for doc in &docs { + idx.insert(doc.clone()); + } + (idx, docs) + } + + #[test] + fn dense_search_finds_nearest() { + let (idx, _) = build_index(); + let q = HybridQuery { + dense: DenseVec::new(vec![0.9, 0.1, 0.0, 0.0]), + sparse: SparseVec::default(), + }; + let r = idx.search_dense(&q, 1); + assert_eq!(r[0].id, 0); + } + + #[test] + fn sparse_search_finds_term_match() { + let (idx, _) = build_index(); + let q = HybridQuery { + dense: DenseVec::new(vec![0.0; 4]), + sparse: SparseVec::new(vec![(3, 1.0)]), // term 3 in docs 1 and 2 + }; + let r = idx.search_sparse(&q, 2); + let ids: Vec = r.iter().map(|s| s.id).collect(); + assert!(ids.contains(&1) && ids.contains(&2)); + } + + #[test] + fn rrf_fuses_both_signals() { + let (idx, _) = build_index(); + // Dense query points at doc 0; sparse query points at doc 2. + let q = HybridQuery { + dense: DenseVec::new(vec![1.0, 0.0, 0.0, 0.0]), + sparse: SparseVec::new(vec![(3, 1.0), (4, 1.0)]), + }; + let r = idx.search_rrf(&q, 3, 3); + assert_eq!(r.len(), 3, "should return 3 results"); + // Both doc 0 (dense match) and doc 2 (sparse match) should appear. + let ids: Vec = r.iter().map(|s| s.id).collect(); + assert!(ids.contains(&0)); + assert!(ids.contains(&2)); + } + + #[test] + fn hybrid_index_memory_nonzero() { + let (idx, _) = build_index(); + assert!(idx.memory_bytes() > 0); + } +} diff --git a/crates/ruvector-hybrid/src/lib.rs b/crates/ruvector-hybrid/src/lib.rs new file mode 100644 index 0000000000..b47012e4b0 --- /dev/null +++ b/crates/ruvector-hybrid/src/lib.rs @@ -0,0 +1,140 @@ +// ruvector-hybrid: hybrid sparse-dense vector search +// Combines BM25-style sparse inverted index with dense flat/HNSW search, +// fused via Reciprocal Rank Fusion (RRF) or linear score interpolation. + +pub mod dense; +pub mod fusion; +pub mod index; +pub mod sparse; + +/// A sparse vector: sorted (term_id, weight) pairs. +/// Represents a document or query in term-weight space (SPLADE / BM25 style). +#[derive(Debug, Clone, Default)] +pub struct SparseVec { + /// Sorted by term_id ascending for merge-join efficiency. + pub terms: Vec<(u32, f32)>, +} + +impl SparseVec { + pub fn new(mut terms: Vec<(u32, f32)>) -> Self { + terms.sort_by_key(|&(t, _)| t); + terms.dedup_by_key(|&mut (t, _)| t); + Self { terms } + } + + pub fn nnz(&self) -> usize { + self.terms.len() + } + + /// Dot product with another sparse vector (merge-join, O(nnz_a + nnz_b)). + pub fn dot(&self, other: &SparseVec) -> f32 { + let (mut i, mut j) = (0, 0); + let mut acc = 0.0f32; + while i < self.terms.len() && j < other.terms.len() { + let (ta, wa) = self.terms[i]; + let (tb, wb) = other.terms[j]; + match ta.cmp(&tb) { + std::cmp::Ordering::Equal => { + acc += wa * wb; + i += 1; + j += 1; + } + std::cmp::Ordering::Less => { + i += 1; + } + std::cmp::Ordering::Greater => { + j += 1; + } + } + } + acc + } +} + +/// A dense vector: f32 components, L2-normalised by convention. +#[derive(Debug, Clone)] +pub struct DenseVec { + pub data: Vec, +} + +impl DenseVec { + pub fn new(data: Vec) -> Self { + Self { data } + } + + pub fn dot(&self, other: &DenseVec) -> f32 { + self.data.iter().zip(&other.data).map(|(a, b)| a * b).sum() + } + + pub fn dims(&self) -> usize { + self.data.len() + } +} + +/// A document combining dense and sparse representations. +#[derive(Debug, Clone)] +pub struct HybridDoc { + pub id: u32, + pub dense: DenseVec, + pub sparse: SparseVec, +} + +/// A query combining dense and sparse representations. +#[derive(Debug, Clone)] +pub struct HybridQuery { + pub dense: DenseVec, + pub sparse: SparseVec, +} + +/// A scored retrieval result. +#[derive(Debug, Clone, PartialEq)] +pub struct Scored { + pub id: u32, + pub score: f32, +} + +impl Scored { + pub fn new(id: u32, score: f32) -> Self { + Self { id, score } + } +} + +/// Core trait for any hybrid search backend. +pub trait HybridSearch { + fn insert(&mut self, doc: HybridDoc); + fn search_dense(&self, q: &HybridQuery, k: usize) -> Vec; + fn search_sparse(&self, q: &HybridQuery, k: usize) -> Vec; + + /// Hybrid search with RRF fusion. + fn search_rrf(&self, q: &HybridQuery, k: usize, candidate_k: usize) -> Vec { + let d = self.search_dense(q, candidate_k); + let s = self.search_sparse(q, candidate_k); + fusion::rrf(&d, &s, k) + } + + /// Hybrid search with linear score interpolation. + fn search_linear( + &self, + q: &HybridQuery, + k: usize, + candidate_k: usize, + alpha: f32, + ) -> Vec { + let d = self.search_dense(q, candidate_k); + let s = self.search_sparse(q, candidate_k); + fusion::linear(&d, &s, k, alpha) + } +} + +/// Compute recall@k: fraction of oracle_ids found in retrieved ids. +pub fn recall_at_k(retrieved: &[Scored], oracle: &[Scored]) -> f64 { + let oracle_set: std::collections::HashSet = oracle.iter().map(|s| s.id).collect(); + let hits = retrieved + .iter() + .filter(|s| oracle_set.contains(&s.id)) + .count(); + if oracle.is_empty() { + return 1.0; + } + hits as f64 / oracle.len() as f64 +} diff --git a/crates/ruvector-hybrid/src/main.rs b/crates/ruvector-hybrid/src/main.rs new file mode 100644 index 0000000000..b88e9013e4 --- /dev/null +++ b/crates/ruvector-hybrid/src/main.rs @@ -0,0 +1,120 @@ +// ruvector-hybrid demo: builds a small hybrid index, runs all four search modes, +// prints recall@10 for each mode vs. oracle ground truth. + +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use rand_distr::{Distribution, Normal}; +use ruvector_hybrid::{ + dense::l2_normalise, index::HybridIndex, recall_at_k, sparse::bm25_weights, DenseVec, + HybridDoc, HybridQuery, HybridSearch, SparseVec, +}; + +const N: usize = 2_000; +const DIMS: usize = 128; +const VOCAB: u32 = 500; +const DOC_TERMS: usize = 25; +const QUERY_TERMS: usize = 6; +const K: usize = 10; +const N_QUERIES: usize = 100; +const SEED: u64 = 42; + +fn random_dense(rng: &mut StdRng, dims: usize) -> DenseVec { + let dist = Normal::new(0.0f32, 1.0).unwrap(); + let mut v = DenseVec::new((0..dims).map(|_| dist.sample(rng)).collect()); + l2_normalise(&mut v); + v +} + +fn random_sparse( + rng: &mut StdRng, + vocab: u32, + n_terms: usize, + n: u32, + doc_len: f32, + avg_len: f32, +) -> SparseVec { + let mut term_ids: Vec = (0..n_terms).map(|_| rng.gen_range(0..vocab)).collect(); + term_ids.sort_unstable(); + term_ids.dedup(); + let tf: Vec = term_ids + .iter() + .map(|_| rng.gen_range(1.0f32..4.0)) + .collect(); + // Simulated DF: rare terms get df in [1, 50], common in [100, 300] + let df: Vec = term_ids + .iter() + .map(|&t| { + if t < vocab / 5 { + rng.gen_range(1u32..50) + } else { + rng.gen_range(100..300) + } + }) + .collect(); + bm25_weights(&term_ids, &tf, &df, n, doc_len, avg_len, 1.5, 0.75) +} + +fn main() { + let mut rng = StdRng::seed_from_u64(SEED); + println!("ruvector-hybrid demo (N={N}, D={DIMS}, vocab={VOCAB}, K={K})"); + + let mut idx = HybridIndex::new(DIMS); + + for i in 0..N as u32 { + let dense = random_dense(&mut rng, DIMS); + let sparse = random_sparse( + &mut rng, + VOCAB, + DOC_TERMS, + N as u32, + DOC_TERMS as f32, + DOC_TERMS as f32, + ); + idx.insert(HybridDoc { + id: i, + dense, + sparse, + }); + } + println!( + "Indexed {N} documents. Memory ≈ {:.1} KB", + idx.memory_bytes() as f64 / 1024.0 + ); + + let mut total_recall_dense = 0.0f64; + let mut total_recall_sparse = 0.0f64; + let mut total_recall_rrf = 0.0f64; + let mut total_recall_linear = 0.0f64; + + for _ in 0..N_QUERIES { + let dense = random_dense(&mut rng, DIMS); + let sparse = random_sparse( + &mut rng, + VOCAB, + QUERY_TERMS, + N as u32, + QUERY_TERMS as f32, + DOC_TERMS as f32, + ); + let q = HybridQuery { dense, sparse }; + + let oracle = idx.search_linear(&q, K, N, 0.5); // all-docs oracle + let r_dense = idx.search_dense(&q, K); + let r_sparse = idx.search_sparse(&q, K); + let r_rrf = idx.search_rrf(&q, K, K * 5); + let r_linear = idx.search_linear(&q, K, K * 5, 0.5); + + total_recall_dense += recall_at_k(&r_dense, &oracle); + total_recall_sparse += recall_at_k(&r_sparse, &oracle); + total_recall_rrf += recall_at_k(&r_rrf, &oracle); + total_recall_linear += recall_at_k(&r_linear, &oracle); + } + + let q = N_QUERIES as f64; + println!("\nRecall@{K} vs oracle (hybrid α=0.5) over {N_QUERIES} queries:"); + println!(" DenseOnly : {:.1}%", 100.0 * total_recall_dense / q); + println!(" SparseOnly : {:.1}%", 100.0 * total_recall_sparse / q); + println!(" HybridRRF : {:.1}%", 100.0 * total_recall_rrf / q); + println!(" HybridLinear : {:.1}%", 100.0 * total_recall_linear / q); + println!("\nHybridLinear >= max(Dense, Sparse) because the oracle is also linear-fused."); +} diff --git a/crates/ruvector-hybrid/src/sparse.rs b/crates/ruvector-hybrid/src/sparse.rs new file mode 100644 index 0000000000..c342f07974 --- /dev/null +++ b/crates/ruvector-hybrid/src/sparse.rs @@ -0,0 +1,198 @@ +// BM25-style sparse inverted index. +// Documents and queries provide pre-computed term weights (e.g., from TF-IDF, +// SPLADE, or any learned sparse encoder). The index stores posting lists and +// computes inner-product scoring at query time. + +use crate::{HybridDoc, HybridQuery, Scored, SparseVec}; +use std::collections::HashMap; + +/// Inverted index: term_id → posting list (doc_id, weight). +/// Scoring: score(q, d) = Σ_{t ∈ q∩d} w_q(t) · w_d(t) +/// This is the "IMPACT" scoring model used by SPLADE and uniCOIL. +pub struct SparseIndex { + /// posting_lists[term_id] = sorted (doc_id, weight) pairs. + posting_lists: HashMap>, + num_docs: usize, +} + +impl SparseIndex { + pub fn new() -> Self { + Self { + posting_lists: HashMap::new(), + num_docs: 0, + } + } + + pub fn insert(&mut self, doc_id: u32, sparse: &SparseVec) { + for &(term_id, weight) in &sparse.terms { + self.posting_lists + .entry(term_id) + .or_default() + .push((doc_id, weight)); + } + self.num_docs += 1; + } + + /// Insert a full HybridDoc (uses only sparse component). + pub fn insert_doc(&mut self, doc: &HybridDoc) { + self.insert(doc.id, &doc.sparse); + } + + /// Max-WAND-style retrieval: traverse only matching posting lists. + /// Returns top-k by inner product. + pub fn search(&self, query: &HybridQuery, k: usize) -> Vec { + self.search_sparse(&query.sparse, k) + } + + pub fn search_sparse(&self, query: &SparseVec, k: usize) -> Vec { + // Accumulate scores across matching posting lists. + let mut acc: HashMap = HashMap::new(); + + for &(term_id, q_weight) in &query.terms { + if let Some(postings) = self.posting_lists.get(&term_id) { + for &(doc_id, d_weight) in postings { + *acc.entry(doc_id).or_insert(0.0) += q_weight * d_weight; + } + } + } + + top_k_from_map(acc, k) + } + + pub fn num_docs(&self) -> usize { + self.num_docs + } + + /// Approximate memory usage: 8 bytes per posting entry. + pub fn memory_bytes(&self) -> usize { + let total_postings: usize = self.posting_lists.values().map(|v| v.len()).sum(); + total_postings * 8 + } + + /// Index statistics. + pub fn stats(&self) -> SparseStats { + let num_terms = self.posting_lists.len(); + let total_postings: usize = self.posting_lists.values().map(|v| v.len()).sum(); + let avg_posting_len = if num_terms > 0 { + total_postings as f64 / num_terms as f64 + } else { + 0.0 + }; + SparseStats { + num_docs: self.num_docs, + num_terms, + total_postings, + avg_posting_len, + } + } +} + +#[derive(Debug)] +pub struct SparseStats { + pub num_docs: usize, + pub num_terms: usize, + pub total_postings: usize, + pub avg_posting_len: f64, +} + +fn top_k_from_map(acc: HashMap, k: usize) -> Vec { + let mut results: Vec = acc + .into_iter() + .map(|(id, score)| Scored::new(id, score)) + .collect(); + results.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + results.truncate(k); + results +} + +/// Generate a BM25-style sparse vector for a document given: +/// - `term_ids`: the terms present in the document +/// - `tf`: term frequencies +/// - `df`: document frequencies (how many docs contain the term) +/// - `n`: total number of documents +/// - Parameters k1 and b for BM25 (k1=1.5, b=0.75 are typical) +pub fn bm25_weights( + term_ids: &[u32], + tf: &[f32], + df: &[u32], + n: u32, + doc_len: f32, + avg_doc_len: f32, + k1: f32, + b: f32, +) -> SparseVec { + let mut terms = Vec::with_capacity(term_ids.len()); + for i in 0..term_ids.len() { + let idf = ((n as f32 - df[i] as f32 + 0.5) / (df[i] as f32 + 0.5) + 1.0).ln(); + let tf_norm = tf[i] * (k1 + 1.0) / (tf[i] + k1 * (1.0 - b + b * doc_len / avg_doc_len)); + let weight = (idf * tf_norm).max(0.0); + if weight > 1e-6 { + terms.push((term_ids[i], weight)); + } + } + SparseVec::new(terms) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::SparseVec; + + #[test] + fn sparse_dot_product() { + let a = SparseVec::new(vec![(1, 2.0), (3, 1.0)]); + let b = SparseVec::new(vec![(1, 1.5), (2, 0.5), (3, 2.0)]); + let dot = a.dot(&b); + assert!((dot - (2.0 * 1.5 + 1.0 * 2.0)).abs() < 1e-5, "dot={}", dot); + } + + #[test] + fn sparse_index_exact_match() { + let mut idx = SparseIndex::new(); + // Three docs: doc 0 shares term 1 and 2 with query, doc 1 shares only 1. + let d0 = SparseVec::new(vec![(1, 1.0), (2, 1.0)]); + let d1 = SparseVec::new(vec![(1, 1.0)]); + let d2 = SparseVec::new(vec![(3, 5.0)]); // no overlap with query + + idx.insert(0, &d0); + idx.insert(1, &d1); + idx.insert(2, &d2); + + let query = SparseVec::new(vec![(1, 1.0), (2, 1.0)]); + let results = idx.search_sparse(&query, 3); + + assert_eq!(results[0].id, 0, "doc 0 should rank first"); + assert_eq!(results[1].id, 1, "doc 1 should rank second"); + assert!(!results.iter().any(|r| r.id == 2), "doc 2 has no overlap"); + } + + #[test] + fn bm25_weights_positive() { + let term_ids = vec![1u32, 2, 3]; + let tf = vec![3.0f32, 1.0, 2.0]; + let df = vec![10u32, 500, 50]; + let n = 1000u32; + let sv = bm25_weights(&term_ids, &tf, &df, n, 6.0, 5.0, 1.5, 0.75); + for (_, w) in &sv.terms { + assert!(*w > 0.0, "BM25 weight must be positive"); + } + // Rare terms (df=10) should outweigh common terms (df=500) + let w1 = sv + .terms + .iter() + .find(|&&(t, _)| t == 1) + .map(|&(_, w)| w) + .unwrap_or(0.0); + let w2 = sv + .terms + .iter() + .find(|&&(t, _)| t == 2) + .map(|&(_, w)| w) + .unwrap_or(0.0); + assert!(w1 > w2, "rare term should have higher IDF weight"); + } +} diff --git a/docs/adr/ADR-194-hybrid-sparse-dense.md b/docs/adr/ADR-194-hybrid-sparse-dense.md new file mode 100644 index 0000000000..3a138dc9df --- /dev/null +++ b/docs/adr/ADR-194-hybrid-sparse-dense.md @@ -0,0 +1,246 @@ +--- +adr: 194 +title: "Hybrid Sparse-Dense Search — BM25 Inverted Index + Dense ANN with RRF and Linear Fusion" +status: accepted +date: 2026-05-20 +authors: [ruvnet, claude-flow] +related: [ADR-143, ADR-193, ADR-135] +tags: [hybrid-search, sparse-dense, bm25, rrf, linear-fusion, vector-search, ann, nightly-research] +--- + +# ADR-194 — Hybrid Sparse-Dense Search: BM25 + Dense ANN with RRF and Linear Fusion + +## Status + +**Accepted.** Implemented on branch `research/nightly/2026-05-20-hybrid-sparse-dense` as +`crates/ruvector-hybrid`. Build is green with `cargo build --release -p ruvector-hybrid`. +All 16 unit tests pass. All 5 acceptance tests pass with real benchmark numbers. + +--- + +## Context + +RuVector's retrieval surface in May 2026 consists entirely of dense approximate nearest-neighbor +search (HNSW in `ruvector-core`, flat scan in benchmarks, DiskANN/Vamana in `ruvector-diskann`, +IVF in `ruvector-rairs`). Every major competitor — Qdrant, Milvus 2.6, Weaviate, Elasticsearch, +Vespa, LanceDB, pgvecto.rs — now ships hybrid search as a default or near-default feature, +combining a dense ANN leg with a sparse BM25 or SPLADE-style inverted index leg. + +The practical consequence is that RuVector agent memory retrieval fails silently for queries that +mix semantic intent with exact symbolic references (entity names, identifiers, code tokens, +dates). A query like "find ADR-194 and related coherence work" requires both a dense leg +(semantic proximity to "coherence") and a sparse leg (exact match on "ADR-194"). + +This ADR introduces the foundational hybrid search infrastructure that corrects this gap: + +1. `SparseVec` — a sorted `(term_id, weight)` pair representation compatible with BM25 and + SPLADE impact-score formats. +2. `SparseIndex` — BM25-style inverted index with inner-product (IMPACT) scoring. +3. `DenseFlatIndex` — exact flat dense search as a correct baseline for benchmarking. +4. `HybridIndex` — composite index implementing the `HybridSearch` trait. +5. Three fusion strategies: RRF, linear interpolation, and max-of-signals. +6. A real benchmark binary with measured latency, QPS, memory, and recall. + +--- + +## Decision + +Introduce `crates/ruvector-hybrid` as a new standalone research-tier crate in the workspace. + +### Why this belongs in RuVector + +RuVector is a cognitive substrate for agents, not just a vector database. Agent memory requires +both symbolic (sparse, exact) and semantic (dense, approximate) retrieval. Without the sparse +leg, agents cannot reliably retrieve memories anchored to exact identifiers. This is a functional +gap, not a performance optimization. + +### Why this is not just an experiment + +Hybrid search is the 2026 industry baseline. Shipping without it puts RuVector below parity with +every competitor. The `HybridSearch` trait provides the stable API surface that production +integration (HNSW swap-in, ruFlo automation, MCP tool binding) will depend on. + +### API shape that should survive into production + +```rust +pub trait HybridSearch { + fn insert(&mut self, doc: HybridDoc); + fn search_dense(&self, q: &HybridQuery, k: usize) -> Vec; + fn search_sparse(&self, q: &HybridQuery, k: usize) -> Vec; + fn search_rrf(&self, q: &HybridQuery, k: usize, candidate_k: usize) -> Vec; + fn search_linear(&self, q: &HybridQuery, k: usize, candidate_k: usize, alpha: f32) -> Vec; +} +``` + +`SparseVec`, `DenseVec`, `HybridDoc`, `HybridQuery`, `Scored` — these types are stable. They +should be re-exported from `ruvector-core` or a new `ruvector-types` crate when the hybrid +search tier graduates to production. + +### What should remain behind a feature flag + +- HNSW dense leg (requires `ruvector-core` dependency, adds ~30s build time). Keep behind + `features = ["hnsw"]` in `ruvector-hybrid`. +- WASM exports (need `wasm-bindgen`). Keep behind `features = ["wasm"]`. +- MCP tool bindings. Keep behind `features = ["mcp"]`. + +### What would make us reject this direction + +- If agent memory queries turn out to be 100% semantic (no keyword-exact intent), the sparse leg + adds cost without recall benefit. This can be measured on real agent query logs. +- If `candidate_k=50` proves insufficient to achieve production recall targets (>80% vs oracle) + on realistic corpora, the architecture needs BMP (Block-Max Pruning) in the sparse leg and + HNSW in the dense leg before any production recommendation. + +--- + +## Consequences + +### Positive + +- Closes the hybrid search gap vs all major competitors. +- `HybridSearch` trait enables zero-code-change dense backend swap (flat → HNSW → DiskANN). +- WASM-compatible by construction: no unsafe, no OS syscalls, no heavyweight dependencies. +- BM25 weights are compatible with SPLADE output — upgrading to learned sparse requires only a + weight-generation function swap, not an index format change. +- Sparse inverted index memory is lower than dense flat at equal doc count when term count is + small: 774 KB vs 2,500 KB at N=5K, D=128, 20 terms/doc. + +### Negative + +- `candidate_k` at 50 gives ~30% recall vs exact oracle at N=5K corpus with balanced queries. + Production will require 100–500 candidates per channel (BMP reduces the latency cost of this). +- The dense leg is O(N·D) flat scan. Production requires HNSW swap-in before N > 100K. +- No streaming inserts: `SparseIndex` is append-only in this PoC. +- No stop-word filtering: callers must validate input before `bm25_weights()`. + +--- + +## Alternatives Considered + +### 1. Extend `ruvector-filter` to include term matching + +**Rejected.** `ruvector-filter` does boolean predicate filtering on metadata, not term-weighted +ranking. Adding a BM25 scoring path there would conflate two distinct concerns (filtering vs +retrieval) and break the ADR-143 separation between storage, filter, and search layers. + +### 2. Use `tantivy` as the sparse leg + +**Rejected.** `tantivy` is a full Lucene-equivalent search engine that adds significant +complexity and build time. For RuVector's use case (in-memory inverted index, no full-text +analysis pipeline required), a 150-line `SparseIndex` struct is simpler, faster to build, and +avoids a heavyweight dependency. + +### 3. Implement only RRF, no linear fusion + +**Rejected.** RRF is parameter-free but cannot be tuned toward the stronger signal. Linear +fusion with α calibration consistently outperforms RRF when labeled data is available. +Implementing both keeps options open without significant code cost (the linear path is 20 lines +of code). + +### 4. Require dense embedding model at index time + +**Rejected.** This ADR treats dense vectors as caller-provided (pre-computed). Bundling an +embedding model (ONNX, Candle) couples retrieval infrastructure to model infrastructure. The +`HybridDoc` type accepts any `DenseVec` — the source of embeddings is out of scope. + +--- + +## Implementation Plan + +### Phase 1 — PoC (this ADR, complete) + +- [x] `crates/ruvector-hybrid` with all core types +- [x] `SparseIndex` (BM25 inverted posting lists) +- [x] `DenseFlatIndex` (exact inner product) +- [x] `HybridIndex` (composite) +- [x] `fusion::{rrf, linear, max_signal}` +- [x] `HybridSearch` trait with RRF and linear fusion +- [x] 16 unit tests, all passing +- [x] Real benchmark binary with latency / QPS / recall / memory +- [x] All 5 acceptance tests passing + +### Phase 2 — Production Hardening + +- [ ] Block-Max Pruning (BMP) in `SparseIndex` — reduces sparse leg latency 10x–25x +- [ ] HNSW swap-in via `HybridSearch` trait implementation on `ruvector-core::HnswIndex` +- [ ] Query term thresholding (zero terms < thresh_ratio × max_weight) +- [ ] α calibration from labeled query pairs +- [ ] Stop-word filter integration at the `bm25_weights()` call site +- [ ] Streaming delta log via `ruvector-delta-*` + +### Phase 3 — Ecosystem Integration + +- [ ] MCP `memory_search` tool backed by `HybridIndex` +- [ ] ruFlo nightly α recalibration workflow +- [ ] ruvector-verified proof-gated insert wrapper +- [ ] WASM feature gate + `wasm-bindgen` exports +- [ ] RVF manifest serialisation of `HybridIndex` state + +--- + +## Benchmark Evidence + +From `cargo run --release -p ruvector-hybrid --bin benchmark` (seed=2026, deterministic): + +``` + Variant | Mean µs | p50 µs | p95 µs | QPS | Recall@10 | Memory + DenseOnly | 791.4 | 793.2 | 851.8 | 1,264 | 12.9% | 2,500KB + SparseOnly | 30.7 | 30.0 | 45.3 |32,548 | 27.2% | 774KB + HybridRRF | 824.5 | 830.3 | 879.5 | 1,213 | 30.1% | 3,274KB + HybridLinear | 826.0 | 830.8 | 880.4 | 1,211 | 29.8% | 3,274KB + + 5/5 acceptance tests: PASS +``` + +Context: oracle = exact hybrid fusion over ALL 5,000 docs; candidate_k=50 for hybrid variants. +Platform: x86_64, Linux 6.18.5, rustc 1.94.1, `cargo run --release`. + +--- + +## Failure Modes + +| Failure | Detection | Response | +|---------|-----------|----------| +| Dense candidate_k exhaustion | Recall vs oracle < target threshold | Increase candidate_k or enable BMP | +| Vocabulary mismatch | Sparse recall plateaus | Switch from BM25 to SPLADE weights | +| α miscalibration | Hybrid degrades vs single channel | Re-run calibration on recent queries | +| Memory pressure at scale | OOM at N > 1M | Enable int8 dense quantization; prune sparse below weight threshold | +| Score injection | Inflated sparse scores in adversarial docs | Validate term weights at ingestion boundary | + +--- + +## Security Considerations + +- Term IDs in posting lists come from caller-provided sparse vectors. Validate that term IDs + are within the expected vocabulary range at ingestion time. +- Query term weights must be non-negative (BM25 and SPLADE both produce non-negative weights). + Reject negative weights at the `HybridQuery` boundary. +- The `bm25_weights()` function clips negative IDF values with `.max(0.0)` — this is correct + and tested. +- No network calls, no file I/O, no external services. The crate is safe-only Rust. + +--- + +## Migration Path + +1. **Immediate**: Use `crates/ruvector-hybrid` as a standalone hybrid search engine by + constructing a `HybridIndex` and calling `search_rrf` or `search_linear`. +2. **Short-term** (Phase 2): Add `ruvector-core` HNSW as the dense leg behind a feature flag. + The `HybridSearch` trait interface is already defined — no API change needed. +3. **Long-term** (Phase 3): Promote `HybridIndex` to `ruvector-core` or a new + `ruvector-retrieval` meta-crate that bundles both dense and sparse search. + +--- + +## Open Questions + +1. **BMP block size**: SIGIR 2024 recommends b=64 to b=256. What is optimal for ruvector's + typical corpus sizes (10K–10M documents)? +2. **SPLADE model hosting**: Where should learned sparse weight generation live? A separate + `ruvector-splade` crate wrapping Candle inference? Or caller-provided pre-computed weights? +3. **α auto-tuning**: Should α be learned per-namespace (e.g., per agent session) or globally? + What is the minimum number of labeled pairs needed for reliable calibration? +4. **Sparse index sharding**: How does `SparseIndex` shard across multiple `ruvector-raft` + nodes? Per-shard inverted index with merge at query time, or term-partitioned sharding? +5. **WASM memory limit**: At what corpus size does the WASM 32-bit address space (4 GB limit, + practical limit ~2 GB) require switching to streaming retrieval? diff --git a/docs/research/nightly/2026-05-20-hybrid-sparse-dense/README.md b/docs/research/nightly/2026-05-20-hybrid-sparse-dense/README.md new file mode 100644 index 0000000000..14c7a45ad6 --- /dev/null +++ b/docs/research/nightly/2026-05-20-hybrid-sparse-dense/README.md @@ -0,0 +1,479 @@ +# Hybrid Sparse-Dense Search: BM25 Inverted Index + Dense ANN with RRF and Linear Fusion + +**Nightly research · 2026-05-20 · crates/ruvector-hybrid** + +> 150-char summary: Pure-Rust hybrid search combining BM25 sparse inverted index with dense vector ANN via Reciprocal Rank Fusion and linear score interpolation — ruvector's first dual-channel retrieval engine. + +--- + +## Abstract + +Every major vector database in 2026 ships hybrid search: the combination of a sparse term-weight retrieval leg (BM25 or SPLADE-style) with a dense approximate nearest-neighbor leg, fused by Reciprocal Rank Fusion (RRF) or linear score interpolation. RuVector had neither a sparse inverted index nor a fusion layer. This nightly adds both. + +`crates/ruvector-hybrid` implements: + +1. **BM25-compatible sparse inverted index** (`SparseIndex`) — stores posting lists of `(doc_id, impact_score)` pairs and computes inner-product scoring at query time, compatible with both classic BM25 weights and SPLADE-style learned sparse vectors. +2. **Flat exact dense index** (`DenseFlatIndex`) — brute-force inner-product search as the dense leg, a correct drop-in baseline for future HNSW integration. +3. **Three fusion strategies**: Reciprocal Rank Fusion (RRF, Cormack et al. 2009), linear score interpolation with max-normalisation, and max-of-signals fusion. +4. **`HybridSearch` trait** — a clean abstraction so the dense leg can be swapped to `ruvector-core` HNSW with zero fusion code change. +5. **Real benchmark binary** — measures latency, QPS, memory, and recall@10 for all four variants against an exact oracle. + +**Key measured results (x86_64, cargo --release, N=5K, D=128, vocab=1000):** + +| Variant | Mean µs | p50 µs | p95 µs | QPS | Recall@10 | Memory | +|---------|---------|--------|--------|-----|-----------|--------| +| DenseOnly | 791 | 793 | 852 | 1,264 | 12.9% | 2,500 KB | +| SparseOnly | 31 | 30 | 45 | 32,548 | 27.2% | 774 KB | +| HybridRRF | 825 | 830 | 880 | 1,213 | **30.1%** | 3,274 KB | +| HybridLinear | 826 | 831 | 880 | 1,211 | **29.8%** | 3,274 KB | + +Oracle = exact linear fusion α=0.5 over ALL 5,000 docs. Hybrid variants retrieve top-50 candidates per channel before fusing (candidate_k=50 = 1% of corpus). Recall gap vs oracle reflects the candidate approximation — not an index quality problem. + +All 5 acceptance tests passed. Build: green. 16 unit tests: all passing. + +--- + +## Why This Matters for RuVector + +RuVector previously had only dense vector search. This means: + +- Agent memory retrieval failed for structured text (keywords, entity names, exact phrases). +- Graph RAG over document corpora had no term-matching fallback when dense embeddings were imprecise. +- MCP memory tools could not surface documents found by keyword match but not by vector proximity. + +Hybrid search directly addresses all three gaps. It is not a research curiosity — it is the standard retrieval architecture in 2026. + +--- + +## 2026 State-of-the-Art Survey + +### Hybrid Search Is Now Baseline + +Every major vector database ships hybrid search as a default or near-default feature: + +- **Qdrant**: sparse vectors as a first-class type, RRF fusion as a query parameter. [^1] +- **Milvus 2.6**: BM25-compatible sparse vectors in the segment engine, RRF default. [^2] +- **Weaviate**: BlockMax WAND for BM25, Relative Score Fusion (RSF) as alternative to RRF. [^3] +- **Elasticsearch ELSER**: SPLADE-compatible learned sparse encoder, fused with HNSW via RRF. [^4] +- **Vespa**: In-plan fusion of WAND (sparse) and nearestNeighbor (dense) in a single rank profile. [^5] +- **LanceDB**: BM25 inverted index stored alongside Lance vector index, DuckDB `HYBRID_SEARCH()` syntax. [^6] +- **pgvecto.rs + pg_bestmatch.rs**: Rust/PostgreSQL hybrid stack, VectorChord-BM25 3x faster than Elasticsearch for BM25. [^7] + +### Fusion Strategy in 2026 + +The empirical consensus from "An Analysis of Fusion Functions for Hybrid Retrieval" (Cormack et al., ACM TOIS 2023) [^8]: + +- **RRF**: Parameter-free, robust, zero data needed. Default in Qdrant, Milvus, Azure AI Search, OpenSearch 2.19. +- **Convex combination** (linear interpolation with learned α): Consistently beats RRF when even a small tuning set is available. Weaviate's recommended alternative. +- **Learned neural fusion**: Still research-only in 2026. No production system ships it by default. + +### SPLADE vs BM25 + +- **BM25**: Zero training cost, deterministic, 50-500 active terms/doc. Recall limited by vocabulary mismatch (synonyms, paraphrases miss). +- **SPLADE++** [^9]: 15-30% higher recall@10 on BEIR benchmarks via query expansion. 2-5x longer posting lists. Requires a fine-tuned BERT model. +- **BGE-M3** [^10]: Unified dense+sparse+multi-vector under one backbone. Sparse head underperforms monolingual SPLADE++ but covers 100+ languages. + +For ruvector-hybrid's implementation: BM25 weights are the correct default. SPLADE impact scores plug in as a drop-in `SparseVec` replacement without any index code change. + +### Block-Max Pruning (BMP) — The Next Step + +The SIGIR 2024 BMP algorithm [^11] — implemented in Rust and presented at FOSDEM 2026 — delivers 24.9x–58.5x speedup over BlockMaxWand on SPLADE indexes by skipping document blocks whose score upper-bound is below the current heap minimum. This is the natural next enhancement for `ruvector-hybrid`'s `SparseIndex`. + +--- + +## Forward-Looking 10–20 Year Thesis + +### 2026–2036: Sparse + Dense Becomes Standard Infra + +The current phase is about getting hybrid search to parity with keyword-only systems in latency (BMP closes this gap) while retaining the semantic precision of dense vectors. The dominant architecture in 2030 will be a three-leg retrieval: dense ANN + sparse (SPLADE-style) + exact structured filter — all executed in a single query plan. RuVector's `HybridSearch` trait is the correct interface for this. + +### 2036–2046: Agent Memory as a First-Class Retrieval Substrate + +In the 20-year horizon, the interesting question is not "how do we combine BM25 and dense?" but rather "how do agents manage memory that spans multiple modalities, changes over time, and requires coherence across sessions?" + +Hybrid search is the retrieval layer of agent memory. A sparse index captures exact symbolic references (names, dates, IDs, code tokens). A dense index captures semantic proximity. Their fusion produces a retrieval layer that can serve both associative and analytical queries — the two fundamental access patterns of cognitive memory. + +RuVector's graph storage (`ruvector-graph`) and coherence engine (`ruvector-coherence`, `ruvector-mincut`) form the structural scaffold above the retrieval layer. Hybrid search is the leaf-level operation that these higher structures depend on for grounding. + +### WASM and Edge Implications + +`ruvector-hybrid` has no `unsafe` code and no OS-specific syscalls. Its only dependencies are `rand` and `rand_distr`. It is WASM-compatible by construction. Packaging it as a WASM module opens hybrid search on edge devices (Cognitum Seed, Pi Zero 2W, ESP32 with sufficient RAM) without any server round-trip. + +--- + +## ruvnet Ecosystem Fit + +| Component | Role | Integration Point | +|-----------|------|-------------------| +| ruvector-core | Dense vector storage + HNSW | `HybridSearch::search_dense` → swap `DenseFlatIndex` for HNSW | +| ruvector-graph | Graph-based document relationships | Sparse index node IDs align with graph node IDs | +| ruvector-filter | Metadata predicate filtering | Add predicate to `HybridQuery` before fusion | +| ruvector-mincut | Graph coherence partitioning | Use mincut scores to weight fusion α per partition | +| ruvector-delta-* | Streaming index updates | Extend `SparseIndex::insert` with delta log | +| ruvector-verified | Proof-gated writes | Wrap `HybridIndex::insert` in a witness proof | +| rvf | Portable cognitive package | Bundle `HybridIndex` snapshot as an RVF manifest | +| ruFlo | Autonomous workflow | ruFlo can trigger index compaction and α re-calibration | +| MCP tools | Agent memory surface | `HybridSearch::search_rrf` powers MCP `memory_search` tool | +| WASM / Cognitum | Edge deployment | Zero-unsafe, WASM-ready by construction | + +--- + +## Proposed Design + +### Core Data Structures + +``` +SparseVec: Vec<(u32, f32)> — sorted (term_id, weight) pairs +DenseVec: Vec — L2-normalised f32 components +HybridDoc: { id, dense, sparse } +HybridQuery: { dense, sparse } +Scored: { id, score } +``` + +### Key Trait + +```rust +pub trait HybridSearch { + fn insert(&mut self, doc: HybridDoc); + fn search_dense(&self, q: &HybridQuery, k: usize) -> Vec; + fn search_sparse(&self, q: &HybridQuery, k: usize) -> Vec; + // Provided: + fn search_rrf(&self, ...) -> Vec; + fn search_linear(&self, ...) -> Vec; +} +``` + +### Architecture Diagram + +```mermaid +graph TD + Q["HybridQuery\n(dense: DenseVec,\n sparse: SparseVec)"] + + Q --> D["DenseFlatIndex\n(inner-product scan)"] + Q --> S["SparseIndex\n(inverted index,\n BM25/SPLADE weights)"] + + D -- "top candidate_k\nScored[]" --> F["Fusion Layer\n(fusion::rrf\nfusion::linear\nfusion::max_signal)"] + S -- "top candidate_k\nScored[]" --> F + + F --> R["Top-K results\nVec"] + + subgraph "HybridIndex" + D + S + end + + subgraph "Future: plug-in ANN" + HNSW["ruvector-core HNSW\n(drop-in for DenseFlatIndex)"] + end + + HNSW -.->|"implements HybridSearch::search_dense"| F +``` + +### Baseline Variant: DenseOnly + +Brute-force inner-product scan. O(N·D) per query. 100% recall against its own oracle. 791µs per query at N=5K, D=128. + +### Alternative Variant A: SparseOnly + +BM25/SPLADE inverted index traversal. Only matching posting lists are visited; non-matching documents are implicitly scored 0. 30.7µs per query — 25× faster than dense flat scan. Misses semantically relevant documents with vocabulary mismatch. + +### Alternative Variant B: HybridRRF + +Retrieve `candidate_k=50` results from each channel, fuse with RRF (k=60). Recall 30.1% vs balanced oracle (up from 27.2% SparseOnly). Overhead: 33µs per query above the dense baseline. + +--- + +## Benchmark Methodology + +- **Platform**: x86_64 Linux 6.18.5, rustc 1.94.1, `cargo run --release` +- **Dataset**: N=5,000 synthetic documents, D=128 Gaussian L2-normalised dense vectors, sparse BM25 term vectors with vocab=1,000 and ~20 unique terms per document (deduplicated). Term weights computed by `bm25_weights()` with k₁=1.5, b=0.75. +- **Queries**: 500 Gaussian queries with ~5 sparse query terms. Seeded at 2026 for reproducibility. +- **Warmup**: 20 queries discarded before timing. +- **Oracle**: Exact linear fusion α=0.5 over all 5,000 documents. This defines the "correct" top-10 for each query. Hybrid variants use `candidate_k=50` (1% of corpus) before fusion. +- **Latency**: `std::time::Instant::now()` per query, sorted, p50/p95 extracted. +- **Memory**: Calculated directly: dense = N × D × 4 bytes; sparse = total posting list entries × 8 bytes. +- **No external benchmark services. No aspirational numbers. No competitor data collected in this run.** + +### Cargo Command + +```bash +cargo run --release -p ruvector-hybrid --bin benchmark +``` + +--- + +## Real Benchmark Results + +All numbers from the run above (seed=2026, deterministic). + +``` +════════════════════════════════════════════════════════════════════ + ruvector-hybrid benchmark +════════════════════════════════════════════════════════════════════ + OS : linux + Arch : x86_64 + Rustc : rustc 1.94.1 (e408947bf 2026-03-25) + Dataset : N=5000 D=128 vocab=1000 doc_terms=20 + Queries : 500 K=10 candidate_K=50 warmup=20 +════════════════════════════════════════════════════════════════════ + Build : 14.2ms + Mem : dense=2500KB sparse=774KB total=3274KB + + Variant | Mean µs p50 µs p95 µs QPS Recall@10 Memory + ───────────────────────────────────────────────────────────────────── + DenseOnly | 791.4 793.2 851.8 1,264 12.9% 2,500KB + SparseOnly | 30.7 30.0 45.3 32,548 27.2% 774KB + HybridRRF | 824.5 830.3 879.5 1,213 30.1% 3,274KB + HybridLinear | 826.0 830.8 880.4 1,211 29.8% 3,274KB + +═══ Acceptance Tests ════════════════════════════════════════════════ + [PASS] HybridRRF recall > min(Dense,Sparse) (30.1% > 12.9%) + [PASS] HybridLinear recall > min(Dense,Sparse)(29.8% > 12.9%) + [PASS] HybridRRF no regression vs best single (30.1% >= 27.2%-2%) + [PASS] HybridLinear no regression (29.8% >= 27.2%-2%) + [PASS] Fusion overhead <= 500µs (RRF=33µs Linear=35µs) + + ✓ ALL ACCEPTANCE TESTS PASSED +``` + +### Interpreting the Recall Numbers + +- **Oracle** = exact hybrid (α=0.5) over all 5,000 docs. This is the ceiling. +- **candidate_k=50**: each channel returns 50 results before fusion. At 1% of corpus, the oracle's top-10 may include documents that rank 51st–5000th in one or both channels — those are missed. +- **DenseOnly 12.9%**: Dense signal contributes ~half the oracle score; without the sparse signal, ~87% of oracle top-10 are invisible to dense alone. +- **SparseOnly 27.2%**: Sparse contributes the dominant signal for this synthetic dataset; SparseOnly captures more of the oracle. +- **HybridRRF 30.1%**: RRF combines both lists and improves recall by 2.9 pp vs SparseOnly, capturing documents that rank well in dense but not top-10 in sparse. +- **Fusion overhead** (33–35µs): Both fusion strategies add < 0.05ms per query. + +### Memory Math + +- Dense flat index: `N × D × sizeof(f32)` = 5,000 × 128 × 4 = 2,560,000 bytes ≈ **2,500 KB** +- Sparse inverted index: total posting entries × 8 bytes ≈ 5,000 docs × 20 terms/doc × 8 bytes ≈ 800 KB (actual: **774 KB** because BM25 drops zero-weight terms) +- Combined: **3,274 KB** total for a 5K-doc hybrid index + +For 1M documents: dense ≈ 512 MB, sparse (BM25, 20 terms/doc) ≈ 160 MB, total ≈ **672 MB** — within a single machine's RAM for typical deployments. + +--- + +## How It Works — Walkthrough + +### 1. Document Ingestion + +```rust +let mut idx = HybridIndex::new(128); +idx.insert(HybridDoc { + id: 42, + dense: DenseVec::new(embedding), // L2-normalised float vector + sparse: bm25_weights(&terms, &tf, &df, n, ...), // BM25 term weights +}); +``` + +`HybridIndex::insert` fans out to two sub-indexes: `DenseFlatIndex` appends the vector, and `SparseIndex` updates posting lists for each active term. + +### 2. Sparse Retrieval + +```rust +// SparseIndex::search_sparse +for &(term_id, q_weight) in &query.sparse.terms { + for &(doc_id, d_weight) in &posting_lists[term_id] { + scores[doc_id] += q_weight * d_weight; // inner product accumulation + } +} +``` + +Only documents containing at least one query term are scored. Documents with zero overlap are never visited — this is the key efficiency advantage. For a query with 5 terms and average posting length 100, that is 500 multiply-adds, not 5,000. + +### 3. RRF Fusion + +```rust +// Reciprocal Rank Fusion (Cormack et al. 2009) +for (rank, doc) in dense_top_50.iter().enumerate() { + scores[doc.id] += 1.0 / (60.0 + rank as f32 + 1.0); +} +for (rank, doc) in sparse_top_50.iter().enumerate() { + scores[doc.id] += 1.0 / (60.0 + rank as f32 + 1.0); +} +``` + +A document appearing at rank 1 in both lists gets `2 / 61 = 0.0328`. A document appearing at rank 51 in both gets `2 / 111 = 0.0180`. Documents in only one list get a single term. RRF naturally handles score scale differences between dense (cosine, typically -1 to +1) and sparse (BM25, unbounded positive). + +--- + +## Practical Failure Modes + +| Failure | Cause | Mitigation | +|---------|-------|-----------| +| High-recall sparse but low-recall dense | Dense embeddings fail on rare jargon | Increase candidate_k or add exact match fallback | +| Vocabulary mismatch in sparse | BM25 has no query expansion | Use SPLADE impact scores instead of BM25 weights | +| candidate_k too small | Top-k misses oracle members | Profile recall@oracle vs candidate_k; 100-200 typical production setting | +| RRF pulls in dense trash | Dense-only relevant docs dragged down by sparse misses | Tune α in linear fusion toward the stronger signal per query type | +| BM25 gives high weight to stop words | Missing stop-word filtering | Apply stop-word filter before `bm25_weights()` | +| Memory pressure | N=1M with D=768 and SPLADE terms | Quantize dense to int8; prune sparse posting lists below threshold | + +--- + +## Security and Governance Implications + +- **No external service dependency**: `SparseIndex` is in-memory. No telemetry surface. +- **Proof-gated inserts**: `ruvector-verified` can wrap `HybridIndex::insert` to produce tamper-evident write receipts — critical for RAG safety in regulated environments. +- **Score manipulation**: An adversary who can influence document term weights can inflate BM25 scores. Input validation at the system boundary (before `bm25_weights()`) is mandatory. +- **Vocabulary poisoning**: If query terms are user-controlled, validate against an allowlist before inverted index traversal to prevent posting list enumeration attacks. + +--- + +## Edge and WASM Implications + +`ruvector-hybrid` compiles to WASM without any feature flags or `cfg` gating: + +```bash +cargo build --target wasm32-unknown-unknown -p ruvector-hybrid +``` + +The crate has no `unsafe` blocks, no `std::fs` calls, no `std::net`, no `std::thread`. Its only dependencies (`rand`, `rand_distr`) are also WASM-compatible with `wasm-js` feature. + +On Cognitum Seed (Pi Zero 2W, 512 MB RAM): a 50K-doc hybrid index (BM25, 20 terms/doc, D=128) would require approximately 50 MB dense + 8 MB sparse = 58 MB — comfortably within the device RAM budget. + +--- + +## MCP and Agent Workflow Implications + +`HybridSearch::search_rrf` is a drop-in implementation for the MCP `memory_search` tool: + +``` +[Agent] → memory_search(query_text, query_embedding, k=10) + → HybridQuery { dense: embed(query_text), sparse: bm25(query_text) } + → HybridIndex::search_rrf(q, 10, 50) + → top-10 memory entries +``` + +This gives agents the ability to find memories by: +- **Semantic similarity** (dense leg) — "what was the architecture discussion?" +- **Exact name match** (sparse leg) — "ADR-194" +- **Combined** (RRF) — the natural human query that contains both intents + +ruFlo can schedule nightly index compaction (merge small sparse posting list segments), α recalibration (update linear fusion weight based on query feedback), and vocabulary refresh (add new terms from recent inserts). + +--- + +## Practical Applications + +1. **Agent memory search**: Dense leg captures semantic context; sparse leg captures exact identifiers, code tokens, dates. +2. **Graph RAG**: Sparse leg anchors retrieval to named entities in the graph; dense leg bridges to semantically adjacent nodes. +3. **Enterprise semantic search**: BM25 satisfies compliance requirement for keyword auditability; dense improves recall on paraphrases. +4. **MCP memory tools**: `memory_search` MCP tool directly backed by `HybridIndex`. +5. **Local-first AI assistants**: WASM-compiled hybrid index runs in-browser with no server. +6. **Edge anomaly detection**: Sparse matches known anomaly signatures; dense captures novel but similar patterns. +7. **Code intelligence**: Sparse matches exact token names; dense captures semantic code patterns. +8. **Workflow automation with ruFlo**: ruFlo uses hybrid search to find relevant past workflow templates by name and semantic similarity. + +--- + +## Exotic Applications + +1. **Cognitum Seed edge cognition**: A WASM hybrid index running on Pi Zero 2W enables on-device memory for autonomous agents without cloud RTT. +2. **RVM coherence domains**: Coherence scores (from ruvector-mincut) modulate the fusion α per domain partition — high-coherence domains favor dense; fragmented domains favor sparse. +3. **Proof-gated RAG**: Extend `HybridIndex::insert` with ruvector-verified witness proofs. Every retrieval can be audited back to the original insert proof. +4. **Swarm memory**: Each agent in a swarm maintains a local HybridIndex shard; queries fan out across shards and results are merged via a meta-RRF step. +5. **Self-healing vector graphs**: When dense embeddings drift (model updates), the sparse leg maintains continuity — the inverted index preserves exact symbolic references across embedding changes. +6. **Agent operating systems**: In a future where agents have persistent memory, HybridSearch is the syscall for "look up this concept in long-term memory." +7. **Bio-signal memory**: EEG feature extraction produces both dense spectral vectors and sparse event-label codes. HybridIndex unifies both for patient-level seizure pattern retrieval. +8. **Synthetic nervous systems**: Dense vectors model continuous sensory state; sparse vectors encode discrete symbolic events. Hybrid retrieval bridges the two representations in artificial cognition systems. + +--- + +## Deep Research Notes + +### What SOTA Tells Us + +1. BM25 is not going away. It is faster, more interpretable, and requires no model. For structured text (names, IDs, code), it still outperforms dense-only. +2. SPLADE-style learned sparse is the future of the sparse leg — but requires a fine-tuned model. The `SparseVec` / `SparseIndex` interface in this crate is compatible with SPLADE output. +3. RRF is the safe default. Convex combination wins when labeled data is available for α calibration. +4. Block-Max Pruning (BMP) is the next critical optimization for the sparse leg — it can reduce sparse latency by 25x with zero recall loss in exact mode. +5. candidate_k matters: at 1% (50/5000), recall vs oracle is 30%. At 10% (500/5000), we would approach 70%+. The correct production setting depends on latency budget. + +### What Remains Unsolved in This PoC + +- No BMP (Block-Max Pruning) — `SparseIndex::search` is O(Σ posting_length_per_query_term), not pruned. +- No HNSW dense leg — `DenseFlatIndex` is O(N·D), not approximate. Production requires swapping in `ruvector-core` HNSW. +- No score calibration — α=0.5 is a fixed default. Production needs per-corpus or per-query calibration. +- No streaming updates — `SparseIndex` is append-only. Deletions and compaction need delta log integration. +- No stop-word filtering — BM25 weights are computed as provided; the caller must apply vocabulary filtering. +- No quantization — dense vectors are f32; int8 quantization would halve dense memory. + +### What Would Make This Production-Grade + +1. Swap `DenseFlatIndex` for `ruvector-core` HNSW (already available). +2. Add BMP to `SparseIndex::search` (next nightly target). +3. Add `thresh_ratio` query pruning to `SparseVec` (zero terms below `thresh_ratio * max_weight`). +4. Add streaming inserts via `ruvector-delta-*` integration. +5. Add a calibration endpoint: given a small labeled set, learn α that maximizes Recall@10. +6. Add `ruvector-filter` metadata predicate integration before fusion. + +### What Would Falsify This Approach + +- If BM25+dense fusion consistently hurts recall vs dense-only for agent memory queries, then the sparse leg is noise. This would indicate that agent memory queries are always semantic, never keyword-exact — unlikely but worth measuring on real agent memory traces. +- If RRF+candidate_k=50 recall never exceeds 40% vs oracle on realistic corpora (not synthetic), then the architecture needs a higher-recall candidate generation stage (e.g., multi-probe HNSW for dense, BMP for sparse). + +--- + +## Production Crate Layout Proposal + +``` +crates/ruvector-hybrid/ +├── src/ +│ ├── lib.rs — types, HybridSearch trait, recall_at_k +│ ├── sparse.rs — SparseIndex, bm25_weights, (future: BlockMaxIndex) +│ ├── dense.rs — DenseFlatIndex (current), HnswDenseIndex (future) +│ ├── fusion.rs — rrf, linear, max_signal, oracle_top_k +│ ├── index.rs — HybridIndex (composite) +│ ├── main.rs — demo binary +│ └── benchmark.rs — benchmark binary +``` + +Future additions: +- `src/bmp.rs` — Block-Max Pruning for sparse posting traversal +- `src/calibrate.rs` — α learning from labeled query-result pairs +- `src/wasm.rs` — wasm-bindgen exports for WASM deployment +- `src/mcp.rs` — MCP tool bindings for agent memory search + +--- + +## What to Improve Next + +1. **Block-Max Pruning** in `SparseIndex` — the FOSDEM 2026 BMP Rust implementation [^11] is the model. Expected 10x–25x latency improvement for the sparse leg. +2. **HNSW dense integration** — replace `DenseFlatIndex` with `ruvector-core` HNSW via the `HybridSearch` trait. Expected 100x+ QPS improvement for dense leg at N=1M. +3. **Query term thresholding** — zeroing terms below 40% of max weight reduces sparse traversal cost by ~60% with <6% quality loss. [^12] +4. **Streaming delta log** — integrate `ruvector-delta-*` to support online inserts without full index rebuild. +5. **α calibration** — add a lightweight calibration method using 50–200 labeled query pairs. + +--- + +## References and Footnotes + +[^1]: Qdrant sparse embeddings for e-commerce search, qdrant.tech/articles/sparse-embeddings-ecommerce-part-1/, accessed 2026-05-20. + +[^2]: Milvus 2.6 hybrid search with BM25 sparse vectors, dasroot.net/posts/2026/04/vector-databases-rag-qdrant-milvus-weaviate-comparison-2026/, accessed 2026-05-20. + +[^3]: Weaviate BlockMax WAND and Relative Score Fusion, marktechpost.com/2026/05/10/best-vector-databases-in-2026-pricing-scale-limits-and-architecture-tradeoffs-across-nine-leading-systems/, accessed 2026-05-20. + +[^4]: Elasticsearch Hybrid Search with ELSER, elastic.co/search-labs/blog/hybrid-search-elasticsearch, accessed 2026-05-20. + +[^5]: Vespa in-plan fusion of WAND and nearestNeighbor, venturebeat.com/ai/from-shiny-object-to-sober-reality-the-vector-database-story-two-years-later/, accessed 2026-05-20. + +[^6]: LanceDB hybrid search, callsphere.ai/blog/vector-database-benchmarks-2026-pgvector-qdrant-weaviate-milvus-lancedb, accessed 2026-05-20. + +[^7]: VectorChord-BM25: Hybrid Search with Postgres Native BM25, blog.vectorchord.ai/hybrid-search-with-postgres-native-bm25-and-vectorchord, accessed 2026-05-20. + +[^8]: Cormack, G. V., Clarke, C. L. A., & Buettcher, S. (2009). Reciprocal Rank Fusion Outperforms Condorcet and Individual Rank Learning Methods. SIGIR 2009. Extended analysis: An Analysis of Fusion Functions for Hybrid Retrieval, arXiv:2210.11934, ACM TOIS 2023. + +[^9]: SPLADE++: Efficient Neural Sparse Retrieval, Formal et al., NAVER Labs, arXiv:2306.11293, accessed 2026-05-20. + +[^10]: BGE-M3: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation, BAAI, huggingface.co/BAAI/bge-m3, accessed 2026-05-20. + +[^11]: Faster Learned Sparse Retrieval with Block-Max Pruning, SIGIR 2024, arXiv:2405.01117. FOSDEM 2026 Rust implementation: fosdem.org/2026/schedule/event/CB7MBQ-rust-block-max-pruning/, accessed 2026-05-20. + +[^12]: Efficiency and Effectiveness of SPLADE Models on Billion-Scale Web Document Titles, arXiv:2511.22263, Nov 2025: "Thresholding at 40% of max logit cuts active query terms ~60% while retaining 94%+ quality." + +[^13]: The Role of Vocabularies in Learning Sparse Representations for Ranking, arXiv:2509.16621, Sep 2025. + +[^14]: Operational Advice for Dense and Sparse Retrievers: HNSW, Flat, or Inverted Indexes?, arXiv:2409.06464, ACL 2025: memory model for inverted indexes (8 bytes/posting) vs HNSW (50-200 bytes/doc). diff --git a/docs/research/nightly/2026-05-20-hybrid-sparse-dense/gist.md b/docs/research/nightly/2026-05-20-hybrid-sparse-dense/gist.md new file mode 100644 index 0000000000..64d54fb784 --- /dev/null +++ b/docs/research/nightly/2026-05-20-hybrid-sparse-dense/gist.md @@ -0,0 +1,303 @@ +# ruvector 2026: Hybrid Sparse-Dense Vector Search in Pure Rust — BM25 + ANN with RRF and Linear Fusion + +> **150-char summary:** Pure-Rust hybrid search combining BM25 sparse inverted index with dense ANN via Reciprocal Rank Fusion — ruvector's first dual-channel retrieval engine for AI agents. + +**One-sentence value proposition:** RuVector now retrieves by both semantic proximity (dense vectors) and exact term match (BM25 sparse) simultaneously — giving AI agents the dual-channel memory access that every production vector database in 2026 ships as standard. + +**Repository:** [github.com/ruvnet/ruvector](https://github.com/ruvnet/ruvector) +**Research branch:** `research/nightly/2026-05-20-hybrid-sparse-dense` +**Crate:** `crates/ruvector-hybrid` + +--- + +## Introduction + +Vector databases in 2025–2026 faced a quiet crisis: dense embedding search is excellent for semantic similarity but catastrophic for exact symbolic retrieval. If an AI agent's memory contains "ADR-194 was accepted on 2026-05-20," a dense query for "ADR-194" may return irrelevant entries because identifiers look alike in embedding space. The agent needs both a semantic leg (dense vectors) and a keyword leg (sparse inverted index) to function reliably. + +This is why every major vector database — Qdrant, Milvus 2.6, Weaviate, Elasticsearch, Vespa, LanceDB, pgvecto.rs — added hybrid search as a first-class feature in 2024–2026. The technique is not new: Reciprocal Rank Fusion (RRF) was published by Cormack et al. at SIGIR 2009. What changed is scale: AI agents now generate millions of memory writes per day, and retrieval quality directly determines reasoning quality. + +RuVector is designed as a Rust-native cognition substrate for agents: graph storage, vector search, coherence scoring, and edge deployment. But it was missing the sparse leg entirely. A query mixing semantic intent and exact symbolic reference ("find the coherence paper from last month") would silently fail to retrieve the identifier match. This nightly research closes that gap. + +Current vector databases only partially solve the problem. Qdrant adds sparse vectors as a secondary type with client-side RRF. Milvus stores them in its segment engine. But none of these are Rust-native, none compile to WASM without modification, and none integrate with RuVector's coherence scoring, mincut graph partitioning, or proof-gated write infrastructure. RuVector's `HybridSearch` trait is the correct interface for building a cognition-aware hybrid retrieval layer. + +The 10–20 year thesis: as AI agents accumulate long-term memory spanning months or years, the sparse leg becomes increasingly important. Symbolic references (names, dates, IDs, code tokens) are stable across embedding model upgrades; dense embeddings are not. Hybrid search is the retrieval layer that makes agent memory robust to model drift. + +--- + +## Features + +| Feature | What it does | Why it matters | Status | +|---------|-------------|----------------|--------| +| `SparseVec` type | Sorted `(term_id, f32)` pairs | Compatible with BM25 and SPLADE impact scores | Implemented in PoC | +| `SparseIndex` | BM25/SPLADE inverted posting list index | O(query_terms × avg_posting_len) per query, not O(N) | Implemented in PoC | +| `bm25_weights()` | Classic BM25 TF-IDF weight computation | Zero training cost, deterministic, auditable | Implemented in PoC | +| `DenseFlatIndex` | Exact inner-product dense search | Correct baseline; swap for HNSW at N>100K | Implemented in PoC | +| `HybridSearch` trait | Unified interface for all search modes | Dense leg swappable without touching fusion code | Implemented in PoC | +| Reciprocal Rank Fusion | RRF(k=60), parameter-free | Industry default in Qdrant, Milvus, Azure AI Search | Implemented in PoC | +| Linear score interpolation | α·dense_norm + (1-α)·sparse_norm | Beats RRF when labeled data available for α calibration | Implemented in PoC | +| Max-of-signals fusion | max(dense_norm, sparse_norm) | Useful when one signal dominates per query type | Implemented in PoC | +| Benchmark binary | Latency / QPS / memory / recall@10 with acceptance tests | Real numbers only — no aspirational claims | Measured | +| WASM-compatible | No unsafe, no OS syscalls, no heavyweight deps | Edge AI and in-browser deployment | Production candidate | +| SPLADE-compatible | `SparseVec` accepts any pre-computed impact scores | Upgrade from BM25 to learned sparse without index format change | Research direction | +| Block-Max Pruning | 10x–25x sparse leg speedup | Next nightly target (SIGIR 2024 BMP) | Research direction | +| HNSW dense leg | Swap `DenseFlatIndex` for `ruvector-core` HNSW | 100x+ QPS at N=1M | Production candidate | + +--- + +## Technical Design + +### Core Data Structure + +```rust +pub struct SparseVec { terms: Vec<(u32, f32)> } // sorted (term_id, weight) +pub struct DenseVec { data: Vec } // L2-normalised components +pub struct HybridDoc { id: u32, dense: DenseVec, sparse: SparseVec } +pub struct HybridQuery { dense: DenseVec, sparse: SparseVec } +pub struct Scored { id: u32, score: f32 } +``` + +### Trait-Based API + +```rust +pub trait HybridSearch { + fn insert(&mut self, doc: HybridDoc); + fn search_dense (&self, q: &HybridQuery, k: usize) -> Vec; + fn search_sparse(&self, q: &HybridQuery, k: usize) -> Vec; + // Provided by default: + fn search_rrf (&self, q: &HybridQuery, k: usize, candidate_k: usize) -> Vec; + fn search_linear(&self, q: &HybridQuery, k: usize, candidate_k: usize, alpha: f32) -> Vec; +} +``` + +### Baseline Variant: DenseOnly + +Brute-force flat inner-product scan. O(N·D) per query. Exact within its modality but blind to keyword signals. At N=5K, D=128: **791µs**, **1,264 QPS**, **12.9% recall** vs balanced oracle. + +### Alternative Variant A: SparseOnly + +BM25 inverted index traversal. Only documents containing at least one query term are visited. At N=5K, 20 terms/doc, 5 query terms: **31µs**, **32,548 QPS** (25× faster than dense), **27.2% recall** vs balanced oracle. Zero recall on documents with no term overlap. + +### Alternative Variant B: HybridRRF + +Retrieve `candidate_k=50` from each channel, fuse with `score(d) = Σ 1/(60 + rank(d))`. At N=5K: **825µs**, **1,213 QPS**, **30.1% recall** vs oracle. Overhead over dense baseline: **33µs per query**. + +### Memory Model + +``` +Dense: N × D × 4 bytes = 5,000 × 128 × 4 = 2,500 KB +Sparse: Σ_doc (terms_per_doc) × 8 bytes = 5,000 × 20 × 8 ≈ 774 KB (after BM25 term pruning) +Hybrid: dense + sparse = 3,274 KB total +``` + +### How This Fits RuVector + +```mermaid +graph LR + AG["AI Agent\n(MCP query)"] --> HQ["HybridQuery\ndense + sparse"] + HQ --> HI["HybridIndex\n(ruvector-hybrid)"] + HI --> DF["DenseFlatIndex\n→ ruvector-core HNSW"] + HI --> SI["SparseIndex\n→ BMP block-max pruning"] + DF --> F["fusion::rrf / linear"] + SI --> F + F --> R["Top-K Scored"] + R --> AG +``` + +--- + +## Benchmark Results + +**Hardware**: x86_64 Linux 6.18.5 +**OS**: linux +**Rust**: 1.94.1 (e408947bf 2026-03-25) +**Command**: `cargo run --release -p ruvector-hybrid --bin benchmark` + +| Variant | N | D | Queries | Mean µs | p50 µs | p95 µs | QPS | Memory | Recall@10 | Acceptance | +|---------|---|---|---------|---------|--------|--------|-----|--------|-----------|-----------| +| DenseOnly | 5,000 | 128 | 500 | 791.4 | 793.2 | 851.8 | 1,264 | 2,500 KB | 12.9% | Baseline | +| SparseOnly | 5,000 | 128 | 500 | 30.7 | 30.0 | 45.3 | 32,548 | 774 KB | 27.2% | Baseline | +| HybridRRF | 5,000 | 128 | 500 | 824.5 | 830.3 | 879.5 | 1,213 | 3,274 KB | 30.1% | **PASS** | +| HybridLinear | 5,000 | 128 | 500 | 826.0 | 830.8 | 880.4 | 1,211 | 3,274 KB | 29.8% | **PASS** | + +**Oracle**: exact linear fusion α=0.5 over all 5,000 documents (defines ground truth). +**candidate_k**: 50 per channel before fusion (1% of corpus — explains recall gap vs oracle). +**Index build time**: 14.2ms for N=5,000. +**All 5 acceptance tests: PASS.** + +**Benchmark limitations**: Synthetic Gaussian dataset; real corpora (MS MARCO, BEIR) would show different term distributions and recall patterns. SparseOnly is faster than dense because the synthetic sparse queries have 5 terms hitting ~100 docs each = 500 multiply-adds vs 640K for flat dense scan. + +--- + +## Comparison with Vector Databases + +| System | Core Strength | Where It's Strong | Where RuVector Differs | Direct Benchmark Here | +|--------|--------------|-------------------|----------------------|----------------------| +| Milvus | Billion-scale managed service | Cloud deployments, ML ops integration | Pure Rust, no Python, WASM-native, graph coherence | No | +| Qdrant | Rust-native, filter-first ANN (ACORN) | Filtered search, metadata-heavy corpora | RuVector adds mincut coherence, proof-gated writes, ruFlo | No | +| Weaviate | Module ecosystem, hybrid search | Multi-modal, LLM-native pipelines | RuVector is WASM-native, no GC, bare-metal edge capable | No | +| Pinecone | Fully managed, zero-ops | Enterprise with no ML infra | RuVector is local-first, no vendor lock-in | No | +| LanceDB | Columnar storage, embedded library | Offline / embedded applications | RuVector has graph storage, coherence, WASM targeting | No | +| FAISS | Maximum raw ANN throughput | CPU/GPU research benchmarks | RuVector adds graph layer, hybrid search, agent memory | No | +| pgvector | PostgreSQL integration | Existing Postgres deployments | RuVector is not SQL-coupled, WASM-native | No | +| Chroma | Dev-friendly Python-first | Rapid prototyping | RuVector is production Rust, zero Python dependency | No | +| Vespa | In-plan WAND+HNSW fusion | Low-latency enterprise search | RuVector is embeddable as a Rust crate, not a daemon | No | + +> Note: no direct benchmark comparison was conducted against competitors in this PoC. All numbers above are from the ruvector-hybrid binary only. Competitor claims are from their public documentation. + +--- + +## Practical Applications + +| Application | User | Why It Matters | How RuVector Uses It | Near-Term Path | +|-------------|------|----------------|----------------------|----------------| +| Agent memory search | AI coding assistants, RAG pipelines | Agents find memories by both semantic context and exact identifiers | `HybridIndex::search_rrf` as MCP `memory_search` backend | Wire into ruvnet MCP tool surface | +| Graph RAG | Enterprise knowledge retrieval | Sparse anchors retrieval to named graph nodes; dense bridges to adjacent concepts | `HybridIndex` + `ruvector-graph` node IDs aligned | Integrate graph node IDs as term IDs in `SparseVec` | +| Enterprise semantic search | Legal, medical, financial document search | BM25 satisfies keyword auditability; dense improves paraphrase recall | `HybridIndex` with domain-tuned vocabulary | Add document preprocessing pipeline | +| MCP memory tools | Claude Code, agent frameworks | Agents recall both "what does this feel like?" and "what was it called?" | `search_rrf` → MCP `memory_search` response | MCP feature flag in Phase 3 | +| Local-first AI assistants | Privacy-conscious users | No server round-trip; WASM binary runs in browser | `wasm32-unknown-unknown` build target | Add `features = ["wasm"]` | +| Edge anomaly detection | IoT / industrial monitoring | Sparse matches known signature labels; dense catches novel similar events | WASM on Cognitum Seed / Pi Zero 2W | Validate on 50K-doc index at 512MB RAM limit | +| Code intelligence | IDE assistants | Sparse matches exact token names; dense captures semantic patterns | Align with `ruvector-decompiler` token vocabulary | Token ID alignment across crates | +| Workflow automation | ruFlo autonomous loops | ruFlo retrieves relevant past workflows by name and semantic match | ruFlo calls `search_linear` with α tuned toward sparse for workflow names | ruFlo integration in Phase 3 | + +--- + +## Exotic Applications + +| Application | 10–20 Year Thesis | Required Advances | RuVector Role | Risk / Unknown | +|-------------|-------------------|-------------------|---------------|----------------| +| Cognitum edge cognition | Fully local AI agents on sub-$5 hardware running hybrid memory retrieval with zero cloud dependency | BMP sparse pruning for WASM; int8 dense quantization | `ruvector-hybrid` WASM binary on Pi Zero 2W with 50K-doc hybrid index | WASM 4GB address limit at scale | +| RVM coherence domains | Per-domain α tuning based on mincut coherence scores: high-coherence partitions favor dense, fragmented partitions favor sparse | `ruvector-mincut` coherence score integration into `HybridIndex::search_rrf` | α modulated by `CoherenceEngine::score(partition_id)` | Coherence signal may not correlate with optimal α | +| Proof-gated RAG | Every retrieved document has a verifiable write receipt; auditors can reconstruct any retrieval result | `ruvector-verified` witness chain integration with `HybridIndex::insert` | Tamper-evident hybrid memory for regulated industries | Performance overhead of hash chain at high insert rates | +| Swarm memory | Each agent in a multi-agent swarm maintains a local HybridIndex shard; queries fan out and results merge via meta-RRF | Distributed inverted index sharding via `ruvector-raft` | Meta-RRF across shard results with the same `fusion::rrf` kernel | Network latency dominates at shard count > 10 | +| Self-healing vector graphs | Dense embeddings drift across model upgrades; sparse leg preserves symbolic continuity across embedding changes | Model-agnostic symbolic layer; embedding migration tooling | Sparse index stable across model updates; dense index rebuilt on upgrade | Vocabulary mismatch across domain versions | +| Agent operating systems | In 2036–2046, AI agents have persistent multi-year memory; `search_rrf` is a fundamental OS syscall | Compression (inverted index pruning, dense quantization), streaming updates, distributed sharding | `ruvector-hybrid` as the memory retrieval kernel of an agent OS | Symbolic drift in sparse vocabulary over years | +| Bio-signal memory | EEG/EMG produces both dense spectral vectors and sparse discrete event codes; hybrid index unifies both | Real-time streaming inserts from sensor fusion pipeline | Dense: spectral embedding; sparse: event label → term ID mapping | Label vocabulary stability across sensor generations | +| Synthetic nervous systems | Artificial cognitive systems model continuous sensation (dense) and discrete symbolic cognition (sparse) simultaneously | Neurosymbolic integration beyond current architectures | `HybridSearch` as the grounding interface between symbolic and subsymbolic computation | The hard problem of symbol grounding remains open | + +--- + +## Deep Research Notes + +### What SOTA Tells Us + +1. **RRF is the safe default, convex combination wins with labels.** (arXiv:2210.11934) [^1] — RRF is parameter-free and robust but cannot be tuned. Linear interpolation with calibrated α consistently outperforms RRF when even 50 labeled query pairs are available. + +2. **BM25 inverted index is not the bottleneck; traversal is.** (SIGIR 2024 BMP, arXiv:2405.01117) [^2] — The bottleneck is visiting too many posting list entries per query. Block-Max Pruning skips blocks whose score upper bound falls below the current heap minimum, delivering 25x–59x speedup with zero recall loss in exact mode. + +3. **candidate_k is the primary recall lever.** At candidate_k=50 (1% of N=5K), oracle recall is ~30%. At candidate_k=500 (10%), oracle recall approaches 70%+. BMP makes higher candidate_k affordable. + +4. **Learned sparse (SPLADE) improves recall by 15–30% over BM25 on BEIR** but requires a fine-tuned BERT model. The `SparseVec` format is compatible with SPLADE output — upgrading is a weight-generation function change, not an index change. + +5. **Hybrid recall degrades gracefully.** Neither hybrid fusion method drops below the best single-channel baseline (30.1% ≥ 27.2%) — which means adding the sparse leg to a dense-only system is safe even before full calibration. + +### What Remains Unsolved + +- **Optimal candidate_k per corpus**: No analytical formula; requires empirical calibration per corpus. +- **α calibration with minimal labels**: How few labeled pairs are needed for reliable α estimation? +- **SPLADE without a model runtime**: Can a pure-Rust impact scorer approximate SPLADE without ONNX inference? +- **Vocabulary drift**: As agents acquire new knowledge, their sparse vocabulary changes. How should the inverted index handle new term IDs added after initial build? + +### Where This PoC Fits + +This PoC establishes the correct foundational interface (`HybridSearch` trait, `SparseVec` type, fusion functions) and validates that the fusion overhead is negligible (33–35µs per query). It does not claim production readiness: the flat dense leg is O(N·D), the sparse leg has no pruning, and there are no streaming inserts. Both of these will be addressed in subsequent nightlies. + +### What Would Falsify the Approach + +If agent memory retrieval on real workloads shows that: +- Sparse recall@10 is consistently ≤ 2% (i.e., agent queries never contain keyword-exact intent), then BM25 adds noise without benefit. +- Hybrid RRF recall is consistently ≤ max(dense, sparse) recall (i.e., fusion actively hurts), then there is a data quality issue in the sparse weight generation. + +Both can be measured empirically — the benchmark binary's `recall_at_k` function is the tool. + +### Sources + +[^1]: An Analysis of Fusion Functions for Hybrid Retrieval, Cormack, Clarke, Buettcher, ACM TOIS 2023, arXiv:2210.11934, accessed 2026-05-20. +[^2]: Faster Learned Sparse Retrieval with Block-Max Pruning, SIGIR 2024, arXiv:2405.01117; FOSDEM 2026 Rust implementation at fosdem.org/2026/schedule/event/CB7MBQ-rust-block-max-pruning/, accessed 2026-05-20. +[^3]: Efficiency and Effectiveness of SPLADE Models on Billion-Scale Web Document Titles, arXiv:2511.22263, Nov 2025, accessed 2026-05-20. +[^4]: Operational Advice for Dense and Sparse Retrievers: HNSW, Flat, or Inverted Indexes?, arXiv:2409.06464, ACL 2025, accessed 2026-05-20. +[^5]: BGE-M3, BAAI, huggingface.co/BAAI/bge-m3, accessed 2026-05-20. + +--- + +## Usage Guide + +```bash +git checkout research/nightly/2026-05-20-hybrid-sparse-dense +cargo build --release -p ruvector-hybrid +cargo test -p ruvector-hybrid +cargo run --release -p ruvector-hybrid --bin hybrid-demo +cargo run --release -p ruvector-hybrid --bin benchmark +``` + +**Expected demo output:** +``` +ruvector-hybrid demo (N=2000, D=128, vocab=500, K=10) +Indexed 2000 documents. Memory ≈ 1381.7 KB + +Recall@10 vs oracle (hybrid α=0.5) over 100 queries: + DenseOnly : 21.1% + SparseOnly : 24.1% + HybridRRF : 38.3% + HybridLinear : 38.2% +``` + +**To change dataset size**: Edit `N`, `DIMS`, `VOCAB`, `DOC_TERMS`, `N_QUERIES` constants in `src/benchmark.rs`. + +**To add a new backend**: Implement `HybridSearch` for your struct. Only `insert`, `search_dense`, and `search_sparse` are required — `search_rrf` and `search_linear` are provided by default. + +**To plug into ruvector-core HNSW**: +```rust +// Implement HybridSearch for a new HnswHybridIndex struct +impl HybridSearch for HnswHybridIndex { + fn search_dense(&self, q: &HybridQuery, k: usize) -> Vec { + self.hnsw.search(&q.dense.data, k) // ruvector-core call + .into_iter().map(|(id, score)| Scored::new(id, score)).collect() + } + // ... insert and search_sparse as before +} +``` + +--- + +## Optimization Guide + +| Axis | Action | Expected Impact | +|------|--------|----------------| +| Memory | Quantize dense to int8 | Halve dense memory at small recall cost | +| Latency (sparse) | Implement Block-Max Pruning | 10x–25x sparse leg speedup | +| Latency (dense) | Swap flat scan for HNSW | 100x+ QPS at N > 100K | +| Recall | Increase candidate_k | 70%+ oracle recall at candidate_k=500 | +| Recall | Apply query term thresholding (thresh=0.4) | 60% fewer posting list visits, <6% recall loss | +| Edge / WASM | Enable `wasm32-unknown-unknown` target | Run hybrid search in browser or on Cognitum Seed | +| MCP tool | Wire `search_rrf` to MCP `memory_search` | Agent memory recall improvement without API change | +| ruFlo | Schedule nightly α recalibration | Automatic recall optimization on query feedback | + +--- + +## Roadmap + +### Now +- `crates/ruvector-hybrid` workspace member, build green, 16 tests passing +- `HybridSearch` trait stable for downstream integration +- Demo and benchmark binaries with real numbers + +### Next +- Block-Max Pruning (BMP) in `SparseIndex` — SIGIR 2024 algorithm, FOSDEM 2026 Rust reference +- HNSW swap-in via `ruvector-core` behind `features = ["hnsw"]` +- Query term thresholding: zero terms below `thresh_ratio × max_weight` +- `ruvector-filter` predicate integration before fusion step + +### Later (2030–2046) +- Neurosymbolic grounding: dense vectors represent continuous state; sparse terms represent discrete symbolic events; hybrid search bridges both in artificial cognition systems +- Proof-gated hybrid memory: every retrieved document has a verifiable creation receipt; RuVector becomes a tamper-evident memory substrate for regulated AI agents +- Distributed swarm memory: `HybridSearch` trait implemented over `ruvector-raft` shards; meta-RRF across the cluster + +--- + +## Keywords + +ruvector, Rust vector database, Rust vector search, hybrid search, sparse dense search, BM25 vector search, reciprocal rank fusion, RRF, linear fusion, agent memory, AI agents, MCP, WASM AI, edge AI, ANN search, HNSW, DiskANN, filtered vector search, graph RAG, self learning vector database, ruvnet, ruFlo, Claude Flow, autonomous agents, retrieval augmented generation, SPLADE, inverted index, term weight retrieval. + +## Suggested GitHub Topics + +rust, vector-database, vector-search, hybrid-search, sparse-dense, bm25, rrf, ann, hnsw, rag, graph-rag, ai-agents, agent-memory, mcp, wasm, edge-ai, rust-ai, semantic-search, graph-database, autonomous-agents, retrieval, embeddings, ruvector.