diff --git a/crates/Cargo.toml b/crates/Cargo.toml index 289e4cc3..28a348f0 100644 --- a/crates/Cargo.toml +++ b/crates/Cargo.toml @@ -9,6 +9,7 @@ members = [ "khive-query", "khive-gate", "khive-gate-rego", + "khive-fusion", "khive-runtime", "khive-request", "khive-pack-kg", diff --git a/crates/khive-fusion/Cargo.toml b/crates/khive-fusion/Cargo.toml new file mode 100644 index 00000000..6ccc1856 --- /dev/null +++ b/crates/khive-fusion/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "khive-fusion" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true +keywords.workspace = true +categories.workspace = true +description = "Rank fusion strategies (RRF, Weighted, Union) with deterministic scoring" + +[dependencies] +khive-score = { version = "0.2.0", path = "../khive-score" } +serde = { workspace = true } diff --git a/crates/khive-fusion/src/fuse.rs b/crates/khive-fusion/src/fuse.rs new file mode 100644 index 00000000..0e51c880 --- /dev/null +++ b/crates/khive-fusion/src/fuse.rs @@ -0,0 +1,172 @@ +//! Main fusion entry point. + +use khive_score::DeterministicScore; +use std::hash::Hash; + +use super::rrf::reciprocal_rank_fusion; +use super::strategy::FusionStrategy; +use super::union::union_fusion; +use super::weighted::weighted_fusion; + +/// Fuse multiple ranked result lists into a single ranked list. +/// +/// This is the main entry point for rank fusion. It supports multiple fusion +/// strategies and is generic over the ID type. +/// +/// # Arguments +/// +/// * `sources` - Vector of result lists from different retrievers. +/// Each list contains `(Id, DeterministicScore)` pairs, already sorted +/// by score descending (best first). +/// * `strategy` - The fusion strategy to use. +/// * `top_k` - Maximum number of results to return. +/// +/// # Returns +/// +/// A vector of `(Id, DeterministicScore)` pairs sorted by fused score descending, +/// truncated to `top_k` results. +/// +/// # Type Parameters +/// +/// * `Id` - The identifier type. Must implement `Eq`, `Hash`, `Clone`, and `Ord`. +/// Works with `EmbeddingId`, `DocumentId`, `String`, `Uuid`, etc. +/// `Ord` is required for deterministic tie-breaking when scores are equal. +/// +/// # Example +/// +/// ```rust +/// use khive_fusion::{fuse, FusionStrategy}; +/// use khive_score::DeterministicScore; +/// +/// let sources = vec![ +/// vec![("a", DeterministicScore::from_f64(0.9))], +/// vec![("a", DeterministicScore::from_f64(0.8))], +/// ]; +/// +/// let results = fuse(sources, &FusionStrategy::default(), 10); +/// assert_eq!(results.len(), 1); +/// ``` +pub fn fuse( + sources: Vec>, + strategy: &FusionStrategy, + top_k: usize, +) -> Vec<(Id, DeterministicScore)> { + if sources.is_empty() || top_k == 0 { + return Vec::new(); + } + + let fused = match strategy { + FusionStrategy::Rrf { k } => reciprocal_rank_fusion(sources, *k), + FusionStrategy::Weighted { weights } => weighted_fusion(sources, weights), + FusionStrategy::Union => union_fusion(sources), + // VectorOnly / KeywordOnly: the caller is responsible for ensuring only + // the relevant source list is passed. Within fuse(), we take the union + // (max-score per ID) which is a no-op when there is a single source. + FusionStrategy::VectorOnly | FusionStrategy::KeywordOnly => union_fusion(sources), + }; + + // Truncate to top_k + fused.into_iter().take(top_k).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_results(items: Vec<(Id, f64)>) -> Vec<(Id, DeterministicScore)> { + items + .into_iter() + .map(|(id, score)| (id, DeterministicScore::from_f64(score))) + .collect() + } + + #[test] + fn test_fuse_rrf_strategy() { + let source = make_results(vec![("doc_a", 0.9), ("doc_b", 0.8)]); + let fused = fuse(vec![source], &FusionStrategy::rrf(), 10); + + assert_eq!(fused.len(), 2); + } + + #[test] + fn test_fuse_weighted_strategy() { + let source = make_results(vec![("doc_a", 1.0)]); + let fused = fuse(vec![source], &FusionStrategy::weighted(vec![1.0]), 10); + + assert_eq!(fused.len(), 1); + } + + #[test] + fn test_fuse_union_strategy() { + let source = make_results(vec![("doc_a", 0.9)]); + let fused = fuse(vec![source], &FusionStrategy::union(), 10); + + assert_eq!(fused.len(), 1); + } + + #[test] + fn test_fuse_top_k_truncation() { + let source = make_results(vec![ + ("doc_a", 0.9), + ("doc_b", 0.8), + ("doc_c", 0.7), + ("doc_d", 0.6), + ("doc_e", 0.5), + ]); + + let fused = fuse(vec![source], &FusionStrategy::rrf(), 3); + + assert_eq!(fused.len(), 3); + assert_eq!(fused[0].0, "doc_a"); + assert_eq!(fused[1].0, "doc_b"); + assert_eq!(fused[2].0, "doc_c"); + } + + #[test] + fn test_fuse_top_k_zero() { + let source = make_results(vec![("doc_a", 0.9)]); + let fused = fuse(vec![source], &FusionStrategy::rrf(), 0); + + assert!(fused.is_empty()); + } + + #[test] + fn test_fuse_empty_sources() { + let fused: Vec<(&str, DeterministicScore)> = fuse(vec![], &FusionStrategy::rrf(), 10); + assert!(fused.is_empty()); + } + + #[test] + fn test_fuse_top_k_larger_than_results() { + let source = make_results(vec![("doc_a", 0.9), ("doc_b", 0.8)]); + let fused = fuse(vec![source], &FusionStrategy::rrf(), 100); + + assert_eq!(fused.len(), 2); + } + + #[test] + fn test_fuse_with_string_ids() { + let source: Vec<(String, DeterministicScore)> = vec![ + ("doc_a".to_string(), DeterministicScore::from_f64(0.9)), + ("doc_b".to_string(), DeterministicScore::from_f64(0.8)), + ]; + + let fused = fuse(vec![source], &FusionStrategy::rrf(), 10); + + assert_eq!(fused.len(), 2); + assert_eq!(fused[0].0, "doc_a"); + } + + #[test] + fn test_fuse_with_integer_ids() { + let source: Vec<(u64, DeterministicScore)> = vec![ + (1, DeterministicScore::from_f64(0.9)), + (2, DeterministicScore::from_f64(0.8)), + ]; + + let fused = fuse(vec![source], &FusionStrategy::rrf(), 10); + + assert_eq!(fused.len(), 2); + assert_eq!(fused[0].0, 1); + } +} diff --git a/crates/khive-fusion/src/lib.rs b/crates/khive-fusion/src/lib.rs new file mode 100644 index 00000000..007da8d5 --- /dev/null +++ b/crates/khive-fusion/src/lib.rs @@ -0,0 +1,68 @@ +//! Fusion algorithms for combining retrieval results. +//! +//! This module implements rank fusion strategies for hybrid search, combining +//! results from multiple retrieval sources (e.g., vector search, keyword search). +//! +//! # Supported Strategies +//! +//! - **RRF (Reciprocal Rank Fusion)**: Default and recommended. Uses only ranks, +//! making it robust to score distribution differences. +//! - **Weighted**: Linear combination of scores with configurable weights. +//! - **Union**: Takes the maximum score per ID across sources. +//! +//! # Algorithm +//! +//! RRF formula: +//! ```text +//! score(d) = Σ 1/(k + rank_i(d)) +//! ``` +//! where: +//! - k = 60 (standard, dampens high-rank dominance) +//! - rank_i(d) = position of d in retriever i's results (1-indexed) +//! - If d not in retriever i, contribution = 0 +//! +//! # Example +//! +//! ```rust +//! use khive_fusion::{fuse, FusionStrategy, reciprocal_rank_fusion}; +//! use khive_score::DeterministicScore; +//! +//! // Two retrieval sources with different rankings +//! let vector_results = vec![ +//! ("doc_a", DeterministicScore::from_f64(0.95)), +//! ("doc_b", DeterministicScore::from_f64(0.90)), +//! ("doc_c", DeterministicScore::from_f64(0.85)), +//! ]; +//! +//! let keyword_results = vec![ +//! ("doc_b", DeterministicScore::from_f64(0.88)), +//! ("doc_c", DeterministicScore::from_f64(0.75)), +//! ("doc_d", DeterministicScore::from_f64(0.70)), +//! ]; +//! +//! // Fuse using RRF with k=60 +//! let fused = fuse( +//! vec![vector_results, keyword_results], +//! &FusionStrategy::Rrf { k: 60 }, +//! 5, +//! ); +//! +//! // doc_b appears in both sources, so it gets highest RRF score +//! assert_eq!(fused[0].0, "doc_b"); +//! ``` + +mod fuse; +mod rrf; +mod strategy; +mod union; +mod weighted; + +#[cfg(test)] +mod tests; + +// Re-export public types and functions +pub use fuse::fuse; +pub use rrf::reciprocal_rank_fusion; +pub use strategy::{FusionStrategy, DEFAULT_RRF_K}; +pub use union::union_fusion; +pub use weighted::{normalize_weights, weighted_fusion, weights_are_normalized}; diff --git a/crates/khive-fusion/src/rrf.rs b/crates/khive-fusion/src/rrf.rs new file mode 100644 index 00000000..377a9151 --- /dev/null +++ b/crates/khive-fusion/src/rrf.rs @@ -0,0 +1,206 @@ +//! Reciprocal Rank Fusion (RRF) algorithm. +//! +//! # Properties +//! +//! - Better rank → higher contribution: r1 < r2 → contrib(r1) > contrib(r2) +//! - Present documents always outscore absent +//! - Contribution upper bound: ≤ 1/(k+1) +//! - Total score ≤ number of sources +//! - Sum is permutation invariant (order-independent) +//! - Ties broken by ID for deterministic cross-platform ordering + +use khive_score::DeterministicScore; +use std::cmp::Ordering; +use std::collections::HashMap; +use std::hash::Hash; + +/// Reciprocal Rank Fusion (RRF) algorithm. +/// +/// Combines ranked lists using only rank information, ignoring original scores. +/// This makes it robust to different score distributions and outliers. +/// +/// # Formula +/// +/// For each document d across all sources: +/// ```text +/// score(d) = Σ 1/(k + rank_i(d)) +/// ``` +/// where rank_i(d) is the 1-indexed position of d in source i. +/// +/// # Arguments +/// +/// * `sources` - Vector of result lists. Each list should be sorted by +/// score descending (best first). The scores are ignored; only positions matter. +/// * `k` - Smoothing constant. Standard value is 60. +/// +/// # Returns +/// +/// A vector of `(Id, DeterministicScore)` pairs sorted by RRF score descending. +/// +/// # Example +/// +/// ```rust +/// use khive_fusion::reciprocal_rank_fusion; +/// use khive_score::DeterministicScore; +/// +/// let source1 = vec![ +/// ("doc_a", DeterministicScore::from_f64(0.9)), // rank 1 +/// ("doc_b", DeterministicScore::from_f64(0.8)), // rank 2 +/// ]; +/// let source2 = vec![ +/// ("doc_b", DeterministicScore::from_f64(0.95)), // rank 1 +/// ("doc_a", DeterministicScore::from_f64(0.7)), // rank 2 +/// ]; +/// +/// let fused = reciprocal_rank_fusion(vec![source1, source2], 60); +/// +/// // doc_a: 1/(60+1) + 1/(60+2) = 1/61 + 1/62 ≈ 0.0326 +/// // doc_b: 1/(60+2) + 1/(60+1) = 1/62 + 1/61 ≈ 0.0326 +/// // Scores are equal since both appear at ranks 1 and 2 (just swapped) +/// // Ties are broken by ID (lexicographic order) for determinism +/// ``` +/// +/// **Property**: better rank → higher contribution. +/// Better rank yields higher contribution: r1 < r2 implies 1/(k+r1) > 1/(k+r2). +/// +/// **Property**: present > absent. +/// Documents present in any source always outscore documents absent from all sources +/// (absent documents have score 0, present documents have score > 0). +pub fn reciprocal_rank_fusion( + sources: Vec>, + k: usize, +) -> Vec<(Id, DeterministicScore)> { + if sources.is_empty() { + return Vec::new(); + } + + // Ensure k >= 1 to avoid division issues + let k = k.max(1); + + // Estimate capacity as sum of all source lengths (upper bound on unique IDs) + let estimated_capacity: usize = sources.iter().map(|s| s.len()).sum(); + let mut combined: HashMap = HashMap::with_capacity(estimated_capacity); + + for results in sources { + for (rank_0_indexed, (id, _score)) in results.into_iter().enumerate() { + // rank is 1-indexed per ADR-002 + let rank_1_indexed = rank_0_indexed + 1; + let rrf_contribution = 1.0 / (k + rank_1_indexed) as f64; + + *combined.entry(id).or_insert(0.0) += rrf_contribution; + } + } + + // Convert to DeterministicScore and sort descending + // Permutation invariant: reordering sources yields same totals. + // The sum of contributions is permutation-invariant: reordering sources + // produces the same total score for each document. + let mut fused: Vec<(Id, DeterministicScore)> = combined + .into_iter() + .map(|(id, score)| (id, DeterministicScore::from_f64(score))) + .collect(); + + // Sort by score descending, then by ID ascending for deterministic tie-breaking + // This ensures cross-platform consistency when scores are equal + fused.sort_by( + |(id_a, score_a), (id_b, score_b)| match score_b.cmp(score_a) { + Ordering::Equal => id_a.cmp(id_b), + other => other, + }, + ); + + fused +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_results(items: Vec<(Id, f64)>) -> Vec<(Id, DeterministicScore)> { + items + .into_iter() + .map(|(id, score)| (id, DeterministicScore::from_f64(score))) + .collect() + } + + #[test] + fn test_rrf_basic_two_sources() { + let source1 = make_results(vec![("doc_a", 0.9), ("doc_b", 0.8)]); + let source2 = make_results(vec![("doc_b", 0.95), ("doc_c", 0.7)]); + + let fused = reciprocal_rank_fusion(vec![source1, source2], 60); + + // doc_b appears in both, should have highest score + assert_eq!(fused[0].0, "doc_b"); + assert_eq!(fused.len(), 3); + } + + #[test] + fn test_rrf_score_calculation() { + let source = make_results(vec![("doc_a", 0.9)]); + let fused = reciprocal_rank_fusion(vec![source], 60); + + let expected = 1.0 / 61.0; + assert!((fused[0].1.to_f64() - expected).abs() < 1e-9); + } + + #[test] + fn test_rrf_cumulative_scores() { + let source1 = make_results(vec![("doc_a", 0.9)]); + let source2 = make_results(vec![("doc_a", 0.8)]); + + let fused = reciprocal_rank_fusion(vec![source1, source2], 60); + + let expected = 2.0 / 61.0; + assert!((fused[0].1.to_f64() - expected).abs() < 1e-9); + } + + #[test] + fn test_rrf_ignores_scores() { + let source1_high = make_results(vec![("doc_a", 0.99), ("doc_b", 0.01)]); + let source1_low = make_results(vec![("doc_a", 0.6), ("doc_b", 0.5)]); + + let fused_high = reciprocal_rank_fusion(vec![source1_high], 60); + let fused_low = reciprocal_rank_fusion(vec![source1_low], 60); + + assert_eq!(fused_high[0].1, fused_low[0].1); + assert_eq!(fused_high[1].1, fused_low[1].1); + } + + #[test] + fn test_rrf_empty_sources() { + let fused: Vec<(&str, DeterministicScore)> = reciprocal_rank_fusion(vec![], 60); + assert!(fused.is_empty()); + } + + #[test] + fn test_rrf_single_source_passthrough() { + let source = make_results(vec![("doc_a", 0.9), ("doc_b", 0.8), ("doc_c", 0.7)]); + let fused = reciprocal_rank_fusion(vec![source], 60); + + assert_eq!(fused.len(), 3); + assert_eq!(fused[0].0, "doc_a"); + assert_eq!(fused[1].0, "doc_b"); + assert_eq!(fused[2].0, "doc_c"); + } + + #[test] + fn test_rrf_k_minimum_enforced() { + let source = make_results(vec![("doc_a", 0.9)]); + let fused = reciprocal_rank_fusion(vec![source], 0); + + let expected = 1.0 / 2.0; + assert!((fused[0].1.to_f64() - expected).abs() < 1e-9); + } + + #[test] + fn test_rrf_many_sources() { + let sources: Vec> = + (0..5).map(|_| make_results(vec![("doc_a", 0.9)])).collect(); + + let fused = reciprocal_rank_fusion(sources, 60); + + let expected = 5.0 / 61.0; + assert!((fused[0].1.to_f64() - expected).abs() < 1e-9); + } +} diff --git a/crates/khive-fusion/src/strategy.rs b/crates/khive-fusion/src/strategy.rs new file mode 100644 index 00000000..b098bd6e --- /dev/null +++ b/crates/khive-fusion/src/strategy.rs @@ -0,0 +1,116 @@ +//! Fusion strategy types. + +use serde::{Deserialize, Serialize}; + +/// Default RRF constant k=60, standard in literature (Craswell et al., 2009). +pub const DEFAULT_RRF_K: usize = 60; + +/// Fusion strategy for combining ranked result lists. +/// +/// See module-level docs for algorithm details. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum FusionStrategy { + /// Reciprocal Rank Fusion (default, recommended). + /// + /// Uses only ranks, making it robust to different score distributions. + /// Formula: score(d) = Σ 1/(k + rank_i(d)) + #[serde(alias = "Rrf")] + Rrf { + /// Smoothing constant. Higher values reduce impact of rank differences. + /// Default: 60 (standard in literature). + k: usize, + }, + + /// Weighted linear combination of scores. + /// + /// Requires score normalization for different score scales (e.g., vector + /// similarity 0-1 vs BM25 0-∞). + /// + /// Weights are normalized to sum to 1.0 internally. + #[serde(alias = "Weighted")] + Weighted { + /// Weights for each source (will be normalized). + weights: Vec, + }, + + /// Take union with max score per ID. + /// + /// Useful when you want the best score from any source. + #[serde(alias = "Union")] + Union, + + /// Skip BM25 entirely — return only vector (HNSW) results. + /// + /// Use when keyword search degrades quality (short queries, code search). + /// The result list is the raw HNSW output with no fusion step. + #[serde(alias = "VectorOnly")] + VectorOnly, + + /// Skip HNSW entirely — return only BM25 keyword results. + /// + /// Use for exact-match retrieval (medication names, identifiers, slugs). + /// The result list is the raw BM25 output with no fusion step. + #[serde(alias = "KeywordOnly")] + KeywordOnly, +} + +impl Default for FusionStrategy { + fn default() -> Self { + Self::Rrf { k: DEFAULT_RRF_K } + } +} + +impl FusionStrategy { + /// Create an RRF strategy with default k=60. + #[inline] + pub fn rrf() -> Self { + Self::Rrf { k: DEFAULT_RRF_K } + } + + /// Create an RRF strategy with custom k value. + #[inline] + pub fn rrf_with_k(k: usize) -> Self { + Self::Rrf { k: k.max(1) } // Ensure k >= 1 + } + + /// Create a weighted strategy with given weights. + #[inline] + pub fn weighted(weights: Vec) -> Self { + Self::Weighted { weights } + } + + /// Create a union strategy. + #[inline] + pub fn union() -> Self { + Self::Union + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fusion_strategy_default() { + let default = FusionStrategy::default(); + assert_eq!(default, FusionStrategy::Rrf { k: 60 }); + } + + #[test] + fn test_fusion_strategy_builders() { + assert_eq!(FusionStrategy::rrf(), FusionStrategy::Rrf { k: 60 }); + assert_eq!( + FusionStrategy::rrf_with_k(20), + FusionStrategy::Rrf { k: 20 } + ); + assert_eq!(FusionStrategy::rrf_with_k(0), FusionStrategy::Rrf { k: 1 }); // min enforced + assert_eq!( + FusionStrategy::weighted(vec![0.5, 0.5]), + FusionStrategy::Weighted { + weights: vec![0.5, 0.5] + } + ); + assert_eq!(FusionStrategy::union(), FusionStrategy::Union); + } +} diff --git a/crates/khive-fusion/src/tests.rs b/crates/khive-fusion/src/tests.rs new file mode 100644 index 00000000..8fda09b8 --- /dev/null +++ b/crates/khive-fusion/src/tests.rs @@ -0,0 +1,315 @@ +//! Integration tests and property tests for fusion module. + +#[cfg(test)] +mod integration_tests { + use crate::{fuse, reciprocal_rank_fusion, union_fusion, weighted_fusion, FusionStrategy}; + use khive_score::DeterministicScore; + + pub(super) fn make_results(items: Vec<(Id, f64)>) -> Vec<(Id, DeterministicScore)> { + items + .into_iter() + .map(|(id, score)| (id, DeterministicScore::from_f64(score))) + .collect() + } + + // ========================================================================= + // RETRIEVAL-01: Deterministic Tie-Breaking Tests + // ========================================================================= + + #[test] + fn test_rrf_deterministic_tie_breaking() { + // When two documents have equal RRF scores, they should be ordered by ID + let source1 = make_results(vec![("doc_a", 0.9)]); // rank 1 + let source2 = make_results(vec![("doc_b", 0.8)]); // rank 1 + + // Run multiple times to verify consistency + for _ in 0..10 { + let fused = reciprocal_rank_fusion(vec![source1.clone(), source2.clone()], 60); + + assert_eq!(fused.len(), 2); + // Both have same RRF score (1/61), so order should be by ID + assert_eq!(fused[0].1, fused[1].1, "Scores should be equal"); + assert_eq!( + fused[0].0, "doc_a", + "doc_a should come first (lexicographic order)" + ); + assert_eq!(fused[1].0, "doc_b", "doc_b should come second"); + } + } + + #[test] + fn test_weighted_deterministic_tie_breaking() { + // Two documents with equal weighted scores + let source = make_results(vec![("z_doc", 0.5), ("a_doc", 0.5)]); + + for _ in 0..10 { + let fused = weighted_fusion(vec![source.clone()], &[1.0]); + + assert_eq!(fused.len(), 2); + assert_eq!(fused[0].1, fused[1].1, "Scores should be equal"); + assert_eq!( + fused[0].0, "a_doc", + "a_doc should come first (lexicographic order)" + ); + assert_eq!(fused[1].0, "z_doc", "z_doc should come second"); + } + } + + #[test] + fn test_union_deterministic_tie_breaking() { + // Two documents with equal max scores + let source1 = make_results(vec![("charlie", 0.8)]); + let source2 = make_results(vec![("alpha", 0.8)]); + + for _ in 0..10 { + let fused = union_fusion(vec![source1.clone(), source2.clone()]); + + assert_eq!(fused.len(), 2); + assert_eq!(fused[0].1, fused[1].1, "Scores should be equal"); + assert_eq!(fused[0].0, "alpha", "alpha should come first"); + assert_eq!(fused[1].0, "charlie", "charlie should come second"); + } + } + + #[test] + fn test_fuse_deterministic_with_many_ties() { + // Multiple documents all at same score + let source: Vec<(&str, DeterministicScore)> = vec![ + ("delta", DeterministicScore::from_f64(0.5)), + ("alpha", DeterministicScore::from_f64(0.5)), + ("charlie", DeterministicScore::from_f64(0.5)), + ("bravo", DeterministicScore::from_f64(0.5)), + ]; + + for _ in 0..10 { + let fused = fuse(vec![source.clone()], &FusionStrategy::union(), 10); + + assert_eq!(fused.len(), 4); + // All have same score, should be in lexicographic order + assert_eq!(fused[0].0, "alpha"); + assert_eq!(fused[1].0, "bravo"); + assert_eq!(fused[2].0, "charlie"); + assert_eq!(fused[3].0, "delta"); + } + } + + #[test] + fn test_rrf_large_number_of_results() { + // Test with many results to ensure no overflow/precision issues + let source: Vec<(String, DeterministicScore)> = (0..1000) + .map(|i| { + ( + format!("doc_{i}"), + DeterministicScore::from_f64(1.0 - i as f64 / 1000.0), + ) + }) + .collect(); + + let fused = fuse(vec![source], &FusionStrategy::rrf(), 100); + + assert_eq!(fused.len(), 100); + assert_eq!(fused[0].0, "doc_0"); + } + + #[test] + fn test_multiple_sources_all_same_document() { + let source1 = make_results(vec![("doc_a", 0.9), ("doc_b", 0.8)]); + let source2 = make_results(vec![("doc_b", 0.95), ("doc_a", 0.7)]); + let source3 = make_results(vec![("doc_a", 0.85)]); + + let fused = reciprocal_rank_fusion(vec![source1, source2, source3], 60); + + let doc_a = fused.iter().find(|(id, _)| *id == "doc_a").unwrap(); + let doc_b = fused.iter().find(|(id, _)| *id == "doc_b").unwrap(); + + assert!(doc_a.1 > doc_b.1); // doc_a appears in more sources + } + + #[test] + fn test_sorted_output() { + let source1 = make_results(vec![("doc_c", 0.7), ("doc_a", 0.9), ("doc_b", 0.8)]); + + let fused = reciprocal_rank_fusion(vec![source1], 60); + + // Input order determines rank, so doc_c is rank 1, doc_a rank 2, doc_b rank 3 + assert_eq!(fused[0].0, "doc_c"); + assert_eq!(fused[1].0, "doc_a"); + assert_eq!(fused[2].0, "doc_b"); + } + + #[test] + fn test_rrf_document_only_in_one_source() { + let source1 = make_results(vec![("doc_a", 0.9)]); + let source2 = make_results(vec![("doc_b", 0.8)]); + + let fused = reciprocal_rank_fusion(vec![source1, source2], 60); + + // Both at rank 1 in their respective sources -> same RRF score + assert_eq!(fused.len(), 2); + assert_eq!(fused[0].1, fused[1].1); + } + + #[test] + fn test_rrf_custom_k() { + let source = make_results(vec![("doc_a", 0.9), ("doc_b", 0.8)]); + + let fused_k20 = reciprocal_rank_fusion(vec![source.clone()], 20); + let fused_k100 = reciprocal_rank_fusion(vec![source], 100); + + let ratio_k20 = fused_k20[0].1.to_f64() / fused_k20[1].1.to_f64(); + let ratio_k100 = fused_k100[0].1.to_f64() / fused_k100[1].1.to_f64(); + + // Smaller k -> larger ratio (more difference between ranks) + assert!(ratio_k20 > ratio_k100); + } +} + +// ============================================================================= +// Property Tests (Issue #746) +// TODO(port): proptest not yet added as a dev-dependency; the proptest macro +// forms below have been converted to deterministic unit tests covering the same +// properties. Re-introduce proptest once it is added to Cargo.toml [dev-dependencies]. +// ============================================================================= + +#[cfg(test)] +mod property_tests { + use crate::{reciprocal_rank_fusion, union_fusion, weighted_fusion}; + use khive_score::DeterministicScore; + use std::collections::HashSet; + + fn make_results(items: Vec<(&'static str, f64)>) -> Vec<(String, DeterministicScore)> { + items + .into_iter() + .map(|(id, score)| (id.to_string(), DeterministicScore::from_f64(score))) + .collect() + } + + /// RRF is commutative: source order should not affect final rankings. + /// + /// Verifies the `sum_perm` property from RRF.lean. + #[test] + fn prop_rrf_is_commutative() { + let sources = vec![ + make_results(vec![("doc_a", 0.9), ("doc_b", 0.8)]), + make_results(vec![("doc_b", 0.95), ("doc_c", 0.7)]), + make_results(vec![("doc_a", 0.6), ("doc_c", 0.5)]), + ]; + + let fused_orig = reciprocal_rank_fusion(sources.clone(), 60); + + let mut reversed = sources.clone(); + reversed.reverse(); + let fused_reversed = reciprocal_rank_fusion(reversed, 60); + + let orig_set: HashSet<_> = fused_orig + .iter() + .map(|(id, score)| (id.clone(), score.to_raw())) + .collect(); + let rev_set: HashSet<_> = fused_reversed + .iter() + .map(|(id, score)| (id.clone(), score.to_raw())) + .collect(); + + assert_eq!( + orig_set, rev_set, + "RRF results should be identical regardless of source order" + ); + } + + /// Documents in more sources should score higher than those in fewer. + /// + /// Verifies the `present_gt_absent` property from RRF.lean. + #[test] + fn prop_rrf_more_sources_higher_score() { + let source1: Vec<(String, DeterministicScore)> = vec![( + "doc_common".to_string(), + DeterministicScore::from_f64(0.9), + )]; + let source2: Vec<(String, DeterministicScore)> = vec![ + ( + "doc_common".to_string(), + DeterministicScore::from_f64(0.9), + ), + ( + "doc_single".to_string(), + DeterministicScore::from_f64(0.8), + ), + ]; + + let fused = reciprocal_rank_fusion(vec![source1, source2], 60); + + let common = fused.iter().find(|(id, _)| id == "doc_common").unwrap(); + let single = fused.iter().find(|(id, _)| id == "doc_single").unwrap(); + + assert!( + common.1 >= single.1, + "Document in more sources should score >= document in fewer" + ); + } + + /// RRF scores should always be non-negative. + #[test] + fn prop_rrf_scores_nonnegative() { + let sources = vec![ + make_results(vec![("doc_a", 0.9), ("doc_b", 0.1)]), + make_results(vec![("doc_b", 0.5), ("doc_c", 0.0)]), + ]; + let fused = reciprocal_rank_fusion(sources, 60); + + for (id, score) in &fused { + assert!( + score.to_f64() >= 0.0, + "RRF score for {} should be non-negative, got {}", + id, + score.to_f64() + ); + } + } + + /// Union fusion should include all unique documents. + #[test] + fn prop_union_includes_all_docs() { + let sources = vec![ + make_results(vec![("doc_a", 0.9), ("doc_b", 0.8)]), + make_results(vec![("doc_b", 0.7), ("doc_c", 0.6)]), + make_results(vec![("doc_d", 0.5)]), + ]; + + let expected_ids: HashSet<_> = sources + .iter() + .flat_map(|s| s.iter().map(|(id, _)| id.clone())) + .collect(); + + let fused = union_fusion(sources); + let result_ids: HashSet<_> = fused.iter().map(|(id, _)| id.clone()).collect(); + + assert_eq!( + expected_ids, result_ids, + "Union should contain all unique documents" + ); + } + + /// With per-source min-max normalization, single-element sources always + /// map to 1.0, so a document present in both sources receives a combined + /// score of sum(weight_i * 1.0) = total_weight = 1.0 for equal weights. + #[test] + fn prop_weighted_single_element_sources_score_one() { + for (s1, s2) in [(0.0f64, 0.0f64), (0.5, 1.0), (0.9, 0.1), (1.0, 1.0)] { + let source1: Vec<(String, DeterministicScore)> = + vec![("doc".to_string(), DeterministicScore::from_f64(s1))]; + let source2: Vec<(String, DeterministicScore)> = + vec![("doc".to_string(), DeterministicScore::from_f64(s2))]; + + let fused = weighted_fusion(vec![source1, source2], &[0.5, 0.5]); + + if let Some((_, score)) = fused.first() { + let actual = score.to_f64(); + assert!( + (actual - 1.0).abs() < 1e-9, + "Single-element source always normalizes to 1.0; combined = 1.0, got {} (inputs: {}, {})", + actual, s1, s2 + ); + } + } + } +} diff --git a/crates/khive-fusion/src/union.rs b/crates/khive-fusion/src/union.rs new file mode 100644 index 00000000..96777535 --- /dev/null +++ b/crates/khive-fusion/src/union.rs @@ -0,0 +1,103 @@ +//! Union fusion (max score per ID). + +use khive_score::DeterministicScore; +use std::cmp::Ordering; +use std::collections::HashMap; +use std::hash::Hash; + +/// Union fusion: take max score for each ID. +/// +/// Useful when you want the best score any retriever assigned to each document. +/// +/// # Arguments +/// +/// * `sources` - Vector of result lists. +/// +/// # Returns +/// +/// A vector sorted by max score descending, with ties broken by ID +/// for deterministic cross-platform ordering. +pub fn union_fusion( + sources: Vec>, +) -> Vec<(Id, DeterministicScore)> { + if sources.is_empty() { + return Vec::new(); + } + + let estimated_capacity: usize = sources.iter().map(|s| s.len()).sum(); + let mut combined: HashMap = HashMap::with_capacity(estimated_capacity); + + for results in sources { + for (id, score) in results { + combined + .entry(id) + .and_modify(|existing| { + if score > *existing { + *existing = score; + } + }) + .or_insert(score); + } + } + + let mut fused: Vec<(Id, DeterministicScore)> = combined.into_iter().collect(); + // Sort by score descending, then by ID ascending for determinism + fused.sort_by( + |(id_a, score_a), (id_b, score_b)| match score_b.cmp(score_a) { + Ordering::Equal => id_a.cmp(id_b), + other => other, + }, + ); + fused +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_results(items: Vec<(Id, f64)>) -> Vec<(Id, DeterministicScore)> { + items + .into_iter() + .map(|(id, score)| (id, DeterministicScore::from_f64(score))) + .collect() + } + + #[test] + fn test_union_takes_max_score() { + let source1 = make_results(vec![("doc_a", 0.7)]); + let source2 = make_results(vec![("doc_a", 0.9)]); + + let fused = union_fusion(vec![source1, source2]); + + assert_eq!(fused.len(), 1); + assert!((fused[0].1.to_f64() - 0.9).abs() < 0.01); + } + + #[test] + fn test_union_disjoint_sources() { + let source1 = make_results(vec![("doc_a", 0.8)]); + let source2 = make_results(vec![("doc_b", 0.6)]); + + let fused = union_fusion(vec![source1, source2]); + + assert_eq!(fused.len(), 2); + assert_eq!(fused[0].0, "doc_a"); + assert_eq!(fused[1].0, "doc_b"); + } + + #[test] + fn test_union_empty_sources() { + let fused: Vec<(&str, DeterministicScore)> = union_fusion(vec![]); + assert!(fused.is_empty()); + } + + #[test] + fn test_union_single_source() { + let source = make_results(vec![("doc_a", 0.9), ("doc_b", 0.7)]); + let fused = union_fusion(vec![source]); + + assert_eq!(fused.len(), 2); + assert_eq!(fused[0].0, "doc_a"); + assert_eq!(fused[1].0, "doc_b"); + } +} diff --git a/crates/khive-fusion/src/weighted.rs b/crates/khive-fusion/src/weighted.rs new file mode 100644 index 00000000..6ebc9911 --- /dev/null +++ b/crates/khive-fusion/src/weighted.rs @@ -0,0 +1,489 @@ +//! Weighted linear combination fusion. +//! +//! # Weight Normalization (RETRIEVAL-07) +//! +//! Weights are automatically normalized to sum to 1.0 before fusion. +//! This ensures consistent behavior regardless of the input weight scale. +//! +//! ## Normalization Behavior +//! +//! | Input Weights | Normalized Weights | Behavior | +//! |--------------|-------------------|----------| +//! | `[0.7, 0.3]` | `[0.7, 0.3]` | Already normalized | +//! | `[7.0, 3.0]` | `[0.7, 0.3]` | Scaled to sum to 1.0 | +//! | `[1.0, 1.0, 1.0]` | `[0.333, 0.333, 0.333]` | Equal distribution | +//! | `[0.0, 0.0]` | `[0.5, 0.5]` | Fallback to equal | +//! | `[1.0, -0.5]` | `[1.0, 0.0]` | Negatives treated as 0 | +//! +//! ## Example +//! +//! ```rust +//! use khive_fusion::weighted_fusion; +//! use khive_score::DeterministicScore; +//! +//! let semantic = vec![("doc1", DeterministicScore::from_f64(0.9))]; +//! let keyword = vec![("doc1", DeterministicScore::from_f64(0.8))]; +//! +//! // These produce identical results due to normalization: +//! let result1 = weighted_fusion(vec![semantic.clone(), keyword.clone()], &[0.6, 0.4]); +//! let result2 = weighted_fusion(vec![semantic, keyword], &[6.0, 4.0]); +//! // result1 == result2 +//! ``` + +use khive_score::DeterministicScore; +use std::cmp::Ordering; +use std::collections::HashMap; +use std::hash::Hash; + +/// Min-max normalize a single source's scores to [0, 1] for cross-source fusion (#2496). +/// +/// When all scores are equal (or the source has one element) every entry +/// receives 1.0 so it still contributes to the weighted combination. +fn min_max_normalize_source( + source: Vec<(Id, DeterministicScore)>, +) -> Vec<(Id, DeterministicScore)> { + if source.is_empty() { + return source; + } + let min = source + .iter() + .map(|(_, s)| s.to_f64()) + .fold(f64::INFINITY, f64::min); + let max = source + .iter() + .map(|(_, s)| s.to_f64()) + .fold(f64::NEG_INFINITY, f64::max); + let span = max - min; + if span <= f64::EPSILON { + return source + .into_iter() + .map(|(id, _)| (id, DeterministicScore::from_f64(1.0))) + .collect(); + } + source + .into_iter() + .map(|(id, s)| { + let normalized = (s.to_f64() - min) / span; + (id, DeterministicScore::from_f64(normalized)) + }) + .collect() +} + +/// Weighted linear combination of scores. +/// +/// Combines scores using weighted averaging. Weights are automatically normalized +/// to sum to 1.0, allowing flexible input ranges (see module documentation). +/// +/// # Weight Normalization (RETRIEVAL-07) +/// +/// Weights are normalized as follows: +/// 1. Negative weights are treated as 0.0 +/// 2. If all weights are <= 0, equal distribution is used +/// 3. Otherwise, weights are divided by their sum to normalize to 1.0 +/// +/// This normalization ensures: +/// - Consistent results regardless of weight scale +/// - Graceful handling of edge cases (all zeros, negatives) +/// - No need for callers to pre-normalize weights +/// +/// # Warning +/// +/// This requires score normalization before calling, as different retrievers +/// may produce scores on different scales (e.g., cosine similarity 0-1 vs BM25 0-infinity). +/// Consider using min-max normalization or z-score normalization on individual +/// retriever results before fusion. +/// +/// # Arguments +/// +/// * `sources` - Vector of result lists with scores. +/// * `weights` - Weights for each source. Will be normalized to sum to 1.0. +/// +/// # Returns +/// +/// A vector sorted by weighted score descending, with ties broken by ID +/// for deterministic cross-platform ordering. +/// +/// # Panics +/// +/// Does not panic. Returns empty vector if sources is empty. +pub fn weighted_fusion( + sources: Vec>, + weights: &[f64], +) -> Vec<(Id, DeterministicScore)> { + if sources.is_empty() { + return Vec::new(); + } + + // Normalize weights + let weight_sum: f64 = weights.iter().filter(|w| **w > 0.0).sum(); + let normalized: Vec = if weight_sum <= 0.0 { + // All zero/negative weights -> equal distribution + vec![1.0 / sources.len() as f64; sources.len()] + } else { + weights + .iter() + .map(|w| if *w > 0.0 { w / weight_sum } else { 0.0 }) + .collect() + }; + + // Estimate capacity + let estimated_capacity: usize = sources.iter().map(|s| s.len()).sum(); + let mut combined: HashMap = HashMap::with_capacity(estimated_capacity); + + for (source_idx, results) in sources.into_iter().enumerate() { + // Sources beyond the weights array get weight 0.0 (silently ignored). + let weight = normalized.get(source_idx).copied().unwrap_or(0.0); + + // Normalize each source to [0,1] before weighted combination so that + // BM25 unbounded scores and cosine [0,1] scores contribute proportionally + // to their configured weights (#2496/#2639). + let norm_results = min_max_normalize_source(results); + for (id, score) in norm_results { + *combined.entry(id).or_insert(0.0) += score.to_f64() * weight; + } + } + + // Convert and sort by score descending, then by ID ascending for determinism + let mut fused: Vec<(Id, DeterministicScore)> = combined + .into_iter() + .map(|(id, score)| (id, DeterministicScore::from_f64(score))) + .collect(); + + fused.sort_by( + |(id_a, score_a), (id_b, score_b)| match score_b.cmp(score_a) { + Ordering::Equal => id_a.cmp(id_b), + other => other, + }, + ); + fused +} + +/// Check if weights are already normalized (sum to approximately 1.0). +/// +/// This is a utility function for callers who want to verify or log +/// whether their weights needed normalization. +/// +/// # Arguments +/// +/// * `weights` - The weights to check. +/// * `tolerance` - How close to 1.0 is acceptable (e.g., 1e-6). +/// +/// # Returns +/// +/// `true` if the sum of positive weights is within `tolerance` of 1.0. +/// +/// # Example +/// +/// ```rust +/// use khive_fusion::weights_are_normalized; +/// +/// assert!(weights_are_normalized(&[0.6, 0.4], 1e-6)); +/// assert!(!weights_are_normalized(&[6.0, 4.0], 1e-6)); +/// ``` +#[inline] +pub fn weights_are_normalized(weights: &[f64], tolerance: f64) -> bool { + let sum: f64 = weights.iter().filter(|w| **w > 0.0).sum(); + (sum - 1.0).abs() <= tolerance +} + +/// Normalize weights to sum to 1.0. +/// +/// This is the same normalization logic used internally by `weighted_fusion`, +/// exposed for callers who want to inspect or use the normalized weights. +/// +/// # Arguments +/// +/// * `weights` - Input weights (may be any positive scale). +/// +/// # Returns +/// +/// Normalized weights that sum to 1.0. Negative weights become 0.0. +/// If all weights are <= 0, returns equal distribution. +pub fn normalize_weights(weights: &[f64]) -> Vec { + if weights.is_empty() { + return Vec::new(); + } + + let weight_sum: f64 = weights.iter().filter(|w| **w > 0.0).sum(); + + if weight_sum <= 0.0 { + vec![1.0 / weights.len() as f64; weights.len()] + } else { + weights + .iter() + .map(|w| if *w > 0.0 { w / weight_sum } else { 0.0 }) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_results(items: Vec<(Id, f64)>) -> Vec<(Id, DeterministicScore)> { + items + .into_iter() + .map(|(id, score)| (id, DeterministicScore::from_f64(score))) + .collect() + } + + #[test] + fn test_weighted_basic() { + let source1 = make_results(vec![("doc_a", 1.0)]); + let source2 = make_results(vec![("doc_a", 1.0)]); + + let fused = weighted_fusion(vec![source1, source2], &[0.7, 0.3]); + + assert!((fused[0].1.to_f64() - 1.0).abs() < 0.01); + } + + #[test] + fn test_weighted_normalization() { + let source1 = make_results(vec![("doc_a", 1.0)]); + let source2 = make_results(vec![("doc_a", 1.0)]); + + let fused = weighted_fusion(vec![source1, source2], &[7.0, 3.0]); + + assert!((fused[0].1.to_f64() - 1.0).abs() < 0.01); + } + + #[test] + fn test_weighted_zero_weights() { + let source1 = make_results(vec![("doc_a", 1.0)]); + let source2 = make_results(vec![("doc_a", 1.0)]); + + let fused = weighted_fusion(vec![source1, source2], &[0.0, 0.0]); + + assert!((fused[0].1.to_f64() - 1.0).abs() < 0.01); + } + + #[test] + fn test_weighted_disjoint_results() { + let source1 = make_results(vec![("doc_a", 0.9)]); + let source2 = make_results(vec![("doc_b", 0.8)]); + + let fused = weighted_fusion(vec![source1, source2], &[0.6, 0.4]); + + let doc_a = fused.iter().find(|(id, _)| *id == "doc_a").unwrap(); + let doc_b = fused.iter().find(|(id, _)| *id == "doc_b").unwrap(); + + // After per-source min-max normalization, single-element sources map to 1.0. + // doc_a contributes 1.0 * 0.6 = 0.6, doc_b contributes 1.0 * 0.4 = 0.4. + assert!((doc_a.1.to_f64() - 0.6).abs() < 0.01); + assert!((doc_b.1.to_f64() - 0.4).abs() < 0.01); + } + + #[test] + fn test_weighted_empty_sources() { + let fused: Vec<(&str, DeterministicScore)> = weighted_fusion(vec![], &[]); + assert!(fused.is_empty()); + } + + #[test] + fn test_weighted_single_source() { + let source = make_results(vec![("doc_a", 0.9)]); + let fused = weighted_fusion(vec![source], &[1.0]); + + assert_eq!(fused.len(), 1); + // Single-element source normalizes to 1.0; weight is 1.0 → final = 1.0. + assert!((fused[0].1.to_f64() - 1.0).abs() < 0.01); + } + + // RETRIEVAL-07: Normalization behavior tests + + #[test] + fn test_normalization_already_normalized() { + let source1 = make_results(vec![("doc_a", 1.0)]); + let source2 = make_results(vec![("doc_b", 1.0)]); + + // Weights already sum to 1.0 + let fused = weighted_fusion(vec![source1, source2], &[0.6, 0.4]); + + let doc_a = fused.iter().find(|(id, _)| *id == "doc_a").unwrap(); + let doc_b = fused.iter().find(|(id, _)| *id == "doc_b").unwrap(); + + assert!((doc_a.1.to_f64() - 0.6).abs() < 0.01); + assert!((doc_b.1.to_f64() - 0.4).abs() < 0.01); + } + + #[test] + fn test_normalization_scaled_weights() { + let source1 = make_results(vec![("doc_a", 1.0)]); + let source2 = make_results(vec![("doc_b", 1.0)]); + + // Weights sum to 100, should be normalized to 0.6, 0.4 + let fused = weighted_fusion(vec![source1, source2], &[60.0, 40.0]); + + let doc_a = fused.iter().find(|(id, _)| *id == "doc_a").unwrap(); + let doc_b = fused.iter().find(|(id, _)| *id == "doc_b").unwrap(); + + assert!((doc_a.1.to_f64() - 0.6).abs() < 0.01); + assert!((doc_b.1.to_f64() - 0.4).abs() < 0.01); + } + + #[test] + fn test_normalization_negative_weights() { + let source1 = make_results(vec![("doc_a", 1.0)]); + let source2 = make_results(vec![("doc_b", 1.0)]); + + // Negative weight should be treated as 0 + let fused = weighted_fusion(vec![source1, source2], &[1.0, -0.5]); + + let doc_a = fused.iter().find(|(id, _)| *id == "doc_a").unwrap(); + let doc_b = fused.iter().find(|(id, _)| *id == "doc_b"); + + // doc_a gets full weight (1.0 normalized to 1.0) + assert!((doc_a.1.to_f64() - 1.0).abs() < 0.01); + // doc_b should have 0 contribution from second source + assert!(doc_b.is_none() || doc_b.unwrap().1.to_f64() < 0.01); + } + + #[test] + fn test_normalization_three_sources_equal() { + let source1 = make_results(vec![("doc_a", 1.0)]); + let source2 = make_results(vec![("doc_b", 1.0)]); + let source3 = make_results(vec![("doc_c", 1.0)]); + + // Equal weights + let fused = weighted_fusion(vec![source1, source2, source3], &[1.0, 1.0, 1.0]); + + for (_, score) in &fused { + // Each should get 1/3 weight = 0.333... + assert!((score.to_f64() - 1.0 / 3.0).abs() < 0.01); + } + } + + #[test] + fn test_normalization_consistent_across_scales() { + let source1 = make_results(vec![("doc_a", 0.8), ("doc_b", 0.6)]); + let source2 = make_results(vec![("doc_a", 0.9), ("doc_c", 0.7)]); + + // Same ratio, different scales + let fused1 = weighted_fusion(vec![source1.clone(), source2.clone()], &[0.7, 0.3]); + let fused2 = weighted_fusion(vec![source1.clone(), source2.clone()], &[7.0, 3.0]); + let fused3 = weighted_fusion(vec![source1, source2], &[70.0, 30.0]); + + // All should produce identical results + assert_eq!(fused1.len(), fused2.len()); + assert_eq!(fused2.len(), fused3.len()); + + for i in 0..fused1.len() { + assert_eq!(fused1[i].0, fused2[i].0); + assert_eq!(fused2[i].0, fused3[i].0); + assert!( + (fused1[i].1.to_f64() - fused2[i].1.to_f64()).abs() < 1e-10, + "Score mismatch at position {}: {} vs {}", + i, + fused1[i].1.to_f64(), + fused2[i].1.to_f64() + ); + assert!( + (fused2[i].1.to_f64() - fused3[i].1.to_f64()).abs() < 1e-10, + "Score mismatch at position {}: {} vs {}", + i, + fused2[i].1.to_f64(), + fused3[i].1.to_f64() + ); + } + } + + // Helper function tests + + #[test] + fn test_weights_are_normalized() { + assert!(weights_are_normalized(&[0.5, 0.5], 1e-6)); + assert!(weights_are_normalized(&[0.7, 0.3], 1e-6)); + assert!(weights_are_normalized(&[1.0], 1e-6)); + assert!(weights_are_normalized(&[0.25, 0.25, 0.25, 0.25], 1e-6)); + + assert!(!weights_are_normalized(&[0.5, 0.6], 1e-6)); // > 1 + assert!(!weights_are_normalized(&[0.3, 0.3], 1e-6)); // < 1 + assert!(!weights_are_normalized(&[10.0, 10.0], 1e-6)); // = 20 + } + + #[test] + fn test_normalize_weights() { + let normalized = normalize_weights(&[6.0, 4.0]); + assert!((normalized[0] - 0.6).abs() < 1e-10); + assert!((normalized[1] - 0.4).abs() < 1e-10); + + let normalized = normalize_weights(&[1.0, 1.0, 1.0]); + for w in &normalized { + assert!((w - 1.0 / 3.0).abs() < 1e-10); + } + + let normalized = normalize_weights(&[0.0, 0.0]); + assert!((normalized[0] - 0.5).abs() < 1e-10); + assert!((normalized[1] - 0.5).abs() < 1e-10); + + let normalized = normalize_weights(&[1.0, -1.0]); + assert!((normalized[0] - 1.0).abs() < 1e-10); + assert!((normalized[1] - 0.0).abs() < 1e-10); + } + + #[test] + fn test_normalize_weights_empty() { + let normalized = normalize_weights(&[]); + assert!(normalized.is_empty()); + } + + // ── #2496 / #2639: per-source min-max normalization before fusion ────── + + #[test] + fn test_weighted_fusion_mixed_scales_bm25_vs_cosine() { + // BM25-like: unbounded scores (0..100) + let bm25 = vec![ + ("doc_a", DeterministicScore::from_f64(80.0)), + ("doc_b", DeterministicScore::from_f64(20.0)), + ]; + // Cosine-like: [0,1] scores + let cosine = vec![ + ("doc_a", DeterministicScore::from_f64(0.9)), + ("doc_b", DeterministicScore::from_f64(0.1)), + ]; + + let result = weighted_fusion(vec![bm25, cosine], &[0.5, 0.5]); + // Both sources agree doc_a is more relevant — it must rank first. + assert_eq!(result[0].0, "doc_a"); + assert_eq!(result[1].0, "doc_b"); + // After normalization, doc_a should score close to 1.0 (top in both). + assert!(result[0].1.to_f64() > 0.8); + } + + #[test] + fn test_weighted_fusion_inverted_scale_normalizes_correctly() { + // If one source has negative/inverted semantics, min-max still works. + let src1 = vec![ + ("x", DeterministicScore::from_f64(100.0)), + ("y", DeterministicScore::from_f64(1.0)), + ]; + let src2 = vec![ + ("x", DeterministicScore::from_f64(0.9)), + ("y", DeterministicScore::from_f64(0.1)), + ]; + + let result = weighted_fusion(vec![src1, src2], &[0.6, 0.4]); + assert_eq!(result[0].0, "x"); + // x must score strictly higher than y + assert!(result[0].1.to_f64() > result[1].1.to_f64()); + } + + #[test] + fn test_min_max_normalize_source_single() { + let src = vec![("a", DeterministicScore::from_f64(42.0))]; + let out = min_max_normalize_source(src); + assert!((out[0].1.to_f64() - 1.0).abs() < 1e-10); + } + + #[test] + fn test_min_max_normalize_source_uniform() { + let src = vec![ + ("a", DeterministicScore::from_f64(5.0)), + ("b", DeterministicScore::from_f64(5.0)), + ]; + let out = min_max_normalize_source(src); + // All equal → all 1.0 + assert!((out[0].1.to_f64() - 1.0).abs() < 1e-10); + assert!((out[1].1.to_f64() - 1.0).abs() < 1e-10); + } +}