Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ members = [
"khive-gate",
"khive-gate-rego",
"khive-fusion",
"khive-bm25",
"khive-runtime",
"khive-request",
"khive-pack-kg",
Expand Down
18 changes: 18 additions & 0 deletions crates/khive-bm25/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[package]
name = "khive-bm25"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
repository.workspace = true
homepage.workspace = true
keywords.workspace = true
categories.workspace = true
description = "BM25 (Okapi BM25) keyword index with deterministic scoring"

[dependencies]
khive-score = { version = "0.2.0", path = "../khive-score" }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
parking_lot = { workspace = true }
104 changes: 104 additions & 0 deletions crates/khive-bm25/src/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
//! BM25 configuration types.
//!
//! See ADR-003 for recommended parameter values.

use serde::{Deserialize, Serialize};

/// BM25 configuration parameters.
///
/// Default values (k1=1.2, b=0.75) from ADR-003 work well for most use cases.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Bm25Config {
/// Term saturation parameter.
///
/// Higher values = diminishing returns for repeated terms.
/// Range: typically 1.2-2.0
/// Default: 1.2
pub k1: f64,

/// Length normalization parameter.
///
/// - 0 = no length normalization (favor longer docs)
/// - 1 = full length normalization (favor shorter docs)
///
/// Range: 0.0-1.0, Default: 0.75
pub b: f64,

/// Maximum memory budget in bytes for the index.
/// If None, no memory limit is enforced (default).
/// If Some(limit), `index_document()` calls that would exceed the budget
/// are rejected with `RetrievalError::BudgetExceeded`. Re-indexing an
/// existing document bypasses the budget check.
#[serde(default)]
pub memory_budget: Option<usize>,
}

impl Default for Bm25Config {
fn default() -> Self {
Self {
k1: 1.2,
b: 0.75,
memory_budget: None,
}
}
}

impl Bm25Config {
/// Create a new BM25 configuration.
pub fn new(k1: f64, b: f64) -> Self {
Self {
k1,
b,
memory_budget: None,
}
}

/// Set memory budget in bytes.
///
/// When set, `index_document()` calls that would cause the estimated
/// memory usage to exceed this limit are rejected with `BudgetExceeded`.
/// Re-indexing an existing document bypasses the budget check.
#[must_use]
pub fn with_memory_budget(mut self, budget: usize) -> Self {
self.memory_budget = Some(budget);
self
}

/// Validate configuration parameters.
pub fn validate(&self) -> Result<(), &'static str> {
if self.k1 < 0.0 {
return Err("k1 must be non-negative");
}
if !(0.0..=1.0).contains(&self.b) {
return Err("b must be in range [0.0, 1.0]");
}
Ok(())
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_config_default() {
let config = Bm25Config::default();
assert!((config.k1 - 1.2).abs() < f64::EPSILON);
assert!((config.b - 0.75).abs() < f64::EPSILON);
}

#[test]
fn test_config_validation() {
assert!(Bm25Config::new(1.2, 0.75).validate().is_ok());
assert!(Bm25Config::new(-0.1, 0.75).validate().is_err());
assert!(Bm25Config::new(1.2, -0.1).validate().is_err());
assert!(Bm25Config::new(1.2, 1.5).validate().is_err());
}

#[test]
fn test_config_custom() {
let config = Bm25Config::new(2.0, 0.5);
assert!((config.k1 - 2.0).abs() < f64::EPSILON);
assert!((config.b - 0.5).abs() < f64::EPSILON);
}
}
54 changes: 54 additions & 0 deletions crates/khive-bm25/src/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
//! Error types for the BM25 index.

use thiserror::Error;

/// Classification of errors by recoverability.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ErrorKind {
/// Retrying will not help (e.g., budget exceeded, invalid config).
Permanent,
/// Retrying after a delay may succeed.
Transient,
}

/// Errors produced by BM25 index operations.
#[derive(Debug, Error)]
pub enum RetrievalError {
/// The memory budget was exceeded when indexing a new document.
#[error(
"memory budget exceeded: current={current_usage}, item_size={item_size}, limit={limit}"
)]
BudgetExceeded {
current_usage: usize,
item_size: usize,
limit: usize,
},

/// Invalid BM25 configuration parameters.
#[error("configuration error: {0}")]
Configuration(String),
}

impl RetrievalError {
/// Construct a `BudgetExceeded` error.
pub fn budget_exceeded(current_usage: usize, item_size: usize, limit: usize) -> Self {
Self::BudgetExceeded {
current_usage,
item_size,
limit,
}
}

/// Return the [`ErrorKind`] for this error.
pub fn kind(&self) -> ErrorKind {
ErrorKind::Permanent
}

/// Whether retrying this operation might succeed.
pub fn is_retryable(&self) -> bool {
self.kind() == ErrorKind::Transient
}
}

/// Convenience `Result` alias for BM25 operations.
pub type Result<T> = std::result::Result<T, RetrievalError>;
159 changes: 159 additions & 0 deletions crates/khive-bm25/src/index/bench_wand.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
use std::hint::black_box;
use std::time::Instant;

use super::{Bm25Index, SearchContext};
use crate::config::Bm25Config;

#[derive(Clone)]
struct XorShift64 {
state: u64,
}

impl XorShift64 {
fn new(seed: u64) -> Self {
Self { state: seed.max(1) }
}

fn next_u64(&mut self) -> u64 {
let mut x = self.state;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
self.state = x;
x
}

fn next_f64(&mut self) -> f64 {
((self.next_u64() >> 11) as f64) / ((1u64 << 53) as f64)
}

fn gen_range(&mut self, upper: usize) -> usize {
if upper <= 1 {
0
} else {
(self.next_u64() as usize) % upper
}
}
}

struct ZipfSampler {
cdf: Vec<f64>,
}

impl ZipfSampler {
fn new(vocab_size: usize, exponent: f64) -> Self {
let mut cumulative = Vec::with_capacity(vocab_size);
let mut running = 0.0;
for rank in 1..=vocab_size {
running += 1.0 / (rank as f64).powf(exponent);
cumulative.push(running);
}
for value in &mut cumulative {
*value /= running;
}
Self { cdf: cumulative }
}

fn sample(&self, rng: &mut XorShift64) -> usize {
let needle = rng.next_f64();
let idx = self.cdf.partition_point(|value| *value < needle);
idx.min(self.cdf.len().saturating_sub(1))
}
}

fn build_vocab(size: usize) -> Vec<String> {
(0..size).map(|idx| format!("tok_{idx:04}")).collect()
}

fn build_index(doc_count: usize, seed: u64) -> (Bm25Index, Vec<String>, ZipfSampler) {
let vocab = build_vocab(2_048);
let zipf = ZipfSampler::new(vocab.len(), 1.07);
let mut rng = XorShift64::new(seed);
let mut index = Bm25Index::new(Bm25Config::default());

for doc_idx in 0..doc_count {
let len = 24 + rng.gen_range(40);
let mut text = String::new();
for token_idx in 0..len {
if token_idx > 0 {
text.push(' ');
}
let token = &vocab[zipf.sample(&mut rng)];
text.push_str(token);
}
index
.index_document(format!("doc_{doc_idx}"), &text)
.expect("synthetic document should index");
}

(index, vocab, zipf)
}

fn build_queries(
vocab: &[String],
zipf: &ZipfSampler,
rng: &mut XorShift64,
count: usize,
terms_per_query: usize,
) -> Vec<String> {
let mut queries = Vec::with_capacity(count);
for _ in 0..count {
let mut query = String::new();
for idx in 0..terms_per_query {
if idx > 0 {
query.push(' ');
}
query.push_str(&vocab[zipf.sample(rng)]);
}
queries.push(query);
}
queries
}

/// Benchmark: WAND vs brute-force on Zipf-distributed corpora.
///
/// Note: `search_with_context` routes to WAND only when total query postings
/// exceed `SMALL_QUERY_POSTINGS_THRESHOLD` (256). For very rare terms or
/// small corpora, the brute-force path may be taken instead, so speedup
/// numbers should be interpreted accordingly.
#[test]
#[ignore = "benchmark; run with `cargo test bench_wand -- --ignored --nocapture`"]
fn bench_bm25_wand_vs_bruteforce_zipf_matrix() {
let corpus_sizes = [10_000usize, 50_000, 100_000];
let query_lengths = [1usize, 2, 3];

for &doc_count in &corpus_sizes {
let (index, vocab, zipf) = build_index(doc_count, 0xFACE_FEED ^ doc_count as u64);

println!("\nCorpus: {doc_count} docs");
println!("query_terms | brute_force_ms | bmw_ms | speedup_x");
println!("------------|----------------|--------|----------");

for &terms_per_query in &query_lengths {
let mut rng = XorShift64::new(0xDEAD_BEEF ^ ((doc_count as u64) << terms_per_query));
let queries = build_queries(&vocab, &zipf, &mut rng, 64, terms_per_query);

let mut brute_ctx = SearchContext::with_capacity(512);
let brute_start = Instant::now();
for query in &queries {
black_box(index.search_brute_force(query, 10, &mut brute_ctx));
}
let brute_ms = brute_start.elapsed().as_secs_f64() * 1000.0;

let mut wand_ctx = SearchContext::with_capacity(512);
let wand_start = Instant::now();
for query in &queries {
black_box(index.search_with_context(query, 10, &mut wand_ctx));
}
let wand_ms = wand_start.elapsed().as_secs_f64() * 1000.0;

let speedup = if wand_ms > 0.0 {
brute_ms / wand_ms
} else {
f64::INFINITY
};

println!("{terms_per_query:>11} | {brute_ms:>14.3} | {wand_ms:>6.3} | {speedup:>8.2}");
}
}
}
Loading
Loading