diff --git a/.gitignore b/.gitignore index dbfd638..31fdee1 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,6 @@ qdrant_storage/ # Generated HTML *.html !openexp/static/*.html + +# Benchmark datasets (download separately — see benchmarks/README.md) +benchmarks/data/ diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..2c9b656 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,48 @@ +# Benchmarks + +Reproducible scripts for the numbers cited in `docs/benchmark-results.md`. + +## LongMemEval + +`longmemeval_bench.py` evaluates OpenExp's retrieval against the +[LongMemEval](https://github.com/xiaowu0162/LongMemEval) benchmark — 500 +questions over long multi-session histories. + +### Get the data + +The dataset (`longmemeval_s_cleaned.json`, ~277 MB) is not committed. Download +it from the upstream MemPalace repo: + +```bash +mkdir -p benchmarks/data +curl -L -o benchmarks/data/longmemeval_s_cleaned.json \ + https://github.com/mem-palace/mempalace/raw/main/data/longmemeval_s_cleaned.json +``` + +Or any other LongMemEval mirror that publishes the cleaned variant in the same +schema (`question`, `haystack_sessions`, `haystack_session_ids`, +`haystack_dates`, `answer_session_ids`, `question_type`). + +### Run + +```bash +# Raw mode — pure vector similarity (baseline) +python benchmarks/longmemeval_bench.py benchmarks/data/longmemeval_s_cleaned.json --mode raw + +# Hybrid — vector + BM25 (OpenExp default) +python benchmarks/longmemeval_bench.py benchmarks/data/longmemeval_s_cleaned.json --mode hybrid + +# Full — vector + BM25 + recency +python benchmarks/longmemeval_bench.py benchmarks/data/longmemeval_s_cleaned.json --mode full + +# Smoke test — 20 questions only +python benchmarks/longmemeval_bench.py benchmarks/data/longmemeval_s_cleaned.json --mode hybrid --limit 20 +``` + +Embedding model: `BAAI/bge-small-en-v1.5` (same as production). Vector store: +in-memory Qdrant (each question gets a fresh collection). Run takes a few hours +end-to-end on the full 500 questions; use `--limit` for quick checks. + +Last published numbers (hybrid mode, session granularity, 500/500 questions): +R@1 = 0.880, R@10 = 0.986, NDCG@10 = 0.924. See `docs/benchmark-results.md` for +the full table and per-type breakdown. diff --git a/benchmarks/longmemeval_bench.py b/benchmarks/longmemeval_bench.py new file mode 100644 index 0000000..2e25ed5 --- /dev/null +++ b/benchmarks/longmemeval_bench.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python3 +""" +OpenExp × LongMemEval Benchmark +================================ + +Evaluates OpenExp's retrieval against the LongMemEval benchmark. +Apples-to-apples comparison with MemPalace's benchmark. + +For each of the 500 questions: +1. Ingest all haystack sessions into a fresh Qdrant collection +2. Query with OpenExp's multi-signal scoring (vector + BM25) +3. Score retrieval against ground-truth answer sessions + +Modes: + raw — pure vector similarity (baseline, like MemPalace raw) + hybrid — vector + BM25 keyword scoring (OpenExp default) + full — vector + BM25 + recency + importance (full OpenExp pipeline) + +Usage: + python benchmarks/longmemeval_bench.py /tmp/mempalace/data/longmemeval_s_cleaned.json + python benchmarks/longmemeval_bench.py /tmp/mempalace/data/longmemeval_s_cleaned.json --mode hybrid + python benchmarks/longmemeval_bench.py /tmp/mempalace/data/longmemeval_s_cleaned.json --limit 20 +""" + +import json +import math +import argparse +import sys +import uuid +from pathlib import Path +from collections import defaultdict +from datetime import datetime + +from fastembed import TextEmbedding +from qdrant_client import QdrantClient +from qdrant_client.models import ( + Distance, + VectorParams, + PointStruct, +) + +# Add openexp to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + + +# ============================================================================= +# METRICS (same as MemPalace for fair comparison) +# ============================================================================= + +def dcg(relevances, k): + score = 0.0 + for i, rel in enumerate(relevances[:k]): + score += rel / math.log2(i + 2) + return score + + +def ndcg(rankings, correct_ids, corpus_ids, k): + relevances = [1.0 if corpus_ids[idx] in correct_ids else 0.0 for idx in rankings[:k]] + ideal = sorted(relevances, reverse=True) + idcg = dcg(ideal, k) + if idcg == 0: + return 0.0 + return dcg(relevances, k) / idcg + + +def evaluate_retrieval(rankings, correct_ids, corpus_ids, k): + top_k_ids = set(corpus_ids[idx] for idx in rankings[:k]) + recall_any = float(any(cid in top_k_ids for cid in correct_ids)) + recall_all = float(all(cid in top_k_ids for cid in correct_ids)) + ndcg_score = ndcg(rankings, correct_ids, corpus_ids, k) + return recall_any, recall_all, ndcg_score + + +def session_id_from_corpus_id(corpus_id): + if "_turn_" in corpus_id: + return corpus_id.rsplit("_turn_", 1)[0] + return corpus_id + + +# ============================================================================= +# BM25 SCORING (from OpenExp's hybrid scoring) +# ============================================================================= + +def bm25_score(query_tokens, doc_tokens, avg_dl, k1=1.5, b=0.75): + """Simple BM25 score for a single document.""" + dl = len(doc_tokens) + score = 0.0 + doc_freq = {} + for t in doc_tokens: + doc_freq[t] = doc_freq.get(t, 0) + 1 + for qt in set(query_tokens): + tf = doc_freq.get(qt, 0) + if tf == 0: + continue + numerator = tf * (k1 + 1) + denominator = tf + k1 * (1 - b + b * dl / avg_dl) + score += numerator / denominator + return score + + +def tokenize(text): + """Simple whitespace + lowercasing tokenizer.""" + import re + return re.findall(r'\w+', text.lower()) + + +# ============================================================================= +# EMBEDDING MODEL +# ============================================================================= + +EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5" # same as OpenExp production +_embedder = None + + +def get_embedder(): + global _embedder + if _embedder is None: + print(f" Loading embedding model: {EMBEDDING_MODEL}...") + _embedder = TextEmbedding(model_name=EMBEDDING_MODEL) + print(" Model ready.") + return _embedder + + +def embed_texts(texts): + embedder = get_embedder() + return list(embedder.embed(texts)) + + +# ============================================================================= +# QDRANT EPHEMERAL +# ============================================================================= + +_qdrant = QdrantClient(":memory:") +COLLECTION = "bench" +VECTOR_DIM = 384 + + +def fresh_collection(): + """Delete and recreate collection for clean slate.""" + try: + _qdrant.delete_collection(COLLECTION) + except Exception: + pass + _qdrant.create_collection( + collection_name=COLLECTION, + vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE), + ) + + +# ============================================================================= +# RETRIEVAL MODES +# ============================================================================= + +def retrieve_raw(entry, granularity="session", n_results=50): + """Raw mode: pure vector similarity search (baseline).""" + corpus, corpus_ids, corpus_timestamps = build_corpus(entry, granularity) + if not corpus: + return [], corpus, corpus_ids, corpus_timestamps + + fresh_collection() + + # Embed and ingest + vectors = embed_texts(corpus) + points = [ + PointStruct(id=i, vector=v.tolist(), payload={"corpus_id": cid, "timestamp": ts}) + for i, (v, cid, ts) in enumerate(zip(vectors, corpus_ids, corpus_timestamps)) + ] + _qdrant.upsert(collection_name=COLLECTION, points=points) + + # Query + query = entry["question"] + query_vec = embed_texts([query])[0].tolist() + results = _qdrant.query_points( + collection_name=COLLECTION, + query=query_vec, + limit=min(n_results, len(corpus)), + with_payload=True, + ) + + ranked_indices = [p.id for p in results.points] + + # Fill missing + seen = set(ranked_indices) + for i in range(len(corpus)): + if i not in seen: + ranked_indices.append(i) + + return ranked_indices, corpus, corpus_ids, corpus_timestamps + + +def retrieve_hybrid(entry, granularity="session", n_results=50, bm25_weight=0.10): + """Hybrid mode: vector similarity + BM25 keyword scoring.""" + corpus, corpus_ids, corpus_timestamps = build_corpus(entry, granularity) + if not corpus: + return [], corpus, corpus_ids, corpus_timestamps + + fresh_collection() + + vectors = embed_texts(corpus) + points = [ + PointStruct(id=i, vector=v.tolist(), payload={"corpus_id": cid, "timestamp": ts}) + for i, (v, cid, ts) in enumerate(zip(vectors, corpus_ids, corpus_timestamps)) + ] + _qdrant.upsert(collection_name=COLLECTION, points=points) + + # Vector search + query = entry["question"] + query_vec = embed_texts([query])[0].tolist() + results = _qdrant.query_points( + collection_name=COLLECTION, + query=query_vec, + limit=min(n_results, len(corpus)), + with_payload=True, + ) + + # BM25 scoring + query_tokens = tokenize(query) + doc_tokens_list = [tokenize(doc) for doc in corpus] + avg_dl = sum(len(dt) for dt in doc_tokens_list) / max(len(doc_tokens_list), 1) + + bm25_scores = {} + for i, dt in enumerate(doc_tokens_list): + bm25_scores[i] = bm25_score(query_tokens, dt, avg_dl) + + # Normalize BM25 + max_bm25 = max(bm25_scores.values()) if bm25_scores else 1.0 + if max_bm25 > 0: + for k in bm25_scores: + bm25_scores[k] /= max_bm25 + + # Combine: vector_score * (1 - bm25_weight) + bm25 * bm25_weight + combined = [] + for point in results.points: + idx = point.id + vec_score = point.score # cosine similarity + bm25_s = bm25_scores.get(idx, 0.0) + final = vec_score * (1 - bm25_weight) + bm25_s * bm25_weight + combined.append((idx, final)) + + # Add any docs not in vector results (with bm25-only score) + seen = {idx for idx, _ in combined} + for i in range(len(corpus)): + if i not in seen: + bm25_s = bm25_scores.get(i, 0.0) + combined.append((i, bm25_s * bm25_weight)) + + combined.sort(key=lambda x: x[1], reverse=True) + ranked_indices = [idx for idx, _ in combined] + + return ranked_indices, corpus, corpus_ids, corpus_timestamps + + +def retrieve_full(entry, granularity="session", n_results=50, + bm25_weight=0.10, recency_weight=0.15): + """Full mode: vector + BM25 + recency boost (simulates OpenExp scoring).""" + corpus, corpus_ids, corpus_timestamps = build_corpus(entry, granularity) + if not corpus: + return [], corpus, corpus_ids, corpus_timestamps + + fresh_collection() + + vectors = embed_texts(corpus) + points = [ + PointStruct(id=i, vector=v.tolist(), payload={"corpus_id": cid, "timestamp": ts}) + for i, (v, cid, ts) in enumerate(zip(vectors, corpus_ids, corpus_timestamps)) + ] + _qdrant.upsert(collection_name=COLLECTION, points=points) + + query = entry["question"] + query_vec = embed_texts([query])[0].tolist() + results = _qdrant.query_points( + collection_name=COLLECTION, + query=query_vec, + limit=min(n_results, len(corpus)), + with_payload=True, + ) + + # BM25 + query_tokens = tokenize(query) + doc_tokens_list = [tokenize(doc) for doc in corpus] + avg_dl = sum(len(dt) for dt in doc_tokens_list) / max(len(doc_tokens_list), 1) + + bm25_scores = {} + for i, dt in enumerate(doc_tokens_list): + bm25_scores[i] = bm25_score(query_tokens, dt, avg_dl) + max_bm25 = max(bm25_scores.values()) if bm25_scores else 1.0 + if max_bm25 > 0: + for k in bm25_scores: + bm25_scores[k] /= max_bm25 + + # Recency scoring + # Parse dates and compute recency relative to question_date + question_date = entry.get("question_date", "") + recency_scores = {} + try: + q_date = datetime.strptime(question_date, "%Y-%m-%d") if question_date else None + except ValueError: + q_date = None + + for i, ts in enumerate(corpus_timestamps): + if q_date and ts: + try: + doc_date = datetime.strptime(ts[:10], "%Y-%m-%d") + days_ago = (q_date - doc_date).days + # Exponential decay: recent = higher score + recency_scores[i] = max(0, 1.0 - days_ago / 365.0) + except (ValueError, TypeError): + recency_scores[i] = 0.5 + else: + recency_scores[i] = 0.5 + + # Combine + vec_weight = 1.0 - bm25_weight - recency_weight + combined = [] + for point in results.points: + idx = point.id + vec_s = point.score + bm25_s = bm25_scores.get(idx, 0.0) + rec_s = recency_scores.get(idx, 0.5) + final = vec_s * vec_weight + bm25_s * bm25_weight + rec_s * recency_weight + combined.append((idx, final)) + + seen = {idx for idx, _ in combined} + for i in range(len(corpus)): + if i not in seen: + bm25_s = bm25_scores.get(i, 0.0) + rec_s = recency_scores.get(i, 0.5) + combined.append((i, bm25_s * bm25_weight + rec_s * recency_weight)) + + combined.sort(key=lambda x: x[1], reverse=True) + ranked_indices = [idx for idx, _ in combined] + + return ranked_indices, corpus, corpus_ids, corpus_timestamps + + +# ============================================================================= +# CORPUS BUILDER +# ============================================================================= + +def build_corpus(entry, granularity="session"): + """Build corpus from haystack sessions.""" + corpus = [] + corpus_ids = [] + corpus_timestamps = [] + + sessions = entry["haystack_sessions"] + session_ids = entry["haystack_session_ids"] + dates = entry["haystack_dates"] + + for session, sess_id, date in zip(sessions, session_ids, dates): + if granularity == "session": + user_turns = [t["content"] for t in session if t["role"] == "user"] + if user_turns: + doc = "\n".join(user_turns) + corpus.append(doc) + corpus_ids.append(sess_id) + corpus_timestamps.append(date) + else: + turn_num = 0 + for turn in session: + if turn["role"] == "user": + corpus.append(turn["content"]) + corpus_ids.append(f"{sess_id}_turn_{turn_num}") + corpus_timestamps.append(date) + turn_num += 1 + + return corpus, corpus_ids, corpus_timestamps + + +# ============================================================================= +# BENCHMARK RUNNER +# ============================================================================= + +def run_benchmark(data_file, granularity="session", limit=0, mode="raw", skip=0, out_file=None): + with open(data_file) as f: + data = json.load(f) + + if skip > 0: + data = data[skip:] + if limit > 0: + data = data[:limit] + + print(f"\n{'=' * 60}") + print(" OpenExp × LongMemEval Benchmark") + print(f"{'=' * 60}") + print(f" Data: {Path(data_file).name}") + print(f" Questions: {len(data)}") + print(f" Granularity: {granularity}") + print(f" Mode: {mode}") + print(f" Embedding: {EMBEDDING_MODEL}") + print(f" Vector DB: Qdrant (in-memory)") + print(f"{'─' * 60}\n") + + ks = [1, 3, 5, 10, 30, 50] + metrics_session = {f"recall_any@{k}": [] for k in ks} + metrics_session.update({f"ndcg_any@{k}": [] for k in ks}) + per_type = defaultdict(lambda: defaultdict(list)) + results_log = [] + start_time = datetime.now() + + for i, entry in enumerate(data): + qid = entry["question_id"] + qtype = entry["question_type"] + question = entry["question"] + answer_sids = set(entry["answer_session_ids"]) + + if mode == "hybrid": + rankings, corpus, corpus_ids, corpus_timestamps = retrieve_hybrid( + entry, granularity=granularity + ) + elif mode == "full": + rankings, corpus, corpus_ids, corpus_timestamps = retrieve_full( + entry, granularity=granularity + ) + else: + rankings, corpus, corpus_ids, corpus_timestamps = retrieve_raw( + entry, granularity=granularity + ) + + if not rankings: + print(f" [{i+1:4}/{len(data)}] {qid[:30]:30} SKIP (empty corpus)") + continue + + session_level_ids = [session_id_from_corpus_id(cid) for cid in corpus_ids] + session_correct = answer_sids + + for k in ks: + ra, rl, nd = evaluate_retrieval(rankings, session_correct, session_level_ids, k) + metrics_session[f"recall_any@{k}"].append(ra) + metrics_session[f"ndcg_any@{k}"].append(nd) + + per_type[qtype]["recall_any@5"].append(metrics_session["recall_any@5"][-1]) + per_type[qtype]["recall_any@10"].append(metrics_session["recall_any@10"][-1]) + + r5 = metrics_session["recall_any@5"][-1] + r10 = metrics_session["recall_any@10"][-1] + status = "HIT" if r5 > 0 else "miss" + print(f" [{i+1:4}/{len(data)}] {qid[:30]:30} R@5={r5:.0f} R@10={r10:.0f} {status}") + + elapsed = (datetime.now() - start_time).total_seconds() + + print(f"\n{'=' * 60}") + print(f" RESULTS — OpenExp ({mode} mode, {granularity} granularity)") + print(f"{'=' * 60}") + print(f" Time: {elapsed:.1f}s ({elapsed / max(len(data), 1):.2f}s per question)\n") + + print(" SESSION-LEVEL METRICS:") + for k in ks: + key = f"recall_any@{k}" + if metrics_session[key]: + ra = sum(metrics_session[key]) / len(metrics_session[key]) + nd = sum(metrics_session[f"ndcg_any@{k}"]) / len(metrics_session[f"ndcg_any@{k}"]) + print(f" Recall@{k:2}: {ra:.3f} NDCG@{k:2}: {nd:.3f}") + + print("\n PER-TYPE BREAKDOWN (session recall_any@10):") + for qtype, vals in sorted(per_type.items()): + if vals["recall_any@10"]: + r10 = sum(vals["recall_any@10"]) / len(vals["recall_any@10"]) + n = len(vals["recall_any@10"]) + print(f" {qtype:35} R@10={r10:.3f} (n={n})") + + print(f"\n{'=' * 60}\n") + + if out_file: + with open(out_file, "w") as f: + for entry in results_log: + f.write(json.dumps(entry) + "\n") + print(f" Results saved to: {out_file}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="OpenExp × LongMemEval Benchmark") + parser.add_argument("data_file", help="Path to longmemeval_s_cleaned.json") + parser.add_argument("--granularity", choices=["session", "turn"], default="session") + parser.add_argument("--mode", choices=["raw", "hybrid", "full"], default="raw") + parser.add_argument("--limit", type=int, default=0, help="Limit number of questions (0=all)") + parser.add_argument("--skip", type=int, default=0, help="Skip first N questions") + parser.add_argument("--out", type=str, default=None, help="Output JSONL file") + args = parser.parse_args() + + run_benchmark( + data_file=args.data_file, + granularity=args.granularity, + limit=args.limit, + mode=args.mode, + skip=args.skip, + out_file=args.out, + )