diff --git a/backend/app/api/v1/arxiv.py b/backend/app/api/v1/arxiv.py index ef38508..71d2239 100644 --- a/backend/app/api/v1/arxiv.py +++ b/backend/app/api/v1/arxiv.py @@ -1,7 +1,9 @@ # filepath: backend/app/api/v1/arxiv.py +import itertools import math import re import xml.etree.ElementTree as ET +from collections import Counter from datetime import datetime, timezone from typing import List from urllib.parse import quote @@ -47,6 +49,15 @@ "authority": 0.15, } +# relevance scoring hyperparameters +BM25_K1 = 1.4 +BM25_B = 0.65 +TITLE_BM25_WEIGHT = 1.2 +SUMMARY_BM25_WEIGHT = 1.0 +AUTHOR_BM25_WEIGHT = 0.6 +PHRASE_MATCH_BONUS = 0.1 +RELEVANCE_COMBINATION = (0.65, 0.25, 0.10) # bm25, token overlap, phrase bonus + # multi-batch candidate expansion tuning DEFAULT_BATCH_SIZE = 25 MAX_BATCH_SIZE = 50 @@ -198,6 +209,35 @@ def tokenize(text: str) -> List[str]: query_tokens = tokenize(query) query_token_set = set(query_tokens) + # pre-compute tokenization and corpus stats for stronger BM25-style relevance + tokenized_candidates: list[dict[str, list[str]]] = [] + doc_freq = Counter() + combined_length = 0 + + for paper in candidates: + title_tokens = tokenize(paper.get("title", "")) + summary_tokens = tokenize(paper.get("summary", "")) + author_tokens: List[str] = [] + for name in paper.get("authors", []): + author_tokens.extend(tokenize(name)) + + combined_tokens = list(itertools.chain(title_tokens, summary_tokens, author_tokens)) + combined_length += len(combined_tokens) + for token in set(combined_tokens): + doc_freq[token] += 1 + + tokenized = { + "title": title_tokens, + "summary": summary_tokens, + "authors": author_tokens, + "combined": combined_tokens, + } + tokenized_candidates.append(tokenized) + paper["_tokens"] = tokenized + + doc_count = len(tokenized_candidates) + avg_combined_len = combined_length / doc_count if doc_count else 1.0 + def parse_date(date_str: str): if not date_str: return None @@ -211,27 +251,69 @@ def overlap_score(target_tokens: List[str]) -> float: return 0.0 return len(query_token_set & set(target_tokens)) / len(query_token_set) + def compute_bm25(field_tokens: List[str]) -> float: + if not query_tokens or not field_tokens: + return 0.0 + + term_freq = Counter(field_tokens) + doc_len = len(field_tokens) + score = 0.0 + + for term in query_tokens: + df = doc_freq.get(term, 0) + if df == 0 or doc_count == 0: + continue + + # classic BM25 idf with added 1 to keep score non-negative + idf = math.log(1 + (doc_count - df + 0.5) / (df + 0.5)) + tf = term_freq.get(term, 0) + if tf == 0: + continue + + denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (doc_len / avg_combined_len)) + score += idf * tf * (BM25_K1 + 1) / denom + + return score + + def phrase_bonus(paper: dict) -> float: + # reward contiguous phrase matches in title or summary to bias toward focused hits + if len(query_tokens) < 2: + return 0.0 + + phrase = " ".join(query_tokens) + haystacks = [ + " ".join(paper.get("_tokens", {}).get("title", [])), + " ".join(paper.get("_tokens", {}).get("summary", [])), + ] + return PHRASE_MATCH_BONUS if any(phrase and phrase in h for h in haystacks) else 0.0 + def compute_relevance(paper: dict) -> float: if not query_token_set: return 0.0 - TITLE_W = 0.55 - SUMMARY_W = 0.35 - AUTHOR_W = 0.10 - total = TITLE_W + SUMMARY_W + AUTHOR_W + tokens = paper.get("_tokens", {}) + title_tokens = tokens.get("title", []) + summary_tokens = tokens.get("summary", []) + author_tokens = tokens.get("authors", []) + combined_tokens = tokens.get("combined", []) - title_tokens = tokenize(paper.get("title", "")) - summary_tokens = tokenize(paper.get("summary", "")) - author_tokens: List[str] = [] - for name in paper.get("authors", []): - author_tokens.extend(tokenize(name)) + bm25_title = compute_bm25(title_tokens) + bm25_summary = compute_bm25(summary_tokens) + bm25_authors = compute_bm25(author_tokens) - blended = ( - TITLE_W * overlap_score(title_tokens) - + SUMMARY_W * overlap_score(summary_tokens) - + AUTHOR_W * overlap_score(author_tokens) - ) - return blended / total if total else 0.0 + weighted_bm25 = ( + TITLE_BM25_WEIGHT * bm25_title + + SUMMARY_BM25_WEIGHT * bm25_summary + + AUTHOR_BM25_WEIGHT * bm25_authors + ) / (TITLE_BM25_WEIGHT + SUMMARY_BM25_WEIGHT + AUTHOR_BM25_WEIGHT) + + normalized_bm25 = math.tanh(weighted_bm25) + coverage = overlap_score(combined_tokens) + bonus = phrase_bonus(paper) + + bm25_w, overlap_w, bonus_w = RELEVANCE_COMBINATION + composite = bm25_w * normalized_bm25 + overlap_w * coverage + bonus_w * bonus + return min(1.0, composite) def compute_recency(paper: dict) -> float: date_str = paper.get("updated") or paper.get("published") @@ -285,6 +367,7 @@ def score_paper(paper: dict) -> float: selected = candidates[:max_results] for paper in selected: paper.pop("_score", None) + paper.pop("_tokens", None) return { "query": query, diff --git a/scripts/dev.sh b/scripts/dev.sh old mode 100644 new mode 100755