diff --git a/examples/llamaindex_hybrid_example.py b/examples/llamaindex_hybrid_example.py new file mode 100644 index 00000000..d7858b34 --- /dev/null +++ b/examples/llamaindex_hybrid_example.py @@ -0,0 +1,53 @@ +import os + +from leann.api import LeannBuilder +from leann.integrations.llamaindex import LeannHybridRetriever, LeannRetriever + +# Setup LlamaIndex Settings +# Ensure OPENAI_API_KEY is set in environment since LlamaIndex uses it by default for response generation +if not os.environ.get("OPENAI_API_KEY"): + print( + "Warning: OPENAI_API_KEY is not set. The Retrieval part will work, but QueryEngine text generation will fail unless using a mock LLM." + ) + # For demo purposes, we can try to use a mock LLM or let it fail gracefully on the query engine step + + +def main(): + index_path = "example_docs.leann" + + # 1. Build the dummy database + print("Building LEANN index...") + builder = LeannBuilder(backend_name="hnsw", embedding_model="BAAI/bge-small-en-v1.5") + builder.add_text("LEANN achieves 97% storage reduction.", metadata={"source": "doc1"}) + builder.add_text("Vector databases store embeddings.", metadata={"source": "doc2"}) + builder.add_text( + "Hybrid search combines vector and keyword search.", metadata={"source": "doc3"} + ) + builder.build_index(index_path) + + # 2. Example: Pure Vector Search + print("\n=== Pure Vector Search ===") + retriever = LeannRetriever(index_path=index_path, top_k=2) + + # Retrieve directly (without LLM generation) to show it works even without API keys + nodes = retriever.retrieve("How does LEANN reduce storage?") + for node in nodes: + print(f"ID: {node.node.id_} | Score: {node.score:.4f} | Text: {node.node.text}") + + # 3. Example: Hybrid Search (Recommended) + print("\n=== Hybrid Search (70% vector, 30% keyword) ===") + hybrid_retriever = LeannHybridRetriever( + index_path=index_path, + top_k=2, + bm25_weight=0.3, # 30% keyword weight mapping to LEANN's `gemma = 0.7` internally + ) + + nodes = hybrid_retriever.retrieve("hybrid search combination") + for node in nodes: + print(f"ID: {node.node.id_} | Score: {node.score:.4f} | Text: {node.node.text}") + + print("\nRetrieval successful! The LlamaIndex integration is fully functional.") + + +if __name__ == "__main__": + main() diff --git a/packages/leann-core/src/leann/integrations/__init__.py b/packages/leann-core/src/leann/integrations/__init__.py new file mode 100644 index 00000000..ab89e1c4 --- /dev/null +++ b/packages/leann-core/src/leann/integrations/__init__.py @@ -0,0 +1,3 @@ +from .llamaindex import LeannHybridRetriever, LeannRetriever + +__all__ = ["LeannHybridRetriever", "LeannRetriever"] diff --git a/packages/leann-core/src/leann/integrations/llamaindex.py b/packages/leann-core/src/leann/integrations/llamaindex.py new file mode 100644 index 00000000..7efdb47d --- /dev/null +++ b/packages/leann-core/src/leann/integrations/llamaindex.py @@ -0,0 +1,112 @@ +from typing import Any + +from llama_index.core.retrievers import BaseRetriever +from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode + +from leann.api import LeannSearcher + + +def _results_to_nodes(results: list) -> list[NodeWithScore]: + nodes = [] + + for r in results: + metadata = getattr(r, "metadata", {}) + if not isinstance(metadata, dict): + metadata = {} + + node = TextNode(text=r.text, id_=r.id, metadata=metadata) + + nodes.append(NodeWithScore(node=node, score=r.score)) + + return nodes + + +class LeannRetriever(BaseRetriever): + """LlamaIndex Retriever for LEANN""" + + def __init__( + self, + index_path: str, + top_k: int = 10, + complexity: int = 64, + recompute_embeddings: bool = True, + **searcher_kwargs: Any, + ): + super().__init__() + self._top_k = top_k + self._complexity = complexity + self._recompute_embeddings = recompute_embeddings + self._searcher = LeannSearcher(index_path, **searcher_kwargs) + + def _retrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]: + """Retrieve nodes from LEANN index using pure vector search""" + results = self._searcher.search( + query=query_bundle.query_str, + top_k=self._top_k, + complexity=self._complexity, + recompute_embeddings=self._recompute_embeddings, + ) + + return _results_to_nodes(results) + + async def _aretrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]: + """Async retrieve""" + + return self._retrieve(query_bundle) + + +class LeannHybridRetriever(BaseRetriever): + """LlamaIndex retriever with hybrid search (vector + BM25). + Parameters + ---------- + index_path : str + Path to LEANN index file (*.leann) + top_k : int + Number of results to return (default 10) + bm25_weight : float + Weight for BM25 (keyword) search, range [0, 1] (default 0.3) + - 0.0 = pure vector search (no keywords) + - 0.3 = 70% vector, 30% keywords (recommended) + - 0.5 = balanced hybrid search + - 1.0 = pure keyword search (no vectors) + Notes + ----- + Internally converts `bm25_weight` to LEANN's `gemma` parameter: + gemma = 1.0 - bm25_weight + """ + + def __init__( + self, + index_path: str, + top_k: int = 10, + bm25_weight: float = 0.3, + complexity: int = 64, + recompute_embeddings: bool = True, + **searcher_kwargs: Any, + ): + super().__init__() + + self._bm25_weight = max(0.0, min(1.0, bm25_weight)) + self._gemma = 1.0 - self._bm25_weight + self._top_k = top_k + self._complexity = complexity + self._recompute = recompute_embeddings + self._searcher = LeannSearcher(index_path, **searcher_kwargs) + + def _retrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]: + """Retrieve nodes from LEANN index using hybrid search""" + + results = self._searcher.search( + query=query_bundle.query_str, + top_k=self._top_k, + complexity=self._complexity, + recompute_embeddings=self._recompute, + gemma=self._gemma, + ) + + return _results_to_nodes(results) + + async def _aretrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]: + """Async retrieve""" + + return self._retrieve(query_bundle) diff --git a/tests/test_llamaindex_integration.py b/tests/test_llamaindex_integration.py new file mode 100644 index 00000000..f3fa6a28 --- /dev/null +++ b/tests/test_llamaindex_integration.py @@ -0,0 +1,48 @@ +from unittest.mock import patch + +from leann.integrations.llamaindex import LeannHybridRetriever, _results_to_nodes +from llama_index.core.schema import NodeWithScore + + +class MockSearchResult: + def __init__(self, id, score, text, metadata): + self.id = id + self.score = score + self.text = text + self.metadata = metadata + + +def test_results_to_nodes(): + """Test converting LEANN SearchResults to LlamaIndex Nodes.""" + results = [ + MockSearchResult("1", 0.9, "text1", {"source": "doc1"}), + MockSearchResult("2", 0.8, "text2", None), # Should normalize None metadata to {} + MockSearchResult("3", 0.7, "text3", "not a dict"), # Should normalize string metadata to {} + ] + + nodes = _results_to_nodes(results) + assert len(nodes) == 3 + assert isinstance(nodes[0], NodeWithScore) + assert nodes[0].node.id_ == "1" + assert nodes[0].node.text == "text1" + assert nodes[0].node.metadata == {"source": "doc1"} + + assert nodes[1].node.metadata == {} + assert nodes[2].node.metadata == {} + + +@patch("leann.integrations.llamaindex.LeannSearcher") +def test_leann_hybrid_retriever_bm25_weight(mock_searcher): + """Test that bm25_weight is correctly converted to gemma.""" + retriever1 = LeannHybridRetriever("dummy_path", bm25_weight=0.3) + assert retriever1._gemma == 0.7 + + retriever2 = LeannHybridRetriever("dummy_path", bm25_weight=0.0) + assert retriever2._gemma == 1.0 # Pure vector search + + retriever3 = LeannHybridRetriever("dummy_path", bm25_weight=1.0) + assert retriever3._gemma == 0.0 # Pure keyword search + + # Test clamping + retriever4 = LeannHybridRetriever("dummy_path", bm25_weight=1.5) + assert retriever4._gemma == 0.0 # clamped to 1.0 -> 0.0 gemma