Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ RAGLite is a Python toolkit for Retrieval-Augmented Generation (RAG) with DuckDB
- 🔌 A built-in [Model Context Protocol](https://modelcontextprotocol.io) (MCP) server that any MCP client like [Claude desktop](https://claude.ai/download) can connect with
- 💬 Optional customizable ChatGPT-like frontend for [web](https://docs.chainlit.io/deploy/copilot), [Slack](https://docs.chainlit.io/deploy/slack), and [Teams](https://docs.chainlit.io/deploy/teams) with [Chainlit](https://github.com/Chainlit/chainlit)
- ✍️ Optional conversion of any input document to Markdown with [Pandoc](https://github.com/jgm/pandoc)
- 🔎 Optional high-quality document processing with [Mistral OCR](https://docs.mistral.ai/capabilities/document/) for PDFs, images, DOCX, and PPTX with automatic image descriptions
- ✅ Optional evaluation of retrieval and generation performance with [Ragas](https://github.com/explodinggradients/ragas)

## Installing
Expand Down Expand Up @@ -69,6 +70,12 @@ To add support for filetypes other than PDF, use the `pandoc` extra:
pip install raglite[pandoc]
```

To add support for high-quality document processing with [Mistral OCR](https://docs.mistral.ai/capabilities/document/), use the `mistral-ocr` extra:

```sh
pip install raglite[mistral-ocr]
```

To add support for evaluation, use the `ragas` extra:

```sh
Expand Down Expand Up @@ -152,6 +159,21 @@ my_config = RAGLiteConfig(
> [!TIP]
> ✍️ To insert documents other than PDF, install the `pandoc` extra with `pip install raglite[pandoc]`.

> [!TIP]
> 🔎 For higher-quality document processing with automatic image descriptions, install the `mistral-ocr` extra with `pip install raglite[mistral-ocr]` and configure it as follows:
> ```python
> from raglite import RAGLiteConfig, MistralOCRConfig
>
> my_config = RAGLiteConfig(
> document_processor=MistralOCRConfig(
> include_image_descriptions=True, # Describe images, charts, and diagrams as text
> image_types=frozenset({"chart", "diagram", "photo", "table", "logo", "icon"}), # Custom image categories
> exclude_image_types=frozenset({"logo", "icon"}), # Filter out specific types from the output
> ),
> )
> ```
> The `image_types` parameter defines the categories that Mistral classifies each image into — you can use the defaults or provide your own domain-specific types. Use `exclude_image_types` to filter out any classified types that are not useful for retrieval.

Next, insert some documents into the database. RAGLite will take care of the [conversion to Markdown](src/raglite/_markdown.py), [optimal level 4 semantic chunking](src/raglite/_split_chunks.py), and [multi-vector embedding with late chunking](src/raglite/_embed.py):

```python
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ dev = [
"pytest (>=8.3.4)",
"pytest-mock (>=3.14.0)",
"pytest-xdist (>=3.6.1)",
"python-dotenv (>=1.0.0)",
"ruff (>=0.10.0)",
"typeguard (>=4.4.1)",
]
Expand All @@ -80,6 +81,7 @@ chainlit = ["chainlit (>=2.0.0)"]
# Large Language Models:
llama-cpp-python = ["llama-cpp-python (>=0.3.9)"]
# Markdown conversion:
mistral-ocr = ["mistralai (>=1.10.1)"]
pandoc = ["pypandoc-binary (>=1.13)"]
# Evaluation:
ragas = ["pandas (>=2.1.1)", "ragas (>=0.3.3)"]
Expand Down
5 changes: 4 additions & 1 deletion src/raglite/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""RAGLite."""

from raglite._config import RAGLiteConfig
from raglite._config import MistralOCRConfig, RAGLiteConfig
from raglite._database import Document
from raglite._delete import delete_documents, delete_documents_by_metadata
from raglite._eval import answer_evals, evaluate, insert_evals
from raglite._extract import expand_document_metadata
from raglite._insert import insert_documents
from raglite._mistral_ocr import MistralOCRError
from raglite._query_adapter import update_query_adapter
from raglite._rag import add_context, async_rag, rag, retrieve_context
from raglite._search import (
Expand All @@ -22,6 +23,8 @@
__all__ = [
# Config
"RAGLiteConfig",
"MistralOCRConfig",
"MistralOCRError",
# Insert
"Document",
"insert_documents",
Expand Down
4 changes: 2 additions & 2 deletions src/raglite/_chainlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,15 @@ async def handle_message(user_message: cl.Message) -> None:
inline_attachments = []
for file in user_message.elements:
if file.path:
doc_md = document_to_markdown(Path(file.path))
doc_md = document_to_markdown(Path(file.path), config=config)
if len(doc_md) // 3 <= 5 * (config.chunk_max_size // 3):
# Document is small enough to attach to the context.
inline_attachments.append(f"{Path(file.path).name}:\n\n{doc_md}")
else:
# Document is too large and must be inserted into the database.
async with cl.Step(name="insert", type="run") as step:
step.input = Path(file.path).name
document = Document.from_path(Path(file.path))
document = Document.from_path(Path(file.path), config=config)
await async_insert_documents([document], config=config)
# Append any inline attachments to the user prompt.
user_prompt = (
Expand Down
22 changes: 22 additions & 0 deletions src/raglite/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,26 @@
cache_path = Path(user_data_dir("raglite", ensure_exists=True))


DEFAULT_IMAGE_TYPES = frozenset(
{"graph", "chart", "diagram", "table", "photo", "screenshot", "logo", "icon", "other"}
)


@dataclass(frozen=True)
class MistralOCRConfig:
"""Configuration for MistralOCR document processor."""

# API key - falls back to MISTRAL_API_KEY env var if None.
api_key: str | None = None
# Whether to use vision to describe images in documents.
include_image_descriptions: bool = True
# Image types that Mistral classifies each image into.
image_types: frozenset[str] = DEFAULT_IMAGE_TYPES
# Image types to exclude from the output (e.g., {"logo", "icon"}).
exclude_image_types: frozenset[str] = frozenset()
model: str = "mistral-ocr-latest"


# Lazily load the default search method to avoid circular imports.
# TODO: Replace with search_and_rerank_chunk_spans after benchmarking.
def _vector_search(
Expand Down Expand Up @@ -65,6 +85,8 @@ class RAGLiteConfig:
embedder_normalize: bool = True
# Chunk config used to partition documents into chunks.
chunk_max_size: int = 2048 # Max number of characters per chunk.
# Document processing config. None = default processor.
document_processor: MistralOCRConfig | None = None
# Vector search config.
vector_search_distance_metric: Literal["cosine", "dot", "l2"] = "cosine"
vector_search_multivector: bool = True
Expand Down
5 changes: 4 additions & 1 deletion src/raglite/_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def from_path(
*,
id: DocumentId | None = None, # noqa: A002
url: str | None = None,
config: RAGLiteConfig | None = None,
**kwargs: Any,
) -> "Document":
"""Create a document from a file path.
Expand All @@ -120,6 +121,8 @@ def from_path(
The document id to use. If not provided, a hash of the document's content is used.
url
The URL of the document, if available.
config
The RAGLite configuration for document processing.
kwargs
Any additional metadata to store.

Expand All @@ -145,7 +148,7 @@ def from_path(
filename=doc_path.name,
url=url,
metadata_=metadata,
content=document_to_markdown(doc_path),
content=document_to_markdown(doc_path, config=config),
)

@staticmethod
Expand Down
40 changes: 38 additions & 2 deletions src/raglite/_markdown.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Convert any document to Markdown."""

import logging
import re
from copy import deepcopy
from pathlib import Path
Expand All @@ -9,6 +10,10 @@
from pdftext.extraction import dictionary_output
from sklearn.cluster import KMeans

from raglite._config import MistralOCRConfig, RAGLiteConfig

logger = logging.getLogger(__name__)


def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0915
"""Convert a PDF parsed with pdftext to Markdown."""
Expand Down Expand Up @@ -194,8 +199,8 @@ def _merge_split_headings(match: re.Match[str]) -> str:
return pages_md


def document_to_markdown(doc_path: Path) -> str:
"""Convert any document to GitHub Flavored Markdown."""
def _default_document_to_markdown(doc_path: Path) -> str:
"""Convert any document to GitHub Flavored Markdown using pdftext/pandoc."""
# Convert the file's content to GitHub Flavored Markdown.
if doc_path.suffix == ".pdf":
# Parse the PDF with pdftext and convert it to Markdown.
Expand All @@ -219,3 +224,34 @@ def document_to_markdown(doc_path: Path) -> str:
# File format not supported, fall back to reading the text.
doc = doc_path.read_text()
return doc


def document_to_markdown(doc_path: Path, *, config: RAGLiteConfig | None = None) -> str:
"""Convert any document to GitHub Flavored Markdown.

Parameters
----------
doc_path
Path to the document file.
config
Optional RAGLite configuration. If document_processor is set to a
MistralOCRConfig, uses MistralOCR instead of the default processor.

Returns
-------
str
Document content as GitHub Flavored Markdown.
"""
config = config or RAGLiteConfig()

if isinstance(config.document_processor, MistralOCRConfig):
# Lazy import to avoid requiring mistralai when not using MistralOCR.
from raglite._mistral_ocr import SUPPORTED_EXTENSIONS, mistral_ocr_to_markdown

if doc_path.suffix.lower() in SUPPORTED_EXTENSIONS:
return mistral_ocr_to_markdown(doc_path, processor_config=config.document_processor)
logger.debug(
"Mistral does not support file type: %s\nFalling back to default processor.", doc_path
)

return _default_document_to_markdown(doc_path)
Loading