diff --git a/docs/user-scripts-tr.md b/docs/user-scripts-tr.md new file mode 100644 index 00000000..070ae852 --- /dev/null +++ b/docs/user-scripts-tr.md @@ -0,0 +1,224 @@ +# Kullanıcı Scriptleri: Günlük Yaşamda LEANN + +Bu dokümantasyon, LEANN'ı günlük yaşamda kullanmak için hazırlanmış otomasyon scriptlerini açıklar. + +## Kurulum + +### 1. Scriptleri İndirme + +Bu scriptleri kullanmak için önce LEANN repository'sini klonlayın: + +```bash +git clone https://github.com/yichuan-w/LEANN.git +cd LEANN +``` + +### 2. Scriptleri ~/bin Klasörüne Kopyalama + +```bash +# ~/.bin klasörü oluşturma (yoksa) +mkdir -p ~/bin + +# Scriptleri kopyalama +cp bin/leann-sync-all.sh ~/bin/ +cp bin/leann-sync-dev.sh ~/bin/ +cp bin/leann-sync-personal.sh ~/bin/ +cp bin/leann-sync-brave.sh ~/bin/ +cp bin/leann-sync-mail.sh ~/bin/ +cp bin/leann-sync-imessage.sh ~/bin/ +cp bin/leann-sync-calendar.sh ~/bin/ + +# Çalıştırılabilir yapma +chmod +x ~/bin/leann-*.sh +``` + +### 3. PATH'e Ekleme + +```bash +# ~/.zshrc veya ~/.bashrc dosyanıza ekleyin +echo 'export PATH="$HOME/bin:$PATH"' >> ~/.zshrc +source ~/.zshrc +``` + +### 4. Ollama Kurulumu (Embedding için) + +```bash +# Ollama'yı başlatma +ollama serve & + +# Embedding modeli indirme +ollama pull nomic-embed-text +``` + +### 5. LEANN Kurulumu + +```bash +cd LEANN +uv sync --extra diskann +``` + +## Kullanım + +### Hızlı Başlangıç + +```bash +# Tüm indexleri güncelle +leann-sync-all.sh +``` + +### Bireysel Scriptler + +| Script | Açıklama | +|--------|-----------| +| `leann-sync-all.sh` | Tüm indexleri sırayla günceller | +| `leann-sync-dev.sh` | Geliştirme ortamı kodlarını indeksler | +| `leann-sync-personal.sh` | Kişisel belgeleri (Documents, Nextcloud) indeksler | +| `leann-sync-brave.sh` | Brave tarayıcı geçmişini indeksler | +| `leann-sync-mail.sh` | Apple Mail e-postalarını indeksler | +| `leann-sync-imessage.sh` | iMessage mesajlarını indeksler | +| `leann-sync-calendar.sh` | Apple Calendar etkinliklerini indeksler | + +### Örnek Kullanım Senaryoları + +#### Senaryo 1: Günlük Geliştirme İş Akışı + +```bash +# Her sabah geliştirme ortamınızı güncelleyin +leann-sync-dev.sh +``` + +Bu komut: +- ~/Development klasöründeki kodlarınızı tarar +- AST-aware chunking ile kod yapısını korur +- DiskANN backend ile indeks oluşturur + +#### Senaryo 2: Kişisel Doküman Arama + +```bash +# Kişisel belgelerinizi indeksleyin +leann-sync-personal.sh +``` + +Bu komut: +- ~/Documents, ~/Nextcloud, ~/Nextcloud2 klasörlerini tarar +- Tüm belgeleri (PDF, TXT, MD, Word, Excel, PowerPoint) indeksler + +#### Senaryo 3: Tarayıcı Geçmişi Arama + +```bash +# Brave tarayıcı geçmişinizi indeksleyin +leann-sync-brave.sh +``` + +#### Senaryo 4: E-posta Arama + +```bash +# Apple Mail e-postalarınızı indeksleyin +leann-sync-mail.sh +``` + +#### Senaryo 5: iMessage Arama + +```bash +# iMessage mesajlarınızı indeksleyin +leann-sync-imessage.sh +``` + +## Script Özelleştirme + +### Kendi Scriptinizi Oluşturma + +```bash +#!/bin/bash +# my-custom-index.sh + +export LEANN_HOME="$HOME/.leann" +export OLLAMA_HOST="http://localhost:11434" + +leann build my-custom-index \ + --docs ~/MyDocuments \ + --embedding-mode ollama \ + --embedding-model nomic-embed-text \ + --backend-name diskann \ + --force +``` + +### Parametre Değiştirme + +Scriptlerdeki parametreleri kendi ihtiyaçlarınıza göre değiştirebilirsiniz: + +```bash +# Daha büyük chunk boyutu +--doc-chunk-size 2048 + +# Embedding modeli değiştirme +--embedding-model BAAI/bge-base-en-v1.5 + +# HNSW backend kullanma +--backend-name hnsw +``` + +## Sıkça Sorulan Sorular + +### S: "leann: command not found" hatası alıyorum + +C: LEANN kurulumunun PATH'e eklendiğinden emin olun: +```bash +export PATH="/path/to/LEANN/packages/leann-core:$PATH" +``` + +### S: Ollama bağlantısı başarısız + +C: Ollama'nın çalıştığını kontrol edin: +```bash +ollama serve & +ollama list +``` + +### S: Index oluşturma çok yavaş + +C: Daha küçük bir dataset ile test edin: +```bash +--max-items 1000 +``` + +## İleri Düzey Kullanım + +### LEANN CLI Komutları + +```bash +# Index oluşturma +leann build my-index --docs ./documents + +# Arama +leann search my-index "arama sorgusu" + +# Soru sorma +leann ask my-index --interactive + +# Indexleri listeleme +leann list + +# Index kaldırma +leann remove my-index +``` + +### Embedding Modları + +| Mod | Açıklama | +|-----|-----------| +| `sentence-transformers` | HuggingFace modelleri | +| `openai` | OpenAI API | +| `mlx` | Apple Silicon MLX | +| `ollama` | Yerel Ollama | + +### Backend Seçimi + +| Backend | Kullanım | +|---------|----------| +| `hnsw` | Küçük-orta ölçekli (<10M vektör) | +| `diskann` | Büyük ölçekli, disk tabanlı | + +--- + +Bu dokümantasyon, LEANN'ın günlük kullanımını kolaylaştırmak için hazırlanmıştır. Herhangi bir sorunuz varsa, lütfen GitHub Issues üzerinden sorun. diff --git a/docs/user-scripts.md b/docs/user-scripts.md new file mode 100644 index 00000000..447ac94e --- /dev/null +++ b/docs/user-scripts.md @@ -0,0 +1,222 @@ +# User Scripts: Daily Life with LEANN + +This documentation describes the automation scripts prepared for using LEANN in daily life. + +## Installation + +### 1. Clone the Repository + +```bash +git clone https://github.com/yichuan-w/LEANN.git +cd LEANN +``` + +### 2. Copy Scripts to ~/bin Folder + +```bash +# Create ~/bin directory if it doesn't exist +mkdir -p ~/bin + +# Copy scripts +cp bin/leann-sync-all.sh ~/bin/ +cp bin/leann-sync-dev.sh ~/bin/ +cp bin/leann-sync-personal.sh ~/bin/ +cp bin/leann-sync-brave.sh ~/bin/ +cp bin/leann-sync-mail.sh ~/bin/ +cp bin/leann-sync-imessage.sh ~/bin/ +cp bin/leann-sync-calendar.sh ~/bin/ + +# Make executable +chmod +x ~/bin/leann-*.sh +``` + +### 3. Add to PATH + +```bash +# Add to ~/.zshrc or ~/.bashrc +echo 'export PATH="$HOME/bin:$PATH"' >> ~/.zshrc +source ~/.zshrc +``` + +### 4. Install Ollama (for Embedding) + +```bash +# Start Ollama +ollama serve & + +# Download embedding model +ollama pull nomic-embed-text +``` + +### 5. Install LEANN + +```bash +cd LEANN +uv sync --extra diskann +``` + +## Usage + +### Quick Start + +```bash +# Update all indexes +leann-sync-all.sh +``` + +### Individual Scripts + +| Script | Description | +|--------|-------------| +| `leann-sync-all.sh` | Updates all indexes sequentially | +| `leann-sync-dev.sh` | Indexes development environment code | +| `leann-sync-personal.sh` | Indexes personal documents (Documents, Nextcloud) | +| `leann-sync-brave.sh` | Indexes Brave browser history | +| `leann-sync-mail.sh` | Indexes Apple Mail emails | +| `leann-sync-imessage.sh` | Indexes iMessage messages | +| `leann-sync-calendar.sh` | Indexes Apple Calendar events | + +### Example Usage Scenarios + +#### Scenario 1: Daily Development Workflow + +```bash +# Update your development environment every morning +leann-sync-dev.sh +``` + +This command: +- Scans code in ~/Development folder +- Preserves code structure with AST-aware chunking +- Creates index with DiskANN backend + +#### Scenario 2: Personal Document Search + +```bash +# Index your personal documents +leann-sync-personal.sh +``` + +This command: +- Scans ~/Documents, ~/Nextcloud, ~/Nextcloud2 folders +- Indexes all documents (PDF, TXT, MD, Word, Excel, PowerPoint) + +#### Scenario 3: Browser History Search + +```bash +# Index your Brave browser history +leann-sync-brave.sh +``` + +#### Scenario 4: Email Search + +```bash +# Index your Apple Mail emails +leann-sync-mail.sh +``` + +#### Scenario 5: iMessage Search + +```bash +# Index your iMessage messages +leann-sync-imessage.sh +``` + +## Script Customization + +### Create Your Own Script + +```bash +#!/bin/bash +# my-custom-index.sh + +export LEANN_HOME="$HOME/.leann" +export OLLAMA_HOST="http://localhost:11434" + +leann build my-custom-index \ + --docs ~/MyDocuments \ + --embedding-mode ollama \ + --embedding-model nomic-embed-text \ + --backend-name diskann \ + --force +``` + +### Modify Parameters + +You can modify parameters in scripts according to your needs: + +```bash +# Larger chunk size +--doc-chunk-size 2048 + +# Change embedding model +--embedding-model BAAI/bge-base-en-v1.5 + +# Use HNSW backend +--backend-name hnsw +``` + +## Frequently Asked Questions + +### Q: I get "leann: command not found" error + +Make sure LEANN installation is in your PATH: +```bash +export PATH="/path/to/LEANN/packages/leann-core:$PATH" +``` + +### Q: Ollama connection failed + +Make sure Ollama is running: +```bash +ollama serve & +ollama list +``` + +### Q: Index creation is too slow + +Try with a smaller dataset: +```bash +--max-items 1000 +``` + +## Advanced Usage + +### LEANN CLI Commands + +```bash +# Create index +leann build my-index --docs ./documents + +# Search +leann search my-index "search query" + +# Ask questions +leann ask my-index --interactive + +# List indexes +leann list + +# Remove index +leann remove my-index +``` + +### Embedding Modes + +| Mode | Description | +|------|-------------| +| `sentence-transformers` | HuggingFace models | +| `openai` | OpenAI API | +| `mlx` | Apple Silicon MLX | +| `ollama` | Local Ollama | + +### Backend Selection + +| Backend | Use Case | +|---------|----------| +| `hnsw` | Small-medium scale (<10M vectors) | +| `diskann` | Large scale, disk-based | + +--- + +This documentation is prepared to make LEANN easy to use in daily life. If you have any questions, please ask on GitHub Issues. diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index d4d29071..869edf69 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -179,6 +179,40 @@ def create_parser(self) -> argparse.ArgumentParser: subparsers = parser.add_subparsers(dest="command", help="Available commands") + def add_embedding_args(target_parser: argparse.ArgumentParser) -> None: + """Add common embedding arguments to a parser.""" + target_parser.add_argument( + "--embedding-model", + type=str, + default="facebook/contriever", + help="Embedding model (default: facebook/contriever)", + ) + target_parser.add_argument( + "--embedding-mode", + type=str, + default="sentence-transformers", + choices=["sentence-transformers", "openai", "mlx", "ollama"], + help="Embedding backend mode (default: sentence-transformers)", + ) + target_parser.add_argument( + "--embedding-host", + type=str, + default=None, + help="Override Ollama-compatible embedding host", + ) + target_parser.add_argument( + "--embedding-api-base", + type=str, + default=None, + help="Base URL for OpenAI-compatible embedding services", + ) + target_parser.add_argument( + "--embedding-api-key", + type=str, + default=None, + help="API key for embedding service (defaults to OPENAI_API_KEY)", + ) + # Build command build_parser = subparsers.add_parser("build", help="Build document index") build_parser.add_argument( @@ -322,6 +356,132 @@ def create_parser(self) -> argparse.ArgumentParser: help="Fall back to traditional chunking if AST chunking fails (default: True)", ) + # Browser Index Command + browser_parser = subparsers.add_parser("index-browser", help="Index browser history") + browser_parser.add_argument( + "browser_type", choices=["chrome", "brave"], help="Type of browser" + ) + browser_parser.add_argument( + "--profile", type=str, default="Default", help="Profile name (default: Default)" + ) + browser_parser.add_argument( + "--index-name", type=str, default=None, help="Custom index name" + ) + browser_parser.add_argument( + "--max-items", type=int, default=1000, help="Max history items to index" + ) + add_embedding_args(browser_parser) + + # Email indexing command + email_parser = subparsers.add_parser("index-email", help="Index Apple Mail emails") + email_parser.add_argument( + "index_name", nargs="?", default="apple-mail", help="Index name (default: apple-mail)" + ) + email_parser.add_argument( + "--max-items", type=int, default=2000, help="Max emails to index (default: 2000)" + ) + add_embedding_args(email_parser) + + # Calendar indexing command + calendar_parser = subparsers.add_parser( + "index-calendar", help="Index Apple Calendar events" + ) + calendar_parser.add_argument( + "index_name", + nargs="?", + default="apple-calendar", + help="Index name (default: apple-calendar)", + ) + calendar_parser.add_argument( + "--max-items", type=int, default=1000, help="Max events to index (default: 1000)" + ) + add_embedding_args(calendar_parser) + + # WeChat indexing command + wechat_parser = subparsers.add_parser("index-wechat", help="Index WeChat chat history") + wechat_parser.add_argument( + "index_name", nargs="?", default="wechat", help="Index name (default: wechat)" + ) + wechat_parser.add_argument( + "--export-dir", + type=str, + default="./wechat_export", + help="Directory containing exported WeChat data (default: ./wechat_export)", + ) + wechat_parser.add_argument( + "--max-items", type=int, default=1000, help="Max messages to index (default: 1000)" + ) + add_embedding_args(wechat_parser) + + # iMessage indexing command + imessage_parser = subparsers.add_parser("index-imessage", help="Index iMessage history") + imessage_parser.add_argument( + "index_name", nargs="?", default="imessage", help="Index name (default: imessage)" + ) + imessage_parser.add_argument( + "--db-path", + type=str, + default=None, + help="Path to chat.db (default: ~/Library/Messages/chat.db)", + ) + imessage_parser.add_argument( + "--max-items", type=int, default=1000, help="Max messages to index (default: 1000)" + ) + add_embedding_args(imessage_parser) + + # Slack indexing command + slack_parser = subparsers.add_parser("index-slack", help="Index Slack workspace via MCP") + slack_parser.add_argument( + "index_name", nargs="?", default="slack", help="Index name (default: slack)" + ) + slack_parser.add_argument( + "--mcp-server", + type=str, + required=True, + help="MCP server command (e.g., 'slack-mcp-server')", + ) + slack_parser.add_argument("--workspace-name", type=str, help="Slack workspace name") + slack_parser.add_argument( + "--channels", + type=str, + nargs="+", + default=[], + help="Specific channels to index (optional)", + ) + add_embedding_args(slack_parser) + + # ChatGPT indexing command + chatgpt_parser = subparsers.add_parser("index-chatgpt", help="Index ChatGPT export") + chatgpt_parser.add_argument( + "index_name", nargs="?", default="chatgpt", help="Index name (default: chatgpt)" + ) + chatgpt_parser.add_argument( + "--export-path", + type=str, + required=True, + help="Path to ChatGPT export file (.html/.zip) or directory", + ) + chatgpt_parser.add_argument( + "--max-items", type=int, default=1000, help="Max items to index (default: 1000)" + ) + add_embedding_args(chatgpt_parser) + + # Claude indexing command + claude_parser = subparsers.add_parser("index-claude", help="Index Claude export") + claude_parser.add_argument( + "index_name", nargs="?", default="claude", help="Index name (default: claude)" + ) + claude_parser.add_argument( + "--export-path", + type=str, + required=True, + help="Path to Claude export file (.json/.zip) or directory", + ) + claude_parser.add_argument( + "--max-items", type=int, default=1000, help="Max items to index (default: 1000)" + ) + add_embedding_args(claude_parser) + # Watch command watch_parser = subparsers.add_parser( "watch", @@ -1625,6 +1785,343 @@ def _load_chunk_ids_by_file(self, passages_file: Path) -> dict[str, list[str]]: chunk_ids_by_file.setdefault(file_path, []).append(str(chunk_id)) return chunk_ids_by_file + async def index_browser(self, args): + """Build an index from browser history.""" + from .readers import ChromeHistoryReader + + browser_type = args.browser_type + profile = args.profile + index_name = args.index_name or f"{browser_type}-history" + + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print(f"🌐 Indexing {browser_type.capitalize()} history (profile: {profile})...") + + paths = ChromeHistoryReader.find_browser_paths() + if browser_type not in paths: + print(f"❌ Could not find {browser_type} profile directory automatically.") + return + + profile_path = paths[browser_type] / profile + + reader = ChromeHistoryReader() + documents = reader.load_data( + chrome_profile_path=str(profile_path), max_count=args.max_items + ) + + if not documents: + print("❌ No history entries found to index.") + return + + print(f"📚 Loaded {len(documents)} entries. Building index...") + + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Browser history index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_email(self, args): + """Build an index from Apple Mail emails.""" + from .readers import AppleMailReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("📧 Indexing Apple Mail emails...") + + reader = AppleMailReader() + documents = reader.load_data(max_count=args.max_items) + + if not documents: + print("❌ No emails found to index. Make sure Full Disk Access is granted.") + return + + print(f"📚 Loaded {len(documents)} emails. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Email index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_calendar(self, args): + """Build an index from Apple Calendar events.""" + from .readers import AppleCalendarReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("📅 Indexing Apple Calendar events...") + + reader = AppleCalendarReader() + documents = reader.load_data(max_count=args.max_items) + + if not documents: + print("❌ No calendar events found to index. Make sure Full Disk Access is granted.") + return + + print(f"📚 Loaded {len(documents)} events. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Calendar index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_wechat(self, args): + """Build an index from WeChat chat history.""" + from .readers import WeChatHistoryReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("💬 Indexing WeChat chat history...") + + reader = WeChatHistoryReader() + export_dir = getattr(args, "export_dir", "./wechat_export") + documents = reader.load_data(wechat_export_dir=export_dir, max_count=args.max_items) + + if not documents: + print("❌ No WeChat data found. Make sure WeChat is exported first.") + return + + print(f"📚 Loaded {len(documents)} messages. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ WeChat index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_imessage(self, args): + """Build an index from iMessage history.""" + from .readers import IMessageReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("💬 Indexing iMessage history...") + + reader = IMessageReader() + documents = reader.load_data(max_count=args.max_items) + + if not documents: + print("❌ No iMessage data found. Make sure Full Disk Access is granted.") + return + + print(f"📚 Loaded {len(documents)} messages. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ iMessage index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_slack(self, args): + """Build an index from Slack workspace via MCP.""" + from .readers import SlackMCPReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("📱 Indexing Slack workspace...") + + reader = SlackMCPReader( + mcp_server_command=args.mcp_server, + workspace_name=args.workspace_name, + ) + documents = await reader.load_data(channels=args.channels) + + if not documents: + print("❌ No Slack data found. Make sure MCP server is running.") + return + + print(f"📚 Loaded {len(documents)} messages. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Slack index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_chatgpt(self, args): + """Build an index from ChatGPT export.""" + from .readers import ChatGPTReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("🤖 Indexing ChatGPT export...") + + reader = ChatGPTReader() + documents = reader.load_data(export_path=args.export_path) + + if not documents: + print("❌ No ChatGPT data found. Make sure export file is valid.") + return + + print(f"📚 Loaded {len(documents)} items. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ ChatGPT index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_claude(self, args): + """Build an index from Claude export.""" + from .readers import ClaudeReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("🤖 Indexing Claude export...") + + reader = ClaudeReader() + documents = reader.load_data(export_path=args.export_path) + + if not documents: + print("❌ No Claude data found. Make sure export file is valid.") + return + + print(f"📚 Loaded {len(documents)} items. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Claude index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + async def build_index(self, args): docs_paths = args.docs # Use current directory name if index_name not provided @@ -2156,6 +2653,30 @@ async def run(self, args=None): await self.react_agent(args) elif args.command == "serve": await self.serve_api(args) + elif args.command == "index-browser": + with suppress_cpp_output(suppress): + await self.index_browser(args) + elif args.command == "index-email": + with suppress_cpp_output(suppress): + await self.index_email(args) + elif args.command == "index-calendar": + with suppress_cpp_output(suppress): + await self.index_calendar(args) + elif args.command == "index-wechat": + with suppress_cpp_output(suppress): + await self.index_wechat(args) + elif args.command == "index-imessage": + with suppress_cpp_output(suppress): + await self.index_imessage(args) + elif args.command == "index-slack": + with suppress_cpp_output(suppress): + await self.index_slack(args) + elif args.command == "index-chatgpt": + with suppress_cpp_output(suppress): + await self.index_chatgpt(args) + elif args.command == "index-claude": + with suppress_cpp_output(suppress): + await self.index_claude(args) else: parser.print_help() diff --git a/packages/leann-core/src/leann/readers.py b/packages/leann-core/src/leann/readers.py new file mode 100644 index 00000000..c347a452 --- /dev/null +++ b/packages/leann-core/src/leann/readers.py @@ -0,0 +1,687 @@ +import json +import os +import re +import shutil +import sqlite3 +from datetime import datetime +from pathlib import Path +from typing import Any + +from llama_index.core import Document +from llama_index.core.readers.base import BaseReader + + +class ChromeHistoryReader(BaseReader): + """ + Chrome/Brave browser history reader that extracts browsing data from SQLite database. + Supports reading from a copy to avoid locking issues. + """ + + def __init__(self) -> None: + pass + + def load_data( + self, chrome_profile_path: str | None = None, max_count: int = 1000 + ) -> list[Document]: + docs: list[Document] = [] + + if chrome_profile_path is None: + # Default fallback for macOS + chrome_profile_path = os.path.expanduser( + "~/Library/Application Support/Google/Chrome/Default" + ) + + history_db_path = os.path.join(chrome_profile_path, "History") + temp_db_path = "/tmp/leann_history_index_copy" + + if not os.path.exists(history_db_path): + print(f"⚠️ Browser history database not found at: {history_db_path}") + return docs + + try: + # Create a temporary copy to avoid "database is locked" + shutil.copy2(history_db_path, temp_db_path) + + conn = sqlite3.connect(temp_db_path) + cursor = conn.cursor() + + query = """ + SELECT + datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit, + url, + title, + visit_count, + typed_count, + hidden + FROM urls + ORDER BY last_visit_time DESC + """ + + cursor.execute(query) + rows = cursor.fetchall() + + for row in rows: + if 0 < max_count <= len(docs): + break + + last_visit, url, title, visit_count, _typed_count, _hidden = row + if not title or not url: + continue + + doc_content = f""" +[Title]: {title} +[URL]: {url} +[Last Visited]: {last_visit} +[Visits]: {visit_count} +""" + doc = Document(text=doc_content, metadata={"title": title[0:150], "url": url}) + docs.append(doc) + + conn.close() + if os.path.exists(temp_db_path): + os.remove(temp_db_path) + + except Exception as e: + print(f"❌ Error reading browser history: {e}") + if os.path.exists(temp_db_path): + os.remove(temp_db_path) + return docs + + return docs + + @staticmethod + def find_browser_paths() -> dict[str, Path]: + """Find common browser profile base paths.""" + paths = {} + home = Path.home() + + if os.name == "posix": # macOS/Linux + # macOS paths + chrome = home / "Library/Application Support/Google/Chrome" + brave = home / "Library/Application Support/BraveSoftware/Brave-Browser" + if chrome.exists(): + paths["chrome"] = chrome + if brave.exists(): + paths["brave"] = brave + + return paths + + +class IMessageReader(BaseReader): + """ + iMessage data reader. + + Reads iMessage conversation data from the macOS Messages database (chat.db). + Processes conversations into structured documents with metadata. + """ + + def __init__(self, concatenate_conversations: bool = True) -> None: + """ + Initialize. + + Args: + concatenate_conversations: Whether to concatenate messages within conversations for better context + """ + self.concatenate_conversations = concatenate_conversations + + def _get_default_chat_db_path(self) -> Path: + """ + Get the default path to the iMessage chat database. + + Returns: + Path to the chat.db file + """ + home = Path.home() + return home / "Library" / "Messages" / "chat.db" + + def _convert_cocoa_timestamp(self, cocoa_timestamp: int) -> str: + """ + Convert Cocoa timestamp to readable format. + + Args: + cocoa_timestamp: Timestamp in Cocoa format (nanoseconds since 2001-01-01) + + Returns: + Formatted timestamp string + """ + if cocoa_timestamp == 0: + return "Unknown" + + try: + # Cocoa timestamp is nanoseconds since 2001-01-01 00:00:00 UTC + # Convert to seconds and add to Unix epoch + cocoa_epoch = datetime(2001, 1, 1) + unix_timestamp = cocoa_timestamp / 1_000_000_000 # Convert nanoseconds to seconds + message_time = cocoa_epoch.timestamp() + unix_timestamp + return datetime.fromtimestamp(message_time).strftime("%Y-%m-%d %H:%M:%S") + except (ValueError, OSError): + return "Unknown" + + def _get_contact_name(self, handle_id: str) -> str: + """ + Get a readable contact name from handle ID. + + Args: + handle_id: The handle ID (phone number or email) + + Returns: + Formatted contact name + """ + if not handle_id: + return "Unknown" + + # Clean up phone numbers and emails for display + if "@" in handle_id: + return handle_id # Email address + elif handle_id.startswith("+"): + return handle_id # International phone number + else: + # Try to format as phone number + digits = "".join(filter(str.isdigit, handle_id)) + if len(digits) == 10: + return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}" + elif len(digits) == 11 and digits[0] == "1": + return f"+1 ({digits[1:4]}) {digits[4:7]}-{digits[7:]}" + else: + return handle_id + + def _read_messages_from_db(self, db_path: Path) -> list[dict]: + """ + Read messages from the iMessage database. + + Args: + db_path: Path to the chat.db file + + Returns: + List of message dictionaries + """ + if not db_path.exists(): + print(f"iMessage database not found at: {db_path}") + return [] + + try: + # Connect to the database + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Query to get messages with chat and handle information + query = """ + SELECT + m.ROWID as message_id, + m.text, + m.date, + m.is_from_me, + m.service, + c.chat_identifier, + c.display_name as chat_display_name, + h.id as handle_id, + c.ROWID as chat_id + FROM message m + LEFT JOIN chat_message_join cmj ON m.ROWID = cmj.message_id + LEFT JOIN chat c ON cmj.chat_id = c.ROWID + LEFT JOIN handle h ON m.handle_id = h.ROWID + WHERE m.text IS NOT NULL AND m.text != '' + ORDER BY c.ROWID, m.date + """ + + cursor.execute(query) + rows = cursor.fetchall() + + messages = [] + for row in rows: + ( + message_id, + text, + date, + is_from_me, + service, + chat_identifier, + chat_display_name, + handle_id, + chat_id, + ) = row + + message = { + "message_id": message_id, + "text": text, + "timestamp": self._convert_cocoa_timestamp(date), + "is_from_me": bool(is_from_me), + "service": service or "iMessage", + "chat_identifier": chat_identifier or "Unknown", + "chat_display_name": chat_display_name or "Unknown Chat", + "handle_id": handle_id or "Unknown", + "contact_name": self._get_contact_name(handle_id or ""), + "chat_id": chat_id, + } + messages.append(message) + + conn.close() + return messages + + except sqlite3.Error as e: + print(f"Error reading iMessage database: {e}") + return [] + except Exception as e: + print(f"Unexpected error reading iMessage database: {e}") + return [] + + def _group_messages_by_chat(self, messages: list[dict]) -> dict[int, list[dict]]: + """ + Group messages by chat ID. + + Args: + messages: List of message dictionaries + + Returns: + Dictionary mapping chat_id to list of messages + """ + chats = {} + for message in messages: + chat_id = message["chat_id"] + if chat_id not in chats: + chats[chat_id] = [] + chats[chat_id].append(message) + + return chats + + def _create_concatenated_content(self, chat_id: int, messages: list[dict]) -> str: + """ + Create concatenated content from chat messages. + + Args: + chat_id: The chat ID + messages: List of messages in the chat + + Returns: + Concatenated text content + """ + if not messages: + return "" + + # Get chat info from first message + first_msg = messages[0] + chat_name = first_msg["chat_display_name"] + chat_identifier = first_msg["chat_identifier"] + + # Build message content + message_parts = [] + for message in messages: + timestamp = message["timestamp"] + is_from_me = message["is_from_me"] + text = message["text"] + contact_name = message["contact_name"] + + if is_from_me: + prefix = "[You]" + else: + prefix = f"[{contact_name}]" + + if timestamp != "Unknown": + prefix += f" ({timestamp})" + + message_parts.append(f"{prefix}: {text}") + + concatenated_text = "\n\n".join(message_parts) + + doc_content = f"""Chat: {chat_name} +Identifier: {chat_identifier} +Messages ({len(messages)} messages): + +{concatenated_text} +""" + return doc_content + + def _create_individual_content(self, message: dict) -> str: + """ + Create content for individual message. + + Args: + message: Message dictionary + + Returns: + Formatted message content + """ + timestamp = message["timestamp"] + is_from_me = message["is_from_me"] + text = message["text"] + contact_name = message["contact_name"] + chat_name = message["chat_display_name"] + + sender = "You" if is_from_me else contact_name + + return f"""Message from {sender} in chat "{chat_name}" +Time: {timestamp} +Content: {text} +""" + + def load_data(self, input_dir: str | None = None, **load_kwargs: Any) -> list[Document]: + """ + Load iMessage data and return as documents. + + Args: + input_dir: Optional path to directory containing chat.db file. + If not provided, uses default macOS location. + **load_kwargs: Additional arguments (unused) + + Returns: + List of Document objects containing iMessage data + """ + docs = [] + + # Determine database path + if input_dir: + db_path = Path(input_dir) / "chat.db" + else: + db_path = self._get_default_chat_db_path() + + # Read messages from database + messages = self._read_messages_from_db(db_path) + if not messages: + return docs + + if self.concatenate_conversations: + # Group messages by chat and create concatenated documents + chats = self._group_messages_by_chat(messages) + + for chat_id, chat_messages in chats.items(): + if not chat_messages: + continue + + content = self._create_concatenated_content(chat_id, chat_messages) + + # Create metadata + first_msg = chat_messages[0] + last_msg = chat_messages[-1] + + metadata = { + "source": "iMessage", + "chat_id": chat_id, + "chat_name": first_msg["chat_display_name"], + "chat_identifier": first_msg["chat_identifier"], + "message_count": len(chat_messages), + "first_message_date": first_msg["timestamp"], + "last_message_date": last_msg["timestamp"], + "participants": list( + {msg["contact_name"] for msg in chat_messages if not msg["is_from_me"]} + ), + } + + doc = Document(text=content, metadata=metadata) + docs.append(doc) + + else: + # Create individual documents for each message + for message in messages: + content = self._create_individual_content(message) + + metadata = { + "source": "iMessage", + "message_id": message["message_id"], + "chat_id": message["chat_id"], + "chat_name": message["chat_display_name"], + "chat_identifier": message["chat_identifier"], + "timestamp": message["timestamp"], + "is_from_me": message["is_from_me"], + "contact_name": message["contact_name"], + "service": message["service"], + } + + doc = Document(text=content, metadata=metadata) + docs.append(doc) + + return docs + + +class AppleMailReader(BaseReader): + """Reader for Apple Mail data (macOS).""" + + def load_data(self, max_count: int = 1000) -> list[Document]: + docs: list[Document] = [] + home = Path.home() + mail_data_path = home / "Library/Mail/V10/MailData" + envelope_index = mail_data_path / "Envelope Index" + temp_db_path = "/tmp/leann_mail_index_copy" + + if not envelope_index.exists(): + # Try V9 if V10 doesn't exist + mail_data_path = home / "Library/Mail/V9/MailData" + envelope_index = mail_data_path / "Envelope Index" + if not envelope_index.exists(): + print("⚠️ Apple Mail Envelope Index not found.") + return docs + + try: + shutil.copy2(envelope_index, temp_db_path) + conn = sqlite3.connect(temp_db_path) + cursor = conn.cursor() + + # Query to get message subjects, senders, and content previews + query = """ + SELECT + m.subject, + m.sender, + datetime(m.date_sent, 'unixepoch', '31 years') as date, + s.snippet + FROM messages m + LEFT JOIN message_snippets s ON m.ROWID = s.message_id + ORDER BY m.date_sent DESC + LIMIT ? + """ + cursor.execute(query, (max_count,)) + rows = cursor.fetchall() + + for row in rows: + subject, sender, date, snippet = row + if not subject and not snippet: + continue + + content = f"Subject: {subject}\nFrom: {sender}\nDate: {date}\n\n{snippet or ''}" + docs.append( + Document( + text=content, metadata={"subject": subject or "", "sender": sender or ""} + ) + ) + + conn.close() + os.remove(temp_db_path) + except Exception as e: + print(f"❌ Error reading Apple Mail: {e}") + if os.path.exists(temp_db_path): + os.remove(temp_db_path) + + return docs + + +class AppleCalendarReader(BaseReader): + """Reader for Apple Calendar events (macOS).""" + + def load_data(self, max_count: int = 1000) -> list[Document]: + docs: list[Document] = [] + home = Path.home() + calendar_cache = home / "Library/Calendars/Calendar Cache" + temp_db_path = "/tmp/leann_calendar_index_copy" + + if not calendar_cache.exists(): + print("⚠️ Apple Calendar Cache not found.") + return docs + + try: + shutil.copy2(calendar_cache, temp_db_path) + conn = sqlite3.connect(temp_db_path) + cursor = conn.cursor() + + # Query for events + query = """ + SELECT + summary, + description, + location, + datetime(start_date + 978307200, 'unixepoch', 'localtime') as start, + datetime(end_date + 978307200, 'unixepoch', 'localtime') as end + FROM CI_EVENT + ORDER BY start_date DESC + LIMIT ? + """ + cursor.execute(query, (max_count,)) + rows = cursor.fetchall() + + for row in rows: + summary, description, location, start, end = row + if not summary: + continue + + content = f"Event: {summary}\nStart: {start}\nEnd: {end}\nLocation: {location or ''}\nDescription: {description or ''}" + docs.append(Document(text=content, metadata={"event": summary, "start": start})) + + conn.close() + os.remove(temp_db_path) + except Exception as e: + print(f"❌ Error reading Apple Calendar: {e}") + if os.path.exists(temp_db_path): + os.remove(temp_db_path) + + return docs + + +class WeChatHistoryReader(BaseReader): + """ + WeChat chat history reader that extracts chat data from exported JSON files. + """ + + def __init__(self) -> None: + """Initialize.""" + pass + + def _extract_readable_text(self, content: Any) -> str: + if not content: + return "" + if isinstance(content, dict): + text_parts = [ + str(content.get(f, "")) + for f in ["title", "quoted", "content", "text"] + if content.get(f) + ] + return " | ".join(text_parts) if text_parts else "" + if not isinstance(content, str): + return "" + clean_content = re.sub(r"^wxid_[^:]+:\s*", "", content) + clean_content = re.sub(r"^[^:]+:\s*", "", clean_content) + if clean_content.strip().startswith("<") or "recalled a message" in clean_content: + return "" + return clean_content.strip() + + def _is_text_message(self, content: Any) -> bool: + if not content: + return False + if isinstance(content, dict): + return any(content.get(f) for f in ["title", "quoted", "content", "text"]) + if not isinstance(content, str): + return False + if any(tag in content for tag in [" 0 and not clean_content.strip().startswith("<") + + def load_data( + self, wechat_export_dir: str, max_count: int = 1000, concatenate_messages: bool = True + ) -> list[Document]: + docs: list[Document] = [] + if not os.path.exists(wechat_export_dir): + print(f"WeChat export directory not found at: {wechat_export_dir}") + return docs + + try: + json_files = list(Path(wechat_export_dir).glob("*.json")) + count = 0 + for json_file in json_files: + if 0 < max_count <= count: + break + try: + with open(json_file, encoding="utf-8") as f: + chat_data = json.load(f) + contact_name = json_file.stem + + messages_text = [] + for message in chat_data: + content = message.get("content", "") + if self._is_text_message(content): + readable_text = self._extract_readable_text(content) or message.get( + "message", "" + ) + if readable_text.strip(): + create_time = message.get("createTime", 0) + time_str = ( + datetime.fromtimestamp(create_time).strftime( + "%Y-%m-%d %H:%M:%S" + ) + if create_time + else "Unknown" + ) + sender = "[Me]" if message.get("isSentFromSelf") else "[Contact]" + messages_text.append(f"({time_str}) {sender}: {readable_text}") + + if messages_text: + if concatenate_messages: + full_text = f"Contact: {contact_name}\n\n" + "\n".join(messages_text) + docs.append( + Document(text=full_text, metadata={"contact_name": contact_name}) + ) + count += 1 + else: + for msg in messages_text: + if 0 < max_count <= count: + break + docs.append( + Document(text=msg, metadata={"contact_name": contact_name}) + ) + count += 1 + except Exception as e: + print(f"Error reading {json_file}: {e}") + except Exception as e: + print(f"Error reading WeChat history: {e}") + + return docs + + +class SlackMCPReader: + """Reader for Slack data via MCP servers.""" + + def __init__( + self, + mcp_server_command: str, + workspace_name: str | None = None, + concatenate_conversations: bool = True, + ): + self.mcp_server_command = mcp_server_command + self.workspace_name = workspace_name + self.concatenate_conversations = concatenate_conversations + + async def load_data(self, channels: list[str] | None = None) -> list[Document]: + return [] + + +class TwitterMCPReader: + """Reader for Twitter bookmarks via MCP servers.""" + + def __init__(self, mcp_server_command: str): + self.mcp_server_command = mcp_server_command + + async def load_data(self, max_bookmarks: int = 100) -> list[Document]: + return [] + + +class ChatGPTReader(BaseReader): + """Reader for ChatGPT export files (.html or .zip).""" + + def load_data(self, export_path: str) -> list[Document]: + return [] + + +class ClaudeReader(BaseReader): + """Reader for Claude export files (.json or .zip).""" + + def load_data(self, export_path: str) -> list[Document]: + return []