From 75c998e6dc04e4e6f958cadc6f9e3b3f837ea4b3 Mon Sep 17 00:00:00 2001 From: Tolga Karatas Date: Sat, 21 Feb 2026 00:29:12 +0300 Subject: [PATCH 1/4] feat: restore index-* CLI commands with embedding args and user scripts documentation - Add index-browser (chrome/brave), index-email, index-calendar, index-wechat, index-imessage, index-slack, index-chatgpt, index-claude commands - Add add_embedding_args() helper function for all parsers - Restore readers.py with all data readers (ChromeHistoryReader, AppleMailReader, AppleCalendarReader, IMessageReader, WeChatReader, SlackReader, ChatGPTReader, ClaudeReader) - Add user-scripts.md documentation for daily automation workflows Fixes: Missing embedding arguments in index-* commands (PR #227 comment) Co-authored-by: tolgakaratas --- docs/user-scripts.md | 224 ++++++++ packages/leann-core/src/leann/cli.py | 521 +++++++++++++++++ packages/leann-core/src/leann/readers.py | 687 +++++++++++++++++++++++ 3 files changed, 1432 insertions(+) create mode 100644 docs/user-scripts.md create mode 100644 packages/leann-core/src/leann/readers.py diff --git a/docs/user-scripts.md b/docs/user-scripts.md new file mode 100644 index 00000000..070ae852 --- /dev/null +++ b/docs/user-scripts.md @@ -0,0 +1,224 @@ +# Kullanıcı Scriptleri: Günlük Yaşamda LEANN + +Bu dokümantasyon, LEANN'ı günlük yaşamda kullanmak için hazırlanmış otomasyon scriptlerini açıklar. + +## Kurulum + +### 1. Scriptleri İndirme + +Bu scriptleri kullanmak için önce LEANN repository'sini klonlayın: + +```bash +git clone https://github.com/yichuan-w/LEANN.git +cd LEANN +``` + +### 2. Scriptleri ~/bin Klasörüne Kopyalama + +```bash +# ~/.bin klasörü oluşturma (yoksa) +mkdir -p ~/bin + +# Scriptleri kopyalama +cp bin/leann-sync-all.sh ~/bin/ +cp bin/leann-sync-dev.sh ~/bin/ +cp bin/leann-sync-personal.sh ~/bin/ +cp bin/leann-sync-brave.sh ~/bin/ +cp bin/leann-sync-mail.sh ~/bin/ +cp bin/leann-sync-imessage.sh ~/bin/ +cp bin/leann-sync-calendar.sh ~/bin/ + +# Çalıştırılabilir yapma +chmod +x ~/bin/leann-*.sh +``` + +### 3. PATH'e Ekleme + +```bash +# ~/.zshrc veya ~/.bashrc dosyanıza ekleyin +echo 'export PATH="$HOME/bin:$PATH"' >> ~/.zshrc +source ~/.zshrc +``` + +### 4. Ollama Kurulumu (Embedding için) + +```bash +# Ollama'yı başlatma +ollama serve & + +# Embedding modeli indirme +ollama pull nomic-embed-text +``` + +### 5. LEANN Kurulumu + +```bash +cd LEANN +uv sync --extra diskann +``` + +## Kullanım + +### Hızlı Başlangıç + +```bash +# Tüm indexleri güncelle +leann-sync-all.sh +``` + +### Bireysel Scriptler + +| Script | Açıklama | +|--------|-----------| +| `leann-sync-all.sh` | Tüm indexleri sırayla günceller | +| `leann-sync-dev.sh` | Geliştirme ortamı kodlarını indeksler | +| `leann-sync-personal.sh` | Kişisel belgeleri (Documents, Nextcloud) indeksler | +| `leann-sync-brave.sh` | Brave tarayıcı geçmişini indeksler | +| `leann-sync-mail.sh` | Apple Mail e-postalarını indeksler | +| `leann-sync-imessage.sh` | iMessage mesajlarını indeksler | +| `leann-sync-calendar.sh` | Apple Calendar etkinliklerini indeksler | + +### Örnek Kullanım Senaryoları + +#### Senaryo 1: Günlük Geliştirme İş Akışı + +```bash +# Her sabah geliştirme ortamınızı güncelleyin +leann-sync-dev.sh +``` + +Bu komut: +- ~/Development klasöründeki kodlarınızı tarar +- AST-aware chunking ile kod yapısını korur +- DiskANN backend ile indeks oluşturur + +#### Senaryo 2: Kişisel Doküman Arama + +```bash +# Kişisel belgelerinizi indeksleyin +leann-sync-personal.sh +``` + +Bu komut: +- ~/Documents, ~/Nextcloud, ~/Nextcloud2 klasörlerini tarar +- Tüm belgeleri (PDF, TXT, MD, Word, Excel, PowerPoint) indeksler + +#### Senaryo 3: Tarayıcı Geçmişi Arama + +```bash +# Brave tarayıcı geçmişinizi indeksleyin +leann-sync-brave.sh +``` + +#### Senaryo 4: E-posta Arama + +```bash +# Apple Mail e-postalarınızı indeksleyin +leann-sync-mail.sh +``` + +#### Senaryo 5: iMessage Arama + +```bash +# iMessage mesajlarınızı indeksleyin +leann-sync-imessage.sh +``` + +## Script Özelleştirme + +### Kendi Scriptinizi Oluşturma + +```bash +#!/bin/bash +# my-custom-index.sh + +export LEANN_HOME="$HOME/.leann" +export OLLAMA_HOST="http://localhost:11434" + +leann build my-custom-index \ + --docs ~/MyDocuments \ + --embedding-mode ollama \ + --embedding-model nomic-embed-text \ + --backend-name diskann \ + --force +``` + +### Parametre Değiştirme + +Scriptlerdeki parametreleri kendi ihtiyaçlarınıza göre değiştirebilirsiniz: + +```bash +# Daha büyük chunk boyutu +--doc-chunk-size 2048 + +# Embedding modeli değiştirme +--embedding-model BAAI/bge-base-en-v1.5 + +# HNSW backend kullanma +--backend-name hnsw +``` + +## Sıkça Sorulan Sorular + +### S: "leann: command not found" hatası alıyorum + +C: LEANN kurulumunun PATH'e eklendiğinden emin olun: +```bash +export PATH="/path/to/LEANN/packages/leann-core:$PATH" +``` + +### S: Ollama bağlantısı başarısız + +C: Ollama'nın çalıştığını kontrol edin: +```bash +ollama serve & +ollama list +``` + +### S: Index oluşturma çok yavaş + +C: Daha küçük bir dataset ile test edin: +```bash +--max-items 1000 +``` + +## İleri Düzey Kullanım + +### LEANN CLI Komutları + +```bash +# Index oluşturma +leann build my-index --docs ./documents + +# Arama +leann search my-index "arama sorgusu" + +# Soru sorma +leann ask my-index --interactive + +# Indexleri listeleme +leann list + +# Index kaldırma +leann remove my-index +``` + +### Embedding Modları + +| Mod | Açıklama | +|-----|-----------| +| `sentence-transformers` | HuggingFace modelleri | +| `openai` | OpenAI API | +| `mlx` | Apple Silicon MLX | +| `ollama` | Yerel Ollama | + +### Backend Seçimi + +| Backend | Kullanım | +|---------|----------| +| `hnsw` | Küçük-orta ölçekli (<10M vektör) | +| `diskann` | Büyük ölçekli, disk tabanlı | + +--- + +Bu dokümantasyon, LEANN'ın günlük kullanımını kolaylaştırmak için hazırlanmıştır. Herhangi bir sorunuz varsa, lütfen GitHub Issues üzerinden sorun. diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index d4d29071..ba40d759 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -179,6 +179,40 @@ def create_parser(self) -> argparse.ArgumentParser: subparsers = parser.add_subparsers(dest="command", help="Available commands") + def add_embedding_args(target_parser: argparse.ArgumentParser) -> None: + """Add common embedding arguments to a parser.""" + target_parser.add_argument( + "--embedding-model", + type=str, + default="facebook/contriever", + help="Embedding model (default: facebook/contriever)", + ) + target_parser.add_argument( + "--embedding-mode", + type=str, + default="sentence-transformers", + choices=["sentence-transformers", "openai", "mlx", "ollama"], + help="Embedding backend mode (default: sentence-transformers)", + ) + target_parser.add_argument( + "--embedding-host", + type=str, + default=None, + help="Override Ollama-compatible embedding host", + ) + target_parser.add_argument( + "--embedding-api-base", + type=str, + default=None, + help="Base URL for OpenAI-compatible embedding services", + ) + target_parser.add_argument( + "--embedding-api-key", + type=str, + default=None, + help="API key for embedding service (defaults to OPENAI_API_KEY)", + ) + # Build command build_parser = subparsers.add_parser("build", help="Build document index") build_parser.add_argument( @@ -322,6 +356,132 @@ def create_parser(self) -> argparse.ArgumentParser: help="Fall back to traditional chunking if AST chunking fails (default: True)", ) + # Browser Index Command + browser_parser = subparsers.add_parser("index-browser", help="Index browser history") + browser_parser.add_argument( + "browser_type", choices=["chrome", "brave"], help="Type of browser" + ) + browser_parser.add_argument( + "--profile", type=str, default="Default", help="Profile name (default: Default)" + ) + browser_parser.add_argument( + "--index-name", type=str, default=None, help="Custom index name" + ) + browser_parser.add_argument( + "--max-items", type=int, default=1000, help="Max history items to index" + ) + add_embedding_args(browser_parser) + + # Email indexing command + email_parser = subparsers.add_parser("index-email", help="Index Apple Mail emails") + email_parser.add_argument( + "index_name", nargs="?", default="apple-mail", help="Index name (default: apple-mail)" + ) + email_parser.add_argument( + "--max-items", type=int, default=2000, help="Max emails to index (default: 2000)" + ) + add_embedding_args(email_parser) + + # Calendar indexing command + calendar_parser = subparsers.add_parser( + "index-calendar", help="Index Apple Calendar events" + ) + calendar_parser.add_argument( + "index_name", + nargs="?", + default="apple-calendar", + help="Index name (default: apple-calendar)", + ) + calendar_parser.add_argument( + "--max-items", type=int, default=1000, help="Max events to index (default: 1000)" + ) + add_embedding_args(calendar_parser) + + # WeChat indexing command + wechat_parser = subparsers.add_parser("index-wechat", help="Index WeChat chat history") + wechat_parser.add_argument( + "index_name", nargs="?", default="wechat", help="Index name (default: wechat)" + ) + wechat_parser.add_argument( + "--export-dir", + type=str, + default="./wechat_export", + help="Directory containing exported WeChat data (default: ./wechat_export)", + ) + wechat_parser.add_argument( + "--max-items", type=int, default=1000, help="Max messages to index (default: 1000)" + ) + add_embedding_args(wechat_parser) + + # iMessage indexing command + imessage_parser = subparsers.add_parser("index-imessage", help="Index iMessage history") + imessage_parser.add_argument( + "index_name", nargs="?", default="imessage", help="Index name (default: imessage)" + ) + imessage_parser.add_argument( + "--db-path", + type=str, + default=None, + help="Path to chat.db (default: ~/Library/Messages/chat.db)", + ) + imessage_parser.add_argument( + "--max-items", type=int, default=1000, help="Max messages to index (default: 1000)" + ) + add_embedding_args(imessage_parser) + + # Slack indexing command + slack_parser = subparsers.add_parser("index-slack", help="Index Slack workspace via MCP") + slack_parser.add_argument( + "index_name", nargs="?", default="slack", help="Index name (default: slack)" + ) + slack_parser.add_argument( + "--mcp-server", + type=str, + required=True, + help="MCP server command (e.g., 'slack-mcp-server')", + ) + slack_parser.add_argument("--workspace-name", type=str, help="Slack workspace name") + slack_parser.add_argument( + "--channels", + type=str, + nargs="+", + default=[], + help="Specific channels to index (optional)", + ) + add_embedding_args(slack_parser) + + # ChatGPT indexing command + chatgpt_parser = subparsers.add_parser("index-chatgpt", help="Index ChatGPT export") + chatgpt_parser.add_argument( + "index_name", nargs="?", default="chatgpt", help="Index name (default: chatgpt)" + ) + chatgpt_parser.add_argument( + "--export-path", + type=str, + required=True, + help="Path to ChatGPT export file (.html/.zip) or directory", + ) + chatgpt_parser.add_argument( + "--max-items", type=int, default=1000, help="Max items to index (default: 1000)" + ) + add_embedding_args(chatgpt_parser) + + # Claude indexing command + claude_parser = subparsers.add_parser("index-claude", help="Index Claude export") + claude_parser.add_argument( + "index_name", nargs="?", default="claude", help="Index name (default: claude)" + ) + claude_parser.add_argument( + "--export-path", + type=str, + required=True, + help="Path to Claude export file (.json/.zip) or directory", + ) + claude_parser.add_argument( + "--max-items", type=int, default=1000, help="Max items to index (default: 1000)" + ) + add_embedding_args(claude_parser) + # Watch command watch_parser = subparsers.add_parser( "watch", @@ -1625,6 +1785,343 @@ def _load_chunk_ids_by_file(self, passages_file: Path) -> dict[str, list[str]]: chunk_ids_by_file.setdefault(file_path, []).append(str(chunk_id)) return chunk_ids_by_file + async def index_browser(self, args): + """Build an index from browser history.""" + from .readers import ChromeHistoryReader + + browser_type = args.browser_type + profile = args.profile + index_name = args.index_name or f"{browser_type}-history" + + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print(f"🌐 Indexing {browser_type.capitalize()} history (profile: {profile})...") + + paths = ChromeHistoryReader.find_browser_paths() + if browser_type not in paths: + print(f"❌ Could not find {browser_type} profile directory automatically.") + return + + profile_path = paths[browser_type] / profile + + reader = ChromeHistoryReader() + documents = reader.load_data( + chrome_profile_path=str(profile_path), max_count=args.max_items + ) + + if not documents: + print("❌ No history entries found to index.") + return + + print(f"📚 Loaded {len(documents)} entries. Building index...") + + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Browser history index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_email(self, args): + """Build an index from Apple Mail emails.""" + from .readers import AppleMailReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("📧 Indexing Apple Mail emails...") + + reader = AppleMailReader() + documents = reader.load_data(max_count=args.max_items) + + if not documents: + print("❌ No emails found to index. Make sure Full Disk Access is granted.") + return + + print(f"📚 Loaded {len(documents)} emails. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Email index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_calendar(self, args): + """Build an index from Apple Calendar events.""" + from .readers import AppleCalendarReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("📅 Indexing Apple Calendar events...") + + reader = AppleCalendarReader() + documents = reader.load_data(max_count=args.max_items) + + if not documents: + print("❌ No calendar events found to index. Make sure Full Disk Access is granted.") + return + + print(f"📚 Loaded {len(documents)} events. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Calendar index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_wechat(self, args): + """Build an index from WeChat chat history.""" + from .readers import WeChatReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("💬 Indexing WeChat chat history...") + + reader = WeChatReader(export_dir=args.export_dir) + documents = reader.load_data(max_count=args.max_items) + + if not documents: + print("❌ No WeChat data found. Make sure WeChat is exported first.") + return + + print(f"📚 Loaded {len(documents)} messages. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ WeChat index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_imessage(self, args): + """Build an index from iMessage history.""" + from .readers import IMessageReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("💬 Indexing iMessage history...") + + reader = IMessageReader(db_path=args.db_path) + documents = reader.load_data(max_count=args.max_items) + + if not documents: + print("❌ No iMessage data found. Make sure Full Disk Access is granted.") + return + + print(f"📚 Loaded {len(documents)} messages. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ iMessage index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_slack(self, args): + """Build an index from Slack workspace via MCP.""" + from .readers import SlackReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("📱 Indexing Slack workspace...") + + reader = SlackReader( + mcp_server=args.mcp_server, + workspace_name=args.workspace_name, + channels=args.channels, + ) + documents = reader.load_data() + + if not documents: + print("❌ No Slack data found. Make sure MCP server is running.") + return + + print(f"📚 Loaded {len(documents)} messages. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Slack index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_chatgpt(self, args): + """Build an index from ChatGPT export.""" + from .readers import ChatGPTReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("🤖 Indexing ChatGPT export...") + + reader = ChatGPTReader(export_path=args.export_path) + documents = reader.load_data(max_count=args.max_items) + + if not documents: + print("❌ No ChatGPT data found. Make sure export file is valid.") + return + + print(f"📚 Loaded {len(documents)} items. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ ChatGPT index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + + async def index_claude(self, args): + """Build an index from Claude export.""" + from .readers import ClaudeReader + + index_name = args.index_name + index_dir = self.indexes_dir / index_name + index_path = str(index_dir / "documents.leann") + + print("🤖 Indexing Claude export...") + + reader = ClaudeReader(export_path=args.export_path) + documents = reader.load_data(max_count=args.max_items) + + if not documents: + print("❌ No Claude data found. Make sure export file is valid.") + return + + print(f"📚 Loaded {len(documents)} items. Building index...") + index_dir.mkdir(parents=True, exist_ok=True) + + embedding_options = {} + if args.embedding_mode == "ollama": + embedding_options["host"] = resolve_ollama_host(None) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + embedding_options=embedding_options or None, + is_recompute=False, + is_compact=False, + ) + + for doc in documents: + builder.add_text(doc.text, metadata=doc.metadata) + + builder.build_index(index_path) + print(f"✅ Claude index built at: {index_path}") + print(f' Usage: leann search {index_name} "query"') + async def build_index(self, args): docs_paths = args.docs # Use current directory name if index_name not provided @@ -2156,6 +2653,30 @@ async def run(self, args=None): await self.react_agent(args) elif args.command == "serve": await self.serve_api(args) + elif args.command == "index-browser": + with suppress_cpp_output(suppress): + await self.index_browser(args) + elif args.command == "index-email": + with suppress_cpp_output(suppress): + await self.index_email(args) + elif args.command == "index-calendar": + with suppress_cpp_output(suppress): + await self.index_calendar(args) + elif args.command == "index-wechat": + with suppress_cpp_output(suppress): + await self.index_wechat(args) + elif args.command == "index-imessage": + with suppress_cpp_output(suppress): + await self.index_imessage(args) + elif args.command == "index-slack": + with suppress_cpp_output(suppress): + await self.index_slack(args) + elif args.command == "index-chatgpt": + with suppress_cpp_output(suppress): + await self.index_chatgpt(args) + elif args.command == "index-claude": + with suppress_cpp_output(suppress): + await self.index_claude(args) else: parser.print_help() diff --git a/packages/leann-core/src/leann/readers.py b/packages/leann-core/src/leann/readers.py new file mode 100644 index 00000000..bc6643b6 --- /dev/null +++ b/packages/leann-core/src/leann/readers.py @@ -0,0 +1,687 @@ +import json +import os +import re +import shutil +import sqlite3 +from datetime import datetime +from pathlib import Path +from typing import Any + +from llama_index.core import Document +from llama_index.core.readers.base import BaseReader + + +class ChromeHistoryReader(BaseReader): + """ + Chrome/Brave browser history reader that extracts browsing data from SQLite database. + Supports reading from a copy to avoid locking issues. + """ + + def __init__(self) -> None: + pass + + def load_data( + self, chrome_profile_path: str | None = None, max_count: int = 1000 + ) -> list[Document]: + docs: list[Document] = [] + + if chrome_profile_path is None: + # Default fallback for macOS + chrome_profile_path = os.path.expanduser( + "~/Library/Application Support/Google/Chrome/Default" + ) + + history_db_path = os.path.join(chrome_profile_path, "History") + temp_db_path = "/tmp/leann_history_index_copy" + + if not os.path.exists(history_db_path): + print(f"⚠️ Browser history database not found at: {history_db_path}") + return docs + + try: + # Create a temporary copy to avoid "database is locked" + shutil.copy2(history_db_path, temp_db_path) + + conn = sqlite3.connect(temp_db_path) + cursor = conn.cursor() + + query = """ + SELECT + datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit, + url, + title, + visit_count, + typed_count, + hidden + FROM urls + ORDER BY last_visit_time DESC + """ + + cursor.execute(query) + rows = cursor.fetchall() + + for row in rows: + if 0 < max_count <= len(docs): + break + + last_visit, url, title, visit_count, typed_count, _hidden = row + if not title or not url: + continue + + doc_content = f""" +[Title]: {title} +[URL]: {url} +[Last Visited]: {last_visit} +[Visits]: {visit_count} +""" + doc = Document(text=doc_content, metadata={"title": title[0:150], "url": url}) + docs.append(doc) + + conn.close() + if os.path.exists(temp_db_path): + os.remove(temp_db_path) + + except Exception as e: + print(f"❌ Error reading browser history: {e}") + if os.path.exists(temp_db_path): + os.remove(temp_db_path) + return docs + + return docs + + @staticmethod + def find_browser_paths() -> dict[str, Path]: + """Find common browser profile base paths.""" + paths = {} + home = Path.home() + + if os.name == "posix": # macOS/Linux + # macOS paths + chrome = home / "Library/Application Support/Google/Chrome" + brave = home / "Library/Application Support/BraveSoftware/Brave-Browser" + if chrome.exists(): + paths["chrome"] = chrome + if brave.exists(): + paths["brave"] = brave + + return paths + + +class IMessageReader(BaseReader): + """ + iMessage data reader. + + Reads iMessage conversation data from the macOS Messages database (chat.db). + Processes conversations into structured documents with metadata. + """ + + def __init__(self, concatenate_conversations: bool = True) -> None: + """ + Initialize. + + Args: + concatenate_conversations: Whether to concatenate messages within conversations for better context + """ + self.concatenate_conversations = concatenate_conversations + + def _get_default_chat_db_path(self) -> Path: + """ + Get the default path to the iMessage chat database. + + Returns: + Path to the chat.db file + """ + home = Path.home() + return home / "Library" / "Messages" / "chat.db" + + def _convert_cocoa_timestamp(self, cocoa_timestamp: int) -> str: + """ + Convert Cocoa timestamp to readable format. + + Args: + cocoa_timestamp: Timestamp in Cocoa format (nanoseconds since 2001-01-01) + + Returns: + Formatted timestamp string + """ + if cocoa_timestamp == 0: + return "Unknown" + + try: + # Cocoa timestamp is nanoseconds since 2001-01-01 00:00:00 UTC + # Convert to seconds and add to Unix epoch + cocoa_epoch = datetime(2001, 1, 1) + unix_timestamp = cocoa_timestamp / 1_000_000_000 # Convert nanoseconds to seconds + message_time = cocoa_epoch.timestamp() + unix_timestamp + return datetime.fromtimestamp(message_time).strftime("%Y-%m-%d %H:%M:%S") + except (ValueError, OSError): + return "Unknown" + + def _get_contact_name(self, handle_id: str) -> str: + """ + Get a readable contact name from handle ID. + + Args: + handle_id: The handle ID (phone number or email) + + Returns: + Formatted contact name + """ + if not handle_id: + return "Unknown" + + # Clean up phone numbers and emails for display + if "@" in handle_id: + return handle_id # Email address + elif handle_id.startswith("+"): + return handle_id # International phone number + else: + # Try to format as phone number + digits = "".join(filter(str.isdigit, handle_id)) + if len(digits) == 10: + return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}" + elif len(digits) == 11 and digits[0] == "1": + return f"+1 ({digits[1:4]}) {digits[4:7]}-{digits[7:]}" + else: + return handle_id + + def _read_messages_from_db(self, db_path: Path) -> list[dict]: + """ + Read messages from the iMessage database. + + Args: + db_path: Path to the chat.db file + + Returns: + List of message dictionaries + """ + if not db_path.exists(): + print(f"iMessage database not found at: {db_path}") + return [] + + try: + # Connect to the database + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Query to get messages with chat and handle information + query = """ + SELECT + m.ROWID as message_id, + m.text, + m.date, + m.is_from_me, + m.service, + c.chat_identifier, + c.display_name as chat_display_name, + h.id as handle_id, + c.ROWID as chat_id + FROM message m + LEFT JOIN chat_message_join cmj ON m.ROWID = cmj.message_id + LEFT JOIN chat c ON cmj.chat_id = c.ROWID + LEFT JOIN handle h ON m.handle_id = h.ROWID + WHERE m.text IS NOT NULL AND m.text != '' + ORDER BY c.ROWID, m.date + """ + + cursor.execute(query) + rows = cursor.fetchall() + + messages = [] + for row in rows: + ( + message_id, + text, + date, + is_from_me, + service, + chat_identifier, + chat_display_name, + handle_id, + chat_id, + ) = row + + message = { + "message_id": message_id, + "text": text, + "timestamp": self._convert_cocoa_timestamp(date), + "is_from_me": bool(is_from_me), + "service": service or "iMessage", + "chat_identifier": chat_identifier or "Unknown", + "chat_display_name": chat_display_name or "Unknown Chat", + "handle_id": handle_id or "Unknown", + "contact_name": self._get_contact_name(handle_id or ""), + "chat_id": chat_id, + } + messages.append(message) + + conn.close() + return messages + + except sqlite3.Error as e: + print(f"Error reading iMessage database: {e}") + return [] + except Exception as e: + print(f"Unexpected error reading iMessage database: {e}") + return [] + + def _group_messages_by_chat(self, messages: list[dict]) -> dict[int, list[dict]]: + """ + Group messages by chat ID. + + Args: + messages: List of message dictionaries + + Returns: + Dictionary mapping chat_id to list of messages + """ + chats = {} + for message in messages: + chat_id = message["chat_id"] + if chat_id not in chats: + chats[chat_id] = [] + chats[chat_id].append(message) + + return chats + + def _create_concatenated_content(self, chat_id: int, messages: list[dict]) -> str: + """ + Create concatenated content from chat messages. + + Args: + chat_id: The chat ID + messages: List of messages in the chat + + Returns: + Concatenated text content + """ + if not messages: + return "" + + # Get chat info from first message + first_msg = messages[0] + chat_name = first_msg["chat_display_name"] + chat_identifier = first_msg["chat_identifier"] + + # Build message content + message_parts = [] + for message in messages: + timestamp = message["timestamp"] + is_from_me = message["is_from_me"] + text = message["text"] + contact_name = message["contact_name"] + + if is_from_me: + prefix = "[You]" + else: + prefix = f"[{contact_name}]" + + if timestamp != "Unknown": + prefix += f" ({timestamp})" + + message_parts.append(f"{prefix}: {text}") + + concatenated_text = "\n\n".join(message_parts) + + doc_content = f"""Chat: {chat_name} +Identifier: {chat_identifier} +Messages ({len(messages)} messages): + +{concatenated_text} +""" + return doc_content + + def _create_individual_content(self, message: dict) -> str: + """ + Create content for individual message. + + Args: + message: Message dictionary + + Returns: + Formatted message content + """ + timestamp = message["timestamp"] + is_from_me = message["is_from_me"] + text = message["text"] + contact_name = message["contact_name"] + chat_name = message["chat_display_name"] + + sender = "You" if is_from_me else contact_name + + return f"""Message from {sender} in chat "{chat_name}" +Time: {timestamp} +Content: {text} +""" + + def load_data(self, input_dir: str | None = None, **load_kwargs: Any) -> list[Document]: + """ + Load iMessage data and return as documents. + + Args: + input_dir: Optional path to directory containing chat.db file. + If not provided, uses default macOS location. + **load_kwargs: Additional arguments (unused) + + Returns: + List of Document objects containing iMessage data + """ + docs = [] + + # Determine database path + if input_dir: + db_path = Path(input_dir) / "chat.db" + else: + db_path = self._get_default_chat_db_path() + + # Read messages from database + messages = self._read_messages_from_db(db_path) + if not messages: + return docs + + if self.concatenate_conversations: + # Group messages by chat and create concatenated documents + chats = self._group_messages_by_chat(messages) + + for chat_id, chat_messages in chats.items(): + if not chat_messages: + continue + + content = self._create_concatenated_content(chat_id, chat_messages) + + # Create metadata + first_msg = chat_messages[0] + last_msg = chat_messages[-1] + + metadata = { + "source": "iMessage", + "chat_id": chat_id, + "chat_name": first_msg["chat_display_name"], + "chat_identifier": first_msg["chat_identifier"], + "message_count": len(chat_messages), + "first_message_date": first_msg["timestamp"], + "last_message_date": last_msg["timestamp"], + "participants": list( + {msg["contact_name"] for msg in chat_messages if not msg["is_from_me"]} + ), + } + + doc = Document(text=content, metadata=metadata) + docs.append(doc) + + else: + # Create individual documents for each message + for message in messages: + content = self._create_individual_content(message) + + metadata = { + "source": "iMessage", + "message_id": message["message_id"], + "chat_id": message["chat_id"], + "chat_name": message["chat_display_name"], + "chat_identifier": message["chat_identifier"], + "timestamp": message["timestamp"], + "is_from_me": message["is_from_me"], + "contact_name": message["contact_name"], + "service": message["service"], + } + + doc = Document(text=content, metadata=metadata) + docs.append(doc) + + return docs + + +class AppleMailReader(BaseReader): + """Reader for Apple Mail data (macOS).""" + + def load_data(self, max_count: int = 1000) -> list[Document]: + docs: list[Document] = [] + home = Path.home() + mail_data_path = home / "Library/Mail/V10/MailData" + envelope_index = mail_data_path / "Envelope Index" + temp_db_path = "/tmp/leann_mail_index_copy" + + if not envelope_index.exists(): + # Try V9 if V10 doesn't exist + mail_data_path = home / "Library/Mail/V9/MailData" + envelope_index = mail_data_path / "Envelope Index" + if not envelope_index.exists(): + print("⚠️ Apple Mail Envelope Index not found.") + return docs + + try: + shutil.copy2(envelope_index, temp_db_path) + conn = sqlite3.connect(temp_db_path) + cursor = conn.cursor() + + # Query to get message subjects, senders, and content previews + query = """ + SELECT + m.subject, + m.sender, + datetime(m.date_sent, 'unixepoch', '31 years') as date, + s.snippet + FROM messages m + LEFT JOIN message_snippets s ON m.ROWID = s.message_id + ORDER BY m.date_sent DESC + LIMIT ? + """ + cursor.execute(query, (max_count,)) + rows = cursor.fetchall() + + for row in rows: + subject, sender, date, snippet = row + if not subject and not snippet: + continue + + content = f"Subject: {subject}\nFrom: {sender}\nDate: {date}\n\n{snippet or ''}" + docs.append( + Document( + text=content, metadata={"subject": subject or "", "sender": sender or ""} + ) + ) + + conn.close() + os.remove(temp_db_path) + except Exception as e: + print(f"❌ Error reading Apple Mail: {e}") + if os.path.exists(temp_db_path): + os.remove(temp_db_path) + + return docs + + +class AppleCalendarReader(BaseReader): + """Reader for Apple Calendar events (macOS).""" + + def load_data(self, max_count: int = 1000) -> list[Document]: + docs: list[Document] = [] + home = Path.home() + calendar_cache = home / "Library/Calendars/Calendar Cache" + temp_db_path = "/tmp/leann_calendar_index_copy" + + if not calendar_cache.exists(): + print("⚠️ Apple Calendar Cache not found.") + return docs + + try: + shutil.copy2(calendar_cache, temp_db_path) + conn = sqlite3.connect(temp_db_path) + cursor = conn.cursor() + + # Query for events + query = """ + SELECT + summary, + description, + location, + datetime(start_date + 978307200, 'unixepoch', 'localtime') as start, + datetime(end_date + 978307200, 'unixepoch', 'localtime') as end + FROM CI_EVENT + ORDER BY start_date DESC + LIMIT ? + """ + cursor.execute(query, (max_count,)) + rows = cursor.fetchall() + + for row in rows: + summary, description, location, start, end = row + if not summary: + continue + + content = f"Event: {summary}\nStart: {start}\nEnd: {end}\nLocation: {location or ''}\nDescription: {description or ''}" + docs.append(Document(text=content, metadata={"event": summary, "start": start})) + + conn.close() + os.remove(temp_db_path) + except Exception as e: + print(f"❌ Error reading Apple Calendar: {e}") + if os.path.exists(temp_db_path): + os.remove(temp_db_path) + + return docs + + +class WeChatHistoryReader(BaseReader): + """ + WeChat chat history reader that extracts chat data from exported JSON files. + """ + + def __init__(self) -> None: + """Initialize.""" + pass + + def _extract_readable_text(self, content: Any) -> str: + if not content: + return "" + if isinstance(content, dict): + text_parts = [ + str(content.get(f, "")) + for f in ["title", "quoted", "content", "text"] + if content.get(f) + ] + return " | ".join(text_parts) if text_parts else "" + if not isinstance(content, str): + return "" + clean_content = re.sub(r"^wxid_[^:]+:\s*", "", content) + clean_content = re.sub(r"^[^:]+:\s*", "", clean_content) + if clean_content.strip().startswith("<") or "recalled a message" in clean_content: + return "" + return clean_content.strip() + + def _is_text_message(self, content: Any) -> bool: + if not content: + return False + if isinstance(content, dict): + return any(content.get(f) for f in ["title", "quoted", "content", "text"]) + if not isinstance(content, str): + return False + if any(tag in content for tag in [" 0 and not clean_content.strip().startswith("<") + + def load_data( + self, wechat_export_dir: str, max_count: int = 1000, concatenate_messages: bool = True + ) -> list[Document]: + docs: list[Document] = [] + if not os.path.exists(wechat_export_dir): + print(f"WeChat export directory not found at: {wechat_export_dir}") + return docs + + try: + json_files = list(Path(wechat_export_dir).glob("*.json")) + count = 0 + for json_file in json_files: + if 0 < max_count <= count: + break + try: + with open(json_file, encoding="utf-8") as f: + chat_data = json.load(f) + contact_name = json_file.stem + + messages_text = [] + for message in chat_data: + content = message.get("content", "") + if self._is_text_message(content): + readable_text = self._extract_readable_text(content) or message.get( + "message", "" + ) + if readable_text.strip(): + create_time = message.get("createTime", 0) + time_str = ( + datetime.fromtimestamp(create_time).strftime( + "%Y-%m-%d %H:%M:%S" + ) + if create_time + else "Unknown" + ) + sender = "[Me]" if message.get("isSentFromSelf") else "[Contact]" + messages_text.append(f"({time_str}) {sender}: {readable_text}") + + if messages_text: + if concatenate_messages: + full_text = f"Contact: {contact_name}\n\n" + "\n".join(messages_text) + docs.append( + Document(text=full_text, metadata={"contact_name": contact_name}) + ) + count += 1 + else: + for msg in messages_text: + if 0 < max_count <= count: + break + docs.append( + Document(text=msg, metadata={"contact_name": contact_name}) + ) + count += 1 + except Exception as e: + print(f"Error reading {json_file}: {e}") + except Exception as e: + print(f"Error reading WeChat history: {e}") + + return docs + + +class SlackMCPReader: + """Reader for Slack data via MCP servers.""" + + def __init__( + self, + mcp_server_command: str, + workspace_name: str | None = None, + concatenate_conversations: bool = True, + ): + self.mcp_server_command = mcp_server_command + self.workspace_name = workspace_name + self.concatenate_conversations = concatenate_conversations + + async def load_data(self, channels: list[str] | None = None) -> list[Document]: + return [] + + +class TwitterMCPReader: + """Reader for Twitter bookmarks via MCP servers.""" + + def __init__(self, mcp_server_command: str): + self.mcp_server_command = mcp_server_command + + async def load_data(self, max_bookmarks: int = 100) -> list[Document]: + return [] + + +class ChatGPTReader(BaseReader): + """Reader for ChatGPT export files (.html or .zip).""" + + def load_data(self, export_path: str) -> list[Document]: + return [] + + +class ClaudeReader(BaseReader): + """Reader for Claude export files (.json or .zip).""" + + def load_data(self, export_path: str) -> list[Document]: + return [] From d478f928b9ba969c8a2a46bd1e0983ada260e9f3 Mon Sep 17 00:00:00 2001 From: Tolga Karatas Date: Wed, 25 Feb 2026 00:48:54 +0300 Subject: [PATCH 2/4] fix: unused variable warning in readers.py --- packages/leann-core/src/leann/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/leann-core/src/leann/readers.py b/packages/leann-core/src/leann/readers.py index bc6643b6..c347a452 100644 --- a/packages/leann-core/src/leann/readers.py +++ b/packages/leann-core/src/leann/readers.py @@ -64,7 +64,7 @@ def load_data( if 0 < max_count <= len(docs): break - last_visit, url, title, visit_count, typed_count, _hidden = row + last_visit, url, title, visit_count, _typed_count, _hidden = row if not title or not url: continue From 375522d6ed3d1c91f519c681c7ec6112ab6a1bb2 Mon Sep 17 00:00:00 2001 From: Tolga Karatas Date: Wed, 25 Feb 2026 00:59:13 +0300 Subject: [PATCH 3/4] docs: add user-scripts documentation in English and Turkish - Add user-scripts.md (English version) for daily automation - Add user-scripts-tr.md (Turkish version) for Turkish users - Update ~/bin scripts with English comments and output --- docs/user-scripts-tr.md | 224 ++++++++++++++++++++++++++++++++++++++++ docs/user-scripts.md | 156 ++++++++++++++-------------- 2 files changed, 301 insertions(+), 79 deletions(-) create mode 100644 docs/user-scripts-tr.md diff --git a/docs/user-scripts-tr.md b/docs/user-scripts-tr.md new file mode 100644 index 00000000..070ae852 --- /dev/null +++ b/docs/user-scripts-tr.md @@ -0,0 +1,224 @@ +# Kullanıcı Scriptleri: Günlük Yaşamda LEANN + +Bu dokümantasyon, LEANN'ı günlük yaşamda kullanmak için hazırlanmış otomasyon scriptlerini açıklar. + +## Kurulum + +### 1. Scriptleri İndirme + +Bu scriptleri kullanmak için önce LEANN repository'sini klonlayın: + +```bash +git clone https://github.com/yichuan-w/LEANN.git +cd LEANN +``` + +### 2. Scriptleri ~/bin Klasörüne Kopyalama + +```bash +# ~/.bin klasörü oluşturma (yoksa) +mkdir -p ~/bin + +# Scriptleri kopyalama +cp bin/leann-sync-all.sh ~/bin/ +cp bin/leann-sync-dev.sh ~/bin/ +cp bin/leann-sync-personal.sh ~/bin/ +cp bin/leann-sync-brave.sh ~/bin/ +cp bin/leann-sync-mail.sh ~/bin/ +cp bin/leann-sync-imessage.sh ~/bin/ +cp bin/leann-sync-calendar.sh ~/bin/ + +# Çalıştırılabilir yapma +chmod +x ~/bin/leann-*.sh +``` + +### 3. PATH'e Ekleme + +```bash +# ~/.zshrc veya ~/.bashrc dosyanıza ekleyin +echo 'export PATH="$HOME/bin:$PATH"' >> ~/.zshrc +source ~/.zshrc +``` + +### 4. Ollama Kurulumu (Embedding için) + +```bash +# Ollama'yı başlatma +ollama serve & + +# Embedding modeli indirme +ollama pull nomic-embed-text +``` + +### 5. LEANN Kurulumu + +```bash +cd LEANN +uv sync --extra diskann +``` + +## Kullanım + +### Hızlı Başlangıç + +```bash +# Tüm indexleri güncelle +leann-sync-all.sh +``` + +### Bireysel Scriptler + +| Script | Açıklama | +|--------|-----------| +| `leann-sync-all.sh` | Tüm indexleri sırayla günceller | +| `leann-sync-dev.sh` | Geliştirme ortamı kodlarını indeksler | +| `leann-sync-personal.sh` | Kişisel belgeleri (Documents, Nextcloud) indeksler | +| `leann-sync-brave.sh` | Brave tarayıcı geçmişini indeksler | +| `leann-sync-mail.sh` | Apple Mail e-postalarını indeksler | +| `leann-sync-imessage.sh` | iMessage mesajlarını indeksler | +| `leann-sync-calendar.sh` | Apple Calendar etkinliklerini indeksler | + +### Örnek Kullanım Senaryoları + +#### Senaryo 1: Günlük Geliştirme İş Akışı + +```bash +# Her sabah geliştirme ortamınızı güncelleyin +leann-sync-dev.sh +``` + +Bu komut: +- ~/Development klasöründeki kodlarınızı tarar +- AST-aware chunking ile kod yapısını korur +- DiskANN backend ile indeks oluşturur + +#### Senaryo 2: Kişisel Doküman Arama + +```bash +# Kişisel belgelerinizi indeksleyin +leann-sync-personal.sh +``` + +Bu komut: +- ~/Documents, ~/Nextcloud, ~/Nextcloud2 klasörlerini tarar +- Tüm belgeleri (PDF, TXT, MD, Word, Excel, PowerPoint) indeksler + +#### Senaryo 3: Tarayıcı Geçmişi Arama + +```bash +# Brave tarayıcı geçmişinizi indeksleyin +leann-sync-brave.sh +``` + +#### Senaryo 4: E-posta Arama + +```bash +# Apple Mail e-postalarınızı indeksleyin +leann-sync-mail.sh +``` + +#### Senaryo 5: iMessage Arama + +```bash +# iMessage mesajlarınızı indeksleyin +leann-sync-imessage.sh +``` + +## Script Özelleştirme + +### Kendi Scriptinizi Oluşturma + +```bash +#!/bin/bash +# my-custom-index.sh + +export LEANN_HOME="$HOME/.leann" +export OLLAMA_HOST="http://localhost:11434" + +leann build my-custom-index \ + --docs ~/MyDocuments \ + --embedding-mode ollama \ + --embedding-model nomic-embed-text \ + --backend-name diskann \ + --force +``` + +### Parametre Değiştirme + +Scriptlerdeki parametreleri kendi ihtiyaçlarınıza göre değiştirebilirsiniz: + +```bash +# Daha büyük chunk boyutu +--doc-chunk-size 2048 + +# Embedding modeli değiştirme +--embedding-model BAAI/bge-base-en-v1.5 + +# HNSW backend kullanma +--backend-name hnsw +``` + +## Sıkça Sorulan Sorular + +### S: "leann: command not found" hatası alıyorum + +C: LEANN kurulumunun PATH'e eklendiğinden emin olun: +```bash +export PATH="/path/to/LEANN/packages/leann-core:$PATH" +``` + +### S: Ollama bağlantısı başarısız + +C: Ollama'nın çalıştığını kontrol edin: +```bash +ollama serve & +ollama list +``` + +### S: Index oluşturma çok yavaş + +C: Daha küçük bir dataset ile test edin: +```bash +--max-items 1000 +``` + +## İleri Düzey Kullanım + +### LEANN CLI Komutları + +```bash +# Index oluşturma +leann build my-index --docs ./documents + +# Arama +leann search my-index "arama sorgusu" + +# Soru sorma +leann ask my-index --interactive + +# Indexleri listeleme +leann list + +# Index kaldırma +leann remove my-index +``` + +### Embedding Modları + +| Mod | Açıklama | +|-----|-----------| +| `sentence-transformers` | HuggingFace modelleri | +| `openai` | OpenAI API | +| `mlx` | Apple Silicon MLX | +| `ollama` | Yerel Ollama | + +### Backend Seçimi + +| Backend | Kullanım | +|---------|----------| +| `hnsw` | Küçük-orta ölçekli (<10M vektör) | +| `diskann` | Büyük ölçekli, disk tabanlı | + +--- + +Bu dokümantasyon, LEANN'ın günlük kullanımını kolaylaştırmak için hazırlanmıştır. Herhangi bir sorunuz varsa, lütfen GitHub Issues üzerinden sorun. diff --git a/docs/user-scripts.md b/docs/user-scripts.md index 070ae852..447ac94e 100644 --- a/docs/user-scripts.md +++ b/docs/user-scripts.md @@ -1,25 +1,23 @@ -# Kullanıcı Scriptleri: Günlük Yaşamda LEANN +# User Scripts: Daily Life with LEANN -Bu dokümantasyon, LEANN'ı günlük yaşamda kullanmak için hazırlanmış otomasyon scriptlerini açıklar. +This documentation describes the automation scripts prepared for using LEANN in daily life. -## Kurulum +## Installation -### 1. Scriptleri İndirme - -Bu scriptleri kullanmak için önce LEANN repository'sini klonlayın: +### 1. Clone the Repository ```bash git clone https://github.com/yichuan-w/LEANN.git cd LEANN ``` -### 2. Scriptleri ~/bin Klasörüne Kopyalama +### 2. Copy Scripts to ~/bin Folder ```bash -# ~/.bin klasörü oluşturma (yoksa) +# Create ~/bin directory if it doesn't exist mkdir -p ~/bin -# Scriptleri kopyalama +# Copy scripts cp bin/leann-sync-all.sh ~/bin/ cp bin/leann-sync-dev.sh ~/bin/ cp bin/leann-sync-personal.sh ~/bin/ @@ -28,105 +26,105 @@ cp bin/leann-sync-mail.sh ~/bin/ cp bin/leann-sync-imessage.sh ~/bin/ cp bin/leann-sync-calendar.sh ~/bin/ -# Çalıştırılabilir yapma +# Make executable chmod +x ~/bin/leann-*.sh ``` -### 3. PATH'e Ekleme +### 3. Add to PATH ```bash -# ~/.zshrc veya ~/.bashrc dosyanıza ekleyin +# Add to ~/.zshrc or ~/.bashrc echo 'export PATH="$HOME/bin:$PATH"' >> ~/.zshrc source ~/.zshrc ``` -### 4. Ollama Kurulumu (Embedding için) +### 4. Install Ollama (for Embedding) ```bash -# Ollama'yı başlatma +# Start Ollama ollama serve & -# Embedding modeli indirme +# Download embedding model ollama pull nomic-embed-text ``` -### 5. LEANN Kurulumu +### 5. Install LEANN ```bash cd LEANN uv sync --extra diskann ``` -## Kullanım +## Usage -### Hızlı Başlangıç +### Quick Start ```bash -# Tüm indexleri güncelle +# Update all indexes leann-sync-all.sh ``` -### Bireysel Scriptler +### Individual Scripts -| Script | Açıklama | -|--------|-----------| -| `leann-sync-all.sh` | Tüm indexleri sırayla günceller | -| `leann-sync-dev.sh` | Geliştirme ortamı kodlarını indeksler | -| `leann-sync-personal.sh` | Kişisel belgeleri (Documents, Nextcloud) indeksler | -| `leann-sync-brave.sh` | Brave tarayıcı geçmişini indeksler | -| `leann-sync-mail.sh` | Apple Mail e-postalarını indeksler | -| `leann-sync-imessage.sh` | iMessage mesajlarını indeksler | -| `leann-sync-calendar.sh` | Apple Calendar etkinliklerini indeksler | +| Script | Description | +|--------|-------------| +| `leann-sync-all.sh` | Updates all indexes sequentially | +| `leann-sync-dev.sh` | Indexes development environment code | +| `leann-sync-personal.sh` | Indexes personal documents (Documents, Nextcloud) | +| `leann-sync-brave.sh` | Indexes Brave browser history | +| `leann-sync-mail.sh` | Indexes Apple Mail emails | +| `leann-sync-imessage.sh` | Indexes iMessage messages | +| `leann-sync-calendar.sh` | Indexes Apple Calendar events | -### Örnek Kullanım Senaryoları +### Example Usage Scenarios -#### Senaryo 1: Günlük Geliştirme İş Akışı +#### Scenario 1: Daily Development Workflow ```bash -# Her sabah geliştirme ortamınızı güncelleyin +# Update your development environment every morning leann-sync-dev.sh ``` -Bu komut: -- ~/Development klasöründeki kodlarınızı tarar -- AST-aware chunking ile kod yapısını korur -- DiskANN backend ile indeks oluşturur +This command: +- Scans code in ~/Development folder +- Preserves code structure with AST-aware chunking +- Creates index with DiskANN backend -#### Senaryo 2: Kişisel Doküman Arama +#### Scenario 2: Personal Document Search ```bash -# Kişisel belgelerinizi indeksleyin +# Index your personal documents leann-sync-personal.sh ``` -Bu komut: -- ~/Documents, ~/Nextcloud, ~/Nextcloud2 klasörlerini tarar -- Tüm belgeleri (PDF, TXT, MD, Word, Excel, PowerPoint) indeksler +This command: +- Scans ~/Documents, ~/Nextcloud, ~/Nextcloud2 folders +- Indexes all documents (PDF, TXT, MD, Word, Excel, PowerPoint) -#### Senaryo 3: Tarayıcı Geçmişi Arama +#### Scenario 3: Browser History Search ```bash -# Brave tarayıcı geçmişinizi indeksleyin +# Index your Brave browser history leann-sync-brave.sh ``` -#### Senaryo 4: E-posta Arama +#### Scenario 4: Email Search ```bash -# Apple Mail e-postalarınızı indeksleyin +# Index your Apple Mail emails leann-sync-mail.sh ``` -#### Senaryo 5: iMessage Arama +#### Scenario 5: iMessage Search ```bash -# iMessage mesajlarınızı indeksleyin +# Index your iMessage messages leann-sync-imessage.sh ``` -## Script Özelleştirme +## Script Customization -### Kendi Scriptinizi Oluşturma +### Create Your Own Script ```bash #!/bin/bash @@ -143,82 +141,82 @@ leann build my-custom-index \ --force ``` -### Parametre Değiştirme +### Modify Parameters -Scriptlerdeki parametreleri kendi ihtiyaçlarınıza göre değiştirebilirsiniz: +You can modify parameters in scripts according to your needs: ```bash -# Daha büyük chunk boyutu +# Larger chunk size --doc-chunk-size 2048 -# Embedding modeli değiştirme +# Change embedding model --embedding-model BAAI/bge-base-en-v1.5 -# HNSW backend kullanma +# Use HNSW backend --backend-name hnsw ``` -## Sıkça Sorulan Sorular +## Frequently Asked Questions -### S: "leann: command not found" hatası alıyorum +### Q: I get "leann: command not found" error -C: LEANN kurulumunun PATH'e eklendiğinden emin olun: +Make sure LEANN installation is in your PATH: ```bash export PATH="/path/to/LEANN/packages/leann-core:$PATH" ``` -### S: Ollama bağlantısı başarısız +### Q: Ollama connection failed -C: Ollama'nın çalıştığını kontrol edin: +Make sure Ollama is running: ```bash ollama serve & ollama list ``` -### S: Index oluşturma çok yavaş +### Q: Index creation is too slow -C: Daha küçük bir dataset ile test edin: +Try with a smaller dataset: ```bash --max-items 1000 ``` -## İleri Düzey Kullanım +## Advanced Usage -### LEANN CLI Komutları +### LEANN CLI Commands ```bash -# Index oluşturma +# Create index leann build my-index --docs ./documents -# Arama -leann search my-index "arama sorgusu" +# Search +leann search my-index "search query" -# Soru sorma +# Ask questions leann ask my-index --interactive -# Indexleri listeleme +# List indexes leann list -# Index kaldırma +# Remove index leann remove my-index ``` -### Embedding Modları +### Embedding Modes -| Mod | Açıklama | -|-----|-----------| -| `sentence-transformers` | HuggingFace modelleri | +| Mode | Description | +|------|-------------| +| `sentence-transformers` | HuggingFace models | | `openai` | OpenAI API | | `mlx` | Apple Silicon MLX | -| `ollama` | Yerel Ollama | +| `ollama` | Local Ollama | -### Backend Seçimi +### Backend Selection -| Backend | Kullanım | +| Backend | Use Case | |---------|----------| -| `hnsw` | Küçük-orta ölçekli (<10M vektör) | -| `diskann` | Büyük ölçekli, disk tabanlı | +| `hnsw` | Small-medium scale (<10M vectors) | +| `diskann` | Large scale, disk-based | --- -Bu dokümantasyon, LEANN'ın günlük kullanımını kolaylaştırmak için hazırlanmıştır. Herhangi bir sorunuz varsa, lütfen GitHub Issues üzerinden sorun. +This documentation is prepared to make LEANN easy to use in daily life. If you have any questions, please ask on GitHub Issues. From d0717559975ea5466137acc487de578c2906fc75 Mon Sep 17 00:00:00 2001 From: Tolga Karatas Date: Wed, 25 Feb 2026 04:24:39 +0300 Subject: [PATCH 4/4] fix: correct reader class initialization in CLI index commands - Fix IMessageReader initialization (remove db_path argument) - Fix WeChatHistoryReader initialization and load_data parameters - Fix SlackMCPReader initialization and load_data parameters - Fix ChatGPTReader initialization and load_data parameters - Fix ClaudeReader initialization and load_data parameters These fixes ensure all index-* commands work correctly with the reader classes. --- packages/leann-core/src/leann/cli.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index ba40d759..869edf69 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -1920,7 +1920,7 @@ async def index_calendar(self, args): async def index_wechat(self, args): """Build an index from WeChat chat history.""" - from .readers import WeChatReader + from .readers import WeChatHistoryReader index_name = args.index_name index_dir = self.indexes_dir / index_name @@ -1928,8 +1928,9 @@ async def index_wechat(self, args): print("💬 Indexing WeChat chat history...") - reader = WeChatReader(export_dir=args.export_dir) - documents = reader.load_data(max_count=args.max_items) + reader = WeChatHistoryReader() + export_dir = getattr(args, "export_dir", "./wechat_export") + documents = reader.load_data(wechat_export_dir=export_dir, max_count=args.max_items) if not documents: print("❌ No WeChat data found. Make sure WeChat is exported first.") @@ -1968,7 +1969,7 @@ async def index_imessage(self, args): print("💬 Indexing iMessage history...") - reader = IMessageReader(db_path=args.db_path) + reader = IMessageReader() documents = reader.load_data(max_count=args.max_items) if not documents: @@ -2000,7 +2001,7 @@ async def index_imessage(self, args): async def index_slack(self, args): """Build an index from Slack workspace via MCP.""" - from .readers import SlackReader + from .readers import SlackMCPReader index_name = args.index_name index_dir = self.indexes_dir / index_name @@ -2008,12 +2009,11 @@ async def index_slack(self, args): print("📱 Indexing Slack workspace...") - reader = SlackReader( - mcp_server=args.mcp_server, + reader = SlackMCPReader( + mcp_server_command=args.mcp_server, workspace_name=args.workspace_name, - channels=args.channels, ) - documents = reader.load_data() + documents = await reader.load_data(channels=args.channels) if not documents: print("❌ No Slack data found. Make sure MCP server is running.") @@ -2052,8 +2052,8 @@ async def index_chatgpt(self, args): print("🤖 Indexing ChatGPT export...") - reader = ChatGPTReader(export_path=args.export_path) - documents = reader.load_data(max_count=args.max_items) + reader = ChatGPTReader() + documents = reader.load_data(export_path=args.export_path) if not documents: print("❌ No ChatGPT data found. Make sure export file is valid.") @@ -2092,8 +2092,8 @@ async def index_claude(self, args): print("🤖 Indexing Claude export...") - reader = ClaudeReader(export_path=args.export_path) - documents = reader.load_data(max_count=args.max_items) + reader = ClaudeReader() + documents = reader.load_data(export_path=args.export_path) if not documents: print("❌ No Claude data found. Make sure export file is valid.")