versila22 · zhaog100 · Apr 2, 2026
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ Fini les appels paniqués à 22h pour "la télé qui s'allume plus". **Hotline D
 | Domaine | Implémentation |
 |---------|---------------|
 | **LLM / Multimodal** | Gemini 2.5 Flash — texte, image, audio natif |
-| **RAG familial** | Embedding Google + cosine search sur knowledge base `.md` |
+| **RAG familial** | Embedding Google + cosine search sur knowledge base `.md` + `.pdf` |
 | **Bot Telegram** | Handlers async, gestion de session, file d'attente photo→question |
 | **Sécurité IA** | Filtre PII (IBAN, CB, mots de passe) sur texte et images |
 | **Escalade intelligente** | Détection de niveau de complexité → transfert à l'humain |
@@ -113,8 +113,25 @@ Envoyez `/start` à [@userinfobot](https://t.me/userinfobot) sur Telegram — il
 ...
 ```
 
+**Nouveau !** Ajoutez des fichiers PDF (modes d'emploi, manuels) :
+
+```bash
+# Ajouter des manuels PDF
+cp ~/Downloads/manuel_tv.pdf knowledge/
+cp ~/Downloads/guide_freebox.pdf knowledge/
+
+# Redémarrer le bot pour recharger la base
+docker-compose restart
+```
+
 Le bot recharge automatiquement la base au démarrage. Voir [`knowledge/README.md`](knowledge/README.md) pour le guide complet.
 
+**Support PDF** :
+- ✅ Extraction automatique du texte
+- ✅ Pagination (chaque page est un chunk)
+- ✅ Compatible avec tous les fichiers PDF
+- ⚠️ Nécessite `PyPDF2>=3.0.0` (inclus dans requirements.txt)
+
 ---
 
 ## Fonctionnalités

diff --git a/bot/pdf_parser.py b/bot/pdf_parser.py
@@ -0,0 +1,106 @@
+"""PDF Parser for Knowledge Base
+
+Simple PDF text extraction using PyPDF2.
+Extracts text from PDF files for RAG indexing.
+"""
+
+import logging
+from pathlib import Path
+from typing import List, Dict
+try:
+    from PyPDF2 import PdfReader
+    PDF_SUPPORT = True
+except ImportError:
+    PDF_SUPPORT = False
+    logging.warning("PyPDF2 not installed. PDF support disabled.")
+
+logger = logging.getLogger(__name__)
+
+
+def extract_text_from_pdf(pdf_path: Path) -> List[Dict]:
+    """Extract text from a PDF file, page by page.
+
+    Args:
+        pdf_path: Path to the PDF file
+
+    Returns:
+        List of dicts with "text" and "source" keys
+    """
+    if not PDF_SUPPORT:
+        logger.error("PyPDF2 not available - cannot parse PDFs")
+        return []
+
+    chunks = []
+
+    try:
+        reader = PdfReader(str(pdf_path))
+        logger.info(f"📖 Parsing {pdf_path.name}: {len(reader.pages)} pages")
+
+        for page_num, page in enumerate(reader.pages, 1):
+            text = page.extract_text()
+            if text and text.strip():
+                chunks.append({
+                    "text": text.strip(),
+                    "source": f"{pdf_path.name} (page {page_num})"
+                })
+
+        logger.info(f"✅ Extracted {len(chunks)} chunks from {pdf_path.name}")
+
+    except Exception as e:
+        logger.error(f"❌ Failed to parse {pdf_path.name}: {e}")
+
+    return chunks
+
+
+def parse_pdf_files(knowledge_dir: Path) -> List[Dict]:
+    """Parse all PDF files in the knowledge directory.
+
+    Args:
+        knowledge_dir: Path to knowledge directory
+
+    Returns:
+        List of text chunks from all PDFs
+    """
+    if not PDF_SUPPORT:
+        return []
+
+    pdf_files = list(knowledge_dir.glob("*.pdf"))
+
+    if not pdf_files:
+        logger.debug("No PDF files found in knowledge directory")
+        return []
+
+    logger.info(f"📚 Found {len(pdf_files)} PDF file(s)")
+
+    all_chunks = []
+    for pdf_path in pdf_files:
+        chunks = extract_text_from_pdf(pdf_path)
+        all_chunks.extend(chunks)
+
+    return all_chunks
+
+
+if __name__ == "__main__":
+    # Test PDF parsing
+    import sys
+
+    print("🧪 Testing PDF parser...")
+
+    if not PDF_SUPPORT:
+        print("❌ PyPDF2 not installed. Install with: pip install PyPDF2")
+        sys.exit(1)
+
+    test_dir = Path("knowledge")
+    if not test_dir.exists():
+        print("❌ knowledge/ directory not found")
+        sys.exit(1)
+
+    chunks = parse_pdf_files(test_dir)
+
+    if chunks:
+        print(f"\n✅ Successfully extracted {len(chunks)} chunks:")
+        for i, chunk in enumerate(chunks[:3], 1):
+            print(f"\n{i}. {chunk['source']}")
+            print(f"   {chunk['text'][:100]}...")
+    else:
+        print("⚠️ No PDF files found or extraction failed")
diff --git a/bot/rag.py b/bot/rag.py
@@ -1,19 +1,26 @@
 """RAG familial — Knowledge Base avec embeddings Google text-embedding-004.
 
-Charge les fichiers Markdown du dossier knowledge/, les découpe par sections (##),
+Charge les fichiers Markdown et PDF du dossier knowledge/, les découpe par sections (##),
 calcule les embeddings en mémoire, et expose une fonction search() par cosine similarity.
 """
 
 import logging
 import re
 from pathlib import Path
-from typing import Optional
+from typing import Optional, List, Dict
 
 import numpy as np
 from google import genai
 
 from .config import EMBEDDING_MODEL, GEMINI_API_KEY, KNOWLEDGE_DIR
 
+# Import PDF parser if available
+try:
+    from .pdf_parser import parse_pdf_files
+    PDF_SUPPORT = True
+except ImportError:
+    PDF_SUPPORT = False
+
 logger = logging.getLogger(__name__)
 
 
@@ -26,7 +33,7 @@ def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
     return float(np.dot(a, b) / (norm_a * norm_b))
 
 
-def _split_markdown_by_headers(content: str, source: str) -> list[dict]:
+def _split_markdown_by_headers(content: str, source: str = "unknown") -> list[dict]:
     """Découpe un fichier Markdown en chunks par headers de niveau ## (H2).
 
     Chaque chunk est un dict {"text": str, "source": str}.
@@ -65,13 +72,17 @@ def __init__(
     # ── Chargement ────────────────────────────────────────────────────────────
 
     def load(self) -> None:
-        """Charge tous les fichiers .md du dossier knowledge/ et calcule les embeddings."""
+        """Charge tous les fichiers .md et .pdf du dossier knowledge/ et calcule les embeddings."""
         md_files = sorted(self._knowledge_dir.glob("*.md"))
-        if not md_files:
-            logger.warning("No .md files found in %s", self._knowledge_dir)
+        pdf_files = sorted(self._knowledge_dir.glob("*.pdf")) if PDF_SUPPORT else []
+
+        if not md_files and not pdf_files:
+            logger.warning("No .md or .pdf files found in %s", self._knowledge_dir)
             return
 
         all_chunks: list[dict] = []
+
+        # Load Markdown files
         for md_path in md_files:
             try:
                 content = md_path.read_text(encoding="utf-8")
@@ -80,6 +91,13 @@ def load(self) -> None:
                 logger.info("Loaded %d chunks from %s", len(chunks), md_path.name)
             except Exception as exc:
                 logger.error("Failed to read %s: %s", md_path, exc)
+
+        # Load PDF files
+        if PDF_SUPPORT and pdf_files:
+            logger.info("📄 Loading %d PDF file(s)...", len(pdf_files))
+            pdf_chunks = parse_pdf_files(self._knowledge_dir)
+            all_chunks.extend(pdf_chunks)
+            logger.info("Loaded %d chunks from PDFs", len(pdf_chunks))
 
         if not all_chunks:
             logger.warning("No chunks to embed.")
@@ -91,7 +109,8 @@ def load(self) -> None:
         self._chunks = all_chunks
         self._embeddings = embeddings
         self._loaded = True
-        logger.info("RAG loaded: %d chunks, %d embeddings", len(self._chunks), len(self._embeddings))
+        logger.info("RAG loaded: %d chunks (%d from Markdown, %d from PDFs), %d embeddings", 
+                    len(self._chunks), len(md_files), len(pdf_files), len(self._embeddings))
 
     def _embed_batch(self, texts: list[str]) -> list[np.ndarray]:
         """Embed une liste de textes via text-embedding-004."""

diff --git a/knowledge/README.md b/knowledge/README.md
@@ -1,11 +1,23 @@
 # Knowledge Base — Comment personnaliser ?
 
 Ce dossier contient la base de connaissances chargée par le bot au démarrage.
-Tous les fichiers `.md` sont automatiquement indexés.
+**Tous les fichiers `.md` et `.pdf` sont automatiquement indexés.**
+
+## 🆕 Nouveau : Support PDF !
+
+Vous pouvez maintenant ajouter des fichiers PDF (modes d'emploi, manuels techniques) :
+
+```bash
+# Exemple : ajouter des manuels
+cp ~/Downloads/manuel_tv_samsung.pdf knowledge/
+cp ~/Downloads/guide_freebox.pdf knowledge/
+```
+
+Le bot extraira automatiquement le texte de chaque page PDF et l'indexera dans la base RAG.
 
 ## Structure recommandée
 
-Chaque fichier doit être structuré avec des titres `##` (H2) — ils définissent les chunks de recherche.
+Chaque fichier Markdown doit être structuré avec des titres `##` (H2) — ils définissent les chunks de recherche.
 
 ```markdown
 # Titre du document
@@ -17,9 +29,11 @@ Contenu de la section...
 Contenu de la section...
 ```
 
+**Pour les PDF** : Pas besoin de formatage spécial. Le bot extrait le texte page par page.
+
 ## Comment modifier
 
-1. Éditer `famille_jacq.md` (ou créer un nouveau fichier `.md`)
+1. Éditer `famille_jacq.md` (ou créer un nouveau fichier `.md` ou `.pdf`)
 2. Remplacer les `[À remplir]` par les vraies informations
 3. Relancer le bot (il recharge la base au démarrage)
 

diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,6 @@ python-telegram-bot>=21.0
 google-genai>=1.0.0
 numpy
 pytest>=8.0
+
+# Optional: PDF support
+PyPDF2>=3.0.0