diff --git a/.env.example b/.env.example index 558cced..636dd69 100644 --- a/.env.example +++ b/.env.example @@ -5,5 +5,8 @@ AI_PROVIDER=ollama # Daca AI_PROVIDER este 'google', introdu aici cheia ta secreta de API GOOGLE_API_KEY=your_google_api_key_here +# Modelul implicit daca folosesti google +GOOGLE_MODEL_NAME=gemini-2.5-flash + # URL-ul de baza pentru containerul local de ollama -OLLAMA_BASE_URL=http://ollama:11434 +OLLAMA_BASE_URL=http://localhost:11434 diff --git a/README.md b/README.md index ba7e6fc..2184e79 100644 --- a/README.md +++ b/README.md @@ -139,24 +139,48 @@ To ensure consistent code quality and formatting, this project is configured to Once installed, `ruff` will automatically format your code on `git commit`. If changes are made by the formatter, the commit will abort—simply `git add` the updated files and run `git commit` again. -## 🐳 Docker & Local AI Setup +## 🐳 Environment & Local AI Setup The application leverages Docker to seamlessly run local AI models without complicating the host system. -### Setup Instructions: -1. Start the Docker containers: +### Step-by-Step Setup Instructions: + +1. **Configure Environment Variables**: ```bash - docker-compose up -d + cp .env.example .env ``` -2. Create the custom AI model (Gemma 2 based) configured for precision: + +2. **Start the Docker container** (for Ollama): + ```bash + docker-compose up -d ollama + ``` + +3. **Pull base model and Create Custom AI Models**: + ClutterKill uses two distinct models for processing (Classifier and Extractor): ```bash + # Pull the base model (Wait for the download to finish) + docker exec -it clutterkill_ollama ollama pull gemma2:2b + # Note: If you get a 'manifest does not exist' error on older machines, use 'gemma:2b' instead and update the Modelfiles. + + # Create Agent 0 & 2 (Classifier) docker exec -it clutterkill_ollama ollama create ck-model -f /app/ai/Modelfile + + # Create Agent 1 (Extractor) + docker exec -it clutterkill_ollama ollama create ck-extractor -f /app/ai/Modelfile.extractor ``` -3. Verify the model is running: + +4. **Verify the models are running**: ```bash curl http://localhost:11434/api/tags ``` -*Note: The `docker-compose.yml` configuration also provides environment variables (`AI_PROVIDER`, `GOOGLE_API_KEY`) to easily switch between local `ollama` processing and cloud-based alternatives like `google`.* +5. **Run the Application**: + Activate your virtual environment and run the graphical interface: + ```bash + source .venv/bin/activate + python main.py + ``` + +*Note: The project configuration also provides environment variables (`AI_PROVIDER`, `GOOGLE_API_KEY`) to easily switch between local `ollama` processing and cloud-based alternatives like `google`.* For more detailed DevOps and QA instructions, please refer to [README_ingineri.md](README_ingineri.md). diff --git a/ai/agent_decider.py b/ai/agent_decider.py index 55addd8..a86c79a 100644 --- a/ai/agent_decider.py +++ b/ai/agent_decider.py @@ -72,9 +72,15 @@ def sanitize_filename(cls, v: str) -> str: Instructions: 1. If the Document Summary MATCHES the Rule Category, your status must be "move". 2. If it DOES NOT match, or if you are unsure, your status must be "quarantine". -3. Calculate the new filename based on the Naming Convention. If the naming convention includes {{original_filename}}, replace it with the actual original filename. -4. If the status is "quarantine", the folder must be "Quarantine". -5. If the status is "quarantine", the suggested_name MUST be exactly the Original Filename. +3. Build the new filename using the Naming Convention as a TEMPLATE: + - The Naming Convention may contain camelCase or descriptive placeholder words like "abreviereaMateriei", "NumarulCursului", "Data", "Emitent", "Suma", etc. + - YOU MUST extract the actual values from the Document Summary and substitute them into each placeholder. + - Example: If Naming Convention is "abreviereaMateriei_Curs_NumarulCursului_Data" and the document is about "Algoritmi Avansati, Cursul 4, 01.01.2026", the result must be "AlgoritmiAvansati_Curs_4_01012026". + - If a placeholder value cannot be determined from the Document Summary, use a sensible short abbreviation (e.g. "Unknown"). + - If the Naming Convention is literally "{{original_filename}}", keep the original filename unchanged. +4. CRITICAL: The new filename MUST keep the exact same file extension as the Original Filename (e.g. .pdf, .docx). +5. CRITICAL: Do NOT include spaces in the filename. Use underscores (_) instead. +6. If the status is "quarantine", the folder must be "Quarantine". IMPORTANT: You must return ONLY the raw JSON object containing the actual values. Do NOT return a JSON schema, and do NOT wrap your answer in markdown fences (like ```json). diff --git a/ai/llm_config.py b/ai/llm_config.py index cbaefbd..7b9f723 100644 --- a/ai/llm_config.py +++ b/ai/llm_config.py @@ -30,9 +30,15 @@ import logging import os +from dotenv import load_dotenv +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_google_genai import ChatGoogleGenerativeAI from langchain_ollama import ChatOllama +# Încarcă variabilele de mediu din fișierul .env, dacă există +load_dotenv() + logger = logging.getLogger(__name__) # ─── Defaults (match docker-compose.yml & .env.example) ───────────── @@ -40,6 +46,8 @@ _DEFAULT_OLLAMA_BASE_URL = "http://localhost:11434" _DEFAULT_REQUEST_TIMEOUT = 120.0 # seconds — OCR docs can be big +_DEFAULT_GOOGLE_MODEL = "gemini-2.0-flash" + # Model registry — one entry per Modelfile MODEL_CLASSIFIER = "ck-model" # ai/Modelfile MODEL_EXTRACTOR = "ck-extractor" # ai/Modelfile.extractor @@ -56,35 +64,48 @@ def get_llm( temperature: float | None = None, num_ctx: int | None = None, timeout: float | None = None, -) -> ChatOllama: - """Return a ChatOllama instance for the requested model. +) -> BaseChatModel: + """Return a chat model instance based on chosen AI_PROVIDER. Parameters ---------- model : str - Ollama model name. Use the module constants: - ``MODEL_CLASSIFIER`` (default) or ``MODEL_EXTRACTOR``. + Ollama model name or placeholder. For Google, uses the + defined DEFAULT_GOOGLE_MODEL override. temperature : float, optional - Override sampling temperature (model default: 0.1). + Override sampling temperature (default: 0.1). num_ctx : int, optional - Override context-window size. + Override context-window size (Ollama only). timeout : float, optional Override HTTP request timeout (default: 120 s). Returns ------- - ChatOllama + BaseChatModel A LangChain chat-model instance ready for ``.invoke()`` / ``.ainvoke()`` / agent binding. """ provider = os.getenv("AI_PROVIDER", _DEFAULT_PROVIDER).lower() if provider == "google": - # Future: return ChatGoogleGenerativeAI(...) - raise NotImplementedError( - "Google AI provider is not yet implemented. " - "Set AI_PROVIDER=ollama or leave unset." + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + raise ValueError("GOOGLE_API_KEY is required when AI_PROVIDER='google'") + + google_model = os.getenv("GOOGLE_MODEL_NAME", _DEFAULT_GOOGLE_MODEL) + temp = temperature if temperature is not None else 0.1 + + logger.info( + "Initializing ChatGoogleGenerativeAI model=%s", + google_model, + ) + + g_llm = ChatGoogleGenerativeAI( + model=google_model, + temperature=temp, + google_api_key=api_key, ) + return g_llm if provider != "ollama": raise ValueError( diff --git a/core/scan_worker.py b/core/scan_worker.py new file mode 100644 index 0000000..d1684af --- /dev/null +++ b/core/scan_worker.py @@ -0,0 +1,166 @@ +import logging +from pathlib import Path + +from PyQt6.QtCore import QThread, pyqtSignal + +from ai.agent_compiler import CompilerAgent +from ai.agent_extractor import ExtractorAgent +from ai.agent_decider import DeciderAgent +from ai.tools import extract_text_from_pdf, extract_text_from_image +from core.quarantine_db import quarantine_db + +logger = logging.getLogger(__name__) + + +class ScanWorker(QThread): + """ + Thread real de scanare care folosește pipeline-ul de agenți AI: + 1. Agent 0 (Compiler) transformă regula naturală. + 2. Agent 1 (Extractor) citește fișierul și scoate un rezumat tehnic. + 3. Agent 2 (Decider) aplică regula pe rezumat pentru o decizie de rutare. + 4. Adaugă fiecare fișier în quarantine_db cu recomandările AI. + """ + + progress_updated = pyqtSignal(int) + log_updated = pyqtSignal(str) + scan_finished = pyqtSignal(int) + + def __init__(self, source_dir: str, dest_dir: str, user_rule: str): + super().__init__() + self.source_dir = Path(source_dir) + self.dest_dir = Path(dest_dir) + self.user_rule = user_rule + + def run(self): + # 1. Inițializăm agenții + self.log_updated.emit("🤖 Se încarcă agenții AI...") + try: + compiler = CompilerAgent() + extractor = ExtractorAgent() + decider = DeciderAgent() + except Exception as e: + self.log_updated.emit(f"❌ Eroare la inițializarea agenților: {e}") + self.scan_finished.emit(0) + return + + # 2. Compilăm regula + if not self.user_rule.strip(): + self.log_updated.emit("⚠️ Regula nu a fost completată!") + self.scan_finished.emit(0) + return + + self.log_updated.emit(f"🧠 Compilare regulă: '{self.user_rule}'") + try: + compiled_rule = compiler.compile(self.user_rule) + self.log_updated.emit( + f"✅ Regulă compilată:\n{compiled_rule.model_dump_json(indent=2)}" + ) + except Exception as e: + self.log_updated.emit(f"❌ Eroare la compilarea regulii: {e}") + self.scan_finished.emit(0) + return + + # 3. Preluăm fișierele din sursă + files = [f for f in self.source_dir.rglob("*") if f.is_file()] + total = len(files) + + if total == 0: + self.log_updated.emit("⚠️ Niciun fișier găsit în folderul sursă.") + self.scan_finished.emit(0) + return + + self.log_updated.emit(f"🔍 {total} fișiere găsite. Se începe scanarea cu AI...") + + added_count = 0 + skipped_count = 0 + + existing_paths = {r["original_path"] for r in quarantine_db.get_all()} + + for i, file_path in enumerate(files): + str_path = str(file_path) + + if str_path in existing_paths: + skipped_count += 1 + self.log_updated.emit(f"⏭️ {file_path.name} — deja în carantină, skip") + else: + self.log_updated.emit(f"📄 Procesare: {file_path.name}...") + + # a. Extragere text + text = "" + ext = file_path.suffix.lower() + try: + if ext == ".pdf": + text = extract_text_from_pdf(file_path) + elif ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]: + text = extract_text_from_image(file_path) + elif ext in [".txt", ".csv", ".md"]: + text = file_path.read_text(errors="ignore") + else: + text = f"Fișier de tip necunoscut ({ext}). Conținut text nedisponibil." + except Exception as e: + logger.warning( + f"Eroare extragere text pentru {file_path.name}: {e}" + ) + text = f"Eroare extracție: {e}" + + # b. Agent 1 (Extragere) + try: + extraction_result = extractor.extract( + text or "Conținut gol sau necitibil" + ) + summary = extraction_result.get_technical_summary() + except Exception as e: + logger.error(f"Eroare ExtractorAgent: {e}") + summary = f"Eroare procesare text: {e}" + + # c. Agent 2 (Decizie) + try: + decision = decider.decide(summary, file_path.name, compiled_rule) + + if decision.status == "move": + proposed_folder = str(self.dest_dir / decision.suggested_folder) + + # Mutăm și redenumim fișierul fizic imediat + try: + from core.file_manager import move_and_rename_file + + move_and_rename_file( + str_path, proposed_folder, decision.suggested_name + ) + added_count += 1 + self.log_updated.emit( + f" ↳ MOVE: Mutat și redenumit cu succes în -> {proposed_folder}/{decision.suggested_name}" + ) + except Exception as e: + logger.error( + f"Eroare la mutarea fișierului {file_path.name}: {e}" + ) + self.log_updated.emit(f" ↳ ❌ Eroare la mutare: {e}") + + else: + proposed_folder = "Quarantine" + + # Adăugăm în carantină pentru intervenție manuală + quarantine_db.add( + original_path=str_path, + ai_proposed_name=decision.suggested_name, + ai_proposed_folder=proposed_folder, + reason=f"Decizie AI ({decision.status}) bazată pe: {summary[:100]}...", + ) + added_count += 1 + self.log_updated.emit( + f" ↳ QUARANTINE: Trimis în carantină. Nume sugerat: {decision.suggested_name}" + ) + except Exception as e: + logger.error(f"Eroare DeciderAgent: {e}") + self.log_updated.emit(f" ↳ ❌ Eroare la luarea deciziei: {e}") + + # Actualizăm progresul + progress = int((i + 1) / total * 100) + self.progress_updated.emit(progress) + + self.log_updated.emit( + f"\n✅ Scanare AI completă! {added_count} fișiere noi trimise spre review, " + f"{skipped_count} existente ignorate." + ) + self.scan_finished.emit(added_count) diff --git a/requirements.txt b/requirements.txt index 7aee05c..77527ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,5 @@ ruff fpdf python-docx Pillow +langchain-google-genai +python-dotenv diff --git a/ui/tabs/scan_tab.py b/ui/tabs/scan_tab.py index 9b5e775..8b8d553 100644 --- a/ui/tabs/scan_tab.py +++ b/ui/tabs/scan_tab.py @@ -1,5 +1,3 @@ -from pathlib import Path - from PyQt6.QtWidgets import ( QWidget, QVBoxLayout, @@ -11,117 +9,9 @@ QTextEdit, QFileDialog, ) -from PyQt6.QtCore import QThread, pyqtSignal - -from core.quarantine_db import quarantine_db - -# Categorii de fișiere bazate pe extensie (placeholder până la integrarea AI) -EXTENSION_CATEGORIES = { - ".pdf": "Documente/PDF", - ".doc": "Documente/Word", - ".docx": "Documente/Word", - ".txt": "Documente/Text", - ".rtf": "Documente/Text", - ".xlsx": "Documente/Excel", - ".xls": "Documente/Excel", - ".csv": "Documente/CSV", - ".pptx": "Documente/PowerPoint", - ".ppt": "Documente/PowerPoint", - ".jpg": "Imagini", - ".jpeg": "Imagini", - ".png": "Imagini", - ".gif": "Imagini", - ".bmp": "Imagini", - ".svg": "Imagini", - ".zip": "Arhive", - ".rar": "Arhive", - ".7z": "Arhive", - ".tar": "Arhive", - ".gz": "Arhive", - ".py": "Cod", - ".js": "Cod", - ".html": "Cod", - ".css": "Cod", - ".java": "Cod", - ".mp3": "Audio", - ".wav": "Audio", - ".mp4": "Video", - ".avi": "Video", - ".mkv": "Video", -} - - -class ScanThread(QThread): - """ - Thread real de scanare care: - 1. Parcurge recursiv folderul sursă - 2. Categorizează fișierele după extensie (până la integrarea AI) - 3. Adaugă fiecare fișier în quarantine_db pentru review-ul utilizatorului - """ - - progress_updated = pyqtSignal(int) - log_updated = pyqtSignal(str) - scan_finished = pyqtSignal(int) # emite numărul total de fișiere adăugate - - def __init__(self, source_dir: str, dest_dir: str): - super().__init__() - self.source_dir = Path(source_dir) - self.dest_dir = Path(dest_dir) - def run(self): - # Colectăm toate fișierele din source (recursiv) - files = [f for f in self.source_dir.rglob("*") if f.is_file()] - total = len(files) - if total == 0: - self.log_updated.emit("⚠️ Niciun fișier găsit în folderul sursă.") - self.scan_finished.emit(0) - return - - self.log_updated.emit(f"🔍 {total} fișiere găsite. Se începe scanarea...") - - added_count = 0 - skipped_count = 0 - - # Preluăm fișierele deja existente în carantină (pentru a evita duplicatele) - existing_paths = {r["original_path"] for r in quarantine_db.get_all()} - - for i, file_path in enumerate(files): - str_path = str(file_path) - - # Verificăm dacă fișierul e deja în carantină - if str_path in existing_paths: - skipped_count += 1 - self.log_updated.emit( - f"⏭️ {file_path.name} — deja în carantină, skip" - ) - else: - # Categorizare pe bază de extensie - ext = file_path.suffix.lower() - category = EXTENSION_CATEGORIES.get(ext, "Altele") - proposed_folder = str(self.dest_dir / category) - - quarantine_db.add( - original_path=str_path, - ai_proposed_name=file_path.name, - ai_proposed_folder=proposed_folder, - reason=f"Categorizat automat după extensie: {ext or 'fără extensie'}", - ) - added_count += 1 - - self.log_updated.emit( - f"📄 {file_path.name} → 📂 {category}" - ) - - # Actualizăm progress bar-ul - progress = int((i + 1) / total * 100) - self.progress_updated.emit(progress) - - self.log_updated.emit( - f"\n✅ Scanare completă! {added_count} fișiere adăugate, " - f"{skipped_count} existente (skip)." - ) - self.scan_finished.emit(added_count) +from core.scan_worker import ScanWorker class ScanTab(QWidget): @@ -154,6 +44,16 @@ def init_ui(self): dest_layout.addWidget(self.dest_input) dest_layout.addWidget(self.dest_browse_btn) + # Rule layout + rule_layout = QHBoxLayout() + self.rule_label = QLabel("AI Organizing Rule:") + self.rule_input = QLineEdit() + self.rule_input.setPlaceholderText( + "e.g., Pune facturile in folderul FacturiNou" + ) + rule_layout.addWidget(self.rule_label) + rule_layout.addWidget(self.rule_input) + # Start button self.start_btn = QPushButton("Start Scan") self.start_btn.setObjectName( @@ -172,6 +72,7 @@ def init_ui(self): # Add all to main layout layout.addLayout(source_layout) layout.addLayout(dest_layout) + layout.addLayout(rule_layout) layout.addWidget(self.start_btn) layout.addWidget(self.progress_bar) layout.addWidget(QLabel("Logs:")) @@ -192,9 +93,13 @@ def browse_dest(self): self.dest_input.setText(directory) def start_scan(self): - if not self.source_input.text() or not self.dest_input.text(): + if ( + not self.source_input.text() + or not self.dest_input.text() + or not self.rule_input.text() + ): self.log_area.append( - "⚠️ Selectează atât folderul sursă cât și cel destinație." + "⚠️ Selectează folderele și introdu o regulă de organizare AI." ) return @@ -202,10 +107,11 @@ def start_scan(self): self.progress_bar.setValue(0) self.log_area.clear() - # Pornim thread-ul real de scanare - self.scan_thread = ScanThread( + # Pornim worker-ul AI de scanare + self.scan_thread = ScanWorker( source_dir=self.source_input.text(), dest_dir=self.dest_input.text(), + user_rule=self.rule_input.text(), ) self.scan_thread.progress_updated.connect(self.update_progress) self.scan_thread.log_updated.connect(self.append_log)