diff --git a/.env.example b/.env.example index 636dd69..d3db4b4 100644 --- a/.env.example +++ b/.env.example @@ -1,12 +1,13 @@ -# Model Configuration -# Poate fi 'ollama' sau 'google' +# ─── AI Provider ────────────────────────────────────────────────────────────── +# Poate fi 'ollama' (local, gratuit) sau 'google' (cloud, necesita API key) AI_PROVIDER=ollama -# Daca AI_PROVIDER este 'google', introdu aici cheia ta secreta de API +# ─── Google Gemini (folosit doar cand AI_PROVIDER=google) ───────────────────── GOOGLE_API_KEY=your_google_api_key_here +GOOGLE_MODEL_NAME=gemini-2.0-flash -# Modelul implicit daca folosesti google -GOOGLE_MODEL_NAME=gemini-2.5-flash - -# URL-ul de baza pentru containerul local de ollama +# ─── Ollama (folosit cand AI_PROVIDER=ollama) ───────────────────────────────── +# Pentru rulare locala (python main.py direct): OLLAMA_BASE_URL=http://localhost:11434 +# Pentru rulare din interiorul Docker (docker-compose run --rm app pytest): +# OLLAMA_BASE_URL=http://ollama:11434 diff --git a/ai/agent_extractor.py b/ai/agent_extractor.py index ffd87d2..477d72b 100644 --- a/ai/agent_extractor.py +++ b/ai/agent_extractor.py @@ -26,7 +26,6 @@ import logging from typing import Any - from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from pydantic import BaseModel, Field, ValidationError diff --git a/ai/tools.py b/ai/tools.py index d101f34..aadccff 100644 --- a/ai/tools.py +++ b/ai/tools.py @@ -7,6 +7,8 @@ """ import logging +import platform +import shutil from pathlib import Path from typing import Union @@ -16,6 +18,20 @@ logger = logging.getLogger(__name__) +# Detectare automată cale Tesseract pe Windows +if platform.system() == "Windows": + _win_paths = [ + r"C:\Program Files\Tesseract-OCR\tesseract.exe", + r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe", + r"C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe", + ] + _tesseract_found = shutil.which("tesseract") + if not _tesseract_found: + for _p in _win_paths: + if Path(_p).exists(): + pytesseract.pytesseract.tesseract_cmd = _p + break + def extract_text_from_pdf(path: Union[str, Path], max_pages: int = 10) -> str: """ diff --git a/core/scan_worker.py b/core/scan_worker.py index d1684af..3db19e5 100644 --- a/core/scan_worker.py +++ b/core/scan_worker.py @@ -7,6 +7,7 @@ from ai.agent_extractor import ExtractorAgent from ai.agent_decider import DeciderAgent from ai.tools import extract_text_from_pdf, extract_text_from_image +from core.file_manager import move_and_rename_file from core.quarantine_db import quarantine_db logger = logging.getLogger(__name__) @@ -122,8 +123,6 @@ def run(self): # Mutăm și redenumim fișierul fizic imediat try: - from core.file_manager import move_and_rename_file - move_and_rename_file( str_path, proposed_folder, decision.suggested_name ) diff --git a/docker-compose.yml b/docker-compose.yml index dd9eb26..30dafd6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,18 +10,49 @@ services: - ollama_data:/root/.ollama - ./ai:/app/ai restart: unless-stopped + healthcheck: + test: ["CMD", "ollama", "list"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 30s + + ollama-setup: + image: ollama/ollama + container_name: clutterkill_ollama_setup + depends_on: + ollama: + condition: service_healthy + volumes: + - ollama_data:/root/.ollama + - ./ai:/app/ai + environment: + - OLLAMA_HOST=http://ollama:11434 + entrypoint: > + sh -c " + echo '=== ClutterKill: Initializing AI models ===' && + ollama pull gemma2:2b && + ollama create ck-model -f /app/ai/Modelfile && + ollama create ck-extractor -f /app/ai/Modelfile.extractor && + echo '=== All models ready ===' + " + restart: "no" app: build: . container_name: clutterkill_app depends_on: - - ollama + ollama: + condition: service_healthy volumes: - .:/app working_dir: /app env_file: - .env - command: tail -f /dev/null # Keep-alive container pentru rulare scripturi/teste + environment: + # Suprascrie URL-ul din .env cu hostname-ul intern Docker + - OLLAMA_BASE_URL=http://ollama:11434 + command: tail -f /dev/null volumes: ollama_data: diff --git a/requirements.txt b/requirements.txt index 77527ac..e7490b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,13 +3,13 @@ langchain langchain-community langchain-ollama langchain-core +langchain-google-genai PyMuPDF pytesseract pydantic pytest ruff -fpdf +fpdf2 python-docx Pillow -langchain-google-genai python-dotenv diff --git a/scripts/create_test_pdf.py b/scripts/create_test_pdf.py index fda980b..f367b4f 100644 --- a/scripts/create_test_pdf.py +++ b/scripts/create_test_pdf.py @@ -1,26 +1,24 @@ -from fpdf import FPDF import os +from fpdf import FPDF + def create_fake_pdf(): - # Ne asigurăm că există directorul (în caz că vrem să organizăm mai târziu) os.makedirs("test_data/source", exist_ok=True) pdf = FPDF() pdf.add_page() - - # FPDF default include doar câteva fonturi. Arial e un alias pentru Helvetica. pdf.set_font("Helvetica", size=12) - pdf.cell(200, 10, txt="Universitatea X - Curs MDS", align="C") + pdf.cell(200, 10, text="Universitatea X - Curs MDS", align="C") pdf.ln(10) - pdf.cell(200, 10, txt="Semestrul 2 - Note de Curs", align="C") + pdf.cell(200, 10, text="Semestrul 2 - Note de Curs", align="C") pdf.ln(10) - pdf.cell(200, 10, txt="Acesta este un document generat automat pentru testare.") + pdf.cell(200, 10, text="Acesta este un document generat automat pentru testare.") - file_path = "Curs_MDS_Sem2.pdf" + file_path = os.path.join("test_data", "source", "Curs_MDS_Sem2.pdf") pdf.output(file_path) - print(f"✅ Fișierul PDF de test a fost creat cu succes: {file_path}") + print(f"Fisierul PDF de test a fost creat cu succes: {file_path}") if __name__ == "__main__": diff --git a/setup.bat b/setup.bat new file mode 100644 index 0000000..bb3e2cb --- /dev/null +++ b/setup.bat @@ -0,0 +1,73 @@ +@echo off +REM ClutterKill — First-time setup script (Windows) +setlocal enabledelayedexpansion + +echo === ClutterKill Setup (Windows) === + +REM 1. Python virtualenv +if not exist ".venv\" ( + echo [1/5] Creating virtual environment... + python -m venv .venv +) +call .venv\Scripts\activate.bat + +REM 2. Dependencies +echo [2/5] Installing Python dependencies... +pip install --upgrade pip -q +pip install -r requirements.txt -q + +REM 3. .env +if not exist ".env" ( + echo [3/5] Creating .env from template... + copy .env.example .env +) else ( + echo [3/5] .env already exists -- skipping. +) + +REM 4. Ollama models +echo [4/5] Setting up Ollama models... +where ollama >nul 2>&1 +if %ERRORLEVEL% == 0 ( + ollama pull gemma2:2b + ollama create ck-model -f ai\Modelfile + ollama create ck-extractor -f ai\Modelfile.extractor + echo Models created locally. +) else ( + echo Ollama not found. Trying Docker... + where docker >nul 2>&1 + if %ERRORLEVEL% == 0 ( + docker-compose up -d ollama + echo Waiting for Ollama to start... + timeout /t 20 /nobreak >nul + docker exec clutterkill_ollama ollama pull gemma2:2b + docker exec clutterkill_ollama ollama create ck-model -f /app/ai/Modelfile + docker exec clutterkill_ollama ollama create ck-extractor -f /app/ai/Modelfile.extractor + echo Models created inside Docker container. + ) else ( + echo ERROR: Neither Ollama nor Docker is available. + echo Install Ollama from https://ollama.com + echo Or Docker Desktop from https://docker.com + pause + exit /b 1 + ) +) + +REM 5. Tesseract check +echo [5/5] Checking Tesseract OCR... +where tesseract >nul 2>&1 +if %ERRORLEVEL% == 0 ( + echo Tesseract found. +) else ( + if exist "C:\Program Files\Tesseract-OCR\tesseract.exe" ( + echo Tesseract found at default path. + ) else ( + echo Tesseract not found (OCR on images will be disabled). + echo Download from: https://github.com/UB-Mannheim/tesseract/wiki + ) +) + +echo. +echo === Setup complete! Run the app with: === +echo .venv\Scripts\activate +echo python main.py +pause diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..b8b712e --- /dev/null +++ b/setup.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# ClutterKill — First-time setup script (Linux / macOS) +set -e + +echo "=== ClutterKill Setup ===" + +# 1. Python virtualenv +if [ ! -d ".venv" ]; then + echo "[1/5] Creating virtual environment..." + python3 -m venv .venv +fi +source .venv/bin/activate + +# 2. Dependencies +echo "[2/5] Installing Python dependencies..." +pip install --upgrade pip -q +pip install -r requirements.txt -q + +# 3. .env +if [ ! -f ".env" ]; then + echo "[3/5] Creating .env from template..." + cp .env.example .env +else + echo "[3/5] .env already exists — skipping." +fi + +# 4. Ollama models +echo "[4/5] Setting up Ollama models..." +if ! command -v ollama &>/dev/null; then + echo " Ollama not found. Trying Docker..." + if command -v docker &>/dev/null && docker info &>/dev/null; then + docker-compose up -d ollama + echo " Waiting for Ollama to start..." + sleep 15 + docker exec clutterkill_ollama ollama pull gemma2:2b + docker exec clutterkill_ollama ollama create ck-model -f /app/ai/Modelfile + docker exec clutterkill_ollama ollama create ck-extractor -f /app/ai/Modelfile.extractor + echo " Models created inside Docker container." + else + echo " ERROR: Neither Ollama nor Docker is available." + echo " Install Ollama from https://ollama.com or Docker from https://docker.com" + exit 1 + fi +else + # Ollama is installed locally + ollama pull gemma2:2b + ollama create ck-model -f ai/Modelfile + ollama create ck-extractor -f ai/Modelfile.extractor + echo " Models created locally." +fi + +# 5. Tesseract (optional, for OCR on images) +echo "[5/5] Checking Tesseract OCR..." +if command -v tesseract &>/dev/null; then + echo " Tesseract found: $(tesseract --version 2>&1 | head -1)" +else + echo " Tesseract not found (OCR on images will be disabled)." + if [[ "$OSTYPE" == "darwin"* ]]; then + echo " Install with: brew install tesseract" + else + echo " Install with: sudo apt-get install tesseract-ocr" + fi +fi + +echo "" +echo "=== Setup complete! Run the app with: ===" +echo " source .venv/bin/activate" +echo " python main.py"