Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# Model Configuration
# Poate fi 'ollama' sau 'google'
# ─── AI Provider ──────────────────────────────────────────────────────────────
# Poate fi 'ollama' (local, gratuit) sau 'google' (cloud, necesita API key)
AI_PROVIDER=ollama

# Daca AI_PROVIDER este 'google', introdu aici cheia ta secreta de API
# ─── Google Gemini (folosit doar cand AI_PROVIDER=google) ─────────────────────
GOOGLE_API_KEY=your_google_api_key_here
GOOGLE_MODEL_NAME=gemini-2.0-flash

# Modelul implicit daca folosesti google
GOOGLE_MODEL_NAME=gemini-2.5-flash

# URL-ul de baza pentru containerul local de ollama
# ─── Ollama (folosit cand AI_PROVIDER=ollama) ─────────────────────────────────
# Pentru rulare locala (python main.py direct):
OLLAMA_BASE_URL=http://localhost:11434
# Pentru rulare din interiorul Docker (docker-compose run --rm app pytest):
# OLLAMA_BASE_URL=http://ollama:11434
1 change: 0 additions & 1 deletion ai/agent_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import logging
from typing import Any


from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field, ValidationError
Expand Down
16 changes: 16 additions & 0 deletions ai/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
"""

import logging
import platform
import shutil
from pathlib import Path
from typing import Union

Expand All @@ -16,6 +18,20 @@

logger = logging.getLogger(__name__)

# Detectare automată cale Tesseract pe Windows
if platform.system() == "Windows":
_win_paths = [
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
r"C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe",
]
_tesseract_found = shutil.which("tesseract")
if not _tesseract_found:
for _p in _win_paths:
if Path(_p).exists():
pytesseract.pytesseract.tesseract_cmd = _p
break


def extract_text_from_pdf(path: Union[str, Path], max_pages: int = 10) -> str:
"""
Expand Down
3 changes: 1 addition & 2 deletions core/scan_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ai.agent_extractor import ExtractorAgent
from ai.agent_decider import DeciderAgent
from ai.tools import extract_text_from_pdf, extract_text_from_image
from core.file_manager import move_and_rename_file
from core.quarantine_db import quarantine_db

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -122,8 +123,6 @@ def run(self):

# Mutăm și redenumim fișierul fizic imediat
try:
from core.file_manager import move_and_rename_file

move_and_rename_file(
str_path, proposed_folder, decision.suggested_name
)
Expand Down
35 changes: 33 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,49 @@ services:
- ollama_data:/root/.ollama
- ./ai:/app/ai
restart: unless-stopped
healthcheck:
test: ["CMD", "ollama", "list"]
interval: 10s
timeout: 5s
retries: 10
start_period: 30s

ollama-setup:
image: ollama/ollama
container_name: clutterkill_ollama_setup
depends_on:
ollama:
condition: service_healthy
volumes:
- ollama_data:/root/.ollama
- ./ai:/app/ai
environment:
- OLLAMA_HOST=http://ollama:11434
entrypoint: >
sh -c "
echo '=== ClutterKill: Initializing AI models ===' &&
ollama pull gemma2:2b &&
ollama create ck-model -f /app/ai/Modelfile &&
ollama create ck-extractor -f /app/ai/Modelfile.extractor &&
echo '=== All models ready ==='
"
restart: "no"

app:
build: .
container_name: clutterkill_app
depends_on:
- ollama
ollama:
condition: service_healthy
volumes:
- .:/app
working_dir: /app
env_file:
- .env
command: tail -f /dev/null # Keep-alive container pentru rulare scripturi/teste
environment:
# Suprascrie URL-ul din .env cu hostname-ul intern Docker
- OLLAMA_BASE_URL=http://ollama:11434
command: tail -f /dev/null

volumes:
ollama_data:
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ langchain
langchain-community
langchain-ollama
langchain-core
langchain-google-genai
PyMuPDF
pytesseract
pydantic
pytest
ruff
fpdf
fpdf2
python-docx
Pillow
langchain-google-genai
python-dotenv
16 changes: 7 additions & 9 deletions scripts/create_test_pdf.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,24 @@
from fpdf import FPDF
import os

from fpdf import FPDF


def create_fake_pdf():
# Ne asigurăm că există directorul (în caz că vrem să organizăm mai târziu)
os.makedirs("test_data/source", exist_ok=True)

pdf = FPDF()
pdf.add_page()

# FPDF default include doar câteva fonturi. Arial e un alias pentru Helvetica.
pdf.set_font("Helvetica", size=12)

pdf.cell(200, 10, txt="Universitatea X - Curs MDS", align="C")
pdf.cell(200, 10, text="Universitatea X - Curs MDS", align="C")
pdf.ln(10)
pdf.cell(200, 10, txt="Semestrul 2 - Note de Curs", align="C")
pdf.cell(200, 10, text="Semestrul 2 - Note de Curs", align="C")
pdf.ln(10)
pdf.cell(200, 10, txt="Acesta este un document generat automat pentru testare.")
pdf.cell(200, 10, text="Acesta este un document generat automat pentru testare.")

file_path = "Curs_MDS_Sem2.pdf"
file_path = os.path.join("test_data", "source", "Curs_MDS_Sem2.pdf")
pdf.output(file_path)
print(f"✅ Fișierul PDF de test a fost creat cu succes: {file_path}")
print(f"Fisierul PDF de test a fost creat cu succes: {file_path}")


if __name__ == "__main__":
Expand Down
73 changes: 73 additions & 0 deletions setup.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
@echo off
REM ClutterKill — First-time setup script (Windows)
setlocal enabledelayedexpansion

echo === ClutterKill Setup (Windows) ===

REM 1. Python virtualenv
if not exist ".venv\" (
echo [1/5] Creating virtual environment...
python -m venv .venv
)
call .venv\Scripts\activate.bat

REM 2. Dependencies
echo [2/5] Installing Python dependencies...
pip install --upgrade pip -q
pip install -r requirements.txt -q

REM 3. .env
if not exist ".env" (
echo [3/5] Creating .env from template...
copy .env.example .env
) else (
echo [3/5] .env already exists -- skipping.
)

REM 4. Ollama models
echo [4/5] Setting up Ollama models...
where ollama >nul 2>&1
if %ERRORLEVEL% == 0 (
ollama pull gemma2:2b
ollama create ck-model -f ai\Modelfile
ollama create ck-extractor -f ai\Modelfile.extractor
echo Models created locally.
) else (
echo Ollama not found. Trying Docker...
where docker >nul 2>&1
if %ERRORLEVEL% == 0 (
docker-compose up -d ollama
echo Waiting for Ollama to start...
timeout /t 20 /nobreak >nul
docker exec clutterkill_ollama ollama pull gemma2:2b
docker exec clutterkill_ollama ollama create ck-model -f /app/ai/Modelfile
docker exec clutterkill_ollama ollama create ck-extractor -f /app/ai/Modelfile.extractor
echo Models created inside Docker container.
) else (
echo ERROR: Neither Ollama nor Docker is available.
echo Install Ollama from https://ollama.com
echo Or Docker Desktop from https://docker.com
pause
exit /b 1
)
)

REM 5. Tesseract check
echo [5/5] Checking Tesseract OCR...
where tesseract >nul 2>&1
if %ERRORLEVEL% == 0 (
echo Tesseract found.
) else (
if exist "C:\Program Files\Tesseract-OCR\tesseract.exe" (
echo Tesseract found at default path.
) else (
echo Tesseract not found (OCR on images will be disabled).
echo Download from: https://github.com/UB-Mannheim/tesseract/wiki
)
)

echo.
echo === Setup complete! Run the app with: ===
echo .venv\Scripts\activate
echo python main.py
pause
68 changes: 68 additions & 0 deletions setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env bash
# ClutterKill — First-time setup script (Linux / macOS)
set -e

echo "=== ClutterKill Setup ==="

# 1. Python virtualenv
if [ ! -d ".venv" ]; then
echo "[1/5] Creating virtual environment..."
python3 -m venv .venv
fi
source .venv/bin/activate

# 2. Dependencies
echo "[2/5] Installing Python dependencies..."
pip install --upgrade pip -q
pip install -r requirements.txt -q

# 3. .env
if [ ! -f ".env" ]; then
echo "[3/5] Creating .env from template..."
cp .env.example .env
else
echo "[3/5] .env already exists — skipping."
fi

# 4. Ollama models
echo "[4/5] Setting up Ollama models..."
if ! command -v ollama &>/dev/null; then
echo " Ollama not found. Trying Docker..."
if command -v docker &>/dev/null && docker info &>/dev/null; then
docker-compose up -d ollama
echo " Waiting for Ollama to start..."
sleep 15
docker exec clutterkill_ollama ollama pull gemma2:2b
docker exec clutterkill_ollama ollama create ck-model -f /app/ai/Modelfile
docker exec clutterkill_ollama ollama create ck-extractor -f /app/ai/Modelfile.extractor
echo " Models created inside Docker container."
else
echo " ERROR: Neither Ollama nor Docker is available."
echo " Install Ollama from https://ollama.com or Docker from https://docker.com"
exit 1
fi
else
# Ollama is installed locally
ollama pull gemma2:2b
ollama create ck-model -f ai/Modelfile
ollama create ck-extractor -f ai/Modelfile.extractor
echo " Models created locally."
fi

# 5. Tesseract (optional, for OCR on images)
echo "[5/5] Checking Tesseract OCR..."
if command -v tesseract &>/dev/null; then
echo " Tesseract found: $(tesseract --version 2>&1 | head -1)"
else
echo " Tesseract not found (OCR on images will be disabled)."
if [[ "$OSTYPE" == "darwin"* ]]; then
echo " Install with: brew install tesseract"
else
echo " Install with: sudo apt-get install tesseract-ocr"
fi
fi

echo ""
echo "=== Setup complete! Run the app with: ==="
echo " source .venv/bin/activate"
echo " python main.py"
Loading