Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 109 additions & 15 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,19 @@ name: CI Pipeline

on:
push:
branches:
- master
branches: [master, main, develop]
pull_request:
branches:
- master
branches: [master, main, develop]

# Cancel in-flight runs for the same branch on new pushes
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
build-and-test:
# ─── Job 1: Lint ────────────────────────────────────────────────────────────
lint:
name: Lint (Ruff)
runs-on: ubuntu-latest

steps:
Expand All @@ -22,17 +27,106 @@ jobs:
python-version: "3.13"
cache: "pip"

- name: Install dependencies
- name: Install Ruff
run: pip install ruff

- name: Run Ruff check
run: ruff check --output-format=github .

- name: Run Ruff format check
run: ruff format --check .

# ─── Job 2: Unit & Integration Tests ────────────────────────────────────────
test:
name: Tests (Python ${{ matrix.python-version }})
runs-on: ubuntu-latest
needs: lint

strategy:
fail-fast: false
matrix:
python-version: ["3.12", "3.13"]

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: "pip"

# PyQt6 needs a display; Xvfb provides a virtual one
- name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -y --no-install-recommends \
tesseract-ocr \
libgl1 \
xvfb \
libxkbcommon-x11-0 \
libxcb-icccm4 \
libxcb-image0 \
libxcb-keysyms1 \
libxcb-randr0 \
libxcb-render-util0 \
libxcb-xinerama0 \
libxcb-xfixes0

- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
# Instalăm dependențele din requirements.txt dacă există
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# Adăugăm extensia pentru adnotări pytest pe GitHub
pip install ruff pytest pytest-github-actions-annotate-failures
pip install -r requirements.txt
pip install pytest pytest-github-actions-annotate-failures pytest-cov

- name: Run Ruff Check
run: ruff check --output-format=github .
# Create a minimal .env so the app doesn't crash on import
- name: Set up environment variables
run: |
cp .env.example .env
# Override to use a dummy Google key (Ollama not available in CI)
sed -i 's/AI_PROVIDER=ollama/AI_PROVIDER=google/' .env
sed -i 's/GOOGLE_API_KEY=your_google_api_key_here/GOOGLE_API_KEY=ci-dummy-key/' .env
env:
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}

- name: Run unit tests (core + ai models — no live AI calls)
run: |
xvfb-run --auto-servernum --server-args="-screen 0 1920x1080x24" \
pytest tests/ \
--ignore=tests/evals \
--ignore=tests/test_agent_decider.py \
-v \
--tb=short \
--cov=core \
--cov=ai \
--cov-report=term-missing \
--cov-report=xml:coverage.xml
env:
AI_PROVIDER: google
GOOGLE_API_KEY: ci-dummy-key
PYTHONDONTWRITEBYTECODE: "1"
QT_QPA_PLATFORM: offscreen

- name: Upload coverage report
if: matrix.python-version == '3.13'
uses: actions/upload-artifact@v4
with:
name: coverage-report
path: coverage.xml
retention-days: 7

- name: Run tests with Pytest
if: always() # Rulează testele chiar dacă Ruff a picat, ca să vezi TOATE erorile
run: pytest tests/
# ─── Job 3: Summary gate (required by branch protection rule) ───────────────
build-and-test:
name: build-and-test
runs-on: ubuntu-latest
needs: [lint, test]
if: always()
steps:
- name: Check all jobs passed
run: |
if [[ "${{ needs.lint.result }}" != "success" || "${{ needs.test.result }}" != "success" ]]; then
echo "One or more required jobs failed."
exit 1
fi
echo "All checks passed."
Binary file added Quarantine/Saptamana2LaboratorMDS.pdf
Binary file not shown.
8 changes: 4 additions & 4 deletions ai/agent_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,21 @@ class CompiledRule(BaseModel):
You are an expert rule translator for the ClutterKill system.
Your job is to translate a user's natural language instruction about where and how to save files into a structured JSON rule.

{format_instructions}

User instruction: "{user_prompt}"

Extract the category, folder structure, and naming convention.
If the naming convention is not explicitly stated, use a default placeholder like "{{original_filename}}" or infer a sensible one if the context implies it.

IMPORTANT: You must return ONLY the raw JSON object containing the actual values. Do NOT return a JSON schema, and do NOT wrap your answer in "properties".
CRITICAL: You must return ONLY the raw JSON object containing the ACTUAL values based on the user instruction. Do NOT return a JSON schema. Do NOT return properties definitions. DO NOT echo back the format instructions.

Example of valid output:
Example of expected valid output:
{{
"category": "factura",
"folder_structure": "Facturi",
"naming_convention": "factura_data.pdf"
}}

{format_instructions}
"""


Expand Down
8 changes: 4 additions & 4 deletions ai/agent_decider.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def sanitize_filename(cls, v: str) -> str:
You are an expert decision-making agent for the ClutterKill system.
Your job is to analyze a document summary and a set of organization rules, and decide if the document should be moved to the correct folder or placed in quarantine.

{format_instructions}

Rule Category: {rule_category}
Target Folder: {rule_folder}
Naming Convention: {rule_naming}
Expand All @@ -82,9 +84,7 @@ def sanitize_filename(cls, v: str) -> str:
5. CRITICAL: Do NOT include spaces in the filename. Use underscores (_) instead.
6. If the status is "quarantine", the folder must be "Quarantine".

IMPORTANT: You must return ONLY the raw JSON object containing the actual values. Do NOT return a JSON schema, and do NOT wrap your answer in markdown fences (like ```json).

{format_instructions}
CRITICAL: You must return ONLY the raw JSON object containing the ACTUAL values based on your decision. Do NOT return a JSON schema. Do NOT return properties definitions. DO NOT echo back the format instructions.
"""

_REPAIR_PROMPT = ChatPromptTemplate.from_messages(
Expand Down Expand Up @@ -215,7 +215,7 @@ def decide(
test_filename = "doc_scanned_123.pdf"

print(f"\n{'=' * 60}")
print("TEST 1: Sanitizare și Retry")
print("TEST 1: Sanitizare si Retry")
try:
decision1 = agent.decide(test_summary_match, test_filename, test_rule)
print("Output JSON (observă cum / a fost înlocuit):")
Expand Down
29 changes: 28 additions & 1 deletion ai/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Extraction Tools — ai/tools.py

Acest modul conține funcții utilitare pentru extragerea textului din
diferite tipuri de fișiere (PDF, imagini), folosite ulterior de către
diferite tipuri de fișiere (PDF, imagini, Word), folosite ulterior de către
agenții AI pentru procesare.
"""

Expand All @@ -12,6 +12,7 @@
from pathlib import Path
from typing import Union

import docx # python-docx
import fitz # PyMuPDF
import pytesseract
from PIL import Image
Expand Down Expand Up @@ -91,3 +92,29 @@ def extract_text_from_image(path: Union[str, Path]) -> str:
except Exception as e:
logger.error(f"Eroare la extragerea textului din imagine ({file_path}): {e}")
return ""


def extract_text_from_docx(path: Union[str, Path]) -> str:
"""
Extrage textul dintr-un fișier Word (.docx) folosind python-docx.

Parcurge toate paragrafele documentului și le concatenează cu newline.

Args:
path: Calea către fișierul .docx.

Returns:
Textul extras din document ca string. Returnează un string gol în caz de eroare.
"""
file_path = Path(path)
if not file_path.exists():
logger.error(f"Fișierul Word nu a fost găsit: {file_path}")
return ""

try:
doc = docx.Document(str(file_path))
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
return "\n".join(paragraphs).strip()
except Exception as e:
logger.error(f"Eroare la extragerea textului din Word ({file_path}): {e}")
return ""
4 changes: 1 addition & 3 deletions core/quarantine_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,7 @@ def remove(self, record_id: int) -> bool:
"""
conn = self._get_connection()
try:
cursor = conn.execute(
"DELETE FROM quarantine WHERE id = ?", (record_id,)
)
cursor = conn.execute("DELETE FROM quarantine WHERE id = ?", (record_id,))
conn.commit()
return cursor.rowcount > 0
finally:
Expand Down
8 changes: 7 additions & 1 deletion core/scan_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
from ai.agent_compiler import CompilerAgent
from ai.agent_extractor import ExtractorAgent
from ai.agent_decider import DeciderAgent
from ai.tools import extract_text_from_pdf, extract_text_from_image
from ai.tools import (
extract_text_from_pdf,
extract_text_from_image,
extract_text_from_docx,
)
from core.file_manager import move_and_rename_file
from core.quarantine_db import quarantine_db

Expand Down Expand Up @@ -94,6 +98,8 @@ def run(self):
text = extract_text_from_pdf(file_path)
elif ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
text = extract_text_from_image(file_path)
elif ext == ".docx":
text = extract_text_from_docx(file_path)
elif ext in [".txt", ".csv", ".md"]:
text = file_path.read_text(errors="ignore")
else:
Expand Down
13 changes: 13 additions & 0 deletions pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"pythonVersion": "3.13",
"pythonPlatform": "Windows",
"pythonPath": "C:\\Python313\\python.exe",
"venvPath": ".",
"include": [
"."
],
"extraPaths": [
"."
],
"typeCheckingMode": "basic"
}
Loading
Loading