Fix by linteres

Ariel Vernaza · Ariel Vernaza · commit b61c0796fd32 · 2026-01-29T13:38:48.000+01:00
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -7,6 +7,29 @@ on:
     branches: [main, master]
 
 jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+
+      - name: Set up Python
+        run: uv python install
+
+      - name: Install dependencies
+        run: uv sync
+
+      - name: Ruff check
+        run: uv run ruff check .
+
+      - name: Ruff format check
+        run: uv run ruff format --check .
+
   test:
     runs-on: ubuntu-latest
     steps:
diff --git a/README.md b/README.md
@@ -271,7 +271,20 @@ uv run pytest -v
 ```
 
 **4. CI (GitHub Actions)**  
-On every push and pull request to `main` or `master`, the workflow in `.github/workflows/tests.yml` runs `uv sync` and `uv run pytest -v`. No secrets required; embedding tests that need optional deps are skipped in CI.
+On every push and pull request to `main` or `master`, the workflow in `.github/workflows/tests.yml` runs **lint** (Ruff check + format check) and **tests** (`uv run pytest -v`). No secrets required; embedding tests that need optional deps are skipped in CI.
+
+**5. Linting (Ruff)**  
+Locally: `uv run ruff check .` and `uv run ruff format --check .`. To auto-fix: `uv run ruff check . --fix` and `uv run ruff format .`. Config is in `pyproject.toml` under `[tool.ruff]`.
+
+**6. Linting desde Docker Compose**  
+Construir la imagen de test (incluye Ruff) y ejecutar lint sobre el código montado:
+
+```bash
+docker compose --profile lint build lint
+docker compose --profile lint run --rm lint
+```
+
+(Con Podman: `podman compose --profile lint build lint` y `podman compose --profile lint run --rm lint`.) El servicio `lint` monta `app/`, `tests/` y `pyproject.toml`; no hace falta reconstruir para lintar cambios. Para ejecutar tests en el mismo contenedor: `docker compose --profile lint run --rm lint uv run pytest -v`.
 
 ---
 
diff --git a/app/api/schemas.py b/app/api/schemas.py
@@ -36,7 +36,11 @@ class ChunkCreate(BaseModel):
     """
 
     text: str = Field(..., min_length=1, description="Chunk text content.")
-    embedding: list[float] = Field(..., min_length=1, description="Vector embedding (e.g. from Cohere or Sentence Transformers).")
+    embedding: list[float] = Field(
+        ...,
+        min_length=1,
+        description="Vector embedding (e.g. from Cohere or Sentence Transformers).",
+    )
     name: str | None = Field(default=None, description="Optional label for the chunk.")
 
 
@@ -60,7 +64,9 @@ class IndexRequest(BaseModel):
 class SearchRequest(BaseModel):
     """Request body for POST /libraries/{id}/search. Query vector and number of neighbors."""
 
-    embedding: list[float] = Field(..., min_length=1, description="Query vector (same dimension as indexed chunks).")
+    embedding: list[float] = Field(
+        ..., min_length=1, description="Query vector (same dimension as indexed chunks)."
+    )
     k: int = Field(..., ge=1, le=1000, description="Number of nearest neighbors to return.")
 
 
@@ -74,4 +80,6 @@ class SearchResultItem(BaseModel):
 class SearchResponse(BaseModel):
     """Response for POST /libraries/{id}/search. List of (chunk_id, distance) sorted by distance ascending."""
 
-    results: list[SearchResultItem] = Field(..., description="Up to k nearest chunks with their distances.")
+    results: list[SearchResultItem] = Field(
+        ..., description="Up to k nearest chunks with their distances."
+    )
diff --git a/app/core/concurrency.py b/app/core/concurrency.py
@@ -9,8 +9,8 @@
 from __future__ import annotations
 
 import threading
+from collections.abc import Generator
 from contextlib import contextmanager
-from typing import Generator
 
 
 class RWLock:
diff --git a/app/core/embedding/cohere.py b/app/core/embedding/cohere.py
@@ -7,9 +7,10 @@
 
 from __future__ import annotations
 
-import httpx
 from typing import Any
 
+import httpx
+
 # Default Cohere model dimensions (embed-english-v3.0, embed-multilingual-v3.0)
 COHERE_EMBED_DIM = 1024
 
@@ -64,4 +65,4 @@ def _embed(self, texts: list[str], input_type: str) -> list[list[float]]:
             r.raise_for_status()
             data: dict[str, Any] = r.json()
         embeddings = data.get("embeddings", [])
-        return [e for e in embeddings]
+        return list(embeddings)
diff --git a/app/core/embedding/image.py b/app/core/embedding/image.py
@@ -22,6 +22,7 @@ def _lazy_import() -> tuple[object, object, object, object]:
     import torch  # type: ignore[import-untyped]
     from PIL import Image  # type: ignore[import-untyped]
     from transformers import AutoModel, AutoProcessor  # type: ignore[import-untyped]
+
     return AutoModel, AutoProcessor, torch, Image
 
 
@@ -66,7 +67,9 @@ def dimension(self) -> int:
 
     def embed_documents(self, texts: list[str]) -> list[list[float]]:
         """Not supported: use a text embedder (Cohere or Transformer) for text chunks."""
-        raise NotImplementedError("ImageEmbedder only supports embed_images; use a text embedder for documents.")
+        raise NotImplementedError(
+            "ImageEmbedder only supports embed_images; use a text embedder for documents."
+        )
 
     def embed_queries(self, texts: list[str]) -> list[list[float]]:
         """Not supported: use embed_images for image query."""
@@ -76,8 +79,8 @@ def embed_images(self, images: list[bytes]) -> list[list[float]]:
         """Indexing/search: embed image chunks (raw bytes, e.g. from PDF image extraction). Each item is one image."""
         if not images:
             return []
-        from PIL import Image  # type: ignore[import-untyped]
         import torch  # type: ignore[import-untyped]
+        from PIL import Image  # type: ignore[import-untyped]
 
         pil_images = [Image.open(io.BytesIO(img)).convert("RGB") for img in images]
         processor, model, device = self._get_processor_and_model()
diff --git a/app/core/embedding/registry.py b/app/core/embedding/registry.py
@@ -18,8 +18,8 @@
 
 from app.core.embedding.base import Embedder, EmbedderKind
 from app.core.embedding.cohere import CohereEmbedder
-from app.core.embedding.transformer import TransformerEmbedder
 from app.core.embedding.image import ImageEmbedder
+from app.core.embedding.transformer import TransformerEmbedder
 
 
 def _parse_embedder_uri(uri: str) -> tuple[EmbedderKind, str | None]:
@@ -54,11 +54,15 @@ def get_embedder(
             raise ValueError("COHERE_API_KEY is required for cohere embedder")
         return CohereEmbedder(api_key=api_key)
     if kind == EmbedderKind.EMBEDDING_TRANSFORMER:
-        model_name = model or os.environ.get("EMBEDDING_TRANSFORMER_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+        model_name = model or os.environ.get(
+            "EMBEDDING_TRANSFORMER_MODEL", "sentence-transformers/all-MiniLM-L6-v2"
+        )
         dev = device or os.environ.get("EMBEDDING_DEVICE")
         return TransformerEmbedder(model_name=model_name, device=dev)
     if kind == EmbedderKind.EMBEDDING_IMAGE:
-        model_name = model or os.environ.get("EMBEDDING_IMAGE_MODEL", "openai/clip-vit-base-patch32")
+        model_name = model or os.environ.get(
+            "EMBEDDING_IMAGE_MODEL", "openai/clip-vit-base-patch32"
+        )
         dev = device or os.environ.get("EMBEDDING_DEVICE")
         return ImageEmbedder(model_name=model_name, device=dev)
     raise ValueError(f"Unknown embedder URI: {uri}")
diff --git a/app/core/embedding/transformer.py b/app/core/embedding/transformer.py
@@ -20,6 +20,7 @@
 
 def _lazy_import() -> tuple[object, object]:
     import sentence_transformers  # type: ignore[import-untyped]
+
     return sentence_transformers.SentenceTransformer, sentence_transformers  # type: ignore[attr-defined,no-any-return]
 
 
diff --git a/app/indexes/ivf.py b/app/indexes/ivf.py
@@ -93,7 +93,9 @@ def build(
         for i, label in enumerate(labels):
             cluster_ids[label].append(ids[i])
             cluster_vectors[label].append(vectors[i])
-        cluster_vectors_arr = [np.array(cv, dtype=np.float64) if cv else np.empty((0, d)) for cv in cluster_vectors]
+        cluster_vectors_arr = [
+            np.array(cv, dtype=np.float64) if cv else np.empty((0, d)) for cv in cluster_vectors
+        ]
         return cls(cluster_ids, cluster_vectors_arr, centroids, n_probe)
 
     def search(self, query: list[float], k: int) -> SearchResult:
diff --git a/app/indexes/kd_tree.py b/app/indexes/kd_tree.py
@@ -19,13 +19,15 @@
 @dataclass
 class _Leaf:
     """Leaf node: holds a single point (id, vector)."""
+
     id: UUID
     vector: np.ndarray
 
 
 @dataclass
 class _Internal:
     """Internal node: split dimension and value, left and right subtrees."""
+
     split_dim: int
     split_value: float
     left: _Leaf | _Internal
diff --git a/app/models/schemas.py b/app/models/schemas.py
@@ -9,15 +9,15 @@
 
 from __future__ import annotations
 
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from uuid import UUID, uuid4
 
 from pydantic import BaseModel, Field
 
 
 def _utc_now() -> datetime:
     """Return current UTC time. Used as default for created_at on all entities."""
-    return datetime.now(timezone.utc)
+    return datetime.now(UTC)
 
 
 class Chunk(BaseModel):
diff --git a/app/repositories/chunk_repository.py b/app/repositories/chunk_repository.py
@@ -31,9 +31,7 @@ def get(self, id: UUID) -> Chunk | None:
     def list_by_document(self, document_id: UUID) -> list[Chunk]:
         """Return all chunks for a document."""
         return [
-            c.model_copy(deep=True)
-            for c in self._store.values()
-            if c.document_id == document_id
+            c.model_copy(deep=True) for c in self._store.values() if c.document_id == document_id
         ]
 
     def list_all(self) -> list[Chunk]:
diff --git a/app/services/chunk_service.py b/app/services/chunk_service.py
@@ -9,8 +9,8 @@
 
 from uuid import UUID
 
-from app.models import Chunk, Document
-from app.repositories import ChunkRepository, DocumentRepository, LibraryRepository
+from app.models import Chunk
+from app.repositories import ChunkRepository, DocumentRepository
 from app.services.exceptions import ChunkNotFoundError, DocumentNotFoundError
 
 
diff --git a/app/services/document_service.py b/app/services/document_service.py
@@ -9,7 +9,7 @@
 
 from uuid import UUID
 
-from app.models import Document, Library
+from app.models import Document
 from app.repositories import ChunkRepository, DocumentRepository, LibraryRepository
 from app.services.exceptions import DocumentNotFoundError, LibraryNotFoundError
 
diff --git a/app/services/search_service.py b/app/services/search_service.py
@@ -12,7 +12,6 @@
 from app.repositories import LibraryRepository
 from app.services.exceptions import IndexNotBuiltError, LibraryNotFoundError
 
-
 # Result: list of (chunk_id, distance) sorted by distance ascending.
 SearchResult = list[tuple[UUID, float]]
 
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -18,3 +18,22 @@ services:
       timeout: 5s
       retries: 3
       start_period: 5s
+
+  # Lint (Ruff) and tests using the test image. Mounts source so you lint/tests current code.
+  # Build: docker compose build lint   OR  podman compose build lint
+  # Lint:  docker compose run --rm lint   OR  podman compose run --rm lint
+  # Tests: docker compose run --rm lint uv run pytest -v
+  lint:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      target: test
+    image: vector-db-api-test:latest
+    volumes:
+      - ./app:/app/app:ro
+      - ./tests:/app/tests:ro
+      - ./pyproject.toml:/app/pyproject.toml:ro
+    working_dir: /app
+    command: ["sh", "-c", "uv run ruff check . && uv run ruff format --check ."]
+    profiles:
+      - lint
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,8 +27,36 @@ packages = ["app"]
 dev = [
     "pytest>=8.0.0",
     "pytest-cov>=4.0.0",
+    "ruff>=0.8.0",
 ]
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 pythonpath = ["."]
+
+[tool.ruff]
+target-version = "py311"
+line-length = 100
+src = ["app", "tests"]
+exclude = [".venv", ".git", "__pycache__", "build", "dist"]
+
+[tool.ruff.lint]
+select = [
+    "E",      # pycodestyle errors
+    "W",      # pycodestyle warnings
+    "F",      # Pyflakes
+    "I",      # isort
+    "B",      # flake8-bugbear
+    "C4",     # flake8-comprehensions
+    "UP",     # pyupgrade
+]
+ignore = [
+    "E501",   # line too long (handled by formatter)
+    "B008",   # do not perform function calls in argument defaults
+]
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -9,10 +9,8 @@
 
 import uuid
 
-import pytest
-from fastapi.testclient import TestClient
-
 from app.main import app
+from fastapi.testclient import TestClient
 
 client = TestClient(app)
 
@@ -37,7 +35,7 @@ def test_libraries_crud() -> None:
     assert r.status_code == 200
     libs = r.json()
     assert len(libs) >= 1
-    assert any(l["id"] == lib_id for l in libs)
+    assert any(lib_item["id"] == lib_id for lib_item in libs)
 
     r = client.get(f"/libraries/{lib_id}")
     assert r.status_code == 200
diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py
@@ -5,8 +5,6 @@
 import threading
 import time
 
-import pytest
-
 from app.core.concurrency import RWLock
 
 
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import pytest
-
 from app.core.config import get_cohere_api_key, get_embedding_dimension
 
 
diff --git a/tests/test_embedding.py b/tests/test_embedding.py
@@ -6,13 +6,10 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
-
-from app.core.embedding.base import EmbedderKind
 from app.core.embedding.cohere import COHERE_EMBED_DIM, CohereEmbedder
+from app.core.embedding.image import ImageEmbedder
 from app.core.embedding.registry import get_embedder
 from app.core.embedding.transformer import TransformerEmbedder
-from app.core.embedding.image import ImageEmbedder
-
 
 # --- Registry: URI parsing and get_embedder ---
 
@@ -33,7 +30,9 @@ def test_get_embedder_cohere_with_key_returns_cohere_embedder() -> None:
 def test_get_embedder_transformer_uri_returns_transformer_embedder() -> None:
     """get_embedder('embedding_transformer://...') returns TransformerEmbedder (skip if not installed)."""
     pytest.importorskip("sentence_transformers")
-    emb = get_embedder("embedding_transformer://sentence-transformers/all-MiniLM-L6-v2", device="cpu")
+    emb = get_embedder(
+        "embedding_transformer://sentence-transformers/all-MiniLM-L6-v2", device="cpu"
+    )
     assert isinstance(emb, TransformerEmbedder)
     assert emb.dimension == 384
 
diff --git a/tests/test_indexes.py b/tests/test_indexes.py
@@ -6,11 +6,10 @@
 
 import numpy as np
 import pytest
-
 from app.indexes import (
     BruteForceIndex,
-    IVFIndex,
     IndexRegistry,
+    IVFIndex,
     KDTreeIndex,
 )
 from app.indexes.base import IndexEntry
diff --git a/tests/test_models.py b/tests/test_models.py
diff --git a/tests/test_repositories.py b/tests/test_repositories.py
diff --git a/tests/test_services.py b/tests/test_services.py
diff --git a/uv.lock b/uv.lock