Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions backend/src/cms_backend/db/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,11 @@ def get_next_book_to_move_files_or_none(
) -> Book | None:
return session.scalars(
select(Book)
.where(Book.needs_file_operation.is_(True))
.where(Book.has_error.is_(False))
.where(
Book.needs_file_operation.is_(True),
Book.has_error.is_(False),
Book.location_kind.not_in(["to_delete", "deleted"]),
)
.order_by(Book.created_at)
.limit(1)
).one_or_none()
Expand Down
1 change: 1 addition & 0 deletions backend/src/cms_backend/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ class Book(Base):
location_kind: Mapped[str] = mapped_column(
init=False, default="quarantine", server_default="quarantine"
)
deletion_date: Mapped[datetime | None] = mapped_column(default=None, init=False)
events: Mapped[list[str]] = mapped_column(init=False, default_factory=list)

title_id: Mapped[UUID | None] = mapped_column(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""add deletion_date to book

Revision ID: a5f67b148119
Revises: df6a64dec5a0
Create Date: 2026-02-09 13:41:08.318866

"""

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = "a5f67b148119"
down_revision = "df6a64dec5a0"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("book", sa.Column("deletion_date", sa.DateTime(), nullable=True))
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("book", "deletion_date")
# ### end Alembic commands ###
4 changes: 4 additions & 0 deletions backend/src/cms_backend/mill/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ class Context:
default=UUID(get_mandatory_env("STAGING_WAREHOUSE_ID"))
)
staging_base_path: Path = field(default=Path(os.getenv("STAGING_BASE_PATH", "")))

old_book_deletion_delay: timedelta = timedelta(
seconds=parse_timespan(os.getenv("OLD_BOOK_DELETION_DELAY", default="1d"))
)
94 changes: 93 additions & 1 deletion backend/src/cms_backend/mill/processors/title.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import datetime
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from uuid import UUID
Expand All @@ -9,7 +11,11 @@
from cms_backend.db.models import Book, Title
from cms_backend.mill.context import Context as MillContext
from cms_backend.utils.datetime import getnow
from cms_backend.utils.filename import compute_target_filename
from cms_backend.utils.filename import (
PERIOD_LENGTH,
compute_target_filename,
get_period_and_suffix_from_filename,
)


@dataclass(eq=True, frozen=True)
Expand All @@ -19,6 +25,71 @@ class FileLocation:
filename: str


def apply_retention_rules(session: OrmSession, title: Title):
"""Apply retention rules to `prod` books belonging to same title and flavour group.

The retention rules are described in https://wiki.openzim.org/wiki/ZIM_Updates
- Keep last version of two ZIM files from the two last distinct months (e.g
if we have `2024-04`, `2024-04a`, `2024-06`, `2024-06a`, `2024-06b`,
then we keep `2024-04a` and `2024-06b`)
- AND keep every version which is 30 days old or less.
"""

now = getnow()

books_by_flavour: dict[str, list[Book]] = defaultdict(list)
for book in title.books:
if (
book.location_kind == "prod"
and not book.has_error
and book.created_at <= (now - datetime.timedelta(days=30))
and book.needs_file_operation is False
):
books_by_flavour[book.flavour or ""].append(book)

books_to_delete: list[Book] = []

for _, books in books_by_flavour.items():
# Group books by period (without the suffix)
books_by_period: dict[str, list[Book]] = defaultdict(list)
for book in books:
if not book.date:
continue
books_by_period[book.date[:PERIOD_LENGTH]].append(book)

# Keep last version from each of the 2 most recent periods
sorted_periods = sorted(books_by_period.keys(), reverse=True)
for period in sorted_periods[:2]:
sorted_books_by_period = sort_books_by_filename_period(
books_by_period[period]
)
# Mark all but the most recent one for deletion
books_to_delete.extend(sorted_books_by_period[1:])

# Mark the remainder of the books to be deleted.
for period in sorted_periods[2:]:
books_to_delete.extend(books_by_period[period])

deletion_date = now + MillContext.old_book_deletion_delay

for book in books_to_delete:
logger.info(
f"Marking book {book.id} for deletion, deletion_date={deletion_date}"
)
book.location_kind = "to_delete"
book.deletion_date = deletion_date
book.needs_file_operation = True
book.events.append(
f"{now}: marked for deletion due to retention policy, "
f"will be deleted after {deletion_date}"
)
title.events.append(f"{now}: book {book.id} marked for deletion.")
session.add(book)
session.add(title)

session.flush()


def add_book_to_title(session: OrmSession, book: Book, title: Title):
try:
# Retrieve name from book.name directly
Expand Down Expand Up @@ -76,6 +147,9 @@ def add_book_to_title(session: OrmSession, book: Book, title: Title):
)
book.location_kind = "staging" if goes_to_staging else "prod"

if not goes_to_staging:
apply_retention_rules(session, title)

except Exception as exc:
book.events.append(
f"{getnow()}: error encountered while adding to title {title.id}\n{exc}"
Expand Down Expand Up @@ -167,3 +241,21 @@ def create_book_target_locations(
)

book.needs_file_operation = True


def sort_books_by_filename_period(books: list[Book]) -> list[Book]:
"""Sort a list of books by period.

Assumes:
- the book's location exists since it contains the filename of the book
"""

def sort_fn(book: Book) -> tuple[str, int, str]:
period, suffix = get_period_and_suffix_from_filename(book.locations[0].filename)
return (period, len(suffix), suffix)

return sorted(
books,
key=sort_fn,
reverse=True,
)
4 changes: 4 additions & 0 deletions backend/src/cms_backend/shuttle/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,8 @@ class Context:
seconds=parse_timespan(os.getenv("MOVE_FILES_INTERVAL", default="1m"))
)

delete_files_interval: timedelta = timedelta(
seconds=parse_timespan(os.getenv("DELETE_FILES_INTERVAL", default="1h"))
)

local_warehouse_paths: ClassVar[dict[UUID, Path]] = _parse_local_warehouse_paths()
96 changes: 96 additions & 0 deletions backend/src/cms_backend/shuttle/delete_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import datetime

from sqlalchemy import select
from sqlalchemy.orm import Session as OrmSession

from cms_backend import logger
from cms_backend.db.models import Book
from cms_backend.shuttle.context import Context as ShuttleContext
from cms_backend.utils.datetime import getnow


def delete_files(session: OrmSession):
"""Delete books from filesystem that have passed their deletion_date.

Finds books with location_kind='to_delete' and deletion_date <= now,
deletes their files from filesystem, and marks them as 'deleted'.
"""
now = getnow()
nb_zim_files_deleted = 0

while True:
with session.begin_nested():
book = get_next_book_to_delete(session, now)
if not book:
break

try:
logger.debug(f"Deleting files for book {book.id}")
delete_book_files(session, book)
nb_zim_files_deleted += 1
except Exception as exc:
book.events.append(
f"{getnow()}: error encountered while deleting files\n{exc}"
)
logger.exception(f"Failed to delete files for book {book.id}")
book.has_error = True

logger.info(f"Done deleting {nb_zim_files_deleted} ZIM files")


def get_next_book_to_delete(session: OrmSession, now: datetime.datetime) -> Book | None:
"""Get the next book that needs deletion."""
return session.scalars(
select(Book)
.where(
Book.location_kind == "to_delete",
Book.deletion_date <= now,
Book.has_error.is_(False),
Book.needs_file_operation.is_(True),
)
.order_by(Book.deletion_date)
.limit(1)
).one_or_none()


def delete_book_files(session: OrmSession, book: Book):
"""Delete all files for a book from filesystem."""
inaccessible_warehouse_names = {
loc.warehouse.name
for loc in book.locations
if loc.warehouse_id not in ShuttleContext.local_warehouse_paths.keys()
}

# If any warehouse is not accessible, we cannot proceed
if len(inaccessible_warehouse_names) > 0:
logger.debug(
f"Cannot delete book {book.id}, no access to "
f"{','.join(inaccessible_warehouse_names)} warehouses"
)
return

# Delete all current location files
for location in book.locations:
if location.status == "current":
try:
file_path = location.full_local_path(
ShuttleContext.local_warehouse_paths
)
file_path.unlink(missing_ok=True)
logger.info(f"Deleted file for book {book.id} at {file_path}")
book.events.append(f"{getnow()}: deleted file at {location.full_str}")
session.delete(location)
except Exception:
logger.exception(
f"Failed to delete file at {location.full_str} for book {book.id}"
)
raise

# Mark book as deleted
book.location_kind = "deleted"
book.needs_file_operation = False
book.events.append(f"{getnow()}: all files deleted, book marked as deleted")
session.add(book)

session.flush()
logger.info(f"Book {book.id} files have been deleted")
5 changes: 5 additions & 0 deletions backend/src/cms_backend/shuttle/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from cms_backend.context import Context
from cms_backend.db import Session
from cms_backend.shuttle.context import Context as ShuttleContext
from cms_backend.shuttle.delete_files import delete_files
from cms_backend.shuttle.move_files import move_files
from cms_backend.utils.database import upgrade_db_schema
from cms_backend.utils.datetime import getnow
Expand All @@ -20,6 +21,10 @@
func=move_files,
interval=ShuttleContext.move_files_interval,
),
TaskConfig(
func=delete_files,
interval=ShuttleContext.delete_files_interval,
),
]


Expand Down
13 changes: 13 additions & 0 deletions backend/src/cms_backend/utils/filename.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Utilities for computing and managing book target filenames."""

import re
from uuid import UUID

from sqlalchemy import select
Expand All @@ -8,6 +9,9 @@
from cms_backend.db.models import BookLocation

PERIOD_LENGTH = 7
FILENAME_PERIOD_SUFFIX_PATTERN = re.compile(
r".*_(?P<period>\d{4}-\d{2})(?P<suffix>[a-z]*)\.zim"
)


def get_next_suffix(current_suffix: str) -> str:
Expand Down Expand Up @@ -162,3 +166,12 @@ def compute_target_filename(
next_suffix = get_next_suffix(last_suffix)

return f"{base_pattern}{next_suffix}.zim"


def get_period_and_suffix_from_filename(filename: str) -> tuple[str, str]:
"""Get the (period, suffix) tuple from filename."""
match = FILENAME_PERIOD_SUFFIX_PATTERN.match(filename)
if match is None:
raise ValueError("Unable to retrieve period from filename")
groupdict = match.groupdict()
return (groupdict["period"], groupdict["suffix"])
Loading