Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion codex/urls/api/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from codex.views.reader.page import ReaderPageView
from codex.views.reader.reader import ReaderView
from codex.views.reader.settings import ReaderSettingsView
from codex.views.util import cache_control_2xx

app_name = "issue"
urlpatterns = [
Expand All @@ -28,7 +29,12 @@
),
path(
"<int:pk>/<int:page>/page.jpg",
cache_control(max_age=PAGE_MAX_AGE, public=True)(ReaderPageView.as_view()),
# ``cache_control_2xx`` (not Django's ``cache_control``) — error
# responses from this endpoint must NOT be cached. The view can
# return 404 from DRF content negotiation, ACL filter misses,
# missing-file errors, etc; with ``public, max-age=PAGE_MAX_AGE``
# the browser would pin a transient failure for a week.
cache_control_2xx(max_age=PAGE_MAX_AGE, public=True)(ReaderPageView.as_view()),
name="page",
),
path(
Expand Down
37 changes: 35 additions & 2 deletions codex/views/reader/_archive_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
import time
from collections import OrderedDict
from contextlib import contextmanager
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

from comicbox.box import Comicbox
from loguru import logger
Expand Down Expand Up @@ -79,13 +79,27 @@ def _env_bool(name: str, *, default: bool) -> bool:
class _ArchiveEntry:
"""One cached Comicbox + its per-path lock + last-access timestamp."""

__slots__ = ("comicbox", "last_access", "lock", "path")
__slots__ = (
"comicbox",
"last_access",
"lock",
"path",
"verdicts",
)

def __init__(self, path: str, comicbox: Comicbox, last_access: float) -> None:
self.path = path
self.comicbox = comicbox
self.lock = threading.Lock()
self.last_access = last_access
# Per-page image-serve verdict cache (``pdffile.PageVerdict``
# instances keyed on zero-based page index). Memoized here so
# repeated ``ReaderPageView`` hits on the same archive don't
# re-classify; the underlying ``classify_page`` call is cheap
# but the cache makes prev/curr/next prefetch effectively
# free. Typed loosely to keep ``pdffile`` out of this module's
# import surface.
self.verdicts: dict[int, Any] = {}

def close(self) -> None:
"""Close the cached archive; tolerate already-closed state."""
Expand Down Expand Up @@ -178,6 +192,25 @@ def open(self, path: str) -> Generator[Comicbox]:
with entry.lock:
yield entry.comicbox

@contextmanager
def open_entry(self, path: str) -> Generator[_ArchiveEntry]:
"""
Yield the full ``_ArchiveEntry`` for direct cache-state access.

Used by callers that need the ``Comicbox`` and the per-page
verdict memo (``ReaderPageView``'s image-serve fast path).
Same locking shape as ``open()``. When the cache is disabled,
synthesizes a transient entry so callers see a uniform API;
verdict memoization is a no-op in that mode.
"""
if not self.enabled:
with Comicbox(path, config=COMICBOX_CONFIG, logger=logger) as cb:
yield _ArchiveEntry(path, cb, time.monotonic())
return
entry = self._open_or_get(path)
with entry.lock:
yield entry

def shutdown(self) -> None:
"""Close every cached archive. Wired to ``atexit`` at module load."""
with self._struct_lock:
Expand Down
130 changes: 107 additions & 23 deletions codex/views/reader/page.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,47 @@
"""Views for reading comic books."""

from __future__ import annotations

import time
from typing import TYPE_CHECKING, Final

from django.http import HttpResponse
from drf_spectacular.types import OpenApiTypes
from drf_spectacular.utils import OpenApiParameter, extend_schema
from loguru import logger
from pdffile import PageFormat
from pdffile import PageMode, PDFFile
from rest_framework.exceptions import NotFound

from codex.librarian.bookmark.tasks import BookmarkUpdateTask
from codex.librarian.mp_queue import LIBRARIAN_QUEUE
from codex.models.choices import FileTypeChoices
from codex.models.comic import Comic
from codex.settings import FALSY
from codex.views.auth import AuthFilterAPIView
from codex.views.bookmark import BookmarkAuthMixin
from codex.views.reader._archive_cache import archive_cache, page_acl_cache

_PDF_MIME_TYPE = "application/pdf"
_PDF_FORMAT_NON_PDF_TYPES = frozenset(
{e.value for e in (PageFormat.PIXMAP, PageFormat.IMAGE)}
if TYPE_CHECKING:
from pdffile import PageVerdict

from codex.views.reader._archive_cache import _ArchiveEntry

_PDF_MIME_TYPE: Final[str] = "application/pdf"

#: Query-parameter name that selects the PDF serving mode. Picked to
#: avoid colliding with DRF's reserved ``?format=`` (``URL_FORMAT_OVERRIDE``)
#: which DRF interprets as a renderer-format selector — it raises
#: ``NotFound`` for unknown values *before* the view's ``get`` runs.
_SERVE_PARAM: Final[str] = "serve"

#: Permitted ``?serve=`` values. ``auto`` runs the detector; ``pdf``
#: skips the detector and forces the legacy single-page-PDF path;
#: ``image`` forces a server-side rasterize (works for any PDF page
#: but spends more CPU than the detector path on vector-heavy pages).
_SERVE_AUTO: Final[str] = "auto"
_SERVE_PDF: Final[str] = "pdf"
_SERVE_IMAGE: Final[str] = "image"
_SERVE_HINTS: Final[frozenset[str]] = frozenset(
{_SERVE_AUTO, _SERVE_PDF, _SERVE_IMAGE}
)


Expand Down Expand Up @@ -76,18 +97,77 @@ def _resolve_path_and_type(self, pk) -> tuple[str, str | None]:
page_acl_cache.put(cache_key, path, file_type, now)
return path, file_type

def _get_page_image(self) -> tuple:
def _serve_hint(self) -> str:
raw = self.request.GET.get(_SERVE_PARAM, _SERVE_AUTO).lower()
return raw if raw in _SERVE_HINTS else _SERVE_AUTO

@staticmethod
def _classify_cached(entry: _ArchiveEntry, pdf: PDFFile, page: int) -> PageVerdict:
"""Memoize ``pdf.classify_page`` on the cache entry."""
cached = entry.verdicts.get(page)
if cached is not None:
return cached
verdict = pdf.classify_page(page)
entry.verdicts[page] = verdict
return verdict

def _try_pdf_image_serve(
self, path: str, page: int, serve_hint: str
) -> tuple[bytes, str] | None:
"""
Image-serve fast path for PDF pages.

Returns ``(bytes, content_type)`` when the page can be served
as a raw image (detector matched, or caller forced ``image``);
``None`` when the caller should fall back to the legacy
single-page-PDF path.
"""
with archive_cache.open_entry(path) as entry:
# ``Comicbox._get_archive`` returns the underlying archive
# union (zip / rar / 7z / tar / pdf). Caller has gated on
# ``file_type == PDF`` so the runtime type is ``PDFFile``;
# ``isinstance`` narrows for the type checker. Private
# comicbox API for now — swap to a public getter once one
# lands upstream.
archive = entry.comicbox._get_archive() # noqa: SLF001
if not isinstance(archive, PDFFile):
return None
pdf = archive
page_index = int(page)

if serve_hint == _SERVE_IMAGE:
# Always-image override — pixmap fallback for vector pages.
blob, ext = pdf.read_full_pixmap_jpeg(page_index)
return blob, f"image/{ext}"

verdict = self._classify_cached(entry, pdf, page_index)
if verdict.mode is PageMode.PDF_FALLBACK:
return None
served = pdf.read_image_if_dominant(page_index)
if served is None:
# Detection said dominant but extraction failed; fall
# through to PDF rather than serve a broken response.
return None
blob, ext = served
return blob, f"image/{ext}"

def _get_page_image(self) -> tuple[bytes, str]:
"""Get the image data and content type."""
pk = self.kwargs.get("pk")
path, file_type = self._resolve_path_and_type(pk)

# page_image
page = self.kwargs.get("page")
pdf_format = (
PageFormat.PIXMAP.value
if self.request.GET.get("pixmap", "").lower() not in FALSY
else ""
)
serve_hint = self._serve_hint()

is_pdf = file_type == FileTypeChoices.PDF.value # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]

# Image-dominant fast path for PDFs (skipped when the caller
# forces ``?serve=pdf``).
if is_pdf and serve_hint != _SERVE_PDF:
served = self._try_pdf_image_serve(path, page, serve_hint)
if served is not None:
return served

# Process-wide LRU of open Comicbox archives — the web reader's
# prev/curr/next prefetch fires 3-5 page hits on the same archive
# within a second, and ``cacheBook`` mode bursts a whole-book
Expand All @@ -96,28 +176,32 @@ def _get_page_image(self) -> tuple:
# held inside ``archive_cache.open(...)`` serializes extraction
# because ZipFile / RarFile / PDF backends aren't thread-safe.
with archive_cache.open(path) as cb:
page_image = cb.get_page_by_index(page, pdf_format=pdf_format)
page_image = cb.get_page_by_index(page, pdf_format="")
if not page_image:
page_image = b""

# content type
if (
file_type == FileTypeChoices.PDF.value # pyright: ignore[reportAttributeAccessIssue], # ty: ignore[unresolved-attribute]
and pdf_format not in _PDF_FORMAT_NON_PDF_TYPES
):
content_type = _PDF_MIME_TYPE
else:
content_type = self.content_type

content_type = _PDF_MIME_TYPE if is_pdf else self.content_type
return page_image, content_type

@extend_schema(
parameters=[
OpenApiParameter("bookmark", OpenApiTypes.BOOL, default=True),
OpenApiParameter("pixmap", OpenApiTypes.BOOL, default=False),
OpenApiParameter(
_SERVE_PARAM,
OpenApiTypes.STR,
default=_SERVE_AUTO,
enum=sorted(_SERVE_HINTS),
description=(
"PDF serving mode: 'auto' (detector), "
"'pdf' (legacy single-page PDF), "
"'image' (always rasterize). Ignored for non-PDF archives."
),
),
],
responses={
(200, content_type): OpenApiTypes.BINARY,
(200, "image/png"): OpenApiTypes.BINARY,
(200, "image/webp"): OpenApiTypes.BINARY,
(200, _PDF_MIME_TYPE): OpenApiTypes.BINARY,
},
)
Expand Down
34 changes: 34 additions & 0 deletions codex/views/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,42 @@

from collections.abc import Mapping
from dataclasses import dataclass
from functools import wraps
from typing import override

from django.utils.cache import patch_cache_control


def cache_control_2xx(**kwargs):
"""
Patch ``Cache-Control`` only on 2xx responses.

Like ``django.views.decorators.cache.cache_control`` but only
emits the header for success responses.

Django's ``cache_control`` patches the header onto every response,
including 4xx and 5xx. With ``public, max-age=<long>`` that turns
a transient error (a missing file, an ACL miss, a one-off
backend hiccup) into a week-long cache poison: every browser that
saw the failure keeps serving it from cache without ever reaching
the server. This wrapper only marks success responses cacheable;
errors are returned with whatever default headers DRF / Django
already set (typically uncached).
"""

def _wrap(viewfunc):
@wraps(viewfunc)
def _wrapped(request, *args, **kw):
response = viewfunc(request, *args, **kw)
status = getattr(response, "status_code", 0)
if 200 <= status < 300: # noqa: PLR2004
patch_cache_control(response, **kwargs)
return response

return _wrapped

return _wrap


@dataclass
class Route:
Expand Down
23 changes: 19 additions & 4 deletions frontend/src/api/v3/reader.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,24 @@ export const getReaderInfo = (pk, data, ts, options = {}) => {
const _getReaderAPIPath = (pk) =>
globalThis.CODEX.API_V3_PATH + _getBookPath(pk);

export const getComicPageSource = ({ pk, page, mtime }) => {
export const getComicPageSource = ({ pk, page, mtime, serve }) => {
// ``serve`` is the optional PDF serving-mode hint forwarded to the
// backend ``ReaderPageView``: ``auto`` (detector decides),
// ``image`` (always rasterize), or ``pdf`` (skip the detector and
// serve a single-page PDF blob). Ignored by the backend for
// non-PDF archives. Omitting the param keeps the URL identical to
// the legacy shape so HTTP caches don't fragment.
//
// The query name is ``serve`` rather than ``format`` because DRF
// reserves ``?format=`` (URL_FORMAT_OVERRIDE) as a renderer-format
// selector and raises NotFound for unknown values before the view
// dispatches.
const bookAPIPath = _getReaderAPIPath(pk);
return `${bookAPIPath}/${page}/page.jpg?ts=${mtime}`;
let url = `${bookAPIPath}/${page}/page.jpg?ts=${mtime}`;
if (serve && serve !== "auto") {
url += `&serve=${serve}`;
}
return url;
};

export const getComicDownloadURL = ({ pk }, fn, ts) => {
Expand All @@ -45,8 +60,8 @@ export const getDownloadPageURL = ({ pk, page, mtime }) => {
};

export const getPDFInBrowserURL = ({ pk, mtime }) => {
// Consumed by ``<embed src=...>``, not ``HTTP.get`` — needs an
// absolute path so the browser doesn't resolve it relative to the
// Consumed by the "Read in Tab" link (`<a target="_blank">`) — needs
// an absolute path so the browser doesn't resolve it relative to the
// current SPA route.
const bookPath = _getBookPath(pk);
return `/${bookPath}/book.pdf?ts=${mtime}`;
Expand Down
Loading