ajslater · ajslater · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/codex/urls/api/reader.py b/codex/urls/api/reader.py
@@ -10,6 +10,7 @@
 from codex.views.reader.page import ReaderPageView
 from codex.views.reader.reader import ReaderView
 from codex.views.reader.settings import ReaderSettingsView
+from codex.views.util import cache_control_2xx
 
 app_name = "issue"
 urlpatterns = [
@@ -28,7 +29,12 @@
     ),
     path(
         "<int:pk>/<int:page>/page.jpg",
-        cache_control(max_age=PAGE_MAX_AGE, public=True)(ReaderPageView.as_view()),
+        # ``cache_control_2xx`` (not Django's ``cache_control``) — error
+        # responses from this endpoint must NOT be cached. The view can
+        # return 404 from DRF content negotiation, ACL filter misses,
+        # missing-file errors, etc; with ``public, max-age=PAGE_MAX_AGE``
+        # the browser would pin a transient failure for a week.
+        cache_control_2xx(max_age=PAGE_MAX_AGE, public=True)(ReaderPageView.as_view()),
         name="page",
     ),
     path(

diff --git a/codex/views/reader/_archive_cache.py b/codex/views/reader/_archive_cache.py
@@ -44,7 +44,7 @@
 import time
 from collections import OrderedDict
 from contextlib import contextmanager
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from comicbox.box import Comicbox
 from loguru import logger
@@ -79,13 +79,27 @@ def _env_bool(name: str, *, default: bool) -> bool:
 class _ArchiveEntry:
     """One cached Comicbox + its per-path lock + last-access timestamp."""
 
-    __slots__ = ("comicbox", "last_access", "lock", "path")
+    __slots__ = (
+        "comicbox",
+        "last_access",
+        "lock",
+        "path",
+        "verdicts",
+    )
 
     def __init__(self, path: str, comicbox: Comicbox, last_access: float) -> None:
         self.path = path
         self.comicbox = comicbox
         self.lock = threading.Lock()
         self.last_access = last_access
+        # Per-page image-serve verdict cache (``pdffile.PageVerdict``
+        # instances keyed on zero-based page index). Memoized here so
+        # repeated ``ReaderPageView`` hits on the same archive don't
+        # re-classify; the underlying ``classify_page`` call is cheap
+        # but the cache makes prev/curr/next prefetch effectively
+        # free. Typed loosely to keep ``pdffile`` out of this module's
+        # import surface.
+        self.verdicts: dict[int, Any] = {}
 
     def close(self) -> None:
         """Close the cached archive; tolerate already-closed state."""
@@ -178,6 +192,25 @@ def open(self, path: str) -> Generator[Comicbox]:
         with entry.lock:
             yield entry.comicbox
 
+    @contextmanager
+    def open_entry(self, path: str) -> Generator[_ArchiveEntry]:
+        """
+        Yield the full ``_ArchiveEntry`` for direct cache-state access.
+
+        Used by callers that need the ``Comicbox`` and the per-page
+        verdict memo (``ReaderPageView``'s image-serve fast path).
+        Same locking shape as ``open()``. When the cache is disabled,
+        synthesizes a transient entry so callers see a uniform API;
+        verdict memoization is a no-op in that mode.
+        """
+        if not self.enabled:
+            with Comicbox(path, config=COMICBOX_CONFIG, logger=logger) as cb:
+                yield _ArchiveEntry(path, cb, time.monotonic())
+            return
+        entry = self._open_or_get(path)
+        with entry.lock:
+            yield entry
+
     def shutdown(self) -> None:
         """Close every cached archive. Wired to ``atexit`` at module load."""
         with self._struct_lock:

diff --git a/codex/views/reader/page.py b/codex/views/reader/page.py
@@ -1,26 +1,47 @@
 """Views for reading comic books."""
 
+from __future__ import annotations
+
 import time
+from typing import TYPE_CHECKING, Final
 
 from django.http import HttpResponse
 from drf_spectacular.types import OpenApiTypes
 from drf_spectacular.utils import OpenApiParameter, extend_schema
 from loguru import logger
-from pdffile import PageFormat
+from pdffile import PageMode, PDFFile
 from rest_framework.exceptions import NotFound
 
 from codex.librarian.bookmark.tasks import BookmarkUpdateTask
 from codex.librarian.mp_queue import LIBRARIAN_QUEUE
 from codex.models.choices import FileTypeChoices
 from codex.models.comic import Comic
-from codex.settings import FALSY
 from codex.views.auth import AuthFilterAPIView
 from codex.views.bookmark import BookmarkAuthMixin
 from codex.views.reader._archive_cache import archive_cache, page_acl_cache
 
-_PDF_MIME_TYPE = "application/pdf"
-_PDF_FORMAT_NON_PDF_TYPES = frozenset(
-    {e.value for e in (PageFormat.PIXMAP, PageFormat.IMAGE)}
+if TYPE_CHECKING:
+    from pdffile import PageVerdict
+
+    from codex.views.reader._archive_cache import _ArchiveEntry
+
+_PDF_MIME_TYPE: Final[str] = "application/pdf"
+
+#: Query-parameter name that selects the PDF serving mode. Picked to
+#: avoid colliding with DRF's reserved ``?format=`` (``URL_FORMAT_OVERRIDE``)
+#: which DRF interprets as a renderer-format selector — it raises
+#: ``NotFound`` for unknown values *before* the view's ``get`` runs.
+_SERVE_PARAM: Final[str] = "serve"
+
+#: Permitted ``?serve=`` values. ``auto`` runs the detector; ``pdf``
+#: skips the detector and forces the legacy single-page-PDF path;
+#: ``image`` forces a server-side rasterize (works for any PDF page
+#: but spends more CPU than the detector path on vector-heavy pages).
+_SERVE_AUTO: Final[str] = "auto"
+_SERVE_PDF: Final[str] = "pdf"
+_SERVE_IMAGE: Final[str] = "image"
+_SERVE_HINTS: Final[frozenset[str]] = frozenset(
+    {_SERVE_AUTO, _SERVE_PDF, _SERVE_IMAGE}
 )
 
 
@@ -76,18 +97,77 @@ def _resolve_path_and_type(self, pk) -> tuple[str, str | None]:
         page_acl_cache.put(cache_key, path, file_type, now)
         return path, file_type
 
-    def _get_page_image(self) -> tuple:
+    def _serve_hint(self) -> str:
+        raw = self.request.GET.get(_SERVE_PARAM, _SERVE_AUTO).lower()
+        return raw if raw in _SERVE_HINTS else _SERVE_AUTO
+
+    @staticmethod
+    def _classify_cached(entry: _ArchiveEntry, pdf: PDFFile, page: int) -> PageVerdict:
+        """Memoize ``pdf.classify_page`` on the cache entry."""
+        cached = entry.verdicts.get(page)
+        if cached is not None:
+            return cached
+        verdict = pdf.classify_page(page)
+        entry.verdicts[page] = verdict
+        return verdict
+
+    def _try_pdf_image_serve(
+        self, path: str, page: int, serve_hint: str
+    ) -> tuple[bytes, str] | None:
+        """
+        Image-serve fast path for PDF pages.
+
+        Returns ``(bytes, content_type)`` when the page can be served
+        as a raw image (detector matched, or caller forced ``image``);
+        ``None`` when the caller should fall back to the legacy
+        single-page-PDF path.
+        """
+        with archive_cache.open_entry(path) as entry:
+            # ``Comicbox._get_archive`` returns the underlying archive
+            # union (zip / rar / 7z / tar / pdf). Caller has gated on
+            # ``file_type == PDF`` so the runtime type is ``PDFFile``;
+            # ``isinstance`` narrows for the type checker. Private
+            # comicbox API for now — swap to a public getter once one
+            # lands upstream.
+            archive = entry.comicbox._get_archive()  # noqa: SLF001
+            if not isinstance(archive, PDFFile):
+                return None
+            pdf = archive
+            page_index = int(page)
+
+            if serve_hint == _SERVE_IMAGE:
+                # Always-image override — pixmap fallback for vector pages.
+                blob, ext = pdf.read_full_pixmap_jpeg(page_index)
+                return blob, f"image/{ext}"
+
+            verdict = self._classify_cached(entry, pdf, page_index)
+            if verdict.mode is PageMode.PDF_FALLBACK:
+                return None
+            served = pdf.read_image_if_dominant(page_index)
+            if served is None:
+                # Detection said dominant but extraction failed; fall
+                # through to PDF rather than serve a broken response.
+                return None
+            blob, ext = served
+            return blob, f"image/{ext}"
+
+    def _get_page_image(self) -> tuple[bytes, str]:
         """Get the image data and content type."""
         pk = self.kwargs.get("pk")
         path, file_type = self._resolve_path_and_type(pk)
 
-        # page_image
         page = self.kwargs.get("page")
-        pdf_format = (
-            PageFormat.PIXMAP.value
-            if self.request.GET.get("pixmap", "").lower() not in FALSY
-            else ""
-        )
+        serve_hint = self._serve_hint()
+
+        is_pdf = file_type == FileTypeChoices.PDF.value  # pyright: ignore[reportAttributeAccessIssue]  # ty: ignore[unresolved-attribute]
+
+        # Image-dominant fast path for PDFs (skipped when the caller
+        # forces ``?serve=pdf``).
+        if is_pdf and serve_hint != _SERVE_PDF:
+            served = self._try_pdf_image_serve(path, page, serve_hint)
+            if served is not None:
+                return served
+
         # Process-wide LRU of open Comicbox archives — the web reader's
         # prev/curr/next prefetch fires 3-5 page hits on the same archive
         # within a second, and ``cacheBook`` mode bursts a whole-book
@@ -96,28 +176,32 @@ def _get_page_image(self) -> tuple:
         # held inside ``archive_cache.open(...)`` serializes extraction
         # because ZipFile / RarFile / PDF backends aren't thread-safe.
         with archive_cache.open(path) as cb:
-            page_image = cb.get_page_by_index(page, pdf_format=pdf_format)
+            page_image = cb.get_page_by_index(page, pdf_format="")
         if not page_image:
             page_image = b""
 
-        # content type
-        if (
-            file_type == FileTypeChoices.PDF.value  # pyright: ignore[reportAttributeAccessIssue], # ty: ignore[unresolved-attribute]
-            and pdf_format not in _PDF_FORMAT_NON_PDF_TYPES
-        ):
-            content_type = _PDF_MIME_TYPE
-        else:
-            content_type = self.content_type
-
+        content_type = _PDF_MIME_TYPE if is_pdf else self.content_type
         return page_image, content_type
 
     @extend_schema(
         parameters=[
             OpenApiParameter("bookmark", OpenApiTypes.BOOL, default=True),
-            OpenApiParameter("pixmap", OpenApiTypes.BOOL, default=False),
+            OpenApiParameter(
+                _SERVE_PARAM,
+                OpenApiTypes.STR,
+                default=_SERVE_AUTO,
+                enum=sorted(_SERVE_HINTS),
+                description=(
+                    "PDF serving mode: 'auto' (detector), "
+                    "'pdf' (legacy single-page PDF), "
+                    "'image' (always rasterize). Ignored for non-PDF archives."
+                ),
+            ),
         ],
         responses={
             (200, content_type): OpenApiTypes.BINARY,
+            (200, "image/png"): OpenApiTypes.BINARY,
+            (200, "image/webp"): OpenApiTypes.BINARY,
             (200, _PDF_MIME_TYPE): OpenApiTypes.BINARY,
         },
     )

diff --git a/codex/views/util.py b/codex/views/util.py
@@ -2,8 +2,42 @@
 
 from collections.abc import Mapping
 from dataclasses import dataclass
+from functools import wraps
 from typing import override
 
+from django.utils.cache import patch_cache_control
+
+
+def cache_control_2xx(**kwargs):
+    """
+    Patch ``Cache-Control`` only on 2xx responses.
+
+    Like ``django.views.decorators.cache.cache_control`` but only
+    emits the header for success responses.
+
+    Django's ``cache_control`` patches the header onto every response,
+    including 4xx and 5xx. With ``public, max-age=<long>`` that turns
+    a transient error (a missing file, an ACL miss, a one-off
+    backend hiccup) into a week-long cache poison: every browser that
+    saw the failure keeps serving it from cache without ever reaching
+    the server. This wrapper only marks success responses cacheable;
+    errors are returned with whatever default headers DRF / Django
+    already set (typically uncached).
+    """
+
+    def _wrap(viewfunc):
+        @wraps(viewfunc)
+        def _wrapped(request, *args, **kw):
+            response = viewfunc(request, *args, **kw)
+            status = getattr(response, "status_code", 0)
+            if 200 <= status < 300:  # noqa: PLR2004
+                patch_cache_control(response, **kwargs)
+            return response
+
+        return _wrapped
+
+    return _wrap
+
 
 @dataclass
 class Route:

diff --git a/frontend/src/api/v3/reader.js b/frontend/src/api/v3/reader.js
@@ -25,9 +25,24 @@ export const getReaderInfo = (pk, data, ts, options = {}) => {
 const _getReaderAPIPath = (pk) =>
   globalThis.CODEX.API_V3_PATH + _getBookPath(pk);
 
-export const getComicPageSource = ({ pk, page, mtime }) => {
+export const getComicPageSource = ({ pk, page, mtime, serve }) => {
+  // ``serve`` is the optional PDF serving-mode hint forwarded to the
+  // backend ``ReaderPageView``: ``auto`` (detector decides),
+  // ``image`` (always rasterize), or ``pdf`` (skip the detector and
+  // serve a single-page PDF blob). Ignored by the backend for
+  // non-PDF archives. Omitting the param keeps the URL identical to
+  // the legacy shape so HTTP caches don't fragment.
+  //
+  // The query name is ``serve`` rather than ``format`` because DRF
+  // reserves ``?format=`` (URL_FORMAT_OVERRIDE) as a renderer-format
+  // selector and raises NotFound for unknown values before the view
+  // dispatches.
   const bookAPIPath = _getReaderAPIPath(pk);
-  return `${bookAPIPath}/${page}/page.jpg?ts=${mtime}`;
+  let url = `${bookAPIPath}/${page}/page.jpg?ts=${mtime}`;
+  if (serve && serve !== "auto") {
+    url += `&serve=${serve}`;
+  }
+  return url;
 };
 
 export const getComicDownloadURL = ({ pk }, fn, ts) => {
@@ -45,8 +60,8 @@ export const getDownloadPageURL = ({ pk, page, mtime }) => {
 };
 
 export const getPDFInBrowserURL = ({ pk, mtime }) => {
-  // Consumed by ``<embed src=...>``, not ``HTTP.get`` — needs an
-  // absolute path so the browser doesn't resolve it relative to the
+  // Consumed by the "Read in Tab" link (`<a target="_blank">`) — needs
+  // an absolute path so the browser doesn't resolve it relative to the
   // current SPA route.
   const bookPath = _getBookPath(pk);
   return `/${bookPath}/book.pdf?ts=${mtime}`;