From bf3a46f0abc084506197534b700853ccf3131bbc Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Fri, 8 May 2026 20:36:41 -0700 Subject: [PATCH 1/6] reader: serve image-dominant PDF pages as , drop full-PDF mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most "comic PDFs" are scanned-image wrappers — one full-bleed JPEG or PNG per page, no real vector content. Previously every PDF page was routed through ``vue-pdf-embed`` (and through pdf.js on the client) regardless. Now the backend runs an image-dominant page detector via ``comicbox-pdffile`` 0.6 and serves matched pages as plain image bytes; the browser renders them through ```` like any CBZ page. Vector-content pages keep the existing single-page-PDF + ``vue-pdf-embed`` path. Backend ======= * ``codex/views/reader/page.py`` - ``?format=auto|pdf|image`` query parameter. ``auto`` (default) runs the detector. ``pdf`` skips the detector and forces the legacy single-page-PDF path. ``image`` always rasterizes — works for any PDF page but spends more CPU on vector-heavy pages. - ``_try_pdf_image_serve`` reaches through Comicbox to the underlying ``PDFFile`` (private API for now; comments mark the seam for a future public-getter swap) and dispatches to ``classify_page`` / ``read_image_if_dominant`` / ``read_full_pixmap_jpeg``. Per-page verdicts are memoized on the ``_ArchiveEntry`` so prev/curr/next prefetch is effectively free. - Old ``?pixmap=`` query parameter dropped — it returned PPM bytes with ``image/jpeg`` content-type (latent labeling bug; no caller). - OpenAPI schema advertises three possible response content-types: ``application/pdf`` (fallback), ``image/jpeg``, ``image/png``. * ``codex/views/reader/_archive_cache.py`` - Adds a ``verdicts: dict[int, PageVerdict]`` slot on ``_ArchiveEntry`` for the per-page detector cache. - New ``open_entry()`` context manager yields the entry directly (existing ``open()`` keeps yielding ``Comicbox`` for callers that don't need verdict state). Frontend ======== * ``BookPage`` (``page/page.vue``) always tries ```` first. On ``error`` for a PDF book it sets ``pdfFallback=true`` and the page re-mounts as ```` against the same URL with ``?format=pdf`` appended. No HEAD pre-flight, no verdict threaded through the API response — the browser's image-load failure on a ``application/pdf`` body is the natural signal. * Drops ``PagerFullPDF`` and the whole-document-load mode it served. ``pager.vue`` now picks between ``PagerHorizontal`` / ``PagerVertical`` based purely on reading direction. * Drops the ``cacheBook`` carve-out for vertical PDFs in ``stores/reader.js`` — PDFs prefetch alongside CBZ now. * Adds a per-comic "PDF Rendering" radio in the reader settings drawer (Auto / Force image / Force vector) wired to a new ``clientSettings.pdfRenderMode`` field. Forwarded to the page endpoint as ``?format=``. * ``getComicPageSource`` accepts an optional ``format`` parameter. Omitting it preserves the URL shape so HTTP caches don't fragment. Sequencing ========== This change depends on two upstream PRs: * ``comicbox-pdffile`` 0.6 — image-dominant detector + extractors: https://github.com/ajslater/pdffile/pull/22 * ``comicbox`` widened pdffile pin (>=0.6,<0.7): https://github.com/ajslater/comicbox/pull/131 The ``[tool.uv.sources]`` block in ``pyproject.toml`` temporarily points both deps at their PR branches so this branch's CI can resolve. Once both upstreams land on PyPI, drop the sources block and the explicit ``comicbox-pdffile`` direct dep. The full design + empirical validation against a 14-PDF private corpus lives in ``tasks/pdf-image-detection/``. Co-Authored-By: Claude Opus 4.7 (1M context) --- codex/views/reader/_archive_cache.py | 37 ++- codex/views/reader/page.py | 124 ++++++-- frontend/src/api/v3/reader.js | 18 +- .../drawer/reader-settings-controls.vue | 42 ++- .../src/components/reader/pager/page/page.vue | 63 +++- .../reader/pager/pager-full-pdf.vue | 36 --- .../src/components/reader/pager/pager.vue | 30 +- frontend/src/stores/reader.js | 27 +- pyproject.toml | 13 + tasks/pdf-image-detection/00-plan.md | 131 ++++++++ tasks/pdf-image-detection/01-detector.md | 175 +++++++++++ .../pdf-image-detection/02-implementation.md | 297 ++++++++++++++++++ .../99-prototype-results.md | 223 +++++++++++++ uv.lock | 20 +- 14 files changed, 1118 insertions(+), 118 deletions(-) delete mode 100644 frontend/src/components/reader/pager/pager-full-pdf.vue create mode 100644 tasks/pdf-image-detection/00-plan.md create mode 100644 tasks/pdf-image-detection/01-detector.md create mode 100644 tasks/pdf-image-detection/02-implementation.md create mode 100644 tasks/pdf-image-detection/99-prototype-results.md diff --git a/codex/views/reader/_archive_cache.py b/codex/views/reader/_archive_cache.py index 95ffffe00..f8cfe3831 100644 --- a/codex/views/reader/_archive_cache.py +++ b/codex/views/reader/_archive_cache.py @@ -44,7 +44,7 @@ import time from collections import OrderedDict from contextlib import contextmanager -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from comicbox.box import Comicbox from loguru import logger @@ -79,13 +79,27 @@ def _env_bool(name: str, *, default: bool) -> bool: class _ArchiveEntry: """One cached Comicbox + its per-path lock + last-access timestamp.""" - __slots__ = ("comicbox", "last_access", "lock", "path") + __slots__ = ( + "comicbox", + "last_access", + "lock", + "path", + "verdicts", + ) def __init__(self, path: str, comicbox: Comicbox, last_access: float) -> None: self.path = path self.comicbox = comicbox self.lock = threading.Lock() self.last_access = last_access + # Per-page image-serve verdict cache (``pdffile.PageVerdict`` + # instances keyed on zero-based page index). Memoized here so + # repeated ``ReaderPageView`` hits on the same archive don't + # re-classify; the underlying ``classify_page`` call is cheap + # but the cache makes prev/curr/next prefetch effectively + # free. Typed loosely to keep ``pdffile`` out of this module's + # import surface. + self.verdicts: dict[int, Any] = {} def close(self) -> None: """Close the cached archive; tolerate already-closed state.""" @@ -178,6 +192,25 @@ def open(self, path: str) -> Generator[Comicbox]: with entry.lock: yield entry.comicbox + @contextmanager + def open_entry(self, path: str) -> Generator[_ArchiveEntry]: + """ + Yield the full ``_ArchiveEntry`` for direct cache-state access. + + Used by callers that need the ``Comicbox`` and the per-page + verdict memo (``ReaderPageView``'s image-serve fast path). + Same locking shape as ``open()``. When the cache is disabled, + synthesizes a transient entry so callers see a uniform API; + verdict memoization is a no-op in that mode. + """ + if not self.enabled: + with Comicbox(path, config=COMICBOX_CONFIG, logger=logger) as cb: + yield _ArchiveEntry(path, cb, time.monotonic()) + return + entry = self._open_or_get(path) + with entry.lock: + yield entry + def shutdown(self) -> None: """Close every cached archive. Wired to ``atexit`` at module load.""" with self._struct_lock: diff --git a/codex/views/reader/page.py b/codex/views/reader/page.py index 10f43a75b..94bbda5d8 100644 --- a/codex/views/reader/page.py +++ b/codex/views/reader/page.py @@ -1,26 +1,41 @@ """Views for reading comic books.""" +from __future__ import annotations + import time +from typing import TYPE_CHECKING, Final from django.http import HttpResponse from drf_spectacular.types import OpenApiTypes from drf_spectacular.utils import OpenApiParameter, extend_schema from loguru import logger -from pdffile import PageFormat +from pdffile import PageMode, PDFFile from rest_framework.exceptions import NotFound from codex.librarian.bookmark.tasks import BookmarkUpdateTask from codex.librarian.mp_queue import LIBRARIAN_QUEUE from codex.models.choices import FileTypeChoices from codex.models.comic import Comic -from codex.settings import FALSY from codex.views.auth import AuthFilterAPIView from codex.views.bookmark import BookmarkAuthMixin from codex.views.reader._archive_cache import archive_cache, page_acl_cache -_PDF_MIME_TYPE = "application/pdf" -_PDF_FORMAT_NON_PDF_TYPES = frozenset( - {e.value for e in (PageFormat.PIXMAP, PageFormat.IMAGE)} +if TYPE_CHECKING: + from pdffile import PageVerdict + + from codex.views.reader._archive_cache import _ArchiveEntry + +_PDF_MIME_TYPE: Final[str] = "application/pdf" + +#: Permitted ``?format=`` values. ``auto`` runs the detector; ``pdf`` +#: skips the detector and forces the legacy single-page-PDF path; +#: ``image`` forces a server-side rasterize (works for any PDF page +#: but spends more CPU than the detector path on vector-heavy pages). +_FORMAT_AUTO: Final[str] = "auto" +_FORMAT_PDF: Final[str] = "pdf" +_FORMAT_IMAGE: Final[str] = "image" +_FORMAT_HINTS: Final[frozenset[str]] = frozenset( + {_FORMAT_AUTO, _FORMAT_PDF, _FORMAT_IMAGE} ) @@ -76,18 +91,77 @@ def _resolve_path_and_type(self, pk) -> tuple[str, str | None]: page_acl_cache.put(cache_key, path, file_type, now) return path, file_type - def _get_page_image(self) -> tuple: + def _format_hint(self) -> str: + raw = self.request.GET.get("format", _FORMAT_AUTO).lower() + return raw if raw in _FORMAT_HINTS else _FORMAT_AUTO + + @staticmethod + def _classify_cached(entry: _ArchiveEntry, pdf: PDFFile, page: int) -> PageVerdict: + """Memoize ``pdf.classify_page`` on the cache entry.""" + cached = entry.verdicts.get(page) + if cached is not None: + return cached + verdict = pdf.classify_page(page) + entry.verdicts[page] = verdict + return verdict + + def _try_pdf_image_serve( + self, path: str, page: int, fmt_hint: str + ) -> tuple[bytes, str] | None: + """ + Image-serve fast path for PDF pages. + + Returns ``(bytes, content_type)`` when the page can be served + as a raw image (detector matched, or caller forced ``image``); + ``None`` when the caller should fall back to the legacy + single-page-PDF path. + """ + with archive_cache.open_entry(path) as entry: + # ``Comicbox._get_archive`` returns the underlying archive + # union (zip / rar / 7z / tar / pdf). Caller has gated on + # ``file_type == PDF`` so the runtime type is ``PDFFile``; + # ``isinstance`` narrows for the type checker. Private + # comicbox API for now — swap to a public getter once one + # lands upstream. + archive = entry.comicbox._get_archive() # noqa: SLF001 + if not isinstance(archive, PDFFile): + return None + pdf = archive + page_index = int(page) + + if fmt_hint == _FORMAT_IMAGE: + # Always-image override — pixmap fallback for vector pages. + blob, ext = pdf.read_full_pixmap_jpeg(page_index) + return blob, f"image/{ext}" + + verdict = self._classify_cached(entry, pdf, page_index) + if verdict.mode is PageMode.PDF_FALLBACK: + return None + served = pdf.read_image_if_dominant(page_index) + if served is None: + # Detection said dominant but extraction failed; fall + # through to PDF rather than serve a broken response. + return None + blob, ext = served + return blob, f"image/{ext}" + + def _get_page_image(self) -> tuple[bytes, str]: """Get the image data and content type.""" pk = self.kwargs.get("pk") path, file_type = self._resolve_path_and_type(pk) - # page_image page = self.kwargs.get("page") - pdf_format = ( - PageFormat.PIXMAP.value - if self.request.GET.get("pixmap", "").lower() not in FALSY - else "" - ) + fmt_hint = self._format_hint() + + is_pdf = file_type == FileTypeChoices.PDF.value # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + + # Image-dominant fast path for PDFs (skipped when the caller + # forces ``?format=pdf``). + if is_pdf and fmt_hint != _FORMAT_PDF: + served = self._try_pdf_image_serve(path, page, fmt_hint) + if served is not None: + return served + # Process-wide LRU of open Comicbox archives — the web reader's # prev/curr/next prefetch fires 3-5 page hits on the same archive # within a second, and ``cacheBook`` mode bursts a whole-book @@ -96,28 +170,32 @@ def _get_page_image(self) -> tuple: # held inside ``archive_cache.open(...)`` serializes extraction # because ZipFile / RarFile / PDF backends aren't thread-safe. with archive_cache.open(path) as cb: - page_image = cb.get_page_by_index(page, pdf_format=pdf_format) + page_image = cb.get_page_by_index(page, pdf_format="") if not page_image: page_image = b"" - # content type - if ( - file_type == FileTypeChoices.PDF.value # pyright: ignore[reportAttributeAccessIssue], # ty: ignore[unresolved-attribute] - and pdf_format not in _PDF_FORMAT_NON_PDF_TYPES - ): - content_type = _PDF_MIME_TYPE - else: - content_type = self.content_type - + content_type = _PDF_MIME_TYPE if is_pdf else self.content_type return page_image, content_type @extend_schema( parameters=[ OpenApiParameter("bookmark", OpenApiTypes.BOOL, default=True), - OpenApiParameter("pixmap", OpenApiTypes.BOOL, default=False), + OpenApiParameter( + "format", + OpenApiTypes.STR, + default=_FORMAT_AUTO, + enum=sorted(_FORMAT_HINTS), + description=( + "PDF rendering hint: 'auto' (detector), " + "'pdf' (legacy single-page PDF), " + "'image' (always rasterize). Ignored for non-PDF archives." + ), + ), ], responses={ (200, content_type): OpenApiTypes.BINARY, + (200, "image/png"): OpenApiTypes.BINARY, + (200, "image/webp"): OpenApiTypes.BINARY, (200, _PDF_MIME_TYPE): OpenApiTypes.BINARY, }, ) diff --git a/frontend/src/api/v3/reader.js b/frontend/src/api/v3/reader.js index 1e7e79b4b..e6d7f0ac8 100644 --- a/frontend/src/api/v3/reader.js +++ b/frontend/src/api/v3/reader.js @@ -25,9 +25,19 @@ export const getReaderInfo = (pk, data, ts, options = {}) => { const _getReaderAPIPath = (pk) => globalThis.CODEX.API_V3_PATH + _getBookPath(pk); -export const getComicPageSource = ({ pk, page, mtime }) => { +export const getComicPageSource = ({ pk, page, mtime, format }) => { + // ``format`` is the optional PDF rendering hint forwarded to the + // backend ``ReaderPageView``: ``auto`` (detector decides), + // ``image`` (always rasterize), or ``pdf`` (skip the detector and + // serve a single-page PDF blob). Ignored by the backend for + // non-PDF archives. Omitting the param keeps the URL identical to + // the legacy shape so HTTP caches don't fragment. const bookAPIPath = _getReaderAPIPath(pk); - return `${bookAPIPath}/${page}/page.jpg?ts=${mtime}`; + let url = `${bookAPIPath}/${page}/page.jpg?ts=${mtime}`; + if (format && format !== "auto") { + url += `&format=${format}`; + } + return url; }; export const getComicDownloadURL = ({ pk }, fn, ts) => { @@ -45,8 +55,8 @@ export const getDownloadPageURL = ({ pk, page, mtime }) => { }; export const getPDFInBrowserURL = ({ pk, mtime }) => { - // Consumed by ````, not ``HTTP.get`` — needs an - // absolute path so the browser doesn't resolve it relative to the + // Consumed by the "Read in Tab" link (``) — needs + // an absolute path so the browser doesn't resolve it relative to the // current SPA route. const bookPath = _getBookPath(pk); return `/${bookPath}/book.pdf?ts=${mtime}`; diff --git a/frontend/src/components/reader/drawer/reader-settings-controls.vue b/frontend/src/components/reader/drawer/reader-settings-controls.vue index 3236bd048..b6bd75b49 100644 --- a/frontend/src/components/reader/drawer/reader-settings-controls.vue +++ b/frontend/src/components/reader/drawer/reader-settings-controls.vue @@ -89,7 +89,6 @@ :model-value="settings.cacheBook" class="scopedCheckbox" density="compact" - :disabled="disableCacheBook" label="Cache Entire Book" hide-details="auto" :true-value="true" @@ -98,6 +97,25 @@ " @update:model-value="$emit('update', { cacheBook: $event })" /> + + + + + diff --git a/frontend/src/components/reader/pager/page/page.vue b/frontend/src/components/reader/pager/page/page.vue index 4caf4d312..538b09eaf 100644 --- a/frontend/src/components/reader/pager/page/page.vue +++ b/frontend/src/components/reader/pager/page/page.vue @@ -74,16 +74,27 @@ export default { loaded: false, error: "", ts: 0, + /* + * ``true`` once we've seen an ```` load fail on a PDF + * book — the response was ``application/pdf`` (the detector + * declined to serve as image), so we re-mount the page through + * ```` against the same URL with ``?format=pdf``. + */ + pdfFallback: false, }; }, computed: { ...mapState(useReaderStore, { scale: (state) => state.clientSettings?.scale, + pdfRenderMode: (state) => state.clientSettings?.pdfRenderMode || "auto", }), + isPDF() { + return this.book.fileType === "PDF"; + }, style() { // Magic for transform: scale() not sizing elements right. const s = {}; - if (this.book.fileType === "PDF" || this.scale == 1) { + if (this.usingPDFDoc || this.scale == 1) { return s; } const img = this.$refs.pageComponent?.$el; @@ -100,11 +111,34 @@ export default { pk: this.book.pk, page: this.page, mtime, + format: this.activeFormat, }; return getComicPageSource(params); }, + activeFormat() { + /* + * For non-PDF books the format param is ignored by the + * backend; we still pass it through so the URL is stable. + */ + if (!this.isPDF) { + return "auto"; + } + if (this.pdfFallback) { + // Image attempt failed → re-fetch with PDF response. + return "pdf"; + } + return this.pdfRenderMode; + }, + usingPDFDoc() { + /* + * Render through ```` when: + * • the user explicitly forced PDF rendering, or + * • the ```` first attempt failed for a PDF book. + */ + return this.isPDF && (this.pdfRenderMode === "pdf" || this.pdfFallback); + }, component() { - return this.book.fileType === "PDF" ? PDFDoc : ImgPage; + return this.usingPDFDoc ? PDFDoc : ImgPage; }, bookSettings() { return this.getBookSettings(this.book); @@ -113,6 +147,17 @@ export default { return this.bookSettings?.twoPages ?? false; }, }, + watch: { + /* + * If the user toggles render mode mid-read, drop the fallback flag + * so the new mode takes effect on the next mount. + */ + pdfRenderMode() { + this.pdfFallback = false; + this.error = ""; + this.loaded = false; + }, + }, mounted() { /* * Show the spinner only if the image is still loading after @@ -148,6 +193,18 @@ export default { this.error = false; }, onError() { + /* + * For PDF books on the first ```` attempt, the failure + * means the backend sent ``application/pdf`` — the detector + * declined to serve as image. Swap to ```` against + * the same URL and let it render via vue-pdf-embed. For all + * other failures, surface the error. + */ + if (this.isPDF && !this.pdfFallback && !this.usingPDFDoc) { + this.pdfFallback = true; + this.loaded = false; + return; + } this.error = "load"; this.showProgress = false; }, @@ -157,6 +214,8 @@ export default { }, onRetry() { this.ts = Date.now(); + this.pdfFallback = false; + this.error = ""; }, }, }; diff --git a/frontend/src/components/reader/pager/pager-full-pdf.vue b/frontend/src/components/reader/pager/pager-full-pdf.vue deleted file mode 100644 index c258c4e1b..000000000 --- a/frontend/src/components/reader/pager/pager-full-pdf.vue +++ /dev/null @@ -1,36 +0,0 @@ - - - diff --git a/frontend/src/components/reader/pager/pager.vue b/frontend/src/components/reader/pager/pager.vue index 341b3291e..5fd4cd360 100644 --- a/frontend/src/components/reader/pager/pager.vue +++ b/frontend/src/components/reader/pager/pager.vue @@ -12,14 +12,8 @@