From 0c8af70db0f8b783d9eac4607f9c76c40fc95431 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Fri, 8 May 2026 17:12:01 -0700 Subject: [PATCH] reader: add ?hide_text=1 to suppress visible OCR text on PDF pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some scanned PDFs draw their OCR layer with rendering mode 0 (visible) on top of the page's rasterized scan, doubling the text under any renderer that respects the content stream — including PDF.js as embedded by vue-pdf-embed. Setting ``textLayer={false}`` client-side doesn't help because that prop only gates the selectable overlay, not text drawn from the content stream. Forward the new ``hide_text`` kwarg from comicbox >= 3.0.1 (which forwards to comicbox-pdffile >= 0.5.1) when ``?hide_text=1`` is present on the page request. The PDF / pixmap response still contains the text content — only the rendering mode changes — so the selectable overlay continues to work. The pyright/ty ignores cover the dev gap until ``pyproject.toml`` deps are bumped to the released versions of comicbox / comicbox- pdffile. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 4 ++++ codex/views/reader/page.py | 18 +++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d5a7061e1..65a29cfcd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,10 @@ border-radius: 128px; ## v1.11.5 +- Features + - Reader: add `?hide_text=1` query param for PDF pages — suppresses + visible text rendering on badly-OCR'd scans where the OCR layer + doubles up against the page's raster. - Fixes - Fix occasional recreation of comics on docker or network filesystems. - Fix double polling of some libraries. diff --git a/codex/views/reader/page.py b/codex/views/reader/page.py index 10f43a75b..06632130e 100644 --- a/codex/views/reader/page.py +++ b/codex/views/reader/page.py @@ -88,6 +88,13 @@ def _get_page_image(self) -> tuple: if self.request.GET.get("pixmap", "").lower() not in FALSY else "" ) + # ``?hide_text=1`` suppresses visible text rendering on PDF + # pages — useful for badly-OCR'd scans that draw the OCR + # layer with rendering mode 0 (visible) on top of the page's + # raster, doubling the text. Forwarded straight through + # comicbox to the pdffile backend; non-PDF archives ignore + # it. + hide_text = self.request.GET.get("hide_text", "").lower() not in FALSY # Process-wide LRU of open Comicbox archives — the web reader's # prev/curr/next prefetch fires 3-5 page hits on the same archive # within a second, and ``cacheBook`` mode bursts a whole-book @@ -96,7 +103,15 @@ def _get_page_image(self) -> tuple: # held inside ``archive_cache.open(...)`` serializes extraction # because ZipFile / RarFile / PDF backends aren't thread-safe. with archive_cache.open(path) as cb: - page_image = cb.get_page_by_index(page, pdf_format=pdf_format) + # ``hide_text`` requires comicbox > 3.0.0 / comicbox-pdffile + # > 0.5.0; pyright's lock-file-pinned typeshed lags behind + # the editable install used during dev. Drop the ignore once + # both deps land on PyPI and ``pyproject.toml`` is bumped. + page_image = cb.get_page_by_index( + page, + pdf_format=pdf_format, + hide_text=hide_text, # pyright: ignore[reportCallIssue] # ty: ignore[unknown-argument] + ) if not page_image: page_image = b"" @@ -115,6 +130,7 @@ def _get_page_image(self) -> tuple: parameters=[ OpenApiParameter("bookmark", OpenApiTypes.BOOL, default=True), OpenApiParameter("pixmap", OpenApiTypes.BOOL, default=False), + OpenApiParameter("hide_text", OpenApiTypes.BOOL, default=False), ], responses={ (200, content_type): OpenApiTypes.BINARY,