From 0c8af70db0f8b783d9eac4607f9c76c40fc95431 Mon Sep 17 00:00:00 2001
From: AJ Slater <aj@slater.net>
Date: Fri, 8 May 2026 17:12:01 -0700
Subject: [PATCH] reader: add ?hide_text=1 to suppress visible OCR text on PDF
 pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some scanned PDFs draw their OCR layer with rendering mode 0
(visible) on top of the page's rasterized scan, doubling the text
under any renderer that respects the content stream — including
PDF.js as embedded by vue-pdf-embed. Setting ``textLayer={false}``
client-side doesn't help because that prop only gates the
selectable overlay, not text drawn from the content stream.

Forward the new ``hide_text`` kwarg from comicbox >= 3.0.1 (which
forwards to comicbox-pdffile >= 0.5.1) when ``?hide_text=1`` is
present on the page request. The PDF / pixmap response still
contains the text content — only the rendering mode changes — so
the selectable overlay continues to work.

The pyright/ty ignores cover the dev gap until ``pyproject.toml``
deps are bumped to the released versions of comicbox / comicbox-
pdffile.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NEWS.md                    |  4 ++++
 codex/views/reader/page.py | 18 +++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index d5a7061e1..65a29cfcd 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -8,6 +8,10 @@ border-radius: 128px;
 
 ## v1.11.5
 
+- Features
+    - Reader: add `?hide_text=1` query param for PDF pages — suppresses
+      visible text rendering on badly-OCR'd scans where the OCR layer
+      doubles up against the page's raster.
 - Fixes
     - Fix occasional recreation of comics on docker or network filesystems.
     - Fix double polling of some libraries.
diff --git a/codex/views/reader/page.py b/codex/views/reader/page.py
index 10f43a75b..06632130e 100644
--- a/codex/views/reader/page.py
+++ b/codex/views/reader/page.py
@@ -88,6 +88,13 @@ def _get_page_image(self) -> tuple:
             if self.request.GET.get("pixmap", "").lower() not in FALSY
             else ""
         )
+        # ``?hide_text=1`` suppresses visible text rendering on PDF
+        # pages — useful for badly-OCR'd scans that draw the OCR
+        # layer with rendering mode 0 (visible) on top of the page's
+        # raster, doubling the text. Forwarded straight through
+        # comicbox to the pdffile backend; non-PDF archives ignore
+        # it.
+        hide_text = self.request.GET.get("hide_text", "").lower() not in FALSY
         # Process-wide LRU of open Comicbox archives — the web reader's
         # prev/curr/next prefetch fires 3-5 page hits on the same archive
         # within a second, and ``cacheBook`` mode bursts a whole-book
@@ -96,7 +103,15 @@ def _get_page_image(self) -> tuple:
         # held inside ``archive_cache.open(...)`` serializes extraction
         # because ZipFile / RarFile / PDF backends aren't thread-safe.
         with archive_cache.open(path) as cb:
-            page_image = cb.get_page_by_index(page, pdf_format=pdf_format)
+            # ``hide_text`` requires comicbox > 3.0.0 / comicbox-pdffile
+            # > 0.5.0; pyright's lock-file-pinned typeshed lags behind
+            # the editable install used during dev. Drop the ignore once
+            # both deps land on PyPI and ``pyproject.toml`` is bumped.
+            page_image = cb.get_page_by_index(
+                page,
+                pdf_format=pdf_format,
+                hide_text=hide_text,  # pyright: ignore[reportCallIssue]  # ty: ignore[unknown-argument]
+            )
         if not page_image:
             page_image = b""
 
@@ -115,6 +130,7 @@ def _get_page_image(self) -> tuple:
         parameters=[
             OpenApiParameter("bookmark", OpenApiTypes.BOOL, default=True),
             OpenApiParameter("pixmap", OpenApiTypes.BOOL, default=False),
+            OpenApiParameter("hide_text", OpenApiTypes.BOOL, default=False),
         ],
         responses={
             (200, content_type): OpenApiTypes.BINARY,