From fd4d9fd9c10b5eec1e7a3893f6d1cfec8931fbf3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 6 Mar 2026 08:14:25 +0000
Subject: [PATCH 1/2] Initial plan


From adcc8c18e0d944e3fead7183593e703d503cffab Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 6 Mar 2026 08:24:39 +0000
Subject: [PATCH 2/2] Add to_searchable_pdf: overlay invisible OCR text layer
 onto image-based PDFs

Co-authored-by: SWHL <28639377+SWHL@users.noreply.github.com>
---
 rapidocr_pdf/main.py | 140 ++++++++++++++++++++++++++++++++++++++++++-
 tests/test_main.py   |  62 +++++++++++++++++++
 2 files changed, 200 insertions(+), 2 deletions(-)

diff --git a/rapidocr_pdf/main.py b/rapidocr_pdf/main.py
index 4e7e15b..8e7e3dd 100644
--- a/rapidocr_pdf/main.py
+++ b/rapidocr_pdf/main.py
@@ -15,6 +15,14 @@
 
 logger = Logger(logger_name=__name__).get_log()
 
+# Map Unicode ranges to PyMuPDF built-in CJK font names
+_CJK_FONT_RANGES = [
+    (0x4E00, 0x9FFF, "china-s"),   # CJK Unified Ideographs
+    (0x3400, 0x4DBF, "china-s"),   # CJK Extension A
+    (0x3040, 0x30FF, "japan"),     # Hiragana / Katakana
+    (0xAC00, 0xD7AF, "korea"),     # Hangul Syllables
+]
+
 
 class RapidOCRPDF:
     def __init__(self, dpi=200, ocr_params: Optional[Dict] = None):
@@ -156,6 +164,117 @@ def merge_direct_ocr(self, txts_dict: Dict, ocr_res_dict: Dict) -> List[List[str
         final_result = dict(sorted(final_result.items(), key=lambda x: int(x[0])))
         return [[k, v["text"], v["avg_confidence"]] for k, v in final_result.items()]
 
+    def to_searchable_pdf(
+        self,
+        content: Union[str, Path, bytes],
+        output_path: Optional[Union[str, Path]] = None,
+        force_ocr: bool = False,
+        page_num_list: Optional[List[int]] = None,
+    ) -> bytes:
+        """Add an invisible OCR text layer to image-based PDF pages.
+
+        Converts image-only pages into searchable PDF pages by overlaying
+        OCR-recognised text at the correct positions using render_mode=3
+        (invisible text).  Pages that already contain selectable text are
+        left unchanged unless *force_ocr* is True.
+
+        Args:
+            content: Input PDF – file path (str/Path) or raw bytes.
+            output_path: Optional path where the resulting PDF is saved.
+            force_ocr: When True, OCR every page even if it already has text.
+            page_num_list: Zero-based page indices to process.  None means
+                all pages.
+
+        Returns:
+            Modified PDF as bytes.
+        """
+        try:
+            file_type = which_type(content)
+        except (FileExistsError, TypeError) as e:
+            raise RapidOCRPDFError("The input content is empty.") from e
+
+        if file_type != "pdf":
+            raise RapidOCRPDFError("The file type is not PDF format.")
+
+        pdf_data = self.load_pdf(content)
+        _, need_ocr_idxs = self.extract_texts(pdf_data, force_ocr, page_num_list)
+
+        # Scale factor: image pixels (at self.dpi) → PDF points (72 dpi base)
+        scale = 72.0 / self.dpi
+
+        with fitz.open(stream=pdf_data) as doc:
+            for i in need_ocr_idxs:
+                page = doc[i]
+                pix = page.get_pixmap(dpi=self.dpi)
+                img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
+                    [pix.h, pix.w, pix.n]
+                )
+                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+                preds = self.ocr_engine(img)
+                if preds.txts is None or preds.boxes is None:
+                    continue
+
+                for box, txt in zip(preds.boxes, preds.txts):
+                    if not txt:
+                        continue
+                    self._insert_ocr_text(page, box, txt, scale)
+
+            result_bytes = doc.tobytes()
+
+        if output_path is not None:
+            Path(output_path).write_bytes(result_bytes)
+
+        return result_bytes
+
+    @staticmethod
+    def _select_font(text: str) -> str:
+        """Return a PyMuPDF built-in font name that covers *text*."""
+        for char in text:
+            code = ord(char)
+            for start, end, font in _CJK_FONT_RANGES:
+                if start <= code <= end:
+                    return font
+        return "helv"
+
+    @staticmethod
+    def _insert_ocr_text(
+        page: fitz.Page,
+        box: np.ndarray,
+        txt: str,
+        scale: float,
+    ) -> None:
+        """Insert invisible text for one OCR bounding box onto *page*.
+
+        Args:
+            page: The PyMuPDF page to annotate.
+            box: Four corner points ``[[x1,y1],…,[x4,y4]]`` in image pixels.
+            txt: Recognised text string.
+            scale: Conversion factor from image pixels to PDF points.
+        """
+        box_arr = np.asarray(box, dtype=float)
+        xs = box_arr[:, 0] * scale
+        ys = box_arr[:, 1] * scale
+        x0, y0 = float(xs.min()), float(ys.min())
+        x1, y1 = float(xs.max()), float(ys.max())
+        height = y1 - y0
+        if height <= 0 or x1 <= x0:
+            return
+
+        fontname = RapidOCRPDF._select_font(txt)
+        # insert_text expects the *baseline* (bottom-left) of the first line
+        point = fitz.Point(x0, y1)
+        try:
+            page.insert_text(
+                point,
+                txt,
+                fontname=fontname,
+                fontsize=height,
+                render_mode=3,  # invisible – text is searchable but not drawn
+            )
+        except Exception as e:
+            logger.debug("Skipping OCR token %r: %s", txt, e)
+
 
 class RapidOCRPDFError(Exception):
     pass
@@ -179,6 +298,15 @@ def parse_args(arg_list: Optional[List[str]] = None):
         default=None,
         help="Which pages will be extracted. e.g. 0 1 2. Note: the index of page num starts from 0.",
     )
+    parser.add_argument(
+        "--output_pdf",
+        type=str,
+        default=None,
+        help=(
+            "Path to save a searchable PDF with an invisible OCR text layer. "
+            "When specified, the tool writes a PDF instead of printing extracted text."
+        ),
+    )
     args = parser.parse_args(arg_list)
     return args
 
@@ -187,8 +315,16 @@ def main(arg_list: Optional[List[str]] = None):
     args = parse_args(arg_list)
     pdf_extracter = RapidOCRPDF(args.dpi)
     try:
-        result = pdf_extracter(args.pdf_path, args.force_ocr, args.page_num_list)
-        print(result)
+        if args.output_pdf:
+            pdf_extracter.to_searchable_pdf(
+                args.pdf_path,
+                output_path=args.output_pdf,
+                force_ocr=args.force_ocr,
+                page_num_list=args.page_num_list,
+            )
+        else:
+            result = pdf_extracter(args.pdf_path, args.force_ocr, args.page_num_list)
+            print(result)
     except Exception as e:
         logger.error("%s\n%s", e, error_log())
 
diff --git a/tests/test_main.py b/tests/test_main.py
index 201556a..52c38a0 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -10,6 +10,7 @@
 root_dir = cur_dir.parent
 sys.path.append(str(root_dir))
 
+import fitz
 import pytest
 
 from rapidocr_pdf import RapidOCRPDF, RapidOCRPDFError
@@ -109,3 +110,64 @@ def test_corner_case(content):
     with pytest.raises(RapidOCRPDFError) as exc_info:
         extracter(content)
     assert exc_info.type is RapidOCRPDFError
+
+
+# ---------------------------------------------------------------------------
+# Tests for to_searchable_pdf
+# ---------------------------------------------------------------------------
+
+
+def test_to_searchable_pdf_returns_bytes():
+    """to_searchable_pdf should return a non-empty bytes object."""
+    pdf_content = test_dir / "image.pdf"
+    result = extracter.to_searchable_pdf(pdf_content)
+    assert isinstance(result, bytes)
+    assert len(result) > 0
+
+
+def test_to_searchable_pdf_text_layer():
+    """The output PDF should contain a searchable text layer on OCR pages."""
+    pdf_content = test_dir / "image.pdf"
+    result = extracter.to_searchable_pdf(pdf_content)
+    with fitz.open(stream=result) as doc:
+        text = doc[0].get_text("text")
+    assert len(text) > 0
+
+
+def test_to_searchable_pdf_saves_file(tmp_path):
+    """When output_path is given the file should be written to disk."""
+    pdf_content = test_dir / "image.pdf"
+    out = tmp_path / "searchable.pdf"
+    extracter.to_searchable_pdf(pdf_content, output_path=out)
+    assert out.exists()
+    assert out.stat().st_size > 0
+
+
+def test_to_searchable_pdf_from_bytes():
+    """to_searchable_pdf should accept raw PDF bytes as input."""
+    pdf_content = test_dir / "image.pdf"
+    with open(pdf_content, "rb") as f:
+        data = f.read()
+    result = extracter.to_searchable_pdf(data)
+    assert isinstance(result, bytes)
+    with fitz.open(stream=result) as doc:
+        text = doc[0].get_text("text")
+    assert len(text) > 0
+
+
+def test_to_searchable_pdf_invalid_input():
+    """to_searchable_pdf should raise RapidOCRPDFError for invalid input."""
+    with pytest.raises(RapidOCRPDFError):
+        extracter.to_searchable_pdf(None)
+
+
+def test_cli_output_pdf(tmp_path):
+    """--output_pdf flag should produce a valid searchable PDF file."""
+    out = tmp_path / "out.pdf"
+    cmd = f"{test_dir / 'image.pdf'} --output_pdf {out}"
+    main(shlex.split(cmd))
+    assert out.exists()
+    assert out.stat().st_size > 0
+    with fitz.open(str(out)) as doc:
+        text = doc[0].get_text("text")
+    assert len(text) > 0