From fd4d9fd9c10b5eec1e7a3893f6d1cfec8931fbf3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 08:14:25 +0000 Subject: [PATCH 1/2] Initial plan From adcc8c18e0d944e3fead7183593e703d503cffab Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 08:24:39 +0000 Subject: [PATCH 2/2] Add to_searchable_pdf: overlay invisible OCR text layer onto image-based PDFs Co-authored-by: SWHL <28639377+SWHL@users.noreply.github.com> --- rapidocr_pdf/main.py | 140 ++++++++++++++++++++++++++++++++++++++++++- tests/test_main.py | 62 +++++++++++++++++++ 2 files changed, 200 insertions(+), 2 deletions(-) diff --git a/rapidocr_pdf/main.py b/rapidocr_pdf/main.py index 4e7e15b..8e7e3dd 100644 --- a/rapidocr_pdf/main.py +++ b/rapidocr_pdf/main.py @@ -15,6 +15,14 @@ logger = Logger(logger_name=__name__).get_log() +# Map Unicode ranges to PyMuPDF built-in CJK font names +_CJK_FONT_RANGES = [ + (0x4E00, 0x9FFF, "china-s"), # CJK Unified Ideographs + (0x3400, 0x4DBF, "china-s"), # CJK Extension A + (0x3040, 0x30FF, "japan"), # Hiragana / Katakana + (0xAC00, 0xD7AF, "korea"), # Hangul Syllables +] + class RapidOCRPDF: def __init__(self, dpi=200, ocr_params: Optional[Dict] = None): @@ -156,6 +164,117 @@ def merge_direct_ocr(self, txts_dict: Dict, ocr_res_dict: Dict) -> List[List[str final_result = dict(sorted(final_result.items(), key=lambda x: int(x[0]))) return [[k, v["text"], v["avg_confidence"]] for k, v in final_result.items()] + def to_searchable_pdf( + self, + content: Union[str, Path, bytes], + output_path: Optional[Union[str, Path]] = None, + force_ocr: bool = False, + page_num_list: Optional[List[int]] = None, + ) -> bytes: + """Add an invisible OCR text layer to image-based PDF pages. + + Converts image-only pages into searchable PDF pages by overlaying + OCR-recognised text at the correct positions using render_mode=3 + (invisible text). Pages that already contain selectable text are + left unchanged unless *force_ocr* is True. + + Args: + content: Input PDF – file path (str/Path) or raw bytes. + output_path: Optional path where the resulting PDF is saved. + force_ocr: When True, OCR every page even if it already has text. + page_num_list: Zero-based page indices to process. None means + all pages. + + Returns: + Modified PDF as bytes. + """ + try: + file_type = which_type(content) + except (FileExistsError, TypeError) as e: + raise RapidOCRPDFError("The input content is empty.") from e + + if file_type != "pdf": + raise RapidOCRPDFError("The file type is not PDF format.") + + pdf_data = self.load_pdf(content) + _, need_ocr_idxs = self.extract_texts(pdf_data, force_ocr, page_num_list) + + # Scale factor: image pixels (at self.dpi) → PDF points (72 dpi base) + scale = 72.0 / self.dpi + + with fitz.open(stream=pdf_data) as doc: + for i in need_ocr_idxs: + page = doc[i] + pix = page.get_pixmap(dpi=self.dpi) + img = np.frombuffer(pix.samples, dtype=np.uint8).reshape( + [pix.h, pix.w, pix.n] + ) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + preds = self.ocr_engine(img) + if preds.txts is None or preds.boxes is None: + continue + + for box, txt in zip(preds.boxes, preds.txts): + if not txt: + continue + self._insert_ocr_text(page, box, txt, scale) + + result_bytes = doc.tobytes() + + if output_path is not None: + Path(output_path).write_bytes(result_bytes) + + return result_bytes + + @staticmethod + def _select_font(text: str) -> str: + """Return a PyMuPDF built-in font name that covers *text*.""" + for char in text: + code = ord(char) + for start, end, font in _CJK_FONT_RANGES: + if start <= code <= end: + return font + return "helv" + + @staticmethod + def _insert_ocr_text( + page: fitz.Page, + box: np.ndarray, + txt: str, + scale: float, + ) -> None: + """Insert invisible text for one OCR bounding box onto *page*. + + Args: + page: The PyMuPDF page to annotate. + box: Four corner points ``[[x1,y1],…,[x4,y4]]`` in image pixels. + txt: Recognised text string. + scale: Conversion factor from image pixels to PDF points. + """ + box_arr = np.asarray(box, dtype=float) + xs = box_arr[:, 0] * scale + ys = box_arr[:, 1] * scale + x0, y0 = float(xs.min()), float(ys.min()) + x1, y1 = float(xs.max()), float(ys.max()) + height = y1 - y0 + if height <= 0 or x1 <= x0: + return + + fontname = RapidOCRPDF._select_font(txt) + # insert_text expects the *baseline* (bottom-left) of the first line + point = fitz.Point(x0, y1) + try: + page.insert_text( + point, + txt, + fontname=fontname, + fontsize=height, + render_mode=3, # invisible – text is searchable but not drawn + ) + except Exception as e: + logger.debug("Skipping OCR token %r: %s", txt, e) + class RapidOCRPDFError(Exception): pass @@ -179,6 +298,15 @@ def parse_args(arg_list: Optional[List[str]] = None): default=None, help="Which pages will be extracted. e.g. 0 1 2. Note: the index of page num starts from 0.", ) + parser.add_argument( + "--output_pdf", + type=str, + default=None, + help=( + "Path to save a searchable PDF with an invisible OCR text layer. " + "When specified, the tool writes a PDF instead of printing extracted text." + ), + ) args = parser.parse_args(arg_list) return args @@ -187,8 +315,16 @@ def main(arg_list: Optional[List[str]] = None): args = parse_args(arg_list) pdf_extracter = RapidOCRPDF(args.dpi) try: - result = pdf_extracter(args.pdf_path, args.force_ocr, args.page_num_list) - print(result) + if args.output_pdf: + pdf_extracter.to_searchable_pdf( + args.pdf_path, + output_path=args.output_pdf, + force_ocr=args.force_ocr, + page_num_list=args.page_num_list, + ) + else: + result = pdf_extracter(args.pdf_path, args.force_ocr, args.page_num_list) + print(result) except Exception as e: logger.error("%s\n%s", e, error_log()) diff --git a/tests/test_main.py b/tests/test_main.py index 201556a..52c38a0 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -10,6 +10,7 @@ root_dir = cur_dir.parent sys.path.append(str(root_dir)) +import fitz import pytest from rapidocr_pdf import RapidOCRPDF, RapidOCRPDFError @@ -109,3 +110,64 @@ def test_corner_case(content): with pytest.raises(RapidOCRPDFError) as exc_info: extracter(content) assert exc_info.type is RapidOCRPDFError + + +# --------------------------------------------------------------------------- +# Tests for to_searchable_pdf +# --------------------------------------------------------------------------- + + +def test_to_searchable_pdf_returns_bytes(): + """to_searchable_pdf should return a non-empty bytes object.""" + pdf_content = test_dir / "image.pdf" + result = extracter.to_searchable_pdf(pdf_content) + assert isinstance(result, bytes) + assert len(result) > 0 + + +def test_to_searchable_pdf_text_layer(): + """The output PDF should contain a searchable text layer on OCR pages.""" + pdf_content = test_dir / "image.pdf" + result = extracter.to_searchable_pdf(pdf_content) + with fitz.open(stream=result) as doc: + text = doc[0].get_text("text") + assert len(text) > 0 + + +def test_to_searchable_pdf_saves_file(tmp_path): + """When output_path is given the file should be written to disk.""" + pdf_content = test_dir / "image.pdf" + out = tmp_path / "searchable.pdf" + extracter.to_searchable_pdf(pdf_content, output_path=out) + assert out.exists() + assert out.stat().st_size > 0 + + +def test_to_searchable_pdf_from_bytes(): + """to_searchable_pdf should accept raw PDF bytes as input.""" + pdf_content = test_dir / "image.pdf" + with open(pdf_content, "rb") as f: + data = f.read() + result = extracter.to_searchable_pdf(data) + assert isinstance(result, bytes) + with fitz.open(stream=result) as doc: + text = doc[0].get_text("text") + assert len(text) > 0 + + +def test_to_searchable_pdf_invalid_input(): + """to_searchable_pdf should raise RapidOCRPDFError for invalid input.""" + with pytest.raises(RapidOCRPDFError): + extracter.to_searchable_pdf(None) + + +def test_cli_output_pdf(tmp_path): + """--output_pdf flag should produce a valid searchable PDF file.""" + out = tmp_path / "out.pdf" + cmd = f"{test_dir / 'image.pdf'} --output_pdf {out}" + main(shlex.split(cmd)) + assert out.exists() + assert out.stat().st_size > 0 + with fitz.open(str(out)) as doc: + text = doc[0].get_text("text") + assert len(text) > 0