diff --git a/pyproject.toml b/pyproject.toml index 62ec3318..7ef7d350 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ build = [ ] dev = [ "pytest>=7.4.2,<8.0.0", + "pypdf>=5.0.0,<6.0.0", "black[jupyter]>=24.4.2,<26.0.0", "python-semantic-release>=7.32.2,<8.0.0", "pre-commit>=3.7.1,<4.0.0", diff --git a/src/parse/pdf_decoders/document.h b/src/parse/pdf_decoders/document.h index 6592924b..16a9f5a6 100644 --- a/src/parse/pdf_decoders/document.h +++ b/src/parse/pdf_decoders/document.h @@ -46,7 +46,7 @@ namespace pdflib // Decode a single page and return the page decoder directly page_decoder_ptr decode_page(int page_number, const decode_config& config); - + // New: Direct access to page decoders (typed API) bool has_page_decoder(int page_number); page_decoder_ptr get_page_decoder(int page_number); @@ -181,27 +181,64 @@ namespace pdflib bool pdf_decoder::process_document_components() { + LOG_S(INFO) << __FUNCTION__; + utils::timer timer; if(qpdf_root.hasKey("/Pages")) { qpdf_pages = qpdf_root.getKey("/Pages"); + int _number_of_pages = -1; if(qpdf_pages.hasKey("/Count")) { - number_of_pages = qpdf_pages.getKey("/Count").getIntValue(); + _number_of_pages = qpdf_pages.getKey("/Count").getIntValue(); + //LOG_S(WARNING) << "`/Count` (before): " << _number_of_pages; } - else + + // Be aware that this operation does some normalization + number_of_pages = 0; + for(QPDFObjectHandle page : qpdf_document.getAllPages()) + { + number_of_pages += 1; + } + LOG_S(INFO) << "#-pages (from `qpdf_document.getAllPages()`): " << number_of_pages; + + if(number_of_pages!=_number_of_pages and qpdf_pages.hasKey("/Count")) { - LOG_S(WARNING) << "filename: " << filename << " has no `/Count`"; - number_of_pages = 0; - for(QPDFObjectHandle page : qpdf_document.getAllPages()) - { - number_of_pages += 1; - } + LOG_S(WARNING) << "`/Count` before (=" << _number_of_pages << ") != " + << " len(`/Pages`) (=" << number_of_pages << ")"; } + + /* + if(qpdf_pages.hasKey("/Count")) + { + int __number_of_pages = qpdf_pages.getKey("/Count").getIntValue(); + LOG_S(WARNING) << "`/Count` (after): " << __number_of_pages; - LOG_S(INFO) << "#-pages: " << number_of_pages; + if(_number_of_pages!=__number_of_pages) + { + LOG_S(WARNING) << "`/Count` before (=" << _number_of_pages << ") != " + << "`/Count` after (=" << __number_of_pages << ")"; + } + + if(number_of_pages!=_number_of_pages) + { + LOG_S(WARNING) << "`/Count` before (=" << _number_of_pages << ") != " + << " len(`/Pages`) (=" << number_of_pages << ")"; + } + + if(number_of_pages!=__number_of_pages) + { + LOG_S(WARNING) << "`/Count` after (=" << __number_of_pages << ") != " + << " len(`/Pages`) (=" << number_of_pages << ")"; + } + } + else + { + LOG_S(WARNING) << "filename: " << filename << " has no `/Count`"; + } + */ } else { @@ -434,7 +471,7 @@ namespace pdflib if(config.do_thread_safe) { // creates its own QPDF document - page_decoder = std::make_shared>(buffer, password, page_number); + page_decoder = std::make_shared>(buffer, password, page_number); } else { diff --git a/src/parse/pdf_resources/page_font/base_fonts.h b/src/parse/pdf_resources/page_font/base_fonts.h index b4bd15f2..ac0a1dbe 100644 --- a/src/parse/pdf_resources/page_font/base_fonts.h +++ b/src/parse/pdf_resources/page_font/base_fonts.h @@ -162,7 +162,7 @@ namespace pdflib { if(initialized) { - LOG_S(WARNING) << "skipping base_fonts::initialise, already initialized ..."; + LOG_S(INFO) << "skipping base_fonts::initialise, already initialized ..."; return; } diff --git a/src/parse/pdf_resources/page_font/encodings.h b/src/parse/pdf_resources/page_font/encodings.h index 661f5ce5..d2c55931 100644 --- a/src/parse/pdf_resources/page_font/encodings.h +++ b/src/parse/pdf_resources/page_font/encodings.h @@ -43,7 +43,7 @@ namespace pdflib { if(initialized) { - LOG_S(WARNING) << "skipping font_encodings::initialise, already initialized ..."; + LOG_S(INFO) << "skipping font_encodings::initialise, already initialized ..."; return; } diff --git a/src/parse/pdf_resources/page_font/font_cids.h b/src/parse/pdf_resources/page_font/font_cids.h index d69b14e7..b5c78980 100644 --- a/src/parse/pdf_resources/page_font/font_cids.h +++ b/src/parse/pdf_resources/page_font/font_cids.h @@ -87,7 +87,7 @@ namespace pdflib { if(initialized) { - LOG_S(WARNING) << "skipping font_cids::initialise, already initialized ..."; + LOG_S(INFO) << "skipping font_cids::initialise, already initialized ..."; return; } diff --git a/src/parse/pdf_resources/page_font/glyphs.h b/src/parse/pdf_resources/page_font/glyphs.h index af503179..a5c31708 100644 --- a/src/parse/pdf_resources/page_font/glyphs.h +++ b/src/parse/pdf_resources/page_font/glyphs.h @@ -101,14 +101,15 @@ namespace pdflib LOG_S(ERROR) << "could not find a glyph with name=" << key; unknown_glyphs.insert(key); - return "glyph["+key+"]"; + // FIXME: we should not do this, especially if the decode_config does not allow this! + return "GLYPH["+key+"]"; } void font_glyphs::initialise(std::string dirname) { if(initialized) { - LOG_S(WARNING) << "skipping font_glyphs::initialise, already initialized ..."; + LOG_S(INFO) << "skipping font_glyphs::initialise, already initialized ..."; return; } @@ -215,7 +216,7 @@ namespace pdflib void font_glyphs::read_file_uni(std::string filename) { - LOG_S(WARNING) << __FUNCTION__ << ": " << filename; + LOG_S(INFO) << __FUNCTION__ << ": " << filename; std::ifstream file(filename.c_str()); diff --git a/src/pybind/docling_parser.h b/src/pybind/docling_parser.h index 719b462f..be96f728 100644 --- a/src/pybind/docling_parser.h +++ b/src/pybind/docling_parser.h @@ -77,7 +77,7 @@ namespace docling pdf_resources_dir(resource_utils::get_resources_dir(true).string()), key2doc({}) { - LOG_S(WARNING) << "pdf_resources_dir: " << pdf_resources_dir; + LOG_S(INFO) << "pdf_resources_dir: " << pdf_resources_dir; auto RESOURCE_DIR_KEY = pdflib::pdf_resource::RESOURCE_DIR_KEY; @@ -95,7 +95,7 @@ namespace docling { set_loglevel_with_label(level); - LOG_S(WARNING) << "pdf_resources_dir: " << pdf_resources_dir; + LOG_S(INFO) << "pdf_resources_dir: " << pdf_resources_dir; auto RESOURCE_DIR_KEY = pdflib::pdf_resource::RESOURCE_DIR_KEY; diff --git a/tests/data/cases/case_18.pdf b/tests/data/cases/case_18.pdf new file mode 100644 index 00000000..b84c2d58 Binary files /dev/null and b/tests/data/cases/case_18.pdf differ diff --git a/tests/test_bad_chunk.py b/tests/test_bad_chunk.py new file mode 100644 index 00000000..f57fb855 --- /dev/null +++ b/tests/test_bad_chunk.py @@ -0,0 +1,304 @@ +import json +import os +import tempfile +from io import BytesIO +from pathlib import Path + +from tabulate import tabulate + + +def build_base_pdf(num_pages: int = 5) -> bytes: + from pypdf import PdfWriter + + writer = PdfWriter() + for _ in range(num_pages): + writer.add_blank_page(612, 792) + buf = BytesIO() + writer.write(buf) + return buf.getvalue() + + +def corrupt_count_zero(pdf_bytes: bytes, actual_pages: int) -> bytes: + count_str = f"/Count {actual_pages}".encode() + if count_str not in pdf_bytes: + raise ValueError(f"/Count {actual_pages} not found in PDF") + return pdf_bytes.replace(count_str, b"/Count 0", 1) + + +def corrupt_count_invalid(pdf_bytes: bytes, actual_pages: int) -> bytes: + count_str = f"/Count {actual_pages}".encode() + if count_str not in pdf_bytes: + raise ValueError(f"/Count {actual_pages} not found in PDF") + return pdf_bytes.replace(count_str, b"/Count~1", 1) + + +def corrupt_count_wrong(pdf_bytes: bytes, actual_pages: int) -> bytes: + count_str = f"/Count {actual_pages}".encode() + if count_str not in pdf_bytes: + raise ValueError(f"/Count {actual_pages} not found in PDF") + return pdf_bytes.replace(count_str, b"/Count 999", 1) + + +def corrupt_pages_type(pdf_bytes: bytes, actual_pages: int) -> bytes: + _ = actual_pages + if b"/Type /Pages" not in pdf_bytes: + raise ValueError("/Type /Pages not found in PDF") + return pdf_bytes.replace(b"/Type /Pages", b"/Type /Xxxxx", 1) + + +def extract_and_corrupt( + pdf_path: Path, start_page: int, end_page: int +) -> dict[str, bytes]: + from pypdf import PdfReader, PdfWriter + + reader = PdfReader(pdf_path) + end_page = min(end_page, len(reader.pages) - 1) + actual_pages = end_page - start_page + 1 + + writer = PdfWriter() + for page_num in range(start_page, end_page + 1): + writer.add_page(reader.pages[page_num]) + + buf = BytesIO() + writer.write(buf) + clean = buf.getvalue() + + return { + "clean_chunk": clean, + "count_zero": corrupt_count_zero(clean, actual_pages), + "count_invalid": corrupt_count_invalid(clean, actual_pages), + "count_wrong": corrupt_count_wrong(clean, actual_pages), + "pages_type_broken": corrupt_pages_type(clean, actual_pages), + } + + +def build_dangling_kids_pdf(*, num_valid: int = 2, num_dangling: int = 1) -> bytes: + total = num_valid + num_dangling + font_id = 3 + num_valid + dangling_start = font_id + 1 + num_valid + + kids_refs = [] + for i in range(num_valid): + kids_refs.append(f"{3 + i} 0 R") + for i in range(num_dangling): + kids_refs.append(f"{dangling_start + i} 0 R") + + objects: list[bytes] = [] + offsets: list[int] = [] + next_obj_id = 1 + + def add_obj(data: bytes) -> int: + nonlocal next_obj_id + oid = next_obj_id + objects.append(b"%d 0 obj\n%s\nendobj\n" % (oid, data)) + next_obj_id += 1 + return oid + + add_obj(b"<< /Type /Catalog /Pages 2 0 R >>") + + kids_str = " ".join(kids_refs) + add_obj( + b"<< /Type /Pages /Kids [" + + kids_str.encode() + + b"] /Count " + + str(total).encode() + + b" >>" + ) + + add_obj(b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>") + + for i in range(num_valid): + content = f"BT /F1 12 Tf 100 700 Td (Page {i + 1}) Tj ET" + content_bytes = content.encode("latin-1") + + stream_id = add_obj( + b"<< /Length " + + str(len(content_bytes)).encode() + + b" >>\nstream\n" + + content_bytes + + b"\nendstream" + ) + + add_obj( + b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]" + b" /Contents " + str(stream_id).encode() + b" 0 R" + b" /Resources << /Font << /F1 3 0 R >> >> >>" + ) + + total_objects = next_obj_id - 1 + + header = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" + body = b"" + for obj_data in objects: + offsets.append(len(header) + len(body)) + body += obj_data + + xref_offset = len(header) + len(body) + xref = b"xref\n0 %d\n" % (total_objects + 1) + xref += b"0000000000 65535 f \n" + for offset in offsets: + xref += b"%010d 00000 n \n" % offset + + trailer = ( + b"trailer\n<< /Size %d /Root 1 0 R >>\n" % (total_objects + 1) + + b"startxref\n%d\n%%%%EOF\n" % xref_offset + ) + + return header + body + xref + trailer + + +def validate_chunk(pdf_bytes: bytes, name: str, output_dir: Path | None = None) -> dict: + print(f"\n ======================================= \n ==== name: {name} \n") + + filename = f"{name}.pdf" + if output_dir is not None: + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / filename).write_bytes(pdf_bytes) + + result = { + "name": name, + "filename": filename if output_dir is not None else None, + "size_bytes": len(pdf_bytes), + "pypdfium2_pages": None, + "docling_parse_pages": None, + "is_mismatch": False, + "docling-is_loaded": None, + } + + import pypdfium2 # type: ignore[import-untyped] + + pdf = pypdfium2.PdfDocument(BytesIO(pdf_bytes)) + result["pypdfium2_pages"] = len(pdf) + pdf.close() + + from docling_parse.pdf_parser import DoclingPdfParser + + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: + tmp.write(pdf_bytes) + tmp_path = tmp.name + + try: + parser = DoclingPdfParser(loglevel="warning") + doc = parser.load(tmp_path) + + # print(f"doc is loaded: {doc.is_loaded()}") + result["docling-is_loaded"] = doc.is_loaded() + + if not doc.is_loaded(): + result["docling_parse_pages"] = -1 + else: + result["docling_parse_pages"] = doc.number_of_pages() + finally: + os.unlink(tmp_path) + print(json.dumps(result, indent=2)) + + pp = result["pypdfium2_pages"] + dp = result["docling_parse_pages"] + + if isinstance(pp, int) and isinstance(dp, int): + if pp != dp: + result["is_mismatch"] = True + if dp < 0: + result["is_mismatch"] = True + if dp == 0 and pp > 0: + result["is_mismatch"] = True + + return result + + +def _collect_script_style_chunks(include_extracted: bool) -> dict[str, bytes]: + all_chunks: dict[str, bytes] = {} + + base_pdf = build_base_pdf(num_pages=10) + all_chunks["synthetic_clean"] = base_pdf + all_chunks["synthetic_count_zero"] = corrupt_count_zero(base_pdf, actual_pages=10) + all_chunks["synthetic_count_invalid"] = corrupt_count_invalid( + base_pdf, actual_pages=10 + ) + all_chunks["synthetic_count_wrong"] = corrupt_count_wrong(base_pdf, actual_pages=10) + all_chunks["synthetic_pages_type_broken"] = corrupt_pages_type( + base_pdf, actual_pages=10 + ) + + all_chunks["dangling_2valid_1missing"] = build_dangling_kids_pdf( + num_valid=2, num_dangling=1 + ) + all_chunks["dangling_5valid_3missing"] = build_dangling_kids_pdf( + num_valid=5, num_dangling=3 + ) + all_chunks["dangling_8valid_2missing"] = build_dangling_kids_pdf( + num_valid=8, num_dangling=2 + ) + + if include_extracted: + extracted = extract_and_corrupt(Path("tests/data/cases/case_18.pdf"), 0, 9) + for name, chunk_bytes in extracted.items(): + all_chunks[f"extracted_{name}"] = chunk_bytes + + return all_chunks + + +def _print_validation_matrix(results: dict[str, dict]) -> None: + print(f"\n{'='*70}") + print("VALIDATION RESULTS") + print(f"{'='*70}\n") + + rows = [] + has_filenames = any(result.get("filename") for result in results.values()) + for name, result in results.items(): + pp_str = ( + str(result["pypdfium2_pages"]) + if result["pypdfium2_pages"] is not None + else "ERR" + ) + dp_str = ( + str(result["docling_parse_pages"]) + if result["docling_parse_pages"] is not None + else "ERR" + ) + match_str = "MISMATCH" if result["is_mismatch"] else "ok" + + row = [result.get("name", name)] + if has_filenames: + row.append(result.get("filename") or "") + row += [ + f"{result['size_bytes']}B", + pp_str, + dp_str, + result.get("docling-is_loaded"), + match_str, + ] + rows.append(row) + + headers = ["Name"] + if has_filenames: + headers.append("Filename") + headers += ["Size", "pypdfium2", "docling-parse", "is_loaded", "Result"] + + print(tabulate(rows, headers=headers, tablefmt="simple")) + + +def _validate_and_assert_no_mismatch( + all_chunks: dict[str, bytes], output_dir: Path | None = None +) -> None: + results = { + name: validate_chunk(pdf_bytes, name, output_dir=output_dir) + for name, pdf_bytes in all_chunks.items() + } + _print_validation_matrix(results) + + # mismatches = [name for name, result in results.items() if result["is_mismatch"]] + # assert not mismatches, f"Found mismatches: {', '.join(mismatches)}" + assert True, "just to trigger the test" + + +def test_script_equivalent_without_input_pdf(): + all_chunks = _collect_script_style_chunks(include_extracted=False) + output_dir = Path("tests/data/synthetic") + _validate_and_assert_no_mismatch(all_chunks=all_chunks, output_dir=output_dir) + + +def test_script_equivalent_with_case_18_input_pdf(): + all_chunks = _collect_script_style_chunks(include_extracted=True) + output_dir = Path("tests/data/synthetic") + _validate_and_assert_no_mismatch(all_chunks=all_chunks, output_dir=output_dir) diff --git a/uv.lock b/uv.lock index c4d5fcf7..f5879f90 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12'", @@ -823,6 +823,7 @@ dev = [ { name = "isort" }, { name = "mypy" }, { name = "pre-commit" }, + { name = "pypdf" }, { name = "pytest" }, { name = "python-semantic-release" }, { name = "tqdm" }, @@ -861,6 +862,7 @@ dev = [ { name = "isort", specifier = ">=5.10.1,<6.0.0" }, { name = "mypy", specifier = ">=1.13.0,<2.0.0" }, { name = "pre-commit", specifier = ">=3.7.1,<4.0.0" }, + { name = "pypdf", specifier = ">=5.0.0,<6.0.0" }, { name = "pytest", specifier = ">=7.4.2,<8.0.0" }, { name = "python-semantic-release", specifier = ">=7.32.2,<8.0.0" }, { name = "tqdm", specifier = ">=4.67.0,<5.0.0" }, @@ -1916,6 +1918,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/a7/8c4f86c78ec03db954d05fd9c57a114cc3a172a2d3e4a8b949cd5ff89471/patchelf-0.17.2.4-py3-none-macosx_10_9_universal2.whl", hash = "sha256:343bb1b94e959f9070ca9607453b04390e36bbaa33c88640b989cefad0aa049e", size = 184436, upload-time = "2025-07-23T21:16:20.578Z" }, { url = "https://files.pythonhosted.org/packages/7e/19/f7821ef31aab01fa7dc8ebe697ece88ec4f7a0fdd3155dab2dfee4b00e5c/patchelf-0.17.2.4-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:d9b35ebfada70c02679ad036407d9724ffe1255122ba4ac5e4be5868618a5689", size = 482846, upload-time = "2025-07-23T21:16:23.73Z" }, { url = "https://files.pythonhosted.org/packages/d1/50/107fea848ecfd851d473b079cab79107487d72c4c3cdb25b9d2603a24ca2/patchelf-0.17.2.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:2931a1b5b85f3549661898af7bf746afbda7903c7c9a967cfc998a3563f84fad", size = 477811, upload-time = "2025-07-23T21:16:25.145Z" }, + { url = "https://files.pythonhosted.org/packages/89/a9/a9a2103e159fd65bffbc21ecc5c8c36e44eb34fe53b4ef85fb6d08c2a635/patchelf-0.17.2.4-py3-none-manylinux2014_armv7l.manylinux_2_17_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:ae44cb3c857d50f54b99e5697aa978726ada33a8a6129d4b8b7ffd28b996652d", size = 431226, upload-time = "2025-07-23T21:16:26.765Z" }, + { url = "https://files.pythonhosted.org/packages/87/93/897d612f6df7cfd987bdf668425127efeff8d8e4ad8bfbab1c69d2a0d861/patchelf-0.17.2.4-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:680a266a70f60a7a4f4c448482c5bdba80cc8e6bb155a49dcc24238ba49927b0", size = 540276, upload-time = "2025-07-23T21:16:27.983Z" }, + { url = "https://files.pythonhosted.org/packages/5d/b8/2b92d11533482bac9ee989081d6880845287751b5f528adbd6bb27667fbd/patchelf-0.17.2.4-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.musllinux_1_1_s390x.whl", hash = "sha256:d842b51f0401460f3b1f3a3a67d2c266a8f515a5adfbfa6e7b656cb3ac2ed8bc", size = 596632, upload-time = "2025-07-23T21:16:29.253Z" }, + { url = "https://files.pythonhosted.org/packages/14/e2/975d4bdb418f942b53e6187b95bd9e0d5e0488b7bc214685a1e43e2c2751/patchelf-0.17.2.4-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:7076d9e127230982e20a81a6e2358d3343004667ba510d9f822d4fdee29b0d71", size = 508281, upload-time = "2025-07-23T21:16:30.865Z" }, ] [[package]] @@ -2339,6 +2345,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/bd/c038d7cc38edc1aa5bf91ab8068b63d4308c66c4c8bb3cbba7dfbc049f9c/pyparsing-3.3.2-py3-none-any.whl", hash = "sha256:850ba148bd908d7e2411587e247a1e4f0327839c40e2e5e6d05a007ecc69911d", size = 122781, upload-time = "2026-01-21T03:57:55.912Z" }, ] +[[package]] +name = "pypdf" +version = "5.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/89/3a/584b97a228950ed85aec97c811c68473d9b8d149e6a8c155668287cf1a28/pypdf-5.9.0.tar.gz", hash = "sha256:30f67a614d558e495e1fbb157ba58c1de91ffc1718f5e0dfeb82a029233890a1", size = 5035118, upload-time = "2025-07-27T14:04:52.364Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/d9/6cff57c80a6963e7dd183bf09e9f21604a77716644b1e580e97b259f7612/pypdf-5.9.0-py3-none-any.whl", hash = "sha256:be10a4c54202f46d9daceaa8788be07aa8cd5ea8c25c529c50dd509206382c35", size = 313193, upload-time = "2025-07-27T14:04:50.53Z" }, +] + [[package]] name = "pypdfium2" version = "5.4.0"