diff --git a/README.md b/README.md index 2b0b1dc2..aa7d101e 100644 --- a/README.md +++ b/README.md @@ -111,18 +111,18 @@ parser = DoclingThreadedPdfParser( # load one or more documents for source in ["doc_a.pdf", "doc_b.pdf"]: - parser.load(source) + doc_key = parser.load(source) + print(doc_key, parser.page_count(doc_key)) # consume decoded pages as they become available -while parser.has_tasks(): - task = parser.get_task() - - if task.success: - page_decoder, timings = task.get() - print(f"{task.doc_key} p{task.page_number}: " - f"{len(list(page_decoder.get_word_cells()))} words") +for result in parser.iterate_results(): + if result.success: + seg_page = result.get_page() + timings = result.get_timings() + print(f"{result.doc_key} p{result.page_number}: " + f"{len(seg_page.word_cells)} words in {timings.total():.3f}s") else: - print(f"error on {task.doc_key} p{task.page_number}: {task.error()}") + print(f"error on {result.doc_key} p{result.page_number}: {result.error_message}") ``` Use the CLI diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp index a2ba4f6c..7b2eff3f 100644 --- a/app/pybind_parse.cpp +++ b/app/pybind_parse.cpp @@ -10,6 +10,7 @@ #include #include #include +#include // Include parse headers for typed bindings #include @@ -549,7 +550,29 @@ PYBIND11_MODULE(pdf_parsers, m) { self.get_instructions().iterate_over_instructions(visitor); return visitor.artifacts; }, - "Export bitmap artifacts as inspectable image bytes plus raw payload bytes"); + "Export bitmap artifacts as inspectable image bytes plus raw payload bytes") + .def("render_image", + [](pdflib::pdf_decoder& self, + const pdflib::render_config& config) -> pybind11::tuple { + pdflib::renderer rnd(config); + { + pybind11::gil_scoped_release release; + self.get_instructions().iterate_over_instructions(rnd); + } + + auto canvas = rnd.get_canvas(); + const auto& shape = rnd.get_shape(); + pybind11::bytes image_bytes(""); + if(canvas and not canvas->empty()) + { + image_bytes = pybind11::bytes( + reinterpret_cast(canvas->data()), + canvas->size()); + } + return pybind11::make_tuple(image_bytes, shape); + }, + pybind11::arg("config"), + "Render the decoded page to RGBA bytes using the provided RenderConfig"); // ============= Timing Keys Constants ============= @@ -799,10 +822,10 @@ PYBIND11_MODULE(pdf_parsers, m) { // ============= Threaded PDF Parser ============= - // PageDecodeResult - result of a threaded page decode task - pybind11::class_(m, "PageDecodeResult", + // _PageDecodeResult - internal result of a threaded page decode task + pybind11::class_(m, "_PageDecodeResult", R"( - Result of a threaded page decoding task. + Internal result of a threaded page decoding task. Attributes: doc_key (str): The document key this page belongs to. @@ -839,10 +862,10 @@ PYBIND11_MODULE(pdf_parsers, m) { Returns: str: The error message.)"); - // threaded_pdf_parser - parallel PDF parser with bounded result queue - pybind11::class_(m, "threaded_pdf_parser", + // _threaded_pdf_parser - internal parallel PDF parser with bounded result queue + pybind11::class_(m, "_threaded_pdf_parser", R"( - Threaded PDF parser that processes pages in parallel. + Internal threaded PDF parser that processes pages in parallel. Loads multiple documents and decodes their pages using a thread pool. Results are available via a bounded queue to control memory usage. @@ -865,12 +888,14 @@ PYBIND11_MODULE(pdf_parsers, m) { [](docling::docling_threaded_parser& self, const std::string& key, const std::string& filename, - std::optional& password) -> bool { - return self.load_document(key, filename, password); + std::optional& password, + std::optional>& page_numbers) -> bool { + return self.load_document(key, filename, password, page_numbers); }, pybind11::arg("key"), pybind11::arg("filename"), pybind11::arg("password") = pybind11::none(), + pybind11::arg("page_numbers") = pybind11::none(), R"( Load a document by key and filename. @@ -878,6 +903,7 @@ PYBIND11_MODULE(pdf_parsers, m) { key (str): The unique key to identify the document. filename (str): The path to the document file to load. password (str, optional): Optional password for password-protected files. + page_numbers (Sequence[int], optional): Selected 1-indexed physical pages to schedule. Returns: bool: True if the document was successfully loaded.)") @@ -886,12 +912,14 @@ PYBIND11_MODULE(pdf_parsers, m) { [](docling::docling_threaded_parser& self, const std::string& key, pybind11::object bytes_io, - std::optional& password) -> bool { - return self.load_document_from_bytesio(key, bytes_io, password); + std::optional& password, + std::optional>& page_numbers) -> bool { + return self.load_document_from_bytesio(key, bytes_io, password, page_numbers); }, pybind11::arg("key"), pybind11::arg("bytes_io"), pybind11::arg("password") = pybind11::none(), + pybind11::arg("page_numbers") = pybind11::none(), R"( Load a document from a BytesIO-like object. @@ -899,10 +927,54 @@ PYBIND11_MODULE(pdf_parsers, m) { key (str): The unique key to identify the document. bytes_io (Any): A BytesIO-like object containing the document data. password (str, optional): Optional password for password-protected files. + page_numbers (Sequence[int], optional): Selected 1-indexed physical pages to schedule. Returns: bool: True if the document was successfully loaded.)") + .def("number_of_pages", + [](docling::docling_threaded_parser& self, const std::string& key) -> int { + return self.number_of_pages(key); + }, + pybind11::arg("key"), + R"( + Return the number of pages in a loaded document. + + Parameters: + key (str): The unique key identifying the document. + + Returns: + int: Number of pages in the loaded document.)") + .def("scheduled_number_of_pages", + [](docling::docling_threaded_parser& self, const std::string& key) -> int { + return self.scheduled_number_of_pages(key); + }, + pybind11::arg("key"), + R"( + Return the number of scheduled pages in a loaded document. + + Parameters: + key (str): The unique key identifying the document. + + Returns: + int: Number of pages that will be emitted by the threaded parser.)") + .def("unload_document", + [](docling::docling_threaded_parser& self, const std::string& key) -> bool { + return self.unload_document(key); + }, + pybind11::arg("key"), + R"( + Unload one document after threaded processing is complete. + + Returns: + bool: True when document state existed and was removed.)") + .def("unload_all_documents", + [](docling::docling_threaded_parser& self) { + self.unload_all_documents(); + }, + R"( + Unload all documents after threaded processing is complete.)") + .def("has_tasks", [](docling::docling_threaded_parser& self) -> bool { return self.has_tasks(); @@ -926,7 +998,7 @@ PYBIND11_MODULE(pdf_parsers, m) { Blocks until a result is available. Releases the GIL while waiting. Returns: - PageDecodeResult: The result of a page decoding task.)"); + _PageDecodeResult: The result of a page decoding task.)"); // ============= Threaded PDF Renderer ============= @@ -942,6 +1014,7 @@ PYBIND11_MODULE(pdf_parsers, m) { fit_glyph_bbox_to_target (bool): Uniformly rescale measured glyph outlines so the rendered bbox fits inside the target glyph bbox, with either width or height matching exactly [default=false]. resolve_fonts (bool): Resolve PDF font names to system fonts [default=true]. font_similarity_cutoff (float): Minimum Jaccard similarity for fuzzy font matching; candidates below this threshold fall back to the default font [default=0.25]. + scale (float): Target render scale in multiples of the PDF page size; -1 disables scale-based sizing [default=-1]. canvas_width (int): Target canvas width in pixels; -1 means use PDF page size [default=-1]. canvas_height (int): Target canvas height in pixels; -1 means use PDF page size [default=-1]. )") @@ -952,15 +1025,16 @@ PYBIND11_MODULE(pdf_parsers, m) { .def_readwrite("fit_glyph_bbox_to_target",&pdflib::render_config::fit_glyph_bbox_to_target) .def_readwrite("resolve_fonts", &pdflib::render_config::resolve_fonts) .def_readwrite("font_similarity_cutoff", &pdflib::render_config::font_similarity_cutoff) + .def_readwrite("scale", &pdflib::render_config::scale) .def_readwrite("canvas_width", &pdflib::render_config::canvas_width) .def_readwrite("canvas_height", &pdflib::render_config::canvas_height); - // PageRenderResult - result of a threaded page render task - pybind11::class_(m, "PageRenderResult", + // _PageRenderResult - internal result of a threaded page render task + pybind11::class_(m, "_PageRenderResult", R"( - Result of a threaded page rendering task. + Internal result of a threaded page rendering task. - Inherits all attributes of PageDecodeResult and adds rendered image data. + Inherits all attributes of _PageDecodeResult and adds rendered image data. Attributes: image_data: Raw RGBA bytes of the rendered page (height x width x 4, row-major). @@ -988,10 +1062,10 @@ PYBIND11_MODULE(pdf_parsers, m) { Returns: bytes: Raw RGBA pixel data, or empty bytes on failure.)"); - // threaded_pdf_renderer - parallel PDF renderer with bounded result queue - pybind11::class_(m, "threaded_pdf_renderer", + // _threaded_pdf_renderer - internal parallel PDF renderer with bounded result queue + pybind11::class_(m, "_threaded_pdf_renderer", R"( - Threaded PDF renderer that decodes and renders pages in parallel. + Internal threaded PDF renderer that decodes and renders pages in parallel. Loads multiple documents and renders their pages using a thread pool. Each result contains both the decoded page data and the rendered RGBA image. @@ -1018,23 +1092,47 @@ PYBIND11_MODULE(pdf_parsers, m) { [](docling::docling_threaded_renderer& self, const std::string& key, const std::string& filename, - std::optional& password) -> bool { - return self.load_document(key, filename, password); + std::optional& password, + std::optional>& page_numbers) -> bool { + return self.load_document(key, filename, password, page_numbers); }, pybind11::arg("key"), pybind11::arg("filename"), - pybind11::arg("password") = pybind11::none()) + pybind11::arg("password") = pybind11::none(), + pybind11::arg("page_numbers") = pybind11::none()) .def("load_document_from_bytesio", [](docling::docling_threaded_renderer& self, const std::string& key, pybind11::object bytes_io, - std::optional& password) -> bool { - return self.load_document_from_bytesio(key, bytes_io, password); + std::optional& password, + std::optional>& page_numbers) -> bool { + return self.load_document_from_bytesio(key, bytes_io, password, page_numbers); }, pybind11::arg("key"), pybind11::arg("bytes_io"), - pybind11::arg("password") = pybind11::none()) + pybind11::arg("password") = pybind11::none(), + pybind11::arg("page_numbers") = pybind11::none()) + + .def("number_of_pages", + [](docling::docling_threaded_renderer& self, const std::string& key) -> int { + return self.number_of_pages(key); + }, + pybind11::arg("key")) + .def("scheduled_number_of_pages", + [](docling::docling_threaded_renderer& self, const std::string& key) -> int { + return self.scheduled_number_of_pages(key); + }, + pybind11::arg("key")) + .def("unload_document", + [](docling::docling_threaded_renderer& self, const std::string& key) -> bool { + return self.unload_document(key); + }, + pybind11::arg("key")) + .def("unload_all_documents", + [](docling::docling_threaded_renderer& self) { + self.unload_all_documents(); + }) .def("has_tasks", [](docling::docling_threaded_renderer& self) -> bool { @@ -1052,5 +1150,5 @@ PYBIND11_MODULE(pdf_parsers, m) { Blocks until a result is available. Releases the GIL while waiting. Returns: - PageRenderResult: The result of a page rendering task.)"); + _PageRenderResult: The result of a page rendering task.)"); } diff --git a/app/render.cpp b/app/render.cpp index 193de5dd..7d7369c4 100644 --- a/app/render.cpp +++ b/app/render.cpp @@ -218,6 +218,7 @@ int main(int argc, char* argv[]) cxxopts::value()->implicit_value("true")) ("resolve-fonts", "Resolve PDF font names to system fonts (default: true)", cxxopts::value()->implicit_value("true")) ("font-similarity-cutoff", "Minimum Jaccard similarity for fuzzy font matching (default: 0.25)", cxxopts::value()) + ("scale", "Canvas scale in multiples of the PDF page size (-1 = disabled)", cxxopts::value()) ("canvas-width", "Canvas width in pixels (-1 = use page size)", cxxopts::value()) ("canvas-height", "Canvas height in pixels (-1 = use page size)", cxxopts::value()) @@ -317,6 +318,7 @@ int main(int argc, char* argv[]) if (result.count("fit-glyph-bbox-to-target")) { cfg.fit_glyph_bbox_to_target = result["fit-glyph-bbox-to-target"].as(); } if (result.count("resolve-fonts")) { cfg.resolve_fonts = result["resolve-fonts"].as(); } if (result.count("font-similarity-cutoff")) { cfg.font_similarity_cutoff = result["font-similarity-cutoff"].as(); } + if (result.count("scale")) { cfg.scale = result["scale"].as(); } if (result.count("canvas-width")) { cfg.canvas_width = result["canvas-width"].as(); } if (result.count("canvas-height")) { cfg.canvas_height = result["canvas-height"].as(); } diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 36c727d4..656f2d1e 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -2,9 +2,10 @@ import hashlib import logging +import math from io import BytesIO from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode from docling_core.types.doc.document import ImageRef @@ -59,15 +60,14 @@ TIMING_PREFIX_DECODE_XOBJECT, TIMING_PREFIX_DECODING_PAGE, DecodePageConfig, # type: ignore[import] - PageDecodeResult, # type: ignore[import] PdfPageDecoder, # type: ignore[import] RenderConfig, # type: ignore[import] + _threaded_pdf_parser, # type: ignore[import] + _threaded_pdf_renderer, # type: ignore[import] get_decode_page_timing_keys, get_static_timing_keys, is_static_timing_key, pdf_parser, # type: ignore[import] - threaded_pdf_parser, # type: ignore[import] - threaded_pdf_renderer, # type: ignore[import] ) # Configure logging @@ -182,6 +182,306 @@ def decode_page_keys() -> List[str]: return get_decode_page_timing_keys() +def _to_bounding_rectangle( + bbox: tuple[float, float, float, float], +) -> BoundingRectangle: + return BoundingRectangle( + r_x0=bbox[0], + r_y0=bbox[1], + r_x1=bbox[2], + r_y1=bbox[1], + r_x2=bbox[2], + r_y2=bbox[3], + r_x3=bbox[0], + r_y3=bbox[3], + coord_origin=CoordOrigin.BOTTOMLEFT, + ) + + +def _to_bounding_box(bbox: tuple[float, float, float, float]) -> BoundingBox: + return BoundingBox( + l=bbox[0], + b=bbox[1], + r=bbox[2], + t=bbox[3], + coord_origin=CoordOrigin.BOTTOMLEFT, + ) + + +def _get_boundary_bbox( + page_dim, + boundary_type: PdfPageBoundaryType, +) -> tuple[float, float, float, float]: + media_bbox = tuple(page_dim.get_media_bbox()) + crop_bbox = tuple(page_dim.get_crop_bbox()) + + if boundary_type == PdfPageBoundaryType.MEDIA_BOX: + return media_bbox + + return crop_bbox + + +def _to_page_geometry_from_decoder( + page_dim, + boundary_type: PdfPageBoundaryType, +) -> PdfPageGeometry: + crop_bbox = tuple(page_dim.get_crop_bbox()) + media_bbox = tuple(page_dim.get_media_bbox()) + boundary_bbox = _get_boundary_bbox(page_dim, boundary_type) + + return PdfPageGeometry( + angle=page_dim.get_angle(), + boundary_type=boundary_type, + rect=_to_bounding_rectangle(boundary_bbox), + art_bbox=_to_bounding_box(crop_bbox), + media_bbox=_to_bounding_box(media_bbox), + trim_bbox=_to_bounding_box(crop_bbox), + crop_bbox=_to_bounding_box(crop_bbox), + bleed_bbox=_to_bounding_box(crop_bbox), + ) + + +def _to_cells_from_decoder(cells_container) -> List[Union[PdfTextCell, TextCell]]: + result: List[Union[PdfTextCell, TextCell]] = [] + + for ind, cell in enumerate(cells_container): + result.append( + PdfTextCell( + rect=BoundingRectangle( + r_x0=cell.r_x0, + r_y0=cell.r_y0, + r_x1=cell.r_x1, + r_y1=cell.r_y1, + r_x2=cell.r_x2, + r_y2=cell.r_y2, + r_x3=cell.r_x3, + r_y3=cell.r_y3, + ), + text=cell.text, + orig=cell.text, + font_key=cell.font_key, + font_name=cell.font_name, + widget=cell.widget, + text_direction=( + TextDirection.LEFT_TO_RIGHT + if cell.left_to_right + else TextDirection.RIGHT_TO_LEFT + ), + index=ind, + rendering_mode=cell.rendering_mode, + ) + ) + + return result + + +def _to_shapes_from_decoder(shapes_container) -> List[PdfShape]: + result: List[PdfShape] = [] + + for ind, shape in enumerate(shapes_container): + x_coords = shape.get_x() + y_coords = shape.get_y() + indices = shape.get_i() + + for pair_idx in range(0, len(indices), 2): + i0: int = indices[pair_idx + 0] + i1: int = indices[pair_idx + 1] + + points: List[Coord2D] = [] + for k in range(i0, i1): + points.append(Coord2D(x_coords[k], y_coords[k])) + + rgb_s = shape.get_rgb_stroking_ops() + rgb_f = shape.get_rgb_filling_ops() + + result.append( + PdfShape( + index=ind, + parent_id=pair_idx, + points=points, + has_graphics_state=shape.get_has_graphics_state(), + line_width=shape.get_line_width(), + miter_limit=shape.get_miter_limit(), + line_cap=shape.get_line_cap(), + line_join=shape.get_line_join(), + dash_phase=shape.get_dash_phase(), + dash_array=list(shape.get_dash_array()), + flatness=shape.get_flatness(), + rgb_stroking=ColorRGBA(r=rgb_s[0], g=rgb_s[1], b=rgb_s[2]), + rgb_filling=ColorRGBA(r=rgb_f[0], g=rgb_f[1], b=rgb_f[2]), + ) + ) + + return result + + +def _to_widgets_from_decoder(widgets_container) -> List[PdfWidget]: + result: List[PdfWidget] = [] + + for ind, widget in enumerate(widgets_container): + result.append( + PdfWidget( + index=ind, + rect=BoundingRectangle( + r_x0=widget.x0, + r_y0=widget.y0, + r_x1=widget.x1, + r_y1=widget.y0, + r_x2=widget.x1, + r_y2=widget.y1, + r_x3=widget.x0, + r_y3=widget.y1, + ), + widget_text=widget.text or None, + widget_description=widget.description or None, + widget_field_name=widget.field_name or None, + widget_field_type=widget.field_type or None, + ) + ) + + return result + + +def _to_hyperlinks_from_decoder(hyperlinks_container) -> List[PdfHyperlink]: + result: List[PdfHyperlink] = [] + + for ind, hyperlink in enumerate(hyperlinks_container): + result.append( + PdfHyperlink( + index=ind, + rect=BoundingRectangle( + r_x0=hyperlink.x0, + r_y0=hyperlink.y0, + r_x1=hyperlink.x1, + r_y1=hyperlink.y0, + r_x2=hyperlink.x1, + r_y2=hyperlink.y1, + r_x3=hyperlink.x0, + r_y3=hyperlink.y1, + ), + uri=hyperlink.uri or None, + ) + ) + + return result + + +def _to_bitmap_resources_from_decoder(images_container) -> List[BitmapResource]: + result: List[BitmapResource] = [] + + for ind, image in enumerate(images_container): + image_ref = None + mode = ImageRefMode.PLACEHOLDER + + try: + image_bytes = image.get_image_as_bytes() + + if image_bytes and len(image_bytes) > 0: + fmt = image.get_image_format() + pil_image: PILImage.Image | None = None + + if fmt in ("jpeg", "jp2"): + pil_image = PILImage.open(BytesIO(image_bytes)) + elif fmt in ("raw", "jbig2"): + pil_mode = image.get_pil_mode() + w = image.image_width + h = image.image_height + if w > 0 and h > 0: + pil_image = PILImage.frombytes(pil_mode, (w, h), image_bytes) + + if pil_image is not None: + if pil_image.mode != "RGBA": + pil_image = pil_image.convert("RGBA") + + bbox_width = abs(image.x1 - image.x0) + if bbox_width > 0 and image.image_width > 0: + dpi = round(image.image_width * 72.0 / bbox_width) + else: + dpi = 72 + + image_ref = ImageRef.from_pil(pil_image, dpi=dpi) + mode = ImageRefMode.EMBEDDED + + except Exception: + _log.debug( + "Failed to extract image data for bitmap, falling back to placeholder" + ) + + result.append( + BitmapResource( + index=ind, + rect=BoundingRectangle( + r_x0=image.x0, + r_y0=image.y0, + r_x1=image.x1, + r_y1=image.y0, + r_x2=image.x1, + r_y2=image.y1, + r_x3=image.x0, + r_y3=image.y1, + ), + uri=None, + image=image_ref, + mode=mode, + ) + ) + + return result + + +def segmented_page_from_decoder( + page_decoder: PdfPageDecoder, + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, +) -> SegmentedPdfPage: + """Convert a C++ PdfPageDecoder to a SegmentedPdfPage.""" + char_cells = _to_cells_from_decoder(page_decoder.get_char_cells()) + + segmented_page = SegmentedPdfPage( + dimension=_to_page_geometry_from_decoder( + page_decoder.get_page_dimension(), boundary_type + ), + char_cells=char_cells, + word_cells=[], + textline_cells=[], + has_chars=len(char_cells) > 0, + bitmap_resources=_to_bitmap_resources_from_decoder( + page_decoder.get_page_images() + ), + shapes=_to_shapes_from_decoder(page_decoder.get_page_shapes()), + widgets=_to_widgets_from_decoder(page_decoder.get_page_widgets()), + hyperlinks=_to_hyperlinks_from_decoder(page_decoder.get_page_hyperlinks()), + ) + + if page_decoder.has_word_cells(): + segmented_page.word_cells = _to_cells_from_decoder( + page_decoder.get_word_cells() + ) + segmented_page.has_words = len(segmented_page.word_cells) > 0 + + if page_decoder.has_line_cells(): + segmented_page.textline_cells = _to_cells_from_decoder( + page_decoder.get_line_cells() + ) + segmented_page.has_lines = len(segmented_page.textline_cells) > 0 + + return segmented_page + + +def _timings_from_decoder(page_decoder: PdfPageDecoder) -> Timings: + return Timings( + data=dict(page_decoder.get_timings()), + raw_data=dict(page_decoder.get_timings_raw()), + ) + + +def _page_size_from_decoder( + page_decoder: PdfPageDecoder, + boundary_type: PdfPageBoundaryType, +) -> tuple[float, float]: + bbox = _get_boundary_bbox(page_decoder.get_page_dimension(), boundary_type) + return abs(bbox[2] - bbox[0]), abs(bbox[3] - bbox[1]) + + class PdfDocument: def __init__( self, @@ -402,7 +702,6 @@ def _get_page_with_timings_typed( segmented_page = self._to_segmented_page_from_decoder( page_decoder=page_decoder, - config=config, ) # Get timings from the page decoder @@ -420,308 +719,42 @@ def load_all_pages(self, config: DecodePageConfig | None = None): def _to_page_geometry_from_decoder(self, page_dim) -> PdfPageGeometry: """Convert typed PdfPageDimension to PdfPageGeometry.""" - crop_bbox = page_dim.get_crop_bbox() - media_bbox = page_dim.get_media_bbox() - angle = page_dim.get_angle() - - # Use crop_box as default boundary - bbox = crop_bbox - # Build page rectangle as a BoundingRectangle (typed API expects this) - rect = BoundingRectangle( - r_x0=bbox[0], - r_y0=bbox[1], - r_x1=bbox[2], - r_y1=bbox[1], - r_x2=bbox[2], - r_y2=bbox[3], - r_x3=bbox[0], - r_y3=bbox[3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - art_bbox_obj = BoundingBox( - l=crop_bbox[0], - b=crop_bbox[1], - r=crop_bbox[2], - t=crop_bbox[3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - media_bbox_obj = BoundingBox( - l=media_bbox[0], - b=media_bbox[1], - r=media_bbox[2], - t=media_bbox[3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - crop_bbox_obj = BoundingBox( - l=crop_bbox[0], - b=crop_bbox[1], - r=crop_bbox[2], - t=crop_bbox[3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - return PdfPageGeometry( - angle=angle, - boundary_type=PdfPageBoundaryType(self._boundary_type), - rect=rect, - art_bbox=art_bbox_obj, - media_bbox=media_bbox_obj, - trim_bbox=crop_bbox_obj, - crop_bbox=crop_bbox_obj, - bleed_bbox=crop_bbox_obj, - ) + return _to_page_geometry_from_decoder(page_dim, self._boundary_type) def _to_cells_from_decoder( self, cells_container ) -> List[Union[PdfTextCell, TextCell]]: """Convert typed PdfCells container to list of PdfTextCell objects.""" - result: List[Union[PdfTextCell, TextCell]] = [] - - for ind, cell in enumerate(cells_container): - rect = BoundingRectangle( - r_x0=cell.r_x0, - r_y0=cell.r_y0, - r_x1=cell.r_x1, - r_y1=cell.r_y1, - r_x2=cell.r_x2, - r_y2=cell.r_y2, - r_x3=cell.r_x3, - r_y3=cell.r_y3, - ) - - result.append( - PdfTextCell( - rect=rect, - text=cell.text, - orig=cell.text, - font_key=cell.font_key, - font_name=cell.font_name, - widget=cell.widget, - text_direction=( - TextDirection.LEFT_TO_RIGHT - if cell.left_to_right - else TextDirection.RIGHT_TO_LEFT - ), - index=ind, - rendering_mode=cell.rendering_mode, - ) - ) - - return result + return _to_cells_from_decoder(cells_container) def _to_shapes_from_decoder(self, shapes_container) -> List[PdfShape]: """Convert typed PdfShapes container to list of PdfShape objects.""" - result: List[PdfShape] = [] - - for ind, shape in enumerate(shapes_container): - x_coords = shape.get_x() - y_coords = shape.get_y() - indices = shape.get_i() - - """ - print(f"{ind}\tlen(indices): {len(indices)} -> {len(x_coords)} -> {shape.get_rgb_filling_ops()}") - if len(indices)>2: - print(indices) - - if ind>8: - break - """ - - for pair_idx in range(0, len(indices), 2): - i0: int = indices[pair_idx + 0] - i1: int = indices[pair_idx + 1] - - points: List[Coord2D] = [] - for k in range(i0, i1): - points.append(Coord2D(x_coords[k], y_coords[k])) - - rgb_s = shape.get_rgb_stroking_ops() - rgb_f = shape.get_rgb_filling_ops() - - pdf_shape = PdfShape( - index=ind, - parent_id=pair_idx, - points=points, - has_graphics_state=shape.get_has_graphics_state(), - line_width=shape.get_line_width(), - miter_limit=shape.get_miter_limit(), - line_cap=shape.get_line_cap(), - line_join=shape.get_line_join(), - dash_phase=shape.get_dash_phase(), - dash_array=list(shape.get_dash_array()), - flatness=shape.get_flatness(), - rgb_stroking=ColorRGBA(r=rgb_s[0], g=rgb_s[1], b=rgb_s[2]), - rgb_filling=ColorRGBA(r=rgb_f[0], g=rgb_f[1], b=rgb_f[2]), - ) - result.append(pdf_shape) - - return result + return _to_shapes_from_decoder(shapes_container) def _to_widgets_from_decoder(self, widgets_container) -> List[PdfWidget]: """Convert typed PdfWidgets container to list of PdfWidget objects.""" - result: List[PdfWidget] = [] - - for ind, widget in enumerate(widgets_container): - rect = BoundingRectangle( - r_x0=widget.x0, - r_y0=widget.y0, - r_x1=widget.x1, - r_y1=widget.y0, - r_x2=widget.x1, - r_y2=widget.y1, - r_x3=widget.x0, - r_y3=widget.y1, - ) - result.append( - PdfWidget( - index=ind, - rect=rect, - widget_text=widget.text or None, - widget_description=widget.description or None, - widget_field_name=widget.field_name or None, - widget_field_type=widget.field_type or None, - ) - ) - - return result + return _to_widgets_from_decoder(widgets_container) def _to_hyperlinks_from_decoder(self, hyperlinks_container) -> List[PdfHyperlink]: """Convert typed PdfHyperlinks container to list of PdfHyperlink objects.""" - result: List[PdfHyperlink] = [] - - for ind, hyperlink in enumerate(hyperlinks_container): - rect = BoundingRectangle( - r_x0=hyperlink.x0, - r_y0=hyperlink.y0, - r_x1=hyperlink.x1, - r_y1=hyperlink.y0, - r_x2=hyperlink.x1, - r_y2=hyperlink.y1, - r_x3=hyperlink.x0, - r_y3=hyperlink.y1, - ) - result.append( - PdfHyperlink( - index=ind, - rect=rect, - uri=hyperlink.uri or None, - ) - ) - - return result + return _to_hyperlinks_from_decoder(hyperlinks_container) def _to_bitmap_resources_from_decoder( self, images_container ) -> List[BitmapResource]: """Convert typed PdfImages container to list of BitmapResource objects.""" - result: List[BitmapResource] = [] - - for ind, image in enumerate(images_container): - rect = BoundingRectangle( - r_x0=image.x0, - r_y0=image.y0, - r_x1=image.x1, - r_y1=image.y0, - r_x2=image.x1, - r_y2=image.y1, - r_x3=image.x0, - r_y3=image.y1, - ) - - image_ref = None - mode = ImageRefMode.PLACEHOLDER - - try: - image_bytes = image.get_image_as_bytes() - - if image_bytes and len(image_bytes) > 0: - fmt = image.get_image_format() - pil_image: PILImage.Image | None = None - - if fmt in ("jpeg", "jp2"): - pil_image = PILImage.open(BytesIO(image_bytes)) - elif fmt in ("raw", "jbig2"): - pil_mode = image.get_pil_mode() - w = image.image_width - h = image.image_height - if w > 0 and h > 0: - pil_image = PILImage.frombytes( - pil_mode, (w, h), image_bytes - ) - - if pil_image is not None: - # Normalize to RGBA for consistent downstream handling - if pil_image.mode != "RGBA": - pil_image = pil_image.convert("RGBA") - - # Compute DPI from pixel dimensions and PDF bbox - bbox_width = abs(image.x1 - image.x0) - if bbox_width > 0 and image.image_width > 0: - dpi = round(image.image_width * 72.0 / bbox_width) - else: - dpi = 72 - - image_ref = ImageRef.from_pil(pil_image, dpi=dpi) - mode = ImageRefMode.EMBEDDED - - except Exception: - _log.debug( - "Failed to extract image data for bitmap, falling back to placeholder" - ) - - bitmap = BitmapResource( - index=ind, rect=rect, uri=None, image=image_ref, mode=mode - ) - result.append(bitmap) - - return result + return _to_bitmap_resources_from_decoder(images_container) def _to_segmented_page_from_decoder( self, page_decoder, - *, - config: DecodePageConfig, ) -> SegmentedPdfPage: """Convert typed PdfPageDecoder to SegmentedPdfPage (zero-copy path).""" - - char_cells = self._to_cells_from_decoder(page_decoder.get_char_cells()) - shapes = self._to_shapes_from_decoder(page_decoder.get_page_shapes()) - widgets = self._to_widgets_from_decoder(page_decoder.get_page_widgets()) - hyperlinks = self._to_hyperlinks_from_decoder( - page_decoder.get_page_hyperlinks() - ) - bitmap_resources = self._to_bitmap_resources_from_decoder( - page_decoder.get_page_images() - ) - - segmented_page = SegmentedPdfPage( - dimension=self._to_page_geometry_from_decoder( - page_decoder.get_page_dimension() - ), - char_cells=char_cells, - word_cells=[], - textline_cells=[], - has_chars=len(char_cells) > 0, - bitmap_resources=bitmap_resources, - shapes=shapes, - widgets=widgets, - hyperlinks=hyperlinks, + return segmented_page_from_decoder( + page_decoder=page_decoder, + boundary_type=self._boundary_type, ) - if page_decoder.has_word_cells(): - segmented_page.word_cells = self._to_cells_from_decoder( - page_decoder.get_word_cells() - ) - segmented_page.has_words = len(segmented_page.word_cells) > 0 - - if page_decoder.has_line_cells(): - segmented_page.textline_cells = self._to_cells_from_decoder( - page_decoder.get_line_cells() - ) - segmented_page.has_lines = len(segmented_page.textline_cells) > 0 - - return segmented_page - def _get_page_typed( self, page_no: int, @@ -755,7 +788,6 @@ def _get_page_typed( self._pages[page_no] = self._to_segmented_page_from_decoder( page_decoder=page_decoder, - config=config, ) return self._pages[page_no] @@ -869,6 +901,8 @@ class ThreadedPdfParserConfig(BaseModel): loglevel: Logging level ('fatal', 'error', 'warning', 'info'). threads: Number of worker threads for parallel page decoding. max_concurrent_results: Maximum results buffered before workers pause. + boundary_type: Page boundary used for geometry conversion and page sizing. + render_config: Optional render configuration for parse-and-render mode. """ model_config = ConfigDict(arbitrary_types_allowed=True) @@ -876,241 +910,320 @@ class ThreadedPdfParserConfig(BaseModel): loglevel: str = "fatal" threads: int = 4 max_concurrent_results: int = 32 + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX + render_config: RenderConfig | None = None -class DoclingThreadedPdfParser: - """Threaded PDF parser that decodes pages from multiple documents in parallel. - - Usage:: - - parser_config = ThreadedPdfParserConfig(loglevel="fatal", threads=4, max_concurrent_results=32) - decode_config = DecodePageConfig() - - parser = DoclingThreadedPdfParser(parser_config=parser_config, decode_config=decode_config) - - for source in sources: - parser.load(source) - - while parser.has_tasks(): - task = parser.get_task() - - if task.success: - page_decoder, timings = task.get() - else: - error_msg = task.error() - """ +class PageParseResult: + """Outcome of one page processed by DoclingThreadedPdfParser.""" def __init__( self, - parser_config: ThreadedPdfParserConfig | None = None, - decode_config: DecodePageConfig | None = None, + raw_result, + *, + boundary_type: PdfPageBoundaryType, + render_config: RenderConfig | None, ): - if parser_config is None: - parser_config = ThreadedPdfParserConfig() - if decode_config is None: - decode_config = DecodePageConfig() - - self._parser = threaded_pdf_parser( - loglevel=parser_config.loglevel, - num_threads=parser_config.threads, - max_concurrent_results=parser_config.max_concurrent_results, - config=decode_config, - ) - - def load( - self, - path_or_stream: Union[str, Path, BytesIO], - password: str | None = None, - ) -> str: - """Load a document for parallel processing. - - Parameters: - path_or_stream: File path or BytesIO object. - password: Optional password for protected files. - - Returns: - str: The document key. - """ - if isinstance(path_or_stream, str): - path_or_stream = Path(path_or_stream) - - if isinstance(path_or_stream, Path): - key = f"key={path_or_stream!s}" - success = self._parser.load_document( - key=key, filename=str(path_or_stream).encode("utf8"), password=password - ) - elif isinstance(path_or_stream, BytesIO): - hasher = hashlib.sha256(usedforsecurity=False) - while chunk := path_or_stream.read(8192): - hasher.update(chunk) - path_or_stream.seek(0) - hash_val = hasher.hexdigest() - - key = f"key={hash_val}" - success = self._parser.load_document_from_bytesio( - key=key, bytes_io=path_or_stream, password=password + self._raw = raw_result + self._boundary_type = boundary_type + self._render_config = render_config + self._page: SegmentedPdfPage | None = None + self._page_decoder: PdfPageDecoder | None = None + self._default_image: PILImage.Image | None = None + + self.doc_key: str = raw_result.doc_key + self.page_number: int = raw_result.page_number + 1 + self.success: bool = raw_result.success + + if self.success: + self._page_decoder, _ = raw_result.get() + self._timings = _timings_from_decoder(self._page_decoder) + self.page_width, self.page_height = _page_size_from_decoder( + self._page_decoder, boundary_type ) else: - raise TypeError( - f"Expected str, Path, or BytesIO, got {type(path_or_stream)}" + self._timings = Timings() + self.page_width = 0.0 + self.page_height = 0.0 + + @property + def has_image(self) -> bool: + """Whether get_image() can return a rendered image for this result.""" + return self._render_config is not None and self.success + + @property + def error_message(self) -> str: + """Error description; empty string when successful.""" + if self.success: + return "" + return self._raw.error() + + def _require_page_decoder(self) -> PdfPageDecoder: + if not self.success: + raise RuntimeError( + f"Cannot access failed page {self.page_number} for {self.doc_key}: {self.error_message}" ) + assert self._page_decoder is not None + return self._page_decoder + + def get_page(self) -> SegmentedPdfPage: + """Return the parsed page, converting lazily on first access.""" + if self._page is None: + self._page = segmented_page_from_decoder( + page_decoder=self._require_page_decoder(), + boundary_type=self._boundary_type, + ) + return self._page - if not success: - raise RuntimeError(f"Failed to load document with key {key}") - - return key - - def has_tasks(self) -> bool: - """Check if there are remaining tasks to consume. - - On first call, builds the task queue and starts worker threads. - - Returns: - bool: True if there are remaining results to consume. - """ - return self._parser.has_tasks() - - def get_task(self) -> "PageDecodeResult": - """Get the next completed page decode result. - - Blocks until a result is available. - - Returns: - PageDecodeResult: The result with doc_key, page_number, success flag. - Use task.get() to get (PdfPageDecoder, timings) or task.error() for error message. - """ - return self._parser.get_task() - - -# --------------------------------------------------------------------------- -# Threaded renderer -# --------------------------------------------------------------------------- - - -class ThreadedPdfRendererConfig(BaseModel): - """Configuration for the threaded PDF renderer. - - Attributes: - loglevel: Logging level ('fatal', 'error', 'warning', 'info'). - threads: Number of worker threads for parallel page rendering. - max_concurrent_results: Maximum results buffered before workers pause. - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - loglevel: str = "fatal" - threads: int = 4 - max_concurrent_results: int = 32 - - -class PdfPageRenderResult: - """Wrapper around a raw C++ PageRenderResult providing PIL image conversion. - - Attributes: - doc_key: Document key the page belongs to. - page_number: 0-indexed page number. - success: Whether rendering succeeded. - """ - - def __init__(self, raw): - self._raw = raw - self.doc_key: str = raw.doc_key - self.page_number: int = raw.page_number - self.success: bool = raw.success - - def error(self) -> str: - """Return the error message if rendering failed, empty string otherwise.""" - return self._raw.error_message if not self.success else "" - - def get(self) -> Tuple[PdfPageDecoder, Dict[str, float]]: - """Return (page_decoder, timings) for the rendered page. - - Delegates to the underlying PageDecodeResult.get() so that render - results can be used interchangeably with parse results when accessing - the decoded page data. + def get_timings(self) -> Timings: + """Return structured timing data for this page parse.""" + return self._timings - Raises: - RuntimeError: If the task was not successful. - """ - return self._raw.get() + def _rendering_config(self) -> RenderConfig: + if self._render_config is None: + raise RuntimeError( + f"Rendered image not available for page {self.page_number} of {self.doc_key}" + ) + return _copy_render_config(self._render_config) - def get_image(self) -> PILImage.Image | None: - """Convert rendered pixel data to a PIL RGBA Image. + def _default_canvas_size(self) -> tuple[int, int]: + self._require_page_decoder() + self._rendering_config() + height, width, _ = self._raw.image_shape + return width, height - Returns: - PIL.Image.Image in RGBA mode, or None if rendering failed. - """ - if not self.success: - return None + def _scale_abs_tolerance(self) -> float: + if self.page_width <= 0 or self.page_height <= 0: + return 0.0 + return max(0.5 / self.page_width, 0.5 / self.page_height) - raw_bytes = self._raw.get_image() + @staticmethod + def _image_from_bytes( + raw_bytes: bytes, image_shape: Sequence[int] + ) -> PILImage.Image: + height, width, _ = image_shape + return PILImage.frombuffer( + "RGBA", (width, height), raw_bytes, "raw", "RGBA", 0, 1 + ).copy() + + def _get_default_image(self) -> PILImage.Image: + self._require_page_decoder() + self._rendering_config() + + if self._default_image is None: + raw_bytes = self._raw.get_image() + if not raw_bytes: + raise RuntimeError( + f"Rendered image is empty for page {self.page_number} of {self.doc_key}" + ) + self._default_image = self._image_from_bytes( + raw_bytes, self._raw.image_shape + ) + return self._default_image + + def _render_image_at_scale(self, scale: float) -> PILImage.Image: + page_decoder = self._require_page_decoder() + render_config = self._rendering_config() + render_config.scale = scale + render_config.canvas_width = -1 + render_config.canvas_height = -1 + raw_bytes, image_shape = page_decoder.render_image(render_config) if not raw_bytes: - return None - - h, w, _ = self._raw.image_shape - return PILImage.frombuffer("RGBA", (w, h), raw_bytes, "raw", "RGBA", 0, 1) - - -class DoclingThreadedPdfRenderer: - """Threaded PDF renderer that decodes and renders pages from multiple documents in parallel. - - Each result contains both the decoded page data (accessible via the page_decoder) - and the rendered RGBA image, produced in a single pass. + raise RuntimeError( + f"Rendered image is empty for page {self.page_number} of {self.doc_key}" + ) + return self._image_from_bytes(raw_bytes, image_shape) + + def _render_image_at_canvas_size( + self, canvas_size: tuple[int, int] + ) -> PILImage.Image: + page_decoder = self._require_page_decoder() + render_config = self._rendering_config() + render_config.scale = -1.0 + render_config.canvas_width, render_config.canvas_height = canvas_size + raw_bytes, image_shape = page_decoder.render_image(render_config) + if not raw_bytes: + raise RuntimeError( + f"Rendered image is empty for page {self.page_number} of {self.doc_key}" + ) + return self._image_from_bytes(raw_bytes, image_shape) + + def _crop_image( + self, image: PILImage.Image, cropbox: BoundingBox | None + ) -> PILImage.Image: + if cropbox is None: + return image + if self.page_width <= 0 or self.page_height <= 0: + return image + + cropbox_top_left = cropbox.to_top_left_origin(page_height=self.page_height) + x_scale = image.width / self.page_width + y_scale = image.height / self.page_height + + left = max(0, round(cropbox_top_left.l * x_scale)) + top = max(0, round(cropbox_top_left.t * y_scale)) + right = min(image.width, round(cropbox_top_left.r * x_scale)) + bottom = min(image.height, round(cropbox_top_left.b * y_scale)) + return image.crop((left, top, right, bottom)) + + def get_image( + self, + scale: float | None = None, + canvas_size: tuple[int, int] | None = None, + cropbox: BoundingBox | None = None, + ) -> PILImage.Image: + """Return the rendered page image.""" + if scale is not None and canvas_size is not None: + raise ValueError("Provide either scale or canvas_size, not both") + + if scale is None and canvas_size is None: + image = self._get_default_image() + return self._crop_image(image, cropbox) + + if scale is not None: + if scale <= 0: + raise ValueError(f"scale must be > 0, got {scale}") + render_config = self._rendering_config() + if math.isclose( + scale, + render_config.scale, + rel_tol=0.0, + abs_tol=self._scale_abs_tolerance(), + ): + image = self._get_default_image() + else: + image = self._render_image_at_scale(scale) + else: + assert canvas_size is not None + if canvas_size[0] <= 0 or canvas_size[1] <= 0: + raise ValueError( + f"canvas_size must contain positive integers, got {canvas_size}" + ) + if canvas_size == self._default_canvas_size(): + image = self._get_default_image() + else: + image = self._render_image_at_canvas_size(canvas_size) + + return self._crop_image(image, cropbox) + + def _export_render_instructions_json(self) -> Dict[str, Any]: + return self._require_page_decoder().export_render_instructions_json() + + def _export_bitmap_artifacts(self) -> List[Dict[str, Any]]: + return self._require_page_decoder().export_bitmap_artifacts() + + +def _copy_decode_config(src: DecodePageConfig) -> DecodePageConfig: + dst = DecodePageConfig() + dst.page_boundary = src.page_boundary + dst.do_sanitization = src.do_sanitization + dst.keep_char_cells = src.keep_char_cells + dst.keep_shapes = src.keep_shapes + dst.keep_bitmaps = src.keep_bitmaps + dst.max_num_lines = src.max_num_lines + dst.max_num_bitmaps = src.max_num_bitmaps + dst.create_word_cells = src.create_word_cells + dst.create_line_cells = src.create_line_cells + dst.enforce_same_font = src.enforce_same_font + dst.horizontal_cell_tolerance = src.horizontal_cell_tolerance + dst.word_space_width_factor_for_merge = src.word_space_width_factor_for_merge + dst.line_space_width_factor_for_merge = src.line_space_width_factor_for_merge + dst.line_space_width_factor_for_merge_with_space = ( + src.line_space_width_factor_for_merge_with_space + ) + dst.do_thread_safe = src.do_thread_safe + dst.keep_glyphs = src.keep_glyphs + dst.keep_qpdf_warnings = src.keep_qpdf_warnings + return dst + + +def _copy_render_config(src: RenderConfig) -> RenderConfig: + dst = RenderConfig() + dst.render_text = src.render_text + dst.draw_text_bbox = src.draw_text_bbox + dst.resolve_fonts = src.resolve_fonts + dst.font_similarity_cutoff = src.font_similarity_cutoff + dst.scale = src.scale + dst.canvas_width = src.canvas_width + dst.canvas_height = src.canvas_height + return dst + + +def _validate_render_config(src: RenderConfig) -> None: + have_scale = src.scale > 0 + have_width = src.canvas_width > 0 + have_height = src.canvas_height > 0 + + if src.scale != -1.0 and src.scale <= 0: + raise ValueError("render_config.scale must be > 0 or -1") + if src.canvas_width != -1 and src.canvas_width <= 0: + raise ValueError("render_config.canvas_width must be > 0 or -1") + if src.canvas_height != -1 and src.canvas_height <= 0: + raise ValueError("render_config.canvas_height must be > 0 or -1") + if have_scale and (have_width or have_height): + raise ValueError( + "render_config.scale cannot be combined with canvas_width or canvas_height" + ) - Usage:: - render_config = RenderConfig() - decode_config = DecodePageConfig() - renderer_config = ThreadedPdfRendererConfig(threads=4) +def _validated_render_config(src: RenderConfig) -> RenderConfig: + _validate_render_config(src) + return _copy_render_config(src) - renderer = DoclingThreadedPdfRenderer( - renderer_config=renderer_config, - decode_config=decode_config, - render_config=render_config, - ) - for source in sources: - renderer.load(source) - - while renderer.has_tasks(): - result = renderer.get_task() - if result.success: - image = result.get_image() # PIL RGBA Image - else: - print(result.error()) - """ +class DoclingThreadedPdfParser: + """Threaded PDF parser that decodes pages from multiple documents in parallel.""" def __init__( self, - renderer_config: ThreadedPdfRendererConfig | None = None, + parser_config: ThreadedPdfParserConfig | None = None, decode_config: DecodePageConfig | None = None, - render_config: RenderConfig | None = None, ): - if renderer_config is None: - renderer_config = ThreadedPdfRendererConfig() - if decode_config is None: - decode_config = DecodePageConfig() - if render_config is None: - render_config = RenderConfig() - - self._renderer = threaded_pdf_renderer( - loglevel=renderer_config.loglevel, - num_threads=renderer_config.threads, - max_concurrent_results=renderer_config.max_concurrent_results, - decode_config=decode_config, - render_config=render_config, + if parser_config is None: + parser_config = ThreadedPdfParserConfig() + + self._parser_config = parser_config + if parser_config.render_config is not None: + parser_config.render_config = _validated_render_config( + parser_config.render_config + ) + self._decode_config = ( + _copy_decode_config(decode_config) + if decode_config is not None + else DecodePageConfig() ) + self._decode_config.page_boundary = parser_config.boundary_type.value + self._page_counts: Dict[str, int] = {} + self._scheduled_page_counts: Dict[str, int] = {} + + if parser_config.render_config is None: + self._parser = _threaded_pdf_parser( + loglevel=parser_config.loglevel, + num_threads=parser_config.threads, + max_concurrent_results=parser_config.max_concurrent_results, + config=self._decode_config, + ) + else: + self._parser = _threaded_pdf_renderer( + loglevel=parser_config.loglevel, + num_threads=parser_config.threads, + max_concurrent_results=parser_config.max_concurrent_results, + decode_config=self._decode_config, + render_config=parser_config.render_config, + ) def load( self, path_or_stream: Union[str, Path, BytesIO], password: str | None = None, + page_numbers: Sequence[int] | None = None, ) -> str: - """Load a document for parallel rendering. + """Load a document for parallel processing. Parameters: path_or_stream: File path or BytesIO object. password: Optional password for protected files. + page_numbers: Optional 1-indexed physical pages to schedule. Returns: str: The document key. @@ -1120,8 +1233,11 @@ def load( if isinstance(path_or_stream, Path): key = f"key={path_or_stream!s}" - success = self._renderer.load_document( - key=key, filename=str(path_or_stream).encode("utf8"), password=password + success = self._parser.load_document( + key=key, + filename=str(path_or_stream).encode("utf8"), + password=password, + page_numbers=list(page_numbers) if page_numbers is not None else None, ) elif isinstance(path_or_stream, BytesIO): hasher = hashlib.sha256(usedforsecurity=False) @@ -1131,8 +1247,11 @@ def load( hash_val = hasher.hexdigest() key = f"key={hash_val}" - success = self._renderer.load_document_from_bytesio( - key=key, bytes_io=path_or_stream, password=password + success = self._parser.load_document_from_bytesio( + key=key, + bytes_io=path_or_stream, + password=password, + page_numbers=list(page_numbers) if page_numbers is not None else None, ) else: raise TypeError( @@ -1142,8 +1261,35 @@ def load( if not success: raise RuntimeError(f"Failed to load document with key {key}") + self._page_counts[key] = self._parser.number_of_pages(key) + self._scheduled_page_counts[key] = self._parser.scheduled_number_of_pages(key) return key + def page_count(self, doc_key: str) -> int: + """Return the total page count for a loaded document.""" + if doc_key not in self._page_counts: + raise ValueError(f"Document key not loaded: {doc_key}") + return self._page_counts[doc_key] + + def scheduled_page_count(self, doc_key: str) -> int: + """Return the number of pages scheduled for threaded emission.""" + if doc_key not in self._scheduled_page_counts: + raise ValueError(f"Document key not loaded: {doc_key}") + return self._scheduled_page_counts[doc_key] + + def unload(self, doc_key: str) -> bool: + """Unload one document after threaded processing has completed.""" + unloaded = self._parser.unload_document(doc_key) + self._page_counts.pop(doc_key, None) + self._scheduled_page_counts.pop(doc_key, None) + return unloaded + + def unload_all(self) -> None: + """Unload all documents after threaded processing has completed.""" + self._parser.unload_all_documents() + self._page_counts.clear() + self._scheduled_page_counts.clear() + def has_tasks(self) -> bool: """Check if there are remaining tasks to consume. @@ -1152,139 +1298,23 @@ def has_tasks(self) -> bool: Returns: bool: True if there are remaining results to consume. """ - return self._renderer.has_tasks() + return self._parser.has_tasks() - def get_task(self) -> PdfPageRenderResult: - """Get the next completed page render result. + def iterate_results(self) -> Iterator["PageParseResult"]: + """Yield page results in completion order.""" + while self.has_tasks(): + yield self.get_task() + + def get_task(self) -> "PageParseResult": + """Get the next completed page decode result. Blocks until a result is available. Returns: - PdfPageRenderResult: wraps doc_key, page_number, success, and get_image(). + PageParseResult: Parsed page result with lazy page conversion and optional image access. """ - return PdfPageRenderResult(self._renderer.get_task()) - - -class PdfRenderDocument: - def __init__( - self, - *, - path_or_stream: Union[Path, bytes], - parser_doc: PdfDocument, - renderer_config: ThreadedPdfRendererConfig, - decode_config: DecodePageConfig, - render_config: RenderConfig, - password: str | None = None, - ): - self._path_or_stream = path_or_stream - self._parser_doc = parser_doc - self._renderer_config = renderer_config - self._decode_config = decode_config - self._render_config = render_config - self._password = password - self._pages: Dict[int, PdfPageRenderResult] = {} - - def _make_renderer(self) -> "DoclingThreadedPdfRenderer": - return DoclingThreadedPdfRenderer( - renderer_config=self._renderer_config, - decode_config=self._decode_config, - render_config=self._render_config, - ) - - def _load_source(self, renderer: "DoclingThreadedPdfRenderer") -> str: - if isinstance(self._path_or_stream, Path): - return renderer.load(self._path_or_stream, password=self._password) - - return renderer.load(BytesIO(self._path_or_stream), password=self._password) - - def _render_all_pages(self) -> None: - if len(self._pages) == self.number_of_pages(): - return - - renderer = self._make_renderer() - key = self._load_source(renderer) - - while renderer.has_tasks(): - result = renderer.get_task() - if result.doc_key != key: - continue - if not result.success: - raise RuntimeError( - f"Failed to render page {result.page_number + 1}: {result.error()}" - ) - self._pages[result.page_number + 1] = result - - def number_of_pages(self) -> int: - return self._parser_doc.number_of_pages() - - def get_page(self, page_no: int) -> PdfPageRenderResult: - if not (1 <= page_no <= self.number_of_pages()): - raise ValueError( - f"incorrect page_no: {page_no} (min:1, max:{self.number_of_pages()})" - ) - - if page_no not in self._pages: - self._render_all_pages() - - return self._pages[page_no] - - def iterate_pages(self) -> Iterator[Tuple[int, PdfPageRenderResult]]: - self._render_all_pages() - for page_no in range(1, self.number_of_pages() + 1): - yield page_no, self._pages[page_no] - - def unload(self) -> bool: - self._pages.clear() - return self._parser_doc.unload() - - -class DoclingPdfRenderer: - def __init__( - self, - loglevel: str = "fatal", - decode_config: DecodePageConfig | None = None, - render_config: RenderConfig | None = None, - ): - self._loglevel = loglevel - self._parser = DoclingPdfParser(loglevel=loglevel) - self._renderer_config = ThreadedPdfRendererConfig( - loglevel=loglevel, - threads=1, - max_concurrent_results=1, - ) - self._decode_config = decode_config or DecodePageConfig() - self._render_config = render_config or RenderConfig() - - def load( - self, - path_or_stream: Union[str, Path, BytesIO], - lazy: bool = True, - boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, - password: str | None = None, - ) -> PdfRenderDocument: - parser_doc = self._parser.load( - path_or_stream=path_or_stream, - lazy=lazy, - boundary_type=boundary_type, - password=password, - ) - - if isinstance(path_or_stream, str): - source: Union[Path, bytes] = Path(path_or_stream) - elif isinstance(path_or_stream, Path): - source = path_or_stream - elif isinstance(path_or_stream, BytesIO): - source = path_or_stream.getvalue() - else: - raise TypeError( - f"Expected str, Path, or BytesIO, got {type(path_or_stream)}" - ) - - return PdfRenderDocument( - path_or_stream=source, - parser_doc=parser_doc, - renderer_config=self._renderer_config, - decode_config=self._decode_config, - render_config=self._render_config, - password=password, + return PageParseResult( + self._parser.get_task(), + boundary_type=self._parser_config.boundary_type, + render_config=self._parser_config.render_config, ) diff --git a/docs/plans/threaded-api-design.md b/docs/plans/threaded-api-design.md new file mode 100644 index 00000000..b23be024 --- /dev/null +++ b/docs/plans/threaded-api-design.md @@ -0,0 +1,367 @@ +# Threaded Parser Public API Design + +**Status:** Implemented +**Last updated:** 2026-04-28 +**Scope:** `docling-parse` only + +This document is the consolidated design and behavior reference for the public threaded parser API in `docling-parse`. + +It supersedes the narrower `update-threaded-api.md` plan. The decisions from that follow-up plan are folded in here, and the examples below reflect the current implementation rather than an earlier proposal draft. + +--- + +## Goals + +- Keep the sequential `PdfDocument`-based API stable. +- Provide one public threaded parser entry point for both parse-only and parse-and-render workflows. +- Hide C++ decoder objects from normal Python callers. +- Keep page results typed, lazy, and consistent with the sequential API where possible. +- Support selected-page scheduling and explicit cleanup for multi-document threaded workloads. + +--- + +## Stable constraints + +- The sequential API remains unchanged: + - `DoclingPdfParser` + - `PdfDocument` + - `PdfDocument.get_page()` + - `PdfDocument.iterate_pages()` + - `PdfDocument.get_page_with_timings()` +- The threaded API is the place where the public redesign happened. +- Rendering remains optional and is enabled by configuration, not by switching to a separate public threaded class. + +--- + +## Final public shape + +### One threaded parser interface + +The public threaded entry point is: + +```python +DoclingThreadedPdfParser( + parser_config: ThreadedPdfParserConfig | None = None, + decode_config: DecodePageConfig | None = None, +) +``` + +There is no separate public `DoclingThreadedPdfRenderer` API anymore. Parse-only and parse-and-render share the same Python interface. + +### Threaded parser configuration + +```python +class ThreadedPdfParserConfig(BaseModel): + loglevel: str = "fatal" + threads: int = 4 + max_concurrent_results: int = 32 + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX + render_config: RenderConfig | None = None +``` + +Key points: + +- `boundary_type` now has an explicit home in the threaded path. +- `render_config=None` selects parse-only operation. +- `render_config` present selects parse-and-render operation. +- `DecodePageConfig` and `RenderConfig` stay separate because they configure different pipeline stages. + +### Public result type + +`get_task()` and `iterate_results()` return `PageParseResult`. + +`PageParseResult` exposes: + +- `doc_key: str` +- `page_number: int` +- `page_width: float` +- `page_height: float` +- `success: bool` +- `error_message: str` +- `has_image: bool` +- `get_page() -> SegmentedPdfPage` +- `get_timings() -> Timings` +- `get_image(...) -> PIL.Image.Image` + +Notable behavior: + +- `page_number` is 1-indexed, matching the sequential API. +- `get_page()` is lazy and caches the converted `SegmentedPdfPage`. +- `get_timings()` returns the typed `Timings` model, not a raw dict. +- Failed results keep `page_width` and `page_height` at `0.0`, and `get_page()` / `get_image()` raise clearly. + +--- + +## Why this design replaced the earlier threaded API + +The old threaded surface had several problems: + +- It leaked `PdfPageDecoder` into user code. +- It required private `PdfDocument` conversion helpers to turn results into `SegmentedPdfPage`. +- It used 0-indexed page numbers, unlike the sequential API. +- It split parsing and rendering into redundant public threaded classes. +- It returned raw timing dicts instead of `Timings`. +- It had no first-class selected-page scheduling or unload lifecycle on the Python API. + +The implemented design resolves those issues without changing the sequential parser contract. + +--- + +## Conversion model + +The canonical conversion helper is now the public module-level function: + +```python +segmented_page_from_decoder( + page_decoder: PdfPageDecoder, + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, +) -> SegmentedPdfPage +``` + +This is used by both the sequential and threaded paths. + +`PdfDocument._to_segmented_page_from_decoder()` still exists as a thin wrapper for internal sequential use, but threaded callers no longer need any private `PdfDocument` methods. + +--- + +## Document loading and scheduling + +### Loading + +```python +doc_key = parser.load( + path_or_stream, + password: str | None = None, + page_numbers: Sequence[int] | None = None, +) +``` + +Behavior: + +- `page_numbers` is optional. +- When provided, it is interpreted as 1-indexed physical page numbers. +- The C++ layer normalizes the scheduled subset by sorting and de-duplicating it. +- Out-of-range page numbers raise a `RuntimeError`. +- The returned `doc_key` is the routing key for later results and metadata queries. + +### Page counts + +Two count queries are available immediately after `load()`: + +```python +page_count(doc_key) -> int +scheduled_page_count(doc_key) -> int +``` + +Semantics: + +- `page_count(doc_key)` is the physical page count of the loaded document. +- `scheduled_page_count(doc_key)` is the number of pages that will actually be emitted by the threaded parser for that document. + +This distinction matters when `page_numbers` is used. + +--- + +## Result delivery model + +### Completion order + +`iterate_results()` yields results in completion order, not page-number order. + +If callers need in-order processing, they should collect by `page_number` and sort after consumption. + +### Manual vs iterator control + +The threaded parser intentionally exposes both: + +- `has_tasks()` +- `get_task()` +- `iterate_results()` + +`has_tasks()` is not deprecated. It remains the manual-control escape hatch. + +Important runtime detail: + +- The first call to `has_tasks()` starts the threaded work by building the task queue and launching workers. +- `iterate_results()` simply loops on `has_tasks()` and `get_task()`. + +--- + +## Cleanup and unload behavior + +The threaded parser now has explicit lifecycle cleanup: + +```python +unload(doc_key: str) -> bool +unload_all() -> None +``` + +Semantics: + +- `unload(doc_key)` removes one loaded document after threaded processing has completed. +- `unload_all()` clears all loaded documents after threaded processing has completed. +- Python-side count bookkeeping is cleared together with the underlying parser state. +- `unload(doc_key)` is idempotent after successful consumption: + - first unload returns `True` + - unloading the same key again returns `False` +- Unloading during active threaded iteration raises a clear runtime error. + +The current implementation defines "safe to unload" by checking whether results remain to be consumed, not whether worker threads have fully wound down. That matches the intended public contract: unloading should succeed once result consumption is complete. + +--- + +## Image rendering model + +Rendering is available only when the parser was created with `parser_config.render_config`. + +For parse-only results: + +- `has_image` is `False` +- `get_image(...)` raises `RuntimeError` + +For parse-and-render results: + +- the default render is produced during threaded parsing +- the image is exposed lazily through `get_image(...)` + +### `get_image(...)` signature + +```python +get_image( + scale: float | None = None, + canvas_size: tuple[int, int] | None = None, + cropbox: BoundingBox | None = None, +) -> PIL.Image.Image +``` + +### Supported behavior + +- `scale` and `canvas_size` are mutually exclusive. +- Calling `get_image()` with no arguments returns the default pre-rendered image. +- Calling `get_image(scale=...)` performs a true rerender from the retained `PdfPageDecoder` when needed. +- Calling `get_image(canvas_size=...)` rerenders to the requested canvas size when needed. +- Calling `get_image(..., cropbox=...)` crops in Python after full-page rendering. + +### Important decisions reflected in the implementation + +- `get_image(scale=...)` is allowed whenever `render_config` is present. +- It is not restricted to cases where the original `render_config` used `scale`. +- A caller may configure the threaded parser with `canvas_width` / `canvas_height` and later request `get_image(scale=2.0)`. +- Non-default scale requests rerender from the decoder; they do not resize the existing default bitmap. + +### Crop semantics + +- `cropbox` is specified in page coordinates. +- Cropping is done in Python against the rendered full-page image. +- Page-coordinate conversion uses the page height and rendered image dimensions. +- Degenerate page dimensions are handled defensively by returning the uncropped image rather than dividing by zero. + +### Cache behavior + +- The default full-page image is cached lazily per `PageParseResult`. +- Requests matching the default render can reuse that cached image. +- Rerendered `scale` and `canvas_size` requests are generated on demand from the decoder. +- There is no aggressive per-scale or per-crop cache inside `docling-parse`. + +### Thread efficiency + +The expensive C++ rerender path used by `PageParseResult.get_image(scale=...)` / `get_image(canvas_size=...)` releases the Python GIL during instruction replay, matching the threaded API's performance goals. + +--- + +## Parse-only example + +```python +from docling_parse.pdf_parser import DoclingThreadedPdfParser, ThreadedPdfParserConfig +from docling_parse.pdf_parsers import DecodePageConfig + +decode_config = DecodePageConfig() +decode_config.create_line_cells = True + +parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(threads=4), + decode_config=decode_config, +) + +doc_key = parser.load(path, page_numbers=[1, 3, 5]) +total_pages = parser.page_count(doc_key) +scheduled_pages = parser.scheduled_page_count(doc_key) + +for result in parser.iterate_results(): + if not result.success: + print(f"{result.doc_key} p{result.page_number}: {result.error_message}") + continue + + page = result.get_page() + size = (result.page_width, result.page_height) +``` + +--- + +## Parse-and-render example + +```python +from docling_core.types.doc.base import BoundingBox, CoordOrigin +from docling_parse.pdf_parser import DoclingThreadedPdfParser, ThreadedPdfParserConfig +from docling_parse.pdf_parsers import DecodePageConfig, RenderConfig + +render_config = RenderConfig() +render_config.canvas_width = 1024 + +parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( + threads=4, + render_config=render_config, + ), + decode_config=DecodePageConfig(), +) + +doc_key = parser.load(path) + +for result in parser.iterate_results(): + if not result.success: + continue + + page = result.get_page() + default_image = result.get_image() + scaled_image = result.get_image(scale=2.0) + cropped = result.get_image( + scale=2.0, + cropbox=BoundingBox( + l=10, + t=20, + r=60, + b=90, + coord_origin=CoordOrigin.TOPLEFT, + ), + ) +``` + +--- + +## Sequential path remains unchanged + +No signatures or semantics were changed for the sequential parser stack. + +That includes: + +- `DoclingPdfParser` +- `PdfDocument` +- existing `PdfDocument` page access methods +- existing typed models such as `Timings` + +The threaded redesign was intentionally isolated from the sequential API. + +--- + +## Summary of implemented decisions + +- One public threaded parser interface, not separate parser and renderer APIs. +- Typed `PageParseResult` objects instead of raw decoder-centric result objects. +- Public `segmented_page_from_decoder(...)` as the canonical conversion entry point. +- 1-indexed threaded `page_number`. +- `boundary_type` configured on `ThreadedPdfParserConfig`. +- `page_count()` plus `scheduled_page_count()` for subset-aware scheduling. +- `unload()` and `unload_all()` as explicit threaded lifecycle cleanup. +- `get_image(scale=...)`, `get_image(canvas_size=...)`, and Python-side `cropbox` support on `PageParseResult`. +- True rerendering from the retained decoder for non-default render requests. diff --git a/perf/run_perf.py b/perf/run_perf.py index 44ea5fc3..a50e77de 100644 --- a/perf/run_perf.py +++ b/perf/run_perf.py @@ -311,20 +311,16 @@ def _runner(pdf_paths: List[Path]) -> Tuple[List[Row], float]: rows: List[Row] = [] wall_start = time.perf_counter() - while parser.has_tasks(): + for task in parser.iterate_results(): t0 = time.perf_counter() - task = parser.get_task() t1 = time.perf_counter() if task.success: - page_decoder, timings_dict = task.get() - detail: dict = {} - for key, val in timings_dict.items(): - detail[key] = val + detail = dict(task.get_timings().items()) rows.append( Row( filename=task.doc_key, - page_number=task.page_number + 1, + page_number=task.page_number, elapsed_sec=t1 - t0, success=True, error="", @@ -335,10 +331,10 @@ def _runner(pdf_paths: List[Path]) -> Tuple[List[Row], float]: rows.append( Row( filename=task.doc_key, - page_number=task.page_number + 1, + page_number=task.page_number, elapsed_sec=t1 - t0, success=False, - error=task.error(), + error=task.error_message, ) ) diff --git a/perf/run_scaling_threaded_parser.py b/perf/run_scaling_threaded_parser.py index d905e1aa..6b019fdf 100644 --- a/perf/run_scaling_threaded_parser.py +++ b/perf/run_scaling_threaded_parser.py @@ -96,23 +96,11 @@ def run_threaded( t0 = time.perf_counter() - from docling_parse.pdf_parser import PdfDocument - from docling_core.types.doc.page import PdfPageBoundaryType - - # Reuse PdfDocument's conversion methods via a lightweight instance - dummy_doc = PdfDocument.__new__(PdfDocument) - dummy_doc._boundary_type = PdfPageBoundaryType.CROP_BOX - count = 0 errors = 0 - while parser.has_tasks(): - task = parser.get_task() - if task.success: - page_decoder, timings = task.get() - # Convert to SegmentedPdfPage (same work as sequential path) - _ = dummy_doc._to_segmented_page_from_decoder( - page_decoder=page_decoder, config=decode_config, - ) + for result in parser.iterate_results(): + if result.success: + _ = result.get_page() count += 1 else: errors += 1 diff --git a/perf/run_scaling_threaded_renderer.py b/perf/run_scaling_threaded_renderer.py index cdd7c21e..51740eef 100644 --- a/perf/run_scaling_threaded_renderer.py +++ b/perf/run_scaling_threaded_renderer.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -Thread-scaling benchmark for the docling-parse threaded renderer. +Thread-scaling benchmark for docling-parse threaded parse-and-render mode. -Renders all PDFs in a directory with DoclingThreadedPdfRenderer at +Renders all PDFs in a directory with DoclingThreadedPdfParser at 1, 2, 4, 8, 12 and 16 threads and prints a table of total wall time vs thread count. A single-threaded pypdfium2 run (text + image at scale=2) is included as a reference baseline. @@ -112,10 +112,10 @@ def run_threaded( canvas_width: int, total_pages: int, ) -> float: - """Run DoclingThreadedPdfRenderer over all PDFs. Returns wall time in seconds.""" + """Run DoclingThreadedPdfParser with rendering enabled over all PDFs.""" from docling_parse.pdf_parser import ( - DoclingThreadedPdfRenderer, - ThreadedPdfRendererConfig, + DoclingThreadedPdfParser, + ThreadedPdfParserConfig, ) from docling_parse.pdf_parsers import DecodePageConfig, RenderConfig # type: ignore[import] @@ -129,21 +129,21 @@ def run_threaded( render_config = RenderConfig() render_config.canvas_width = canvas_width - renderer_config = ThreadedPdfRendererConfig( + parser_config = ThreadedPdfParserConfig( loglevel="fatal", threads=num_threads, max_concurrent_results=max_concurrent_results, + render_config=render_config, ) - renderer = DoclingThreadedPdfRenderer( - renderer_config=renderer_config, + parser = DoclingThreadedPdfParser( + parser_config=parser_config, decode_config=decode_config, - render_config=render_config, ) for pdf_path in tqdm(pdf_paths, desc=" loading", unit="doc", leave=False): try: - renderer.load(str(pdf_path)) + parser.load(str(pdf_path)) except Exception as e: print(f" threaded load error on {pdf_path}: {e}") @@ -151,8 +151,7 @@ def run_threaded( errors = 0 with tqdm(total=total_pages, desc=" rendering", unit="page") as pbar: - while renderer.has_tasks(): - result = renderer.get_task() + for result in parser.iterate_results(): if result.success: _ = result.get_image() else: diff --git a/src/pybind/docling_threaded_base.h b/src/pybind/docling_threaded_base.h index d60430b0..4756c0dc 100644 --- a/src/pybind/docling_threaded_base.h +++ b/src/pybind/docling_threaded_base.h @@ -3,12 +3,15 @@ #ifndef PYBIND_THREADED_PDF_BASE_H #define PYBIND_THREADED_PDF_BASE_H +#include #include #include #include #include #include +#include #include +#include #include #include @@ -63,11 +66,19 @@ namespace docling bool load_document(std::string key, std::string filename, - std::optional password); + std::optional password, + std::optional> page_numbers = std::nullopt); bool load_document_from_bytesio(std::string key, pybind11::object bytes_io, - std::optional password); + std::optional password, + std::optional> page_numbers = std::nullopt); + + int number_of_pages(std::string key) const; + int scheduled_number_of_pages(std::string key) const; + + bool unload_document(std::string key); + void unload_all_documents(); bool has_tasks(); @@ -76,6 +87,11 @@ namespace docling private: void set_loglevel_with_label(std::string level); + std::vector normalise_page_numbers(const std::string& key, + int num_pages, + std::optional> page_numbers) const; + void validate_unload_state() const; + void reset_after_completion(); void build_task_queue(); @@ -88,6 +104,7 @@ namespace docling int max_concurrent_results; std::unordered_map key2doc; + std::unordered_map> key2scheduled_pages; // Task queue: (doc_key, page_number) pairs std::queue> task_queue; @@ -121,7 +138,8 @@ namespace docling config(config), num_threads(num_threads), max_concurrent_results(max_concurrent_results), - key2doc({}) + key2doc({}), + key2scheduled_pages({}) { set_loglevel_with_label(loglevel); @@ -181,7 +199,8 @@ namespace docling bool docling_threaded_base::load_document( std::string key, std::string filename, - std::optional password) + std::optional password, + std::optional> page_numbers) { if(started.load()) { @@ -199,8 +218,30 @@ namespace docling if(std::filesystem::exists(path_filename)) { - key2doc[key] = std::make_shared(); - key2doc.at(key)->process_document_from_file(filename, password); + try + { + key2doc[key] = std::make_shared(); + key2doc.at(key)->process_document_from_file(filename, password); + } + catch(const std::exception& exc) + { + key2doc.erase(key); + LOG_S(ERROR) << "could not decode file object for key=" << key; + return false; + } + + try + { + key2scheduled_pages[key] = normalise_page_numbers(key, + key2doc.at(key)->get_number_of_pages(), + page_numbers); + } + catch(const std::exception& exc) + { + key2doc.erase(key); + key2scheduled_pages.erase(key); + throw; + } return true; } @@ -212,7 +253,8 @@ namespace docling bool docling_threaded_base::load_document_from_bytesio( std::string key, pybind11::object bytes_io, - std::optional password) + std::optional password, + std::optional> page_numbers) { if(started.load()) { @@ -238,28 +280,159 @@ namespace docling key2doc[key] = std::make_shared(); std::string description = "parsing of " + key + " from bytesio"; key2doc.at(key)->process_document_from_bytesio(data_buffer, password, description); - return true; } catch(const std::exception& exc) { + key2doc.erase(key); + key2scheduled_pages.erase(key); LOG_S(ERROR) << "could not decode bytesio object for key=" << key; return false; } - return false; + try + { + key2scheduled_pages[key] = normalise_page_numbers(key, + key2doc.at(key)->get_number_of_pages(), + page_numbers); + } + catch(const std::exception& exc) + { + key2doc.erase(key); + key2scheduled_pages.erase(key); + throw; + } + return true; } template - void docling_threaded_base::build_task_queue() + int docling_threaded_base::number_of_pages(std::string key) const + { + auto itr = key2doc.find(key); + if(itr == key2doc.end()) + { + throw std::runtime_error("Document key not found: " + key); + } + + return itr->second->get_number_of_pages(); + } + + template + int docling_threaded_base::scheduled_number_of_pages(std::string key) const + { + auto itr = key2scheduled_pages.find(key); + if(itr == key2scheduled_pages.end()) + { + throw std::runtime_error("Document key not found: " + key); + } + + return static_cast(itr->second.size()); + } + + template + bool docling_threaded_base::unload_document(std::string key) { - for(const auto& pair : key2doc) + validate_unload_state(); + + bool removed_doc = key2doc.erase(key) > 0; + bool removed_schedule = key2scheduled_pages.erase(key) > 0; + + if(key2doc.empty()) { - const std::string& doc_key = pair.first; - int num_pages = pair.second->get_number_of_pages(); + reset_after_completion(); + } + + return removed_doc || removed_schedule; + } + + template + void docling_threaded_base::unload_all_documents() + { + validate_unload_state(); + key2doc.clear(); + key2scheduled_pages.clear(); + reset_after_completion(); + } - for(int page = 0; page < num_pages; page++) + template + std::vector docling_threaded_base::normalise_page_numbers( + const std::string& key, + int num_pages, + std::optional> page_numbers) const + { + std::vector scheduled_pages; + + if(not page_numbers.has_value()) + { + scheduled_pages.reserve(num_pages); + for(int page = 0; page < num_pages; ++page) + { + scheduled_pages.push_back(page); + } + return scheduled_pages; + } + + scheduled_pages.reserve(page_numbers->size()); + for(int page_number : *page_numbers) + { + if(page_number < 1 or page_number > num_pages) + { + throw std::runtime_error("Invalid page number " + std::to_string(page_number) + + " for document key " + key + + " with " + std::to_string(num_pages) + " pages"); + } + scheduled_pages.push_back(page_number - 1); + } + + std::sort(scheduled_pages.begin(), scheduled_pages.end()); + scheduled_pages.erase(std::unique(scheduled_pages.begin(), scheduled_pages.end()), + scheduled_pages.end()); + return scheduled_pages; + } + + template + void docling_threaded_base::validate_unload_state() const + { + if(tasks_remaining.load() > 0) + { + throw std::runtime_error("Cannot unload documents while threaded iteration is active"); + } + } + + template + void docling_threaded_base::reset_after_completion() + { + while(not task_queue.empty()) + { + task_queue.pop(); + } + + while(not results_queue.empty()) + { + results_queue.pop(); + } + + for(auto& worker : workers) + { + if(worker.joinable()) + { + worker.join(); + } + } + workers.clear(); + + tasks_remaining.store(0); + active_workers.store(0); + started.store(false); + } + + template + void docling_threaded_base::build_task_queue() + { + for(const auto& pair : key2scheduled_pages) + { + for(int page : pair.second) { - task_queue.push(std::make_pair(doc_key, page)); + task_queue.push(std::make_pair(pair.first, page)); } } diff --git a/src/render/blend2d_renderer.h b/src/render/blend2d_renderer.h index 924ddd0a..b65f0ffa 100644 --- a/src/render/blend2d_renderer.h +++ b/src/render/blend2d_renderer.h @@ -455,33 +455,7 @@ namespace pdflib if (pdf_w <= 0 or pdf_h <= 0) { return; } - // Apply canvas_width / canvas_height from config, preserving aspect ratio. - int width = pdf_w; - int height = pdf_h; - - const bool have_w = (config_.canvas_width > 0); - const bool have_h = (config_.canvas_height > 0); - - if (have_w and have_h) - { - width = config_.canvas_width; - height = config_.canvas_height; - } - else if (have_w) - { - width = config_.canvas_width; - height = static_cast( - std::round(static_cast(pdf_h) * width / pdf_w)); - } - else if (have_h) - { - height = config_.canvas_height; - width = static_cast( - std::round(static_cast(pdf_w) * height / pdf_h)); - } - - if (width <= 0) { width = 1; } - if (height <= 0) { height = 1; } + const auto [width, height] = resolve_canvas_size(pdf_w, pdf_h, config_); scale_x_ = static_cast(width) / pdf_w; scale_y_ = static_cast(height) / pdf_h; diff --git a/src/render/config.h b/src/render/config.h index 3f89a718..9c489978 100644 --- a/src/render/config.h +++ b/src/render/config.h @@ -3,6 +3,10 @@ #ifndef PDF_RENDER_CONFIG_H #define PDF_RENDER_CONFIG_H +#include +#include +#include + namespace pdflib { @@ -42,12 +46,93 @@ namespace pdflib // accept weaker matches, higher values are more strict. float font_similarity_cutoff = 0.75f; + // Target render scale in multiples of the PDF page size (72 ppi baseline). + // -1 means "disabled". Mutually exclusive with canvas_width/canvas_height. + float scale = -1.0f; + // Target canvas dimensions in pixels. -1 means "use the PDF page size". // If only one is set the other is derived to preserve the page aspect ratio. int canvas_width = -1; int canvas_height = -1; }; + inline void validate_render_config(const render_config& config) + { + const bool have_width = config.canvas_width > 0; + const bool have_height = config.canvas_height > 0; + const bool have_scale = config.scale > 0.0f; + + if(config.scale != -1.0f and config.scale <= 0.0f) + { + throw std::runtime_error("render_config.scale must be > 0 or -1"); + } + + if(config.canvas_width != -1 and config.canvas_width <= 0) + { + throw std::runtime_error("render_config.canvas_width must be > 0 or -1"); + } + + if(config.canvas_height != -1 and config.canvas_height <= 0) + { + throw std::runtime_error("render_config.canvas_height must be > 0 or -1"); + } + + if(have_scale and (have_width or have_height)) + { + throw std::runtime_error( + "render_config.scale cannot be combined with canvas_width or canvas_height"); + } + } + + inline std::pair resolve_canvas_size( + int pdf_width, + int pdf_height, + const render_config& config) + { + validate_render_config(config); + + int width = pdf_width; + int height = pdf_height; + + const bool have_width = config.canvas_width > 0; + const bool have_height = config.canvas_height > 0; + const bool have_scale = config.scale > 0.0f; + + if(have_scale) + { + width = static_cast(std::round(static_cast(pdf_width) * config.scale)); + height = static_cast(std::round(static_cast(pdf_height) * config.scale)); + } + else if(have_width and have_height) + { + width = config.canvas_width; + height = config.canvas_height; + } + else if(have_width) + { + width = config.canvas_width; + height = static_cast( + std::round(static_cast(pdf_height) * width / pdf_width)); + } + else if(have_height) + { + height = config.canvas_height; + width = static_cast( + std::round(static_cast(pdf_width) * height / pdf_height)); + } + + if(width <= 0) + { + width = 1; + } + if(height <= 0) + { + height = 1; + } + + return {width, height}; + } + } #endif diff --git a/src/render/naive_renderer.h b/src/render/naive_renderer.h index 519f015e..a6beb342 100644 --- a/src/render/naive_renderer.h +++ b/src/render/naive_renderer.h @@ -48,8 +48,10 @@ namespace pdflib { auto& bbox = instr.crop_bbox; - int width = bbox[2] - bbox[0]; - int height = bbox[3] - bbox[1]; + const int pdf_width = bbox[2] - bbox[0]; + const int pdf_height = bbox[3] - bbox[1]; + + const auto [width, height] = resolve_canvas_size(pdf_width, pdf_height, config_); shape = {height, width, 3}; canvas->assign(height * width * 3, 255); diff --git a/tests/test_renderer.py b/tests/test_renderer.py deleted file mode 100644 index 0c91c431..00000000 --- a/tests/test_renderer.py +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/env python -import glob -import hashlib -import json -import os -from pathlib import Path -from typing import Any - -from docling_parse.pdf_parser import ( - DecodePageConfig, - DoclingPdfRenderer, - PdfRenderDocument, -) - -GENERATE = True -RENDER_INSTRUCTION_EPS = 0.005 - -GROUNDTRUTH_RENDERER_FOLDER = "tests/data/groundtruth_renderer" -REGRESSION_FOLDER = "tests/data/regression/*.pdf" - -PAGE_RESTRICTIONS = { - "deep-mediabox-inheritance.pdf": [2], - "font_06.pdf": [1], - "font_07.pdf": [1], - "font_08.pdf": [1], - "font_09.pdf": [1], - "font_10.pdf": [1], -} - -BITMAP_RESTRICTIONS = { - "indexed_iccbased.pdf": { - 1: [1, 5, 10, 15], - }, -} -MAX_BITMAPS_PER_PAGE = 5 - - -def _round_floats(obj, ndigits=3): - if isinstance(obj, float): - return round(obj, ndigits) - if isinstance(obj, dict): - return {k: _round_floats(v, ndigits) for k, v in obj.items()} - if isinstance(obj, list): - return [_round_floats(v, ndigits) for v in obj] - return obj - - -def _assert_json_matches_with_float_delta( - expected: Any, actual: Any, eps: float, path: str = "root" -) -> None: - if isinstance(expected, bool) or isinstance(actual, bool): - assert expected == actual, f"{path}: {expected!r} != {actual!r}" - return - - if isinstance(expected, float): - assert isinstance(actual, (int, float)), ( - f"{path}: expected float, got {type(actual).__name__}" - ) - assert abs(expected - float(actual)) <= eps, ( - f"{path}: abs({expected} - {actual}) > {eps}" - ) - return - - if isinstance(expected, dict): - assert isinstance(actual, dict), ( - f"{path}: expected dict, got {type(actual).__name__}" - ) - assert expected.keys() == actual.keys(), f"{path}: key mismatch" - for key in expected: - _assert_json_matches_with_float_delta( - expected[key], actual[key], eps, path=f"{path}.{key}" - ) - return - - if isinstance(expected, list): - assert isinstance(actual, list), ( - f"{path}: expected list, got {type(actual).__name__}" - ) - assert len(expected) == len(actual), f"{path}: length mismatch" - for idx, (expected_item, actual_item) in enumerate(zip(expected, actual)): - _assert_json_matches_with_float_delta( - expected_item, actual_item, eps, path=f"{path}[{idx}]" - ) - return - - assert expected == actual, f"{path}: {expected!r} != {actual!r}" - - -def _page_prefix(pdf_name: str, page_no: int) -> Path: - return Path(GROUNDTRUTH_RENDERER_FOLDER) / f"{pdf_name}.page_no_{page_no}" - - -def _instruction_path(pdf_name: str, page_no: int) -> Path: - return Path(f"{_page_prefix(pdf_name, page_no)}.instructions.json") - - -def _bitmap_json_path(pdf_name: str, page_no: int, bitmap_index: int) -> Path: - return Path(f"{_page_prefix(pdf_name, page_no)}.bitmap_{bitmap_index}.json") - - -def _full_page_png_path(pdf_name: str, page_no: int) -> Path: - return Path(f"{_page_prefix(pdf_name, page_no)}.full_page.png") - - -def _write_json(path: Path, payload) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - with open(path, "w", encoding="utf-8") as fw: - json.dump(_round_floats(payload), fw, indent=2) - - -def _load_json(path: Path): - with open(path, encoding="utf-8") as fr: - return json.load(fr) - - -def _artifact_basename( - pdf_name: str, page_no: int, bitmap_index: int, extension: str -) -> str: - return f"{pdf_name}.page_no_{page_no}.bitmap_{bitmap_index}{extension}" - - -def _selected_bitmap_indices(pdf_name: str, page_no: int, num_bitmaps: int) -> set[int]: - restricted = BITMAP_RESTRICTIONS.get(pdf_name, {}).get(page_no) - - if restricted is None: - return set(range(1, min(num_bitmaps, MAX_BITMAPS_PER_PAGE) + 1)) - - return set(restricted[:MAX_BITMAPS_PER_PAGE]) - - -def _export_or_verify_bitmaps(pdf_name: str, page_no: int, bitmaps) -> None: - selected = _selected_bitmap_indices(pdf_name, page_no, len(bitmaps)) - - for bitmap_index, bitmap in enumerate(bitmaps, start=1): - if bitmap_index not in selected: - continue - - raw_sha256 = hashlib.sha256(bitmap["raw_data"]).hexdigest() - extension = bitmap["extension"] - artifact_name = _artifact_basename(pdf_name, page_no, bitmap_index, extension) - artifact_path = Path(GROUNDTRUTH_RENDERER_FOLDER) / artifact_name - sidecar_path = _bitmap_json_path(pdf_name, page_no, bitmap_index) - - sidecar = { - "index": bitmap["index"], - "xobject_key": bitmap["xobject_key"], - "shape": bitmap["shape"], - "pixel_format": bitmap["pixel_format"], - "image_mask": bitmap["image_mask"], - "rgb_filling": bitmap["rgb_filling"], - "quad": bitmap["quad"], - "exported_filename": artifact_name, - "raw_sha256": raw_sha256, - } - - if GENERATE or (not sidecar_path.exists()) or (not artifact_path.exists()): - _write_json(sidecar_path, sidecar) - with open(artifact_path, "wb") as fw: - fw.write(bitmap["encoded_data"]) - continue - - true_sidecar = _load_json(sidecar_path) - assert true_sidecar == _round_floats(sidecar), ( - f"bitmap metadata mismatch for {sidecar_path}" - ) - - with open(artifact_path, "rb") as fr: - true_bytes = fr.read() - assert true_bytes == bitmap["encoded_data"], ( - f"bitmap artifact bytes mismatch for {artifact_path}" - ) - - -def _export_full_page_png(pdf_name: str, page_no: int, image) -> None: - out_path = _full_page_png_path(pdf_name, page_no) - if out_path.exists(): - return - - if image is None: - return - - out_path.parent.mkdir(parents=True, exist_ok=True) - image.save(out_path, format="PNG") - - -def test_render_reference_documents(): - config = DecodePageConfig() - config.page_boundary = "crop_box" - config.do_sanitization = False - config.keep_glyphs = True - config.keep_qpdf_warnings = False - renderer = DoclingPdfRenderer(loglevel="fatal", decode_config=config) - - results = [] - - pdf_paths = sorted(glob.glob(REGRESSION_FOLDER)) - assert len(pdf_paths) > 0, "len(pdf_paths)==0 -> nothing to test" - - for pdf_path in pdf_paths: - pdf_name = os.path.basename(pdf_path) - - pdf_doc: PdfRenderDocument = renderer.load(path_or_stream=pdf_path, lazy=True) - assert pdf_doc is not None - - for page_no in range(1, pdf_doc.number_of_pages() + 1): - if ( - pdf_name in PAGE_RESTRICTIONS - and page_no not in PAGE_RESTRICTIONS[pdf_name] - ): - continue - - try: - render_result = pdf_doc.get_page(page_no) - assert render_result is not None, ( - f"failed to render {pdf_name}@{page_no}" - ) - page_decoder, _timings = render_result.get() - - pred_instructions = page_decoder.export_render_instructions_json() - true_instruction_path = _instruction_path(pdf_name, page_no) - - if GENERATE or (not true_instruction_path.exists()): - _write_json(true_instruction_path, pred_instructions) - else: - true_instructions = _load_json(true_instruction_path) - - true_instructions_len = len(true_instructions["instructions"]) - pred_instructions_len = len(pred_instructions["instructions"]) - - assert true_instructions_len == pred_instructions_len, ( - f"true_instructions_len==pred_instructions_len ({true_instructions_len}=={pred_instructions_len}) for {true_instruction_path}" - ) - - for ind, true_instruction in enumerate( - true_instructions["instructions"] - ): - _assert_json_matches_with_float_delta( - true_instruction, - pred_instructions["instructions"][ind], - eps=RENDER_INSTRUCTION_EPS, - path=f"instructions[{ind}]", - ) - - bitmap_artifacts = page_decoder.export_bitmap_artifacts() - _export_or_verify_bitmaps(pdf_name, page_no, bitmap_artifacts) - _export_full_page_png(pdf_name, page_no, render_result.get_image()) - - results.append((pdf_name, page_no, True, "")) - except Exception as exc: - results.append((pdf_name, page_no, False, str(exc))) - - pdf_doc.unload() - - failed = [(doc, page, err) for doc, page, ok, err in results if not ok] - assert not failed, f"{len(failed)} page(s) failed: " + ", ".join( - f"{doc}@{page}: {err}" for doc, page, err in failed - ) diff --git a/tests/test_threaded_parse.py b/tests/test_threaded_parse.py index e5502c65..bd572bea 100644 --- a/tests/test_threaded_parse.py +++ b/tests/test_threaded_parse.py @@ -3,15 +3,15 @@ import glob import os -from pathlib import Path +import pytest from docling_core.types.doc.page import PdfPageBoundaryType, SegmentedPdfPage +from docling_parse import pdf_parsers from docling_parse.pdf_parser import ( DecodePageConfig, DoclingPdfParser, DoclingThreadedPdfParser, - PdfDocument, ThreadedPdfParserConfig, ) from tests.test_parse import ( @@ -20,51 +20,43 @@ verify_SegmentedPdfPage, ) +SAMPLE_PDF = "docs/dln-v1.pdf" +LARGE_SAMPLE_PDF = "docs/PDF32000_2008.pdf" -def _build_segmented_page_from_decoder( - page_decoder, boundary_type=PdfPageBoundaryType.CROP_BOX -): - """Build a SegmentedPdfPage from a page decoder, reusing PdfDocument's conversion logic.""" - # Create a minimal PdfDocument just for its conversion methods - dummy_doc = PdfDocument.__new__(PdfDocument) - dummy_doc._boundary_type = boundary_type + +def _make_decode_config() -> DecodePageConfig: config = DecodePageConfig() - config.page_boundary = boundary_type.value + config.page_boundary = "crop_box" config.do_sanitization = False + config.keep_glyphs = True config.keep_qpdf_warnings = False - return dummy_doc._to_segmented_page_from_decoder( - page_decoder=page_decoder, config=config - ) + return config + + +def test_threaded_raw_pybind_types_are_internal(): + assert not hasattr(pdf_parsers, "PageDecodeResult") + assert not hasattr(pdf_parsers, "threaded_pdf_parser") + assert not hasattr(pdf_parsers, "PageRenderResult") + assert not hasattr(pdf_parsers, "threaded_pdf_renderer") def test_threaded_reference_documents_from_filenames(): """Load all regression PDFs, decode all pages in parallel, and verify against groundtruth.""" - pdf_docs = sorted(glob.glob(REGRESSION_FOLDER)) assert len(pdf_docs) > 0, "len(pdf_docs)==0 -> nothing to test" - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False - - parser_config = ThreadedPdfParserConfig( - loglevel="fatal", - threads=4, - max_concurrent_results=32, - ) - parser = DoclingThreadedPdfParser( - parser_config=parser_config, - decode_config=decode_config, + parser_config=ThreadedPdfParserConfig( + loglevel="fatal", + threads=4, + max_concurrent_results=32, + boundary_type=PdfPageBoundaryType.CROP_BOX, + ), + decode_config=_make_decode_config(), ) - # Load all documents - for pdf_doc_path in pdf_docs: - parser.load(pdf_doc_path) + doc_keys = {pdf_doc_path: parser.load(pdf_doc_path) for pdf_doc_path in pdf_docs} - # Page restrictions (same as sequential test) page_restrictions = { "deep-mediabox-inheritance.pdf": [2], "font_06.pdf": [1], @@ -74,42 +66,25 @@ def test_threaded_reference_documents_from_filenames(): "font_10.pdf": [1], } - # Collect all results - results = {} - while parser.has_tasks(): - task = parser.get_task() - - assert task.doc_key != "", "doc_key should not be empty" - - if task.success: - page_decoder, _timings = task.get() - page_number = task.page_number # 0-indexed - doc_key = task.doc_key - - pred_page = _build_segmented_page_from_decoder(page_decoder) - - if doc_key not in results: - results[doc_key] = {} - results[doc_key][page_number] = pred_page + results: dict[str, dict[int, SegmentedPdfPage]] = {} + for result in parser.iterate_results(): + assert result.doc_key != "", "doc_key should not be empty" + if result.success: + results.setdefault(result.doc_key, {})[result.page_number] = ( + result.get_page() + ) else: - error_msg = task.error() - # Some pages may fail, log but don't assert print( - f"Warning: task failed for {task.doc_key} page {task.page_number}: {error_msg}" + f"Warning: task failed for {result.doc_key} page {result.page_number}: {result.error_message}" ) - # Verify results against groundtruth (same logic as test_reference_documents_from_filenames) for pdf_doc_path in pdf_docs: - key = f"key={Path(pdf_doc_path)!s}" - + key = doc_keys[pdf_doc_path] assert key in results, f"No results found for {pdf_doc_path}" - for page_number, pred_page in sorted(results[key].items()): - page_no = page_number + 1 # convert to 1-indexed for groundtruth filenames + rname = os.path.basename(pdf_doc_path) - rname = os.path.basename(pdf_doc_path) - - # Skip pages not in restrictions + for page_no, pred_page in sorted(results[key].items()): if rname in page_restrictions and page_no not in page_restrictions[rname]: continue @@ -124,59 +99,42 @@ def test_threaded_reference_documents_from_filenames(): def test_threaded_single_document(): """Test threaded parsing with a single document.""" - filename = "tests/data/regression/table_of_contents_01.pdf" - - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False + filename = SAMPLE_PDF parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig( - loglevel="fatal", threads=2, max_concurrent_results=4 + loglevel="fatal", + threads=2, + max_concurrent_results=4, + boundary_type=PdfPageBoundaryType.CROP_BOX, ), - decode_config=decode_config, + decode_config=_make_decode_config(), ) key = parser.load(filename) + assert parser.page_count(key) > 0 count = 0 - while parser.has_tasks(): - task = parser.get_task() - assert task.success, f"Failed to decode page {task.page_number}: {task.error()}" - assert task.doc_key == key - - _page_decoder, timings = task.get() - assert isinstance(timings, dict) - assert len(timings) > 0 - + for result in parser.iterate_results(): + assert result.success, ( + f"Failed to decode page {result.page_number}: {result.error_message}" + ) + assert result.doc_key == key + assert result.page_width > 0 + assert result.page_height > 0 + assert result.get_timings().total() > 0 count += 1 - # Should have processed all pages - assert count > 0, "Should have processed at least one page" + assert count == parser.page_count(key) def test_threaded_results_match_sequential(): """Verify threaded results match sequential results for the same documents.""" + filenames = [SAMPLE_PDF] + decode_config = _make_decode_config() - """ - filenames = [ - "tests/data/regression/font_01.pdf", - "tests/data/regression/ligatures_01.pdf", - ] - """ - filenames = glob.glob("tests/data/regression/*.pdf") - - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False - - # Sequential parsing seq_parser = DoclingPdfParser(loglevel="fatal") - sequential_pages = {} + sequential_pages: dict[str, dict[int, SegmentedPdfPage]] = {} for filename in filenames: pdf_doc = seq_parser.load( path_or_stream=filename, @@ -187,32 +145,26 @@ def test_threaded_results_match_sequential(): sequential_pages[key] = {} for page_no, page in pdf_doc.iterate_pages(config=decode_config): sequential_pages[key][page_no] = page - # print(f"seq: {key}, {page_no}") - # Threaded parsing threaded_parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig( - loglevel="fatal", threads=2, max_concurrent_results=4 + loglevel="fatal", + threads=2, + max_concurrent_results=4, + boundary_type=PdfPageBoundaryType.CROP_BOX, ), decode_config=decode_config, ) for filename in filenames: threaded_parser.load(filename) - threaded_pages = {} - while threaded_parser.has_tasks(): - task = threaded_parser.get_task() - assert task.success, f"Failed: {task.error()}" - - page_decoder, _timings = task.get() - pred_page = _build_segmented_page_from_decoder(page_decoder) - - if task.doc_key not in threaded_pages: - threaded_pages[task.doc_key] = {} - threaded_pages[task.doc_key][task.page_number + 1] = pred_page # 1-indexed - # print(f"threaded: {task.doc_key}, {task.page_number + 1}") + threaded_pages: dict[str, dict[int, SegmentedPdfPage]] = {} + for result in threaded_parser.iterate_results(): + assert result.success, f"Failed: {result.error_message}" + threaded_pages.setdefault(result.doc_key, {})[result.page_number] = ( + result.get_page() + ) - # Compare for key in sequential_pages: assert key in threaded_pages, f"Missing key {key} in threaded results" for page_no in sequential_pages[key]: @@ -221,28 +173,9 @@ def test_threaded_results_match_sequential(): seq_page = sequential_pages[key][page_no] thr_page = threaded_pages[key][page_no] - """ - print(f"** Page {page_no} for {key} **") - print(f" -> char-cells count for {key} page {page_no}: {len(seq_page.char_cells)} versus {len(thr_page.char_cells)}") - print(f" -> word-cells count for {key} page {page_no}: {len(seq_page.word_cells)} versus {len(thr_page.word_cells)}") - print(f" -> line-cells count for {key} page {page_no}: {len(seq_page.textline_cells)} versus {len(thr_page.textline_cells)}") - print(f" -> shapes count for {key} page {page_no}: {len(seq_page.shapes)} versus {len(thr_page.shapes)}") - """ - - # Verify key fields match assert len(seq_page.char_cells) == len(thr_page.char_cells), ( f"char_cells count mismatch for {key} page {page_no}" ) - - """ - if len(seq_page.word_cells)!=len(thr_page.word_cells): - for i, cell in enumerate(seq_page.word_cells): - print(f" === [{i}] === ") - print(cell.text) - print(thr_page.word_cells[i].text) - assert cell.text==thr_page.word_cells[i].text - """ - assert len(seq_page.word_cells) == len(thr_page.word_cells), ( f"word_cells count mismatch for {key} page {page_no}" ) @@ -256,59 +189,123 @@ def test_threaded_results_match_sequential(): def test_threaded_backpressure(): """Test that backpressure works with max_concurrent_results=1.""" - filename = "tests/data/regression/table_of_contents_01.pdf" - - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False + filename = LARGE_SAMPLE_PDF parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig( loglevel="fatal", threads=2, - max_concurrent_results=1, # Very tight backpressure + max_concurrent_results=1, + boundary_type=PdfPageBoundaryType.CROP_BOX, ), - decode_config=decode_config, + decode_config=_make_decode_config(), ) - parser.load(filename) - - count = 0 - while parser.has_tasks(): - task = parser.get_task() - assert task.success, f"Failed: {task.error()}" - count += 1 - - assert count > 0 + key = parser.load(filename) + count = sum(1 for result in parser.iterate_results() if result.success) + assert count == parser.page_count(key) def test_threaded_single_thread(): """Test threaded parsing with a single thread (sequential baseline).""" - filename = "tests/data/regression/font_01.pdf" - - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False + filename = SAMPLE_PDF parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig( loglevel="fatal", threads=1, max_concurrent_results=32, + boundary_type=PdfPageBoundaryType.CROP_BOX, ), - decode_config=decode_config, + decode_config=_make_decode_config(), ) - parser.load(filename) + key = parser.load(filename) + count = sum(1 for result in parser.iterate_results() if result.success) + assert count == parser.page_count(key) - count = 0 - while parser.has_tasks(): - task = parser.get_task() - assert task.success, f"Failed: {task.error()}" - count += 1 - assert count > 0 +def test_threaded_selected_pages_schedule_subset(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( + loglevel="fatal", + threads=2, + max_concurrent_results=4, + boundary_type=PdfPageBoundaryType.CROP_BOX, + ), + decode_config=_make_decode_config(), + ) + + key = parser.load(LARGE_SAMPLE_PDF, page_numbers=[2, 1, 2]) + + assert parser.page_count(key) >= 2 + assert parser.scheduled_page_count(key) == 2 + + emitted_pages = sorted( + result.page_number for result in parser.iterate_results() if result.success + ) + assert emitted_pages == [1, 2] + + +def test_threaded_selected_pages_invalid_page_number(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), + decode_config=_make_decode_config(), + ) + + with pytest.raises(RuntimeError, match="Invalid page number"): + parser.load(SAMPLE_PDF, page_numbers=[9999]) + + +def test_threaded_multiple_documents_with_different_subsets(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( + loglevel="fatal", + threads=4, + max_concurrent_results=8, + boundary_type=PdfPageBoundaryType.CROP_BOX, + ), + decode_config=_make_decode_config(), + ) + + path_key = parser.load(LARGE_SAMPLE_PDF, page_numbers=[1, 2]) + bytes_key = parser.load(SAMPLE_PDF, page_numbers=[1]) + + results_by_key: dict[str, list[int]] = {} + for result in parser.iterate_results(): + assert result.success, result.error_message + results_by_key.setdefault(result.doc_key, []).append(result.page_number) + + assert sorted(results_by_key[path_key]) == [1, 2] + assert sorted(results_by_key[bytes_key]) == [1] + assert parser.scheduled_page_count(path_key) == 2 + assert parser.scheduled_page_count(bytes_key) == 1 + + +def test_threaded_unload_after_consumption_is_idempotent(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), + decode_config=_make_decode_config(), + ) + + key = parser.load(SAMPLE_PDF, page_numbers=[1]) + list(parser.iterate_results()) + + assert parser.unload(key) is True + assert parser.unload(key) is False + + with pytest.raises(ValueError): + parser.page_count(key) + + +def test_threaded_unload_during_active_iteration_raises(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), + decode_config=_make_decode_config(), + ) + + key = parser.load(SAMPLE_PDF) + assert parser.has_tasks() + + with pytest.raises(RuntimeError, match="threaded iteration is active"): + parser.unload(key) diff --git a/tests/test_threaded_render.py b/tests/test_threaded_render.py index 4aea8c58..3aa77ad5 100644 --- a/tests/test_threaded_render.py +++ b/tests/test_threaded_render.py @@ -1,230 +1,227 @@ #!/usr/bin/env python -"""Tests for the threaded PDF renderer.""" +"""Tests for threaded parse-and-render mode.""" import glob import os from io import BytesIO from pathlib import Path +import pytest +from docling_core.types.doc.base import BoundingBox, CoordOrigin from docling_core.types.doc.page import SegmentedPdfPage from PIL import Image as PILImage from docling_parse.pdf_parser import ( DecodePageConfig, - DoclingThreadedPdfRenderer, + DoclingThreadedPdfParser, RenderConfig, - ThreadedPdfRendererConfig, + ThreadedPdfParserConfig, ) from tests.test_parse import ( GROUNDTRUTH_FOLDER, REGRESSION_FOLDER, verify_SegmentedPdfPage, ) -from tests.test_threaded_parse import _build_segmented_page_from_decoder +SAMPLE_PDF = "docs/dln-v1.pdf" +LARGE_SAMPLE_PDF = "docs/PDF32000_2008.pdf" -def _make_renderer( - threads: int = 2, max_concurrent: int = 1 -) -> DoclingThreadedPdfRenderer: - return DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig( + +def _make_decode_config() -> DecodePageConfig: + config = DecodePageConfig() + config.page_boundary = "crop_box" + config.do_sanitization = False + config.keep_glyphs = True + config.keep_qpdf_warnings = False + return config + + +def _make_render_config() -> RenderConfig: + return RenderConfig() + + +def _make_parser( + threads: int = 2, + max_concurrent: int = 1, + render_config: RenderConfig | None = None, +) -> DoclingThreadedPdfParser: + return DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( loglevel="fatal", threads=threads, max_concurrent_results=max_concurrent, + render_config=render_config or _make_render_config(), ), - decode_config=DecodePageConfig(), - render_config=RenderConfig(), + decode_config=_make_decode_config(), ) +def _write_variable_page_size_pdf(path: Path) -> None: + objects = [ + "<< /Type /Catalog /Pages 2 0 R >>", + "<< /Type /Pages /Count 2 /Kids [3 0 R 5 0 R] >>", + "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 300] /Contents 4 0 R >>", + "<< /Length 0 >>\nstream\n\nendstream", + "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 400 500] /Contents 6 0 R >>", + "<< /Length 0 >>\nstream\n\nendstream", + ] + + chunks = [b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"] + offsets = [0] + + for object_number, body in enumerate(objects, start=1): + offsets.append(sum(len(chunk) for chunk in chunks)) + chunks.append(f"{object_number} 0 obj\n{body}\nendobj\n".encode("ascii")) + + xref_offset = sum(len(chunk) for chunk in chunks) + xref_lines = [ + "xref", + f"0 {len(objects) + 1}", + "0000000000 65535 f ", + ] + xref_lines.extend(f"{offset:010d} 00000 n " for offset in offsets[1:]) + trailer = [ + "trailer", + f"<< /Size {len(objects) + 1} /Root 1 0 R >>", + "startxref", + str(xref_offset), + "%%EOF", + ] + chunks.append(("\n".join(xref_lines) + "\n").encode("ascii")) + chunks.append(("\n".join(trailer) + "\n").encode("ascii")) + + path.write_bytes(b"".join(chunks)) + + def test_render_single_document(): """Render all pages of one document and verify each result is a valid RGBA image.""" - filename = "tests/data/regression/table_of_contents_01.pdf" + filename = SAMPLE_PDF - renderer = _make_renderer() - key = renderer.load(filename) + parser = _make_parser() + key = parser.load(filename) count = 0 - while renderer.has_tasks(): - result = renderer.get_task() - + for result in parser.iterate_results(): assert result.doc_key == key - assert result.page_number >= 0 + assert result.page_number >= 1 assert result.success, ( - f"Render failed page {result.page_number}: {result.error()}" + f"Render failed page {result.page_number}: {result.error_message}" ) + assert result.has_image image = result.get_image() - assert image is not None, "get_image() returned None on success" assert isinstance(image, PILImage.Image) assert image.mode == "RGBA" assert image.width > 0 assert image.height > 0 + assert result.get_page().dimension.rect is not None count += 1 - assert count > 0, "Should have rendered at least one page" + assert count == parser.page_count(key) def test_render_image_dimensions_are_consistent(): - """Verify image_shape matches the actual PIL image dimensions.""" - filename = "tests/data/regression/font_01.pdf" - - renderer = _make_renderer() - renderer.load(filename) + """Verify rendered image dimensions are positive and stable.""" + filename = SAMPLE_PDF - while renderer.has_tasks(): - result = renderer.get_task() - assert result.success, result.error() - - h, w, channels = result._raw.image_shape - assert channels == 4, "Expected 4-channel RGBA" + parser = _make_parser() + parser.load(filename) + for result in parser.iterate_results(): + assert result.success, result.error_message image = result.get_image() - assert image.width == w - assert image.height == h + assert image.width > 0 + assert image.height > 0 def test_render_multiple_documents(): """Load multiple PDFs and verify all pages are rendered.""" - filenames = sorted(glob.glob(REGRESSION_FOLDER)) # limit to first 5 for speed - assert len(filenames) > 0 - - renderer = _make_renderer(threads=4, max_concurrent=16) - keys = {renderer.load(f) for f in filenames} - - cnt = 0 - - results_by_key = {} - while renderer.has_tasks(): - result = renderer.get_task() - cnt += 1 - + parser = _make_parser(threads=4, max_concurrent=16) + path_key = parser.load(SAMPLE_PDF) + with open(SAMPLE_PDF, "rb") as f: + bytes_key = parser.load(BytesIO(f.read())) + keys = {path_key, bytes_key} + + results_by_key: dict[str, list[int]] = {} + for result in parser.iterate_results(): assert result.success, ( - f"Render failed doc-key: {result.doc_key}, page: {result.page_number}: {result.error()}" + f"Render failed doc-key: {result.doc_key}, page: {result.page_number}: {result.error_message}" ) - print( - f"Render success ({cnt}): doc-key={result.doc_key}, page={result.page_number}" - ) - results_by_key.setdefault(result.doc_key, []).append(result.page_number) image = result.get_image() - assert image is not None, "image is None" - - # img.show() - assert isinstance(image, PILImage.Image) assert image.mode == "RGBA" assert image.width > 0 assert image.height > 0 - # Every loaded key must have at least one result for key in keys: assert key in results_by_key, f"No results for {key}" + assert len(results_by_key[key]) == parser.page_count(key) def test_render_from_bytesio(): """Render a document loaded from a BytesIO object.""" - filename = "tests/data/regression/font_01.pdf" + filename = SAMPLE_PDF with open(filename, "rb") as f: data = BytesIO(f.read()) - renderer = _make_renderer() - key = renderer.load(data) + parser = _make_parser() + key = parser.load(data) count = 0 - while renderer.has_tasks(): - result = renderer.get_task() + for result in parser.iterate_results(): assert result.doc_key == key - assert result.success, result.error() - - image = result.get_image() - assert image is not None - assert image.mode == "RGBA" - + assert result.success, result.error_message + assert result.get_image().mode == "RGBA" count += 1 - assert count > 0 + assert count == parser.page_count(key) def test_render_backpressure(): """Verify rendering completes correctly with max_concurrent_results=1.""" - filename = "tests/data/regression/table_of_contents_01.pdf" + filename = LARGE_SAMPLE_PDF - renderer = DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig( - loglevel="fatal", - threads=2, - max_concurrent_results=1, # tight backpressure - ), - decode_config=DecodePageConfig(), - render_config=RenderConfig(), - ) - renderer.load(filename) + parser = _make_parser(threads=2, max_concurrent=1) + key = parser.load(filename) - count = 0 - while renderer.has_tasks(): - result = renderer.get_task() - assert result.success, result.error() - count += 1 - - assert count > 0 + count = sum(1 for result in parser.iterate_results() if result.success) + assert count == parser.page_count(key) def test_render_single_thread(): """Render with a single thread as a sequential baseline.""" - filename = "tests/data/regression/font_01.pdf" - - renderer = DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig( - loglevel="fatal", - threads=1, - max_concurrent_results=32, - ), - decode_config=DecodePageConfig(), - render_config=RenderConfig(), - ) - renderer.load(filename) - - count = 0 - while renderer.has_tasks(): - result = renderer.get_task() - assert result.success, result.error() - - image = result.get_image() - assert image is not None - assert image.mode == "RGBA" - - count += 1 + filename = SAMPLE_PDF - assert count > 0 + parser = _make_parser(threads=1, max_concurrent=32) + key = parser.load(filename) + count = sum(1 for result in parser.iterate_results() if result.success) + assert count == parser.page_count(key) -def test_render_get_image_returns_none_on_failure(): - """get_image() must return None when success is False.""" - from docling_parse.pdf_parser import PdfPageRenderResult - class _FakeRaw: - doc_key = "k" - page_number = 0 - success = False - error_message = "simulated failure" - image_shape = [0, 0, 4] +def test_get_image_raises_without_rendering(): + """Parse-only results must fail loudly when image access is requested.""" + filename = SAMPLE_PDF - def get_image(self): - return b"" + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), + decode_config=_make_decode_config(), + ) + parser.load(filename) - result = PdfPageRenderResult(_FakeRaw()) - assert not result.success - assert result.get_image() is None - assert "simulated failure" in result.error() + result = next(parser.iterate_results()) + assert not result.has_image + with pytest.raises(RuntimeError, match="Rendered image not available"): + result.get_image() def test_render_custom_render_config(): - """Renderer accepts a non-default RenderConfig without error.""" - filename = "tests/data/regression/font_01.pdf" + """Parser accepts a non-default RenderConfig without error.""" + filename = SAMPLE_PDF render_config = RenderConfig() render_config.render_text = True @@ -232,18 +229,158 @@ def test_render_custom_render_config(): render_config.fit_glyph_bbox_to_target = True render_config.resolve_fonts = True - renderer = DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig(loglevel="fatal", threads=2), - decode_config=DecodePageConfig(), - render_config=render_config, + parser = _make_parser(render_config=render_config) + parser.load(filename) + + for result in parser.iterate_results(): + assert result.success, result.error_message + assert result.get_image() is not None + + +def test_get_image_scale_rerenders_for_canvas_config(): + render_config = RenderConfig() + render_config.canvas_width = 1224 + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + scaled_image = result.get_image(scale=2.0) + + assert scaled_image.size == ( + round(result.page_width * 2.0), + round(result.page_height * 2.0), + ) + + +def test_get_image_rerenders_non_default_scale(): + render_config = RenderConfig() + render_config.scale = 1.0 + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + default_image = result.get_image() + scaled_image = result.get_image(scale=2.0) + + assert scaled_image.size == ( + round(result.page_width * 2.0), + round(result.page_height * 2.0), ) - renderer.load(filename) + assert scaled_image.size != default_image.size + + +def test_get_image_canvas_size_is_accepted_for_canvas_config(): + render_config = RenderConfig() + render_config.canvas_width = 1224 + + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + default_image = result.get_image() + same_image = result.get_image(canvas_size=default_image.size) + custom_image = result.get_image(canvas_size=(600, 800)) + + assert same_image.size == default_image.size + assert custom_image.size == (600, 800) + + +def test_get_image_canvas_size_is_accepted_for_scale_config(): + render_config = RenderConfig() + render_config.scale = 2.0 + + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) - while renderer.has_tasks(): - result = renderer.get_task() - assert result.success, result.error() + result = next(parser.iterate_results()) + assert result.success, result.error_message + + default_image = result.get_image() + semantic_image = result.get_image(scale=1.0) + same_image = result.get_image(canvas_size=default_image.size) + + assert default_image.size == ( + round(result.page_width * 2.0), + round(result.page_height * 2.0), + ) + assert semantic_image.size == ( + round(result.page_width), + round(result.page_height), + ) + assert same_image.size == default_image.size + + +def test_get_image_rejects_scale_with_canvas_size(): + render_config = RenderConfig() + render_config.scale = 1.0 + + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + with pytest.raises(ValueError): + result.get_image(scale=1.0, canvas_size=(100, 100)) + + +def test_render_config_rejects_scale_with_canvas_dimensions(): + render_config = RenderConfig() + render_config.scale = 2.0 + render_config.canvas_width = 1224 + + with pytest.raises(ValueError): + _make_parser(render_config=render_config) + + +def test_get_image_crops_using_page_coordinates(): + render_config = RenderConfig() + render_config.scale = 2.0 + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + cropbox = BoundingBox( + l=10, + t=20, + r=60, + b=90, + coord_origin=CoordOrigin.TOPLEFT, + ) + cropped = result.get_image(scale=2.0, cropbox=cropbox) + + assert cropped.size == ( + round((cropbox.r - cropbox.l) * 2.0), + round((cropbox.b - cropbox.t) * 2.0), + ) + + +def test_render_scale_config_handles_pages_with_different_sizes(tmp_path: Path): + pdf_path = tmp_path / "variable_page_sizes.pdf" + _write_variable_page_size_pdf(pdf_path) + + render_config = RenderConfig() + render_config.scale = 2.0 + + parser = _make_parser(render_config=render_config) + parser.load(pdf_path) + + sizes_by_page: dict[int, tuple[int, int]] = {} + for result in parser.iterate_results(): + assert result.success, result.error_message image = result.get_image() - assert image is not None + sizes_by_page[result.page_number] = image.size + + assert sizes_by_page[1] == (400, 600) + assert sizes_by_page[2] == (800, 1000) def test_render_config_exposes_bbox_fit_flag(): @@ -260,26 +397,9 @@ def test_render_reference_documents_from_filenames(): pdf_docs = sorted(glob.glob(REGRESSION_FOLDER)) assert len(pdf_docs) > 0, "len(pdf_docs)==0 -> nothing to test" - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False - - renderer = DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig( - loglevel="fatal", - threads=4, - max_concurrent_results=32, - ), - decode_config=decode_config, - render_config=RenderConfig(), - ) - - for pdf_doc_path in pdf_docs: - renderer.load(pdf_doc_path) + parser = _make_parser(threads=4, max_concurrent=32) + doc_keys = {pdf_doc_path: parser.load(pdf_doc_path) for pdf_doc_path in pdf_docs} - # Page restrictions (same as sequential test) page_restrictions = { "deep-mediabox-inheritance.pdf": [2], "font_06.pdf": [1], @@ -289,34 +409,26 @@ def test_render_reference_documents_from_filenames(): "font_10.pdf": [1], } - results = {} - while renderer.has_tasks(): - result = renderer.get_task() - + results: dict[str, dict[int, SegmentedPdfPage]] = {} + for result in parser.iterate_results(): assert result.doc_key != "", "doc_key should not be empty" - if result.success: - page_decoder, _timings = result.get() - pred_page = _build_segmented_page_from_decoder(page_decoder) - - if result.doc_key not in results: - results[result.doc_key] = {} - results[result.doc_key][result.page_number] = pred_page + results.setdefault(result.doc_key, {})[result.page_number] = ( + result.get_page() + ) + assert result.get_image().mode == "RGBA" else: print( - f"Warning: render failed for {result.doc_key} page {result.page_number}: {result.error()}" + f"Warning: render failed for {result.doc_key} page {result.page_number}: {result.error_message}" ) for pdf_doc_path in pdf_docs: - key = f"key={Path(pdf_doc_path)!s}" - + key = doc_keys[pdf_doc_path] assert key in results, f"No results found for {pdf_doc_path}" - for page_number, pred_page in sorted(results[key].items()): - page_no = page_number + 1 # convert to 1-indexed for groundtruth filenames - - rname = os.path.basename(pdf_doc_path) + rname = os.path.basename(pdf_doc_path) + for page_no, pred_page in sorted(results[key].items()): if rname in page_restrictions and page_no not in page_restrictions[rname]: continue