From 03e993a068a149b87c92b36083f5c7193b932218 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 28 Apr 2026 09:58:27 +0200 Subject: [PATCH 1/8] refactor(threaded-parser): redesign public threaded PDF parser API Replace the split DoclingThreadedPdfParser / DoclingThreadedPdfRenderer classes with a single DoclingThreadedPdfParser whose ThreadedPdfParserConfig selects parse-only or parse-and-render mode via an optional render_config field. Key changes: - Extract segmented_page_from_decoder() as a public module-level function; PdfDocument._to_segmented_page_from_decoder() delegates to it - Add PageParseResult: typed result with 1-indexed page_number, lazy get_page(), typed get_timings(), get_image(), has_image, page_width/height, error_message - Add ThreadedPdfParserConfig.boundary_type and render_config fields - Add DoclingThreadedPdfParser.page_count() and iterate_results() - Expose number_of_pages() on both C++ threaded backends via pybind11 - Remove DoclingThreadedPdfRenderer, PdfPageRenderResult, ThreadedPdfRendererConfig - Fix DoclingThreadedPdfParser.__init__ to copy decode_config before mutating page_boundary, so the caller's object is never modified in place - Update all perf scripts and tests to the new API; restore full groundtruth regression coverage in test_threaded_parse.py and test_threaded_render.py Signed-off-by: Christoph Auer --- README.md | 18 +- app/pybind_parse.cpp | 20 + docling_parse/pdf_parser.py | 982 +++++++++++++------------- docs/plans/threaded-api-design.md | 315 +++++++++ perf/run_perf.py | 14 +- perf/run_scaling_threaded_parser.py | 18 +- perf/run_scaling_threaded_renderer.py | 23 +- src/pybind/docling_threaded_base.h | 14 + tests/test_renderer.py | 80 +-- tests/test_threaded_parse.py | 234 ++---- tests/test_threaded_render.py | 281 +++----- 11 files changed, 1043 insertions(+), 956 deletions(-) create mode 100644 docs/plans/threaded-api-design.md diff --git a/README.md b/README.md index 2b0b1dc2..aa7d101e 100644 --- a/README.md +++ b/README.md @@ -111,18 +111,18 @@ parser = DoclingThreadedPdfParser( # load one or more documents for source in ["doc_a.pdf", "doc_b.pdf"]: - parser.load(source) + doc_key = parser.load(source) + print(doc_key, parser.page_count(doc_key)) # consume decoded pages as they become available -while parser.has_tasks(): - task = parser.get_task() - - if task.success: - page_decoder, timings = task.get() - print(f"{task.doc_key} p{task.page_number}: " - f"{len(list(page_decoder.get_word_cells()))} words") +for result in parser.iterate_results(): + if result.success: + seg_page = result.get_page() + timings = result.get_timings() + print(f"{result.doc_key} p{result.page_number}: " + f"{len(seg_page.word_cells)} words in {timings.total():.3f}s") else: - print(f"error on {task.doc_key} p{task.page_number}: {task.error()}") + print(f"error on {result.doc_key} p{result.page_number}: {result.error_message}") ``` Use the CLI diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp index 76bd5bf0..cc80d583 100644 --- a/app/pybind_parse.cpp +++ b/app/pybind_parse.cpp @@ -903,6 +903,20 @@ PYBIND11_MODULE(pdf_parsers, m) { Returns: bool: True if the document was successfully loaded.)") + .def("number_of_pages", + [](docling::docling_threaded_parser& self, const std::string& key) -> int { + return self.number_of_pages(key); + }, + pybind11::arg("key"), + R"( + Return the number of pages in a loaded document. + + Parameters: + key (str): The unique key identifying the document. + + Returns: + int: Number of pages in the loaded document.)") + .def("has_tasks", [](docling::docling_threaded_parser& self) -> bool { return self.has_tasks(); @@ -1032,6 +1046,12 @@ PYBIND11_MODULE(pdf_parsers, m) { pybind11::arg("bytes_io"), pybind11::arg("password") = pybind11::none()) + .def("number_of_pages", + [](docling::docling_threaded_renderer& self, const std::string& key) -> int { + return self.number_of_pages(key); + }, + pybind11::arg("key")) + .def("has_tasks", [](docling::docling_threaded_renderer& self) -> bool { return self.has_tasks(); diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 36c727d4..c4977335 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -59,7 +59,6 @@ TIMING_PREFIX_DECODE_XOBJECT, TIMING_PREFIX_DECODING_PAGE, DecodePageConfig, # type: ignore[import] - PageDecodeResult, # type: ignore[import] PdfPageDecoder, # type: ignore[import] RenderConfig, # type: ignore[import] get_decode_page_timing_keys, @@ -182,6 +181,306 @@ def decode_page_keys() -> List[str]: return get_decode_page_timing_keys() +def _to_bounding_rectangle( + bbox: tuple[float, float, float, float], +) -> BoundingRectangle: + return BoundingRectangle( + r_x0=bbox[0], + r_y0=bbox[1], + r_x1=bbox[2], + r_y1=bbox[1], + r_x2=bbox[2], + r_y2=bbox[3], + r_x3=bbox[0], + r_y3=bbox[3], + coord_origin=CoordOrigin.BOTTOMLEFT, + ) + + +def _to_bounding_box(bbox: tuple[float, float, float, float]) -> BoundingBox: + return BoundingBox( + l=bbox[0], + b=bbox[1], + r=bbox[2], + t=bbox[3], + coord_origin=CoordOrigin.BOTTOMLEFT, + ) + + +def _get_boundary_bbox( + page_dim, + boundary_type: PdfPageBoundaryType, +) -> tuple[float, float, float, float]: + media_bbox = tuple(page_dim.get_media_bbox()) + crop_bbox = tuple(page_dim.get_crop_bbox()) + + if boundary_type == PdfPageBoundaryType.MEDIA_BOX: + return media_bbox + + return crop_bbox + + +def _to_page_geometry_from_decoder( + page_dim, + boundary_type: PdfPageBoundaryType, +) -> PdfPageGeometry: + crop_bbox = tuple(page_dim.get_crop_bbox()) + media_bbox = tuple(page_dim.get_media_bbox()) + boundary_bbox = _get_boundary_bbox(page_dim, boundary_type) + + return PdfPageGeometry( + angle=page_dim.get_angle(), + boundary_type=boundary_type, + rect=_to_bounding_rectangle(boundary_bbox), + art_bbox=_to_bounding_box(crop_bbox), + media_bbox=_to_bounding_box(media_bbox), + trim_bbox=_to_bounding_box(crop_bbox), + crop_bbox=_to_bounding_box(crop_bbox), + bleed_bbox=_to_bounding_box(crop_bbox), + ) + + +def _to_cells_from_decoder(cells_container) -> List[Union[PdfTextCell, TextCell]]: + result: List[Union[PdfTextCell, TextCell]] = [] + + for ind, cell in enumerate(cells_container): + result.append( + PdfTextCell( + rect=BoundingRectangle( + r_x0=cell.r_x0, + r_y0=cell.r_y0, + r_x1=cell.r_x1, + r_y1=cell.r_y1, + r_x2=cell.r_x2, + r_y2=cell.r_y2, + r_x3=cell.r_x3, + r_y3=cell.r_y3, + ), + text=cell.text, + orig=cell.text, + font_key=cell.font_key, + font_name=cell.font_name, + widget=cell.widget, + text_direction=( + TextDirection.LEFT_TO_RIGHT + if cell.left_to_right + else TextDirection.RIGHT_TO_LEFT + ), + index=ind, + rendering_mode=cell.rendering_mode, + ) + ) + + return result + + +def _to_shapes_from_decoder(shapes_container) -> List[PdfShape]: + result: List[PdfShape] = [] + + for ind, shape in enumerate(shapes_container): + x_coords = shape.get_x() + y_coords = shape.get_y() + indices = shape.get_i() + + for pair_idx in range(0, len(indices), 2): + i0: int = indices[pair_idx + 0] + i1: int = indices[pair_idx + 1] + + points: List[Coord2D] = [] + for k in range(i0, i1): + points.append(Coord2D(x_coords[k], y_coords[k])) + + rgb_s = shape.get_rgb_stroking_ops() + rgb_f = shape.get_rgb_filling_ops() + + result.append( + PdfShape( + index=ind, + parent_id=pair_idx, + points=points, + has_graphics_state=shape.get_has_graphics_state(), + line_width=shape.get_line_width(), + miter_limit=shape.get_miter_limit(), + line_cap=shape.get_line_cap(), + line_join=shape.get_line_join(), + dash_phase=shape.get_dash_phase(), + dash_array=list(shape.get_dash_array()), + flatness=shape.get_flatness(), + rgb_stroking=ColorRGBA(r=rgb_s[0], g=rgb_s[1], b=rgb_s[2]), + rgb_filling=ColorRGBA(r=rgb_f[0], g=rgb_f[1], b=rgb_f[2]), + ) + ) + + return result + + +def _to_widgets_from_decoder(widgets_container) -> List[PdfWidget]: + result: List[PdfWidget] = [] + + for ind, widget in enumerate(widgets_container): + result.append( + PdfWidget( + index=ind, + rect=BoundingRectangle( + r_x0=widget.x0, + r_y0=widget.y0, + r_x1=widget.x1, + r_y1=widget.y0, + r_x2=widget.x1, + r_y2=widget.y1, + r_x3=widget.x0, + r_y3=widget.y1, + ), + widget_text=widget.text or None, + widget_description=widget.description or None, + widget_field_name=widget.field_name or None, + widget_field_type=widget.field_type or None, + ) + ) + + return result + + +def _to_hyperlinks_from_decoder(hyperlinks_container) -> List[PdfHyperlink]: + result: List[PdfHyperlink] = [] + + for ind, hyperlink in enumerate(hyperlinks_container): + result.append( + PdfHyperlink( + index=ind, + rect=BoundingRectangle( + r_x0=hyperlink.x0, + r_y0=hyperlink.y0, + r_x1=hyperlink.x1, + r_y1=hyperlink.y0, + r_x2=hyperlink.x1, + r_y2=hyperlink.y1, + r_x3=hyperlink.x0, + r_y3=hyperlink.y1, + ), + uri=hyperlink.uri or None, + ) + ) + + return result + + +def _to_bitmap_resources_from_decoder(images_container) -> List[BitmapResource]: + result: List[BitmapResource] = [] + + for ind, image in enumerate(images_container): + image_ref = None + mode = ImageRefMode.PLACEHOLDER + + try: + image_bytes = image.get_image_as_bytes() + + if image_bytes and len(image_bytes) > 0: + fmt = image.get_image_format() + pil_image: PILImage.Image | None = None + + if fmt in ("jpeg", "jp2"): + pil_image = PILImage.open(BytesIO(image_bytes)) + elif fmt in ("raw", "jbig2"): + pil_mode = image.get_pil_mode() + w = image.image_width + h = image.image_height + if w > 0 and h > 0: + pil_image = PILImage.frombytes(pil_mode, (w, h), image_bytes) + + if pil_image is not None: + if pil_image.mode != "RGBA": + pil_image = pil_image.convert("RGBA") + + bbox_width = abs(image.x1 - image.x0) + if bbox_width > 0 and image.image_width > 0: + dpi = round(image.image_width * 72.0 / bbox_width) + else: + dpi = 72 + + image_ref = ImageRef.from_pil(pil_image, dpi=dpi) + mode = ImageRefMode.EMBEDDED + + except Exception: + _log.debug( + "Failed to extract image data for bitmap, falling back to placeholder" + ) + + result.append( + BitmapResource( + index=ind, + rect=BoundingRectangle( + r_x0=image.x0, + r_y0=image.y0, + r_x1=image.x1, + r_y1=image.y0, + r_x2=image.x1, + r_y2=image.y1, + r_x3=image.x0, + r_y3=image.y1, + ), + uri=None, + image=image_ref, + mode=mode, + ) + ) + + return result + + +def segmented_page_from_decoder( + page_decoder: PdfPageDecoder, + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, +) -> SegmentedPdfPage: + """Convert a C++ PdfPageDecoder to a SegmentedPdfPage.""" + char_cells = _to_cells_from_decoder(page_decoder.get_char_cells()) + + segmented_page = SegmentedPdfPage( + dimension=_to_page_geometry_from_decoder( + page_decoder.get_page_dimension(), boundary_type + ), + char_cells=char_cells, + word_cells=[], + textline_cells=[], + has_chars=len(char_cells) > 0, + bitmap_resources=_to_bitmap_resources_from_decoder( + page_decoder.get_page_images() + ), + shapes=_to_shapes_from_decoder(page_decoder.get_page_shapes()), + widgets=_to_widgets_from_decoder(page_decoder.get_page_widgets()), + hyperlinks=_to_hyperlinks_from_decoder(page_decoder.get_page_hyperlinks()), + ) + + if page_decoder.has_word_cells(): + segmented_page.word_cells = _to_cells_from_decoder( + page_decoder.get_word_cells() + ) + segmented_page.has_words = len(segmented_page.word_cells) > 0 + + if page_decoder.has_line_cells(): + segmented_page.textline_cells = _to_cells_from_decoder( + page_decoder.get_line_cells() + ) + segmented_page.has_lines = len(segmented_page.textline_cells) > 0 + + return segmented_page + + +def _timings_from_decoder(page_decoder: PdfPageDecoder) -> Timings: + return Timings( + data=dict(page_decoder.get_timings()), + raw_data=dict(page_decoder.get_timings_raw()), + ) + + +def _page_size_from_decoder( + page_decoder: PdfPageDecoder, + boundary_type: PdfPageBoundaryType, +) -> tuple[float, float]: + bbox = _get_boundary_bbox(page_decoder.get_page_dimension(), boundary_type) + return abs(bbox[2] - bbox[0]), abs(bbox[3] - bbox[1]) + + class PdfDocument: def __init__( self, @@ -402,7 +701,6 @@ def _get_page_with_timings_typed( segmented_page = self._to_segmented_page_from_decoder( page_decoder=page_decoder, - config=config, ) # Get timings from the page decoder @@ -420,308 +718,42 @@ def load_all_pages(self, config: DecodePageConfig | None = None): def _to_page_geometry_from_decoder(self, page_dim) -> PdfPageGeometry: """Convert typed PdfPageDimension to PdfPageGeometry.""" - crop_bbox = page_dim.get_crop_bbox() - media_bbox = page_dim.get_media_bbox() - angle = page_dim.get_angle() - - # Use crop_box as default boundary - bbox = crop_bbox - # Build page rectangle as a BoundingRectangle (typed API expects this) - rect = BoundingRectangle( - r_x0=bbox[0], - r_y0=bbox[1], - r_x1=bbox[2], - r_y1=bbox[1], - r_x2=bbox[2], - r_y2=bbox[3], - r_x3=bbox[0], - r_y3=bbox[3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - art_bbox_obj = BoundingBox( - l=crop_bbox[0], - b=crop_bbox[1], - r=crop_bbox[2], - t=crop_bbox[3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - media_bbox_obj = BoundingBox( - l=media_bbox[0], - b=media_bbox[1], - r=media_bbox[2], - t=media_bbox[3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - crop_bbox_obj = BoundingBox( - l=crop_bbox[0], - b=crop_bbox[1], - r=crop_bbox[2], - t=crop_bbox[3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - return PdfPageGeometry( - angle=angle, - boundary_type=PdfPageBoundaryType(self._boundary_type), - rect=rect, - art_bbox=art_bbox_obj, - media_bbox=media_bbox_obj, - trim_bbox=crop_bbox_obj, - crop_bbox=crop_bbox_obj, - bleed_bbox=crop_bbox_obj, - ) + return _to_page_geometry_from_decoder(page_dim, self._boundary_type) def _to_cells_from_decoder( self, cells_container ) -> List[Union[PdfTextCell, TextCell]]: """Convert typed PdfCells container to list of PdfTextCell objects.""" - result: List[Union[PdfTextCell, TextCell]] = [] - - for ind, cell in enumerate(cells_container): - rect = BoundingRectangle( - r_x0=cell.r_x0, - r_y0=cell.r_y0, - r_x1=cell.r_x1, - r_y1=cell.r_y1, - r_x2=cell.r_x2, - r_y2=cell.r_y2, - r_x3=cell.r_x3, - r_y3=cell.r_y3, - ) - - result.append( - PdfTextCell( - rect=rect, - text=cell.text, - orig=cell.text, - font_key=cell.font_key, - font_name=cell.font_name, - widget=cell.widget, - text_direction=( - TextDirection.LEFT_TO_RIGHT - if cell.left_to_right - else TextDirection.RIGHT_TO_LEFT - ), - index=ind, - rendering_mode=cell.rendering_mode, - ) - ) - - return result + return _to_cells_from_decoder(cells_container) def _to_shapes_from_decoder(self, shapes_container) -> List[PdfShape]: """Convert typed PdfShapes container to list of PdfShape objects.""" - result: List[PdfShape] = [] - - for ind, shape in enumerate(shapes_container): - x_coords = shape.get_x() - y_coords = shape.get_y() - indices = shape.get_i() - - """ - print(f"{ind}\tlen(indices): {len(indices)} -> {len(x_coords)} -> {shape.get_rgb_filling_ops()}") - if len(indices)>2: - print(indices) - - if ind>8: - break - """ - - for pair_idx in range(0, len(indices), 2): - i0: int = indices[pair_idx + 0] - i1: int = indices[pair_idx + 1] - - points: List[Coord2D] = [] - for k in range(i0, i1): - points.append(Coord2D(x_coords[k], y_coords[k])) - - rgb_s = shape.get_rgb_stroking_ops() - rgb_f = shape.get_rgb_filling_ops() - - pdf_shape = PdfShape( - index=ind, - parent_id=pair_idx, - points=points, - has_graphics_state=shape.get_has_graphics_state(), - line_width=shape.get_line_width(), - miter_limit=shape.get_miter_limit(), - line_cap=shape.get_line_cap(), - line_join=shape.get_line_join(), - dash_phase=shape.get_dash_phase(), - dash_array=list(shape.get_dash_array()), - flatness=shape.get_flatness(), - rgb_stroking=ColorRGBA(r=rgb_s[0], g=rgb_s[1], b=rgb_s[2]), - rgb_filling=ColorRGBA(r=rgb_f[0], g=rgb_f[1], b=rgb_f[2]), - ) - result.append(pdf_shape) - - return result + return _to_shapes_from_decoder(shapes_container) def _to_widgets_from_decoder(self, widgets_container) -> List[PdfWidget]: """Convert typed PdfWidgets container to list of PdfWidget objects.""" - result: List[PdfWidget] = [] - - for ind, widget in enumerate(widgets_container): - rect = BoundingRectangle( - r_x0=widget.x0, - r_y0=widget.y0, - r_x1=widget.x1, - r_y1=widget.y0, - r_x2=widget.x1, - r_y2=widget.y1, - r_x3=widget.x0, - r_y3=widget.y1, - ) - result.append( - PdfWidget( - index=ind, - rect=rect, - widget_text=widget.text or None, - widget_description=widget.description or None, - widget_field_name=widget.field_name or None, - widget_field_type=widget.field_type or None, - ) - ) - - return result + return _to_widgets_from_decoder(widgets_container) def _to_hyperlinks_from_decoder(self, hyperlinks_container) -> List[PdfHyperlink]: """Convert typed PdfHyperlinks container to list of PdfHyperlink objects.""" - result: List[PdfHyperlink] = [] - - for ind, hyperlink in enumerate(hyperlinks_container): - rect = BoundingRectangle( - r_x0=hyperlink.x0, - r_y0=hyperlink.y0, - r_x1=hyperlink.x1, - r_y1=hyperlink.y0, - r_x2=hyperlink.x1, - r_y2=hyperlink.y1, - r_x3=hyperlink.x0, - r_y3=hyperlink.y1, - ) - result.append( - PdfHyperlink( - index=ind, - rect=rect, - uri=hyperlink.uri or None, - ) - ) - - return result + return _to_hyperlinks_from_decoder(hyperlinks_container) def _to_bitmap_resources_from_decoder( self, images_container ) -> List[BitmapResource]: """Convert typed PdfImages container to list of BitmapResource objects.""" - result: List[BitmapResource] = [] - - for ind, image in enumerate(images_container): - rect = BoundingRectangle( - r_x0=image.x0, - r_y0=image.y0, - r_x1=image.x1, - r_y1=image.y0, - r_x2=image.x1, - r_y2=image.y1, - r_x3=image.x0, - r_y3=image.y1, - ) - - image_ref = None - mode = ImageRefMode.PLACEHOLDER - - try: - image_bytes = image.get_image_as_bytes() - - if image_bytes and len(image_bytes) > 0: - fmt = image.get_image_format() - pil_image: PILImage.Image | None = None - - if fmt in ("jpeg", "jp2"): - pil_image = PILImage.open(BytesIO(image_bytes)) - elif fmt in ("raw", "jbig2"): - pil_mode = image.get_pil_mode() - w = image.image_width - h = image.image_height - if w > 0 and h > 0: - pil_image = PILImage.frombytes( - pil_mode, (w, h), image_bytes - ) - - if pil_image is not None: - # Normalize to RGBA for consistent downstream handling - if pil_image.mode != "RGBA": - pil_image = pil_image.convert("RGBA") - - # Compute DPI from pixel dimensions and PDF bbox - bbox_width = abs(image.x1 - image.x0) - if bbox_width > 0 and image.image_width > 0: - dpi = round(image.image_width * 72.0 / bbox_width) - else: - dpi = 72 - - image_ref = ImageRef.from_pil(pil_image, dpi=dpi) - mode = ImageRefMode.EMBEDDED - - except Exception: - _log.debug( - "Failed to extract image data for bitmap, falling back to placeholder" - ) - - bitmap = BitmapResource( - index=ind, rect=rect, uri=None, image=image_ref, mode=mode - ) - result.append(bitmap) - - return result + return _to_bitmap_resources_from_decoder(images_container) def _to_segmented_page_from_decoder( self, page_decoder, - *, - config: DecodePageConfig, ) -> SegmentedPdfPage: """Convert typed PdfPageDecoder to SegmentedPdfPage (zero-copy path).""" - - char_cells = self._to_cells_from_decoder(page_decoder.get_char_cells()) - shapes = self._to_shapes_from_decoder(page_decoder.get_page_shapes()) - widgets = self._to_widgets_from_decoder(page_decoder.get_page_widgets()) - hyperlinks = self._to_hyperlinks_from_decoder( - page_decoder.get_page_hyperlinks() - ) - bitmap_resources = self._to_bitmap_resources_from_decoder( - page_decoder.get_page_images() - ) - - segmented_page = SegmentedPdfPage( - dimension=self._to_page_geometry_from_decoder( - page_decoder.get_page_dimension() - ), - char_cells=char_cells, - word_cells=[], - textline_cells=[], - has_chars=len(char_cells) > 0, - bitmap_resources=bitmap_resources, - shapes=shapes, - widgets=widgets, - hyperlinks=hyperlinks, + return segmented_page_from_decoder( + page_decoder=page_decoder, + boundary_type=self._boundary_type, ) - if page_decoder.has_word_cells(): - segmented_page.word_cells = self._to_cells_from_decoder( - page_decoder.get_word_cells() - ) - segmented_page.has_words = len(segmented_page.word_cells) > 0 - - if page_decoder.has_line_cells(): - segmented_page.textline_cells = self._to_cells_from_decoder( - page_decoder.get_line_cells() - ) - segmented_page.has_lines = len(segmented_page.textline_cells) > 0 - - return segmented_page - def _get_page_typed( self, page_no: int, @@ -755,7 +787,6 @@ def _get_page_typed( self._pages[page_no] = self._to_segmented_page_from_decoder( page_decoder=page_decoder, - config=config, ) return self._pages[page_no] @@ -869,6 +900,8 @@ class ThreadedPdfParserConfig(BaseModel): loglevel: Logging level ('fatal', 'error', 'warning', 'info'). threads: Number of worker threads for parallel page decoding. max_concurrent_results: Maximum results buffered before workers pause. + boundary_type: Page boundary used for geometry conversion and page sizing. + render_config: Optional render configuration for parse-and-render mode. """ model_config = ConfigDict(arbitrary_types_allowed=True) @@ -876,237 +909,165 @@ class ThreadedPdfParserConfig(BaseModel): loglevel: str = "fatal" threads: int = 4 max_concurrent_results: int = 32 + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX + render_config: RenderConfig | None = None -class DoclingThreadedPdfParser: - """Threaded PDF parser that decodes pages from multiple documents in parallel. - - Usage:: - - parser_config = ThreadedPdfParserConfig(loglevel="fatal", threads=4, max_concurrent_results=32) - decode_config = DecodePageConfig() - - parser = DoclingThreadedPdfParser(parser_config=parser_config, decode_config=decode_config) - - for source in sources: - parser.load(source) - - while parser.has_tasks(): - task = parser.get_task() - - if task.success: - page_decoder, timings = task.get() - else: - error_msg = task.error() - """ +class PageParseResult: + """Outcome of one page processed by DoclingThreadedPdfParser.""" def __init__( self, - parser_config: ThreadedPdfParserConfig | None = None, - decode_config: DecodePageConfig | None = None, + raw_result, + *, + boundary_type: PdfPageBoundaryType, + rendering_enabled: bool, ): - if parser_config is None: - parser_config = ThreadedPdfParserConfig() - if decode_config is None: - decode_config = DecodePageConfig() - - self._parser = threaded_pdf_parser( - loglevel=parser_config.loglevel, - num_threads=parser_config.threads, - max_concurrent_results=parser_config.max_concurrent_results, - config=decode_config, - ) - - def load( - self, - path_or_stream: Union[str, Path, BytesIO], - password: str | None = None, - ) -> str: - """Load a document for parallel processing. - - Parameters: - path_or_stream: File path or BytesIO object. - password: Optional password for protected files. - - Returns: - str: The document key. - """ - if isinstance(path_or_stream, str): - path_or_stream = Path(path_or_stream) - - if isinstance(path_or_stream, Path): - key = f"key={path_or_stream!s}" - success = self._parser.load_document( - key=key, filename=str(path_or_stream).encode("utf8"), password=password - ) - elif isinstance(path_or_stream, BytesIO): - hasher = hashlib.sha256(usedforsecurity=False) - while chunk := path_or_stream.read(8192): - hasher.update(chunk) - path_or_stream.seek(0) - hash_val = hasher.hexdigest() - - key = f"key={hash_val}" - success = self._parser.load_document_from_bytesio( - key=key, bytes_io=path_or_stream, password=password + self._raw = raw_result + self._boundary_type = boundary_type + self._rendering_enabled = rendering_enabled + self._page: SegmentedPdfPage | None = None + self._page_decoder: PdfPageDecoder | None = None + + self.doc_key: str = raw_result.doc_key + self.page_number: int = raw_result.page_number + 1 + self.success: bool = raw_result.success + + if self.success: + self._page_decoder, _ = raw_result.get() + self._timings = _timings_from_decoder(self._page_decoder) + self.page_width, self.page_height = _page_size_from_decoder( + self._page_decoder, boundary_type ) else: - raise TypeError( - f"Expected str, Path, or BytesIO, got {type(path_or_stream)}" + self._timings = Timings() + self.page_width = 0.0 + self.page_height = 0.0 + + @property + def has_image(self) -> bool: + """Whether get_image() can return a rendered image for this result.""" + return self._rendering_enabled and self.success + + @property + def error_message(self) -> str: + """Error description; empty string when successful.""" + if self.success: + return "" + return self._raw.error() + + def _require_page_decoder(self) -> PdfPageDecoder: + if not self.success: + raise RuntimeError( + f"Cannot access failed page {self.page_number} for {self.doc_key}: {self.error_message}" ) + assert self._page_decoder is not None + return self._page_decoder + + def get_page(self) -> SegmentedPdfPage: + """Return the parsed page, converting lazily on first access.""" + if self._page is None: + self._page = segmented_page_from_decoder( + page_decoder=self._require_page_decoder(), + boundary_type=self._boundary_type, + ) + return self._page - if not success: - raise RuntimeError(f"Failed to load document with key {key}") - - return key - - def has_tasks(self) -> bool: - """Check if there are remaining tasks to consume. - - On first call, builds the task queue and starts worker threads. - - Returns: - bool: True if there are remaining results to consume. - """ - return self._parser.has_tasks() - - def get_task(self) -> "PageDecodeResult": - """Get the next completed page decode result. - - Blocks until a result is available. - - Returns: - PageDecodeResult: The result with doc_key, page_number, success flag. - Use task.get() to get (PdfPageDecoder, timings) or task.error() for error message. - """ - return self._parser.get_task() - - -# --------------------------------------------------------------------------- -# Threaded renderer -# --------------------------------------------------------------------------- - - -class ThreadedPdfRendererConfig(BaseModel): - """Configuration for the threaded PDF renderer. - - Attributes: - loglevel: Logging level ('fatal', 'error', 'warning', 'info'). - threads: Number of worker threads for parallel page rendering. - max_concurrent_results: Maximum results buffered before workers pause. - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - loglevel: str = "fatal" - threads: int = 4 - max_concurrent_results: int = 32 - - -class PdfPageRenderResult: - """Wrapper around a raw C++ PageRenderResult providing PIL image conversion. - - Attributes: - doc_key: Document key the page belongs to. - page_number: 0-indexed page number. - success: Whether rendering succeeded. - """ - - def __init__(self, raw): - self._raw = raw - self.doc_key: str = raw.doc_key - self.page_number: int = raw.page_number - self.success: bool = raw.success - - def error(self) -> str: - """Return the error message if rendering failed, empty string otherwise.""" - return self._raw.error_message if not self.success else "" - - def get(self) -> Tuple[PdfPageDecoder, Dict[str, float]]: - """Return (page_decoder, timings) for the rendered page. - - Delegates to the underlying PageDecodeResult.get() so that render - results can be used interchangeably with parse results when accessing - the decoded page data. - - Raises: - RuntimeError: If the task was not successful. - """ - return self._raw.get() + def get_timings(self) -> Timings: + """Return structured timing data for this page parse.""" + return self._timings - def get_image(self) -> PILImage.Image | None: - """Convert rendered pixel data to a PIL RGBA Image. + def get_image(self) -> PILImage.Image: + """Return the rendered page image.""" + self._require_page_decoder() - Returns: - PIL.Image.Image in RGBA mode, or None if rendering failed. - """ - if not self.success: - return None + if not self._rendering_enabled: + raise RuntimeError( + f"Rendered image not available for page {self.page_number} of {self.doc_key}" + ) raw_bytes = self._raw.get_image() if not raw_bytes: - return None + raise RuntimeError( + f"Rendered image is empty for page {self.page_number} of {self.doc_key}" + ) h, w, _ = self._raw.image_shape return PILImage.frombuffer("RGBA", (w, h), raw_bytes, "raw", "RGBA", 0, 1) + def _export_render_instructions_json(self) -> Dict[str, Any]: + return self._require_page_decoder().export_render_instructions_json() + + def _export_bitmap_artifacts(self) -> List[Dict[str, Any]]: + return self._require_page_decoder().export_bitmap_artifacts() + + +def _copy_decode_config(src: DecodePageConfig) -> DecodePageConfig: + dst = DecodePageConfig() + dst.page_boundary = src.page_boundary + dst.do_sanitization = src.do_sanitization + dst.keep_char_cells = src.keep_char_cells + dst.keep_shapes = src.keep_shapes + dst.keep_bitmaps = src.keep_bitmaps + dst.max_num_lines = src.max_num_lines + dst.max_num_bitmaps = src.max_num_bitmaps + dst.create_word_cells = src.create_word_cells + dst.create_line_cells = src.create_line_cells + dst.enforce_same_font = src.enforce_same_font + dst.horizontal_cell_tolerance = src.horizontal_cell_tolerance + dst.word_space_width_factor_for_merge = src.word_space_width_factor_for_merge + dst.line_space_width_factor_for_merge = src.line_space_width_factor_for_merge + dst.line_space_width_factor_for_merge_with_space = ( + src.line_space_width_factor_for_merge_with_space + ) + dst.do_thread_safe = src.do_thread_safe + dst.keep_glyphs = src.keep_glyphs + dst.keep_qpdf_warnings = src.keep_qpdf_warnings + return dst -class DoclingThreadedPdfRenderer: - """Threaded PDF renderer that decodes and renders pages from multiple documents in parallel. - Each result contains both the decoded page data (accessible via the page_decoder) - and the rendered RGBA image, produced in a single pass. - - Usage:: - - render_config = RenderConfig() - decode_config = DecodePageConfig() - renderer_config = ThreadedPdfRendererConfig(threads=4) - - renderer = DoclingThreadedPdfRenderer( - renderer_config=renderer_config, - decode_config=decode_config, - render_config=render_config, - ) - - for source in sources: - renderer.load(source) - - while renderer.has_tasks(): - result = renderer.get_task() - if result.success: - image = result.get_image() # PIL RGBA Image - else: - print(result.error()) - """ +class DoclingThreadedPdfParser: + """Threaded PDF parser that decodes pages from multiple documents in parallel.""" def __init__( self, - renderer_config: ThreadedPdfRendererConfig | None = None, + parser_config: ThreadedPdfParserConfig | None = None, decode_config: DecodePageConfig | None = None, - render_config: RenderConfig | None = None, ): - if renderer_config is None: - renderer_config = ThreadedPdfRendererConfig() - if decode_config is None: - decode_config = DecodePageConfig() - if render_config is None: - render_config = RenderConfig() - - self._renderer = threaded_pdf_renderer( - loglevel=renderer_config.loglevel, - num_threads=renderer_config.threads, - max_concurrent_results=renderer_config.max_concurrent_results, - decode_config=decode_config, - render_config=render_config, + if parser_config is None: + parser_config = ThreadedPdfParserConfig() + + self._parser_config = parser_config + self._decode_config = ( + _copy_decode_config(decode_config) + if decode_config is not None + else DecodePageConfig() ) + self._decode_config.page_boundary = parser_config.boundary_type.value + self._page_counts: Dict[str, int] = {} + + if parser_config.render_config is None: + self._parser = threaded_pdf_parser( + loglevel=parser_config.loglevel, + num_threads=parser_config.threads, + max_concurrent_results=parser_config.max_concurrent_results, + config=self._decode_config, + ) + else: + self._parser = threaded_pdf_renderer( + loglevel=parser_config.loglevel, + num_threads=parser_config.threads, + max_concurrent_results=parser_config.max_concurrent_results, + decode_config=self._decode_config, + render_config=parser_config.render_config, + ) def load( self, path_or_stream: Union[str, Path, BytesIO], password: str | None = None, ) -> str: - """Load a document for parallel rendering. + """Load a document for parallel processing. Parameters: path_or_stream: File path or BytesIO object. @@ -1120,7 +1081,7 @@ def load( if isinstance(path_or_stream, Path): key = f"key={path_or_stream!s}" - success = self._renderer.load_document( + success = self._parser.load_document( key=key, filename=str(path_or_stream).encode("utf8"), password=password ) elif isinstance(path_or_stream, BytesIO): @@ -1131,7 +1092,7 @@ def load( hash_val = hasher.hexdigest() key = f"key={hash_val}" - success = self._renderer.load_document_from_bytesio( + success = self._parser.load_document_from_bytesio( key=key, bytes_io=path_or_stream, password=password ) else: @@ -1142,8 +1103,15 @@ def load( if not success: raise RuntimeError(f"Failed to load document with key {key}") + self._page_counts[key] = self._parser.number_of_pages(key) return key + def page_count(self, doc_key: str) -> int: + """Return the total page count for a loaded document.""" + if doc_key not in self._page_counts: + raise ValueError(f"Document key not loaded: {doc_key}") + return self._page_counts[doc_key] + def has_tasks(self) -> bool: """Check if there are remaining tasks to consume. @@ -1152,17 +1120,26 @@ def has_tasks(self) -> bool: Returns: bool: True if there are remaining results to consume. """ - return self._renderer.has_tasks() + return self._parser.has_tasks() + + def iterate_results(self) -> Iterator["PageParseResult"]: + """Yield page results in completion order.""" + while self.has_tasks(): + yield self.get_task() - def get_task(self) -> PdfPageRenderResult: - """Get the next completed page render result. + def get_task(self) -> "PageParseResult": + """Get the next completed page decode result. Blocks until a result is available. Returns: - PdfPageRenderResult: wraps doc_key, page_number, success, and get_image(). + PageParseResult: Parsed page result with lazy page conversion and optional image access. """ - return PdfPageRenderResult(self._renderer.get_task()) + return PageParseResult( + self._parser.get_task(), + boundary_type=self._parser_config.boundary_type, + rendering_enabled=self._parser_config.render_config is not None, + ) class PdfRenderDocument: @@ -1171,27 +1148,24 @@ def __init__( *, path_or_stream: Union[Path, bytes], parser_doc: PdfDocument, - renderer_config: ThreadedPdfRendererConfig, + parser_config: ThreadedPdfParserConfig, decode_config: DecodePageConfig, - render_config: RenderConfig, password: str | None = None, ): self._path_or_stream = path_or_stream self._parser_doc = parser_doc - self._renderer_config = renderer_config + self._parser_config = parser_config self._decode_config = decode_config - self._render_config = render_config self._password = password - self._pages: Dict[int, PdfPageRenderResult] = {} + self._pages: Dict[int, PageParseResult] = {} - def _make_renderer(self) -> "DoclingThreadedPdfRenderer": - return DoclingThreadedPdfRenderer( - renderer_config=self._renderer_config, + def _make_renderer(self) -> "DoclingThreadedPdfParser": + return DoclingThreadedPdfParser( + parser_config=self._parser_config, decode_config=self._decode_config, - render_config=self._render_config, ) - def _load_source(self, renderer: "DoclingThreadedPdfRenderer") -> str: + def _load_source(self, renderer: "DoclingThreadedPdfParser") -> str: if isinstance(self._path_or_stream, Path): return renderer.load(self._path_or_stream, password=self._password) @@ -1210,14 +1184,14 @@ def _render_all_pages(self) -> None: continue if not result.success: raise RuntimeError( - f"Failed to render page {result.page_number + 1}: {result.error()}" + f"Failed to render page {result.page_number}: {result.error_message}" ) - self._pages[result.page_number + 1] = result + self._pages[result.page_number] = result def number_of_pages(self) -> int: return self._parser_doc.number_of_pages() - def get_page(self, page_no: int) -> PdfPageRenderResult: + def get_page(self, page_no: int) -> PageParseResult: if not (1 <= page_no <= self.number_of_pages()): raise ValueError( f"incorrect page_no: {page_no} (min:1, max:{self.number_of_pages()})" @@ -1228,7 +1202,7 @@ def get_page(self, page_no: int) -> PdfPageRenderResult: return self._pages[page_no] - def iterate_pages(self) -> Iterator[Tuple[int, PdfPageRenderResult]]: + def iterate_pages(self) -> Iterator[Tuple[int, PageParseResult]]: self._render_all_pages() for page_no in range(1, self.number_of_pages() + 1): yield page_no, self._pages[page_no] @@ -1247,11 +1221,6 @@ def __init__( ): self._loglevel = loglevel self._parser = DoclingPdfParser(loglevel=loglevel) - self._renderer_config = ThreadedPdfRendererConfig( - loglevel=loglevel, - threads=1, - max_concurrent_results=1, - ) self._decode_config = decode_config or DecodePageConfig() self._render_config = render_config or RenderConfig() @@ -1283,8 +1252,13 @@ def load( return PdfRenderDocument( path_or_stream=source, parser_doc=parser_doc, - renderer_config=self._renderer_config, + parser_config=ThreadedPdfParserConfig( + loglevel=self._loglevel, + threads=1, + max_concurrent_results=1, + boundary_type=boundary_type, + render_config=self._render_config, + ), decode_config=self._decode_config, - render_config=self._render_config, password=password, ) diff --git a/docs/plans/threaded-api-design.md b/docs/plans/threaded-api-design.md new file mode 100644 index 00000000..c46210e9 --- /dev/null +++ b/docs/plans/threaded-api-design.md @@ -0,0 +1,315 @@ +# Threaded Parser Public API Design + +**Status:** Draft — in iteration +**Date:** 2026-04-24 +**Scope:** `docling-parse` only — docling integration is a separate concern + +--- + +## Constraints + +- **Sequential `PdfDocument`-based API is frozen.** No breaking changes to `DoclingPdfParser`, `PdfDocument`, `PdfDocument.get_page()`, `PdfDocument.iterate_pages()`, or any of their signatures. Existing code that uses the sequential path continues to work unchanged. +- **Threaded API may break.** `DoclingThreadedPdfParser`, `PageDecodeResult`, `PdfPageRenderResult`, and `ThreadedPdfParserConfig` can all change. There are no known external users relying on the current threaded API shape. + +--- + +## Problems with the current threaded API + +### 1. C++ internals leak into user code + +`DoclingThreadedPdfParser.get_task()` returns a raw `PageDecodeResult` whose `.get()` returns `(PdfPageDecoder, timings_dict)`. `PdfPageDecoder` is a C++ binding object with no documented Python interface. Callers must know to call `PdfDocument._to_segmented_page_from_decoder()` on it — a private method not intended for external use. + +The benchmark works around this with: +```python +dummy_doc = PdfDocument.__new__(PdfDocument) +dummy_doc._boundary_type = PdfPageBoundaryType.CROP_BOX +seg_page = dummy_doc._to_segmented_page_from_decoder(page_decoder, config) +``` +This is a hack that will break silently if `PdfDocument` internals change. + +`DoclingThreadedPdfRenderer.get_task()` has the same problem: `PdfPageRenderResult.get()` also returns `(PdfPageDecoder, timings_dict)`. + +The current split also exposes an implementation detail as a public API split. The renderer result is essentially a decoded page result plus an optional rendered image. Users should not have to pick a different threaded class and result type just because they want the image bytes produced during decode. + +### 2. Conversion logic is private and on the wrong class + +`PdfDocument._to_segmented_page_from_decoder()` converts `PdfPageDecoder → SegmentedPdfPage`. Logically this is a pure function: it does not depend on document state, only on `_boundary_type`. It belongs at module level, not as an instance method on `PdfDocument`. + +### 3. Page numbering inconsistency + +`PageDecodeResult.page_number` is **0-indexed**. +`PdfPageRenderResult.page_number` is also **0-indexed**. +`PdfDocument.get_page()` and `iterate_pages()` are **1-indexed**. +Callers of the threaded paths must remember to add 1. This is an unnecessary and error-prone divergence. + +### 4. No Pythonic iteration + +The `has_tasks()` / `get_task()` loop is functional but requires callers to write the same boilerplate every time. The sequential API provides `iterate_pages()`. Neither threaded class has an equivalent. + +### 5. `timings` returned as a raw dict + +The sequential path exposes the typed `Timings` model (with `.total()`, `.get()`, `.keys()`, etc.). The threaded `get()` returns a plain `dict`. These should be consistent. + +### 6. `boundary_type` has no home in the threaded path + +The sequential `DoclingPdfParser.load()` accepts `boundary_type`. There is no way to set it for the threaded parser — the conversion hack requires setting it manually on the dummy `PdfDocument` instance. + +### 7. No way to query page count before iteration + +After `parser.load()` / `renderer.load()`, callers have no way to ask how many pages a document has without starting iteration. This is needed by consumers that must pre-allocate structures or define termination conditions before any page arrives. + +### 8. Parser and renderer public APIs are redundant + +The public distinction between `DoclingThreadedPdfParser` and `DoclingThreadedPdfRenderer` is not strong enough to justify two APIs. Rendering does not produce a fundamentally different page outcome; it produces the same decoded page outcome with an additional optional image artifact. This should be represented as one threaded parser interface whose configuration decides whether page images are produced. + +This keeps user code stable when a workflow later starts needing page images: users change config and start calling `get_image()`, not swap classes, result types, and import paths. + +On the C++ side this is already mostly true structurally. `docling_threaded_parser` and `docling_threaded_renderer` both inherit from the same `docling_threaded_base`, and their worker loops perform the same document lookup, page decoder construction, `decode_page(config)`, optional word-cell creation, optional line-cell creation, result queueing, and error handling. The renderer adds only this extra step after decoding: + +```cpp +pdflib::renderer rnd(render_cfg); +page_decoder->get_instructions().iterate_over_instructions(rnd); + +result.image_data = rnd.get_canvas(); +result.image_shape = rnd.get_shape(); +``` + +So the current second backend is not a fundamentally different threading model. It is the same threaded decode pipeline with an optional render stage and a wider result payload. + +--- + +## Proposed changes + +### A. Public module-level conversion function + +Extract `PdfDocument._to_segmented_page_from_decoder` into a public, standalone function: + +```python +# docling_parse/pdf_parser.py +def segmented_page_from_decoder( + page_decoder: PdfPageDecoder, + config: DecodePageConfig, + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, +) -> SegmentedPdfPage: + """Convert a C++ PdfPageDecoder to a SegmentedPdfPage. + + This is the single canonical conversion point for both the sequential and + threaded parse paths. PdfDocument._to_segmented_page_from_decoder() becomes + a thin wrapper calling this function. + """ + ... +``` + +`PdfDocument._to_segmented_page_from_decoder()` delegates to this function, so the sequential path is untouched. + +--- + +### B. Configuration controls parse-only vs parse-and-render + +Keep one public threaded parser interface. Configuration, not the class name, decides whether page images are rendered. + +```python +class ThreadedPdfParserConfig(BaseModel): + loglevel: str = "fatal" + threads: int = 4 + max_concurrent_results: int = 32 + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX # new + render_config: RenderConfig | None = None +``` + +When `render_config is None`, `DoclingThreadedPdfParser` uses the parse-only backend. When `render_config` is provided, it uses the threaded render backend internally and surfaces the same `PageParseResult` type with image access enabled. + +`DecodePageConfig` remains the decode configuration. Rendering should be activated by supplying `RenderConfig`, because render options such as canvas width and drawing flags already belong there. The key API point is that parse-only versus parse-and-render is a configuration choice on one threaded parser interface, not a separate public parser class. + +Keep `DecodePageConfig` and `RenderConfig` as distinct types. They describe different pipeline stages: + +- `DecodePageConfig` controls what is extracted from the PDF and how decoded page content is normalized: page boundary, sanitization, keeping chars/shapes/bitmaps, word and line cell creation, threading safety, glyph/debug retention, and related merge tolerances. +- `RenderConfig` controls how an already decoded page is rasterized: whether to draw text, whether to draw text bounding boxes, font resolution behavior, font matching cutoff, and target canvas dimensions. + +Merging render fields into `DecodePageConfig` would make parse-only callers carry rasterization settings that do not affect decoding, and it would blur the contract of `DecodePageConfig` in the frozen sequential parser API. The better shape is a composed threaded execution config: decoding remains configured by `DecodePageConfig`; rendering remains configured by `RenderConfig`; the threaded parser config decides whether a render stage is enabled. + +--- + +### C. Typed result object: `PageParseResult` + +Replace both raw `PageDecodeResult` and `PdfPageRenderResult` with a clean Python class. `PdfPageDecoder` never appears in user-facing code — the conversion happens inside `get_page()`. + +```python +class PageParseResult: + """Outcome of one page processed by DoclingThreadedPdfParser.""" + + doc_key: str # document identifier returned by .load() + page_number: int # 1-indexed — consistent with the sequential API + page_width: float # page width in points (from boundary box; cheap, no full conversion needed) + page_height: float + success: bool + + def get_page(self) -> SegmentedPdfPage: + """Return the parsed page. Lazy: converts on first call, caches the result. + + Calls segmented_page_from_decoder() internally using the config and + boundary_type from the parser that produced this result. + Raises RuntimeError if success is False. + """ + ... + + def get_timings(self) -> Timings: + """Return structured timing data for this page parse.""" + ... + + def get_image(self) -> PILImage.Image: + """Return the rendered page image. + + Raises RuntimeError if this result was produced with rendering disabled + or if success is False. + """ + ... + + @property + def has_image(self) -> bool: + """Whether get_image() can return a rendered image for this result.""" + ... + + @property + def error_message(self) -> str: + """Error description; empty string when successful.""" + ... +``` + +`page_width` and `page_height` are extracted from `page_decoder.get_page_dimension()` without triggering the full `SegmentedPdfPage` conversion. Dimension decoding is a distinct internal step (see `TIMING_KEY_DECODE_DIMENSIONS`) and the data is available on the decoder object as soon as `get_task()` returns. + +`get_page()` is **lazy**: it converts on first call and caches the result. This keeps conversion cost on the worker/consumer thread rather than on the task-delivery path, and avoids wasted work on error paths where `get_page()` is never called. + +`get_image()` is available on the same result type but only succeeds when the parser was configured with `render_config`. A parse-only result has `has_image == False` and raises a clear `RuntimeError` from `get_image()`. This makes misuse fail loudly while keeping the page result type uniform. + +--- + +### D. Iterator API on `DoclingThreadedPdfParser` + +```python +class DoclingThreadedPdfParser: + + def page_count(self, doc_key: str) -> int: + """Return the total page count for a loaded document. + + Available immediately after load(), before iteration begins. + """ + ... + + def iterate_results(self) -> Iterator[PageParseResult]: + """Yield page results as they complete. + + Pages are returned in COMPLETION ORDER, not page-number order. + Worker threads start on the first call (same as has_tasks()). + + Use result.page_number and result.doc_key to route results. + To process in page order, collect into a dict keyed by page_number + and sort after iteration is complete. + """ + while self.has_tasks(): + yield self.get_task() + + def get_task(self) -> PageParseResult: # return type changes + """Block until the next result is available and return it.""" + ... + + # has_tasks() is unchanged — stays for callers needing manual control +``` + +--- + +### E. Remove the separate threaded renderer API + +Do not introduce a second primary public interface for rendering. `DoclingThreadedPdfParser` should select the existing C++ threaded parser or renderer implementation internally based on `ThreadedPdfParserConfig.render_config`. + +Longer term, the C++ implementation can also be collapsed into one threaded worker implementation with an optional render stage. That would remove the duplicated worker-loop logic and keep the only behavioral branch close to the actual difference: whether `RenderConfig` is present. + +Remove `DoclingThreadedPdfRenderer`, `PdfPageRenderResult`, and `ThreadedPdfRendererConfig` as part of the threaded API break. There is no stable public interface for the threaded component yet, so keeping deprecated aliases would add compatibility surface without protecting a real external contract. + +Documentation and examples should point users to `DoclingThreadedPdfParser` only. + +--- + +## Resulting user-facing API + +**Parse only (no images):** + +```python +from docling_parse.pdf_parser import DoclingThreadedPdfParser, ThreadedPdfParserConfig +from docling_parse.pdf_parsers import DecodePageConfig + +decode_config = DecodePageConfig() +decode_config.create_line_cells = True + +parser_config = ThreadedPdfParserConfig(threads=4, max_concurrent_results=32) +parser = DoclingThreadedPdfParser(parser_config=parser_config, decode_config=decode_config) + +doc_key = parser.load(path) +total = parser.page_count(doc_key) + +for result in parser.iterate_results(): + if result.success: + seg_page = result.get_page() # SegmentedPdfPage, lazy + size = (result.page_width, result.page_height) # available without get_page() + else: + print(f"p{result.page_number} ERROR: {result.error_message}") +``` + +**Parse and render (with images):** + +```python +from docling_parse.pdf_parser import DoclingThreadedPdfParser, ThreadedPdfParserConfig +from docling_parse.pdf_parsers import DecodePageConfig, RenderConfig + +render_config = RenderConfig() +render_config.canvas_width = 1024 + +parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(threads=4, render_config=render_config), + decode_config=DecodePageConfig(), +) + +doc_key = parser.load(path) +total = parser.page_count(doc_key) + +for result in parser.iterate_results(): + if result.success: + seg_page = result.get_page() # SegmentedPdfPage + image = result.get_image() # PIL RGBA Image + else: + print(f"p{result.page_number} ERROR: {result.error_message}") +``` + +**In-order collection (when page order matters):** + +```python +pages: dict[int, SegmentedPdfPage] = {} +for result in parser.iterate_results(): + if result.success: + pages[result.page_number] = result.get_page() + +for page_no in sorted(pages): + process(pages[page_no]) +``` + +--- + +## Sequential path — unchanged + +The following remain exactly as-is. No signature changes, no behaviour changes: + +- `DoclingPdfParser` +- `PdfDocument` +- `PdfDocument.get_page(page_no, *, config)` +- `PdfDocument.iterate_pages(*, config)` +- `PdfDocument.get_page_with_timings(page_no, *, config)` +- All `Timings`, `PdfAnnotations`, `PdfTocEntry` models + +`PdfDocument._to_segmented_page_from_decoder()` stays as a private method (it will delegate to the new public `segmented_page_from_decoder()` function internally). External callers should migrate to using `PageParseResult.get_page()` instead. + +--- + +## Open questions + +- Should `iterate_results()` accept a timeout parameter, or is that the caller's concern? *(Leaning toward caller's concern — the `has_tasks()` / `get_task()` escape hatch exists for manual control.)* +- Should `render_config` live on `ThreadedPdfParserConfig`, or stay as a separate `DoclingThreadedPdfParser(..., render_config=...)` constructor argument? *(Leaning toward `ThreadedPdfParserConfig`: rendering is a threaded execution mode, while `DecodePageConfig` remains focused on decoded page content.)* diff --git a/perf/run_perf.py b/perf/run_perf.py index 44ea5fc3..a50e77de 100644 --- a/perf/run_perf.py +++ b/perf/run_perf.py @@ -311,20 +311,16 @@ def _runner(pdf_paths: List[Path]) -> Tuple[List[Row], float]: rows: List[Row] = [] wall_start = time.perf_counter() - while parser.has_tasks(): + for task in parser.iterate_results(): t0 = time.perf_counter() - task = parser.get_task() t1 = time.perf_counter() if task.success: - page_decoder, timings_dict = task.get() - detail: dict = {} - for key, val in timings_dict.items(): - detail[key] = val + detail = dict(task.get_timings().items()) rows.append( Row( filename=task.doc_key, - page_number=task.page_number + 1, + page_number=task.page_number, elapsed_sec=t1 - t0, success=True, error="", @@ -335,10 +331,10 @@ def _runner(pdf_paths: List[Path]) -> Tuple[List[Row], float]: rows.append( Row( filename=task.doc_key, - page_number=task.page_number + 1, + page_number=task.page_number, elapsed_sec=t1 - t0, success=False, - error=task.error(), + error=task.error_message, ) ) diff --git a/perf/run_scaling_threaded_parser.py b/perf/run_scaling_threaded_parser.py index d905e1aa..6b019fdf 100644 --- a/perf/run_scaling_threaded_parser.py +++ b/perf/run_scaling_threaded_parser.py @@ -96,23 +96,11 @@ def run_threaded( t0 = time.perf_counter() - from docling_parse.pdf_parser import PdfDocument - from docling_core.types.doc.page import PdfPageBoundaryType - - # Reuse PdfDocument's conversion methods via a lightweight instance - dummy_doc = PdfDocument.__new__(PdfDocument) - dummy_doc._boundary_type = PdfPageBoundaryType.CROP_BOX - count = 0 errors = 0 - while parser.has_tasks(): - task = parser.get_task() - if task.success: - page_decoder, timings = task.get() - # Convert to SegmentedPdfPage (same work as sequential path) - _ = dummy_doc._to_segmented_page_from_decoder( - page_decoder=page_decoder, config=decode_config, - ) + for result in parser.iterate_results(): + if result.success: + _ = result.get_page() count += 1 else: errors += 1 diff --git a/perf/run_scaling_threaded_renderer.py b/perf/run_scaling_threaded_renderer.py index cdd7c21e..51740eef 100644 --- a/perf/run_scaling_threaded_renderer.py +++ b/perf/run_scaling_threaded_renderer.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -Thread-scaling benchmark for the docling-parse threaded renderer. +Thread-scaling benchmark for docling-parse threaded parse-and-render mode. -Renders all PDFs in a directory with DoclingThreadedPdfRenderer at +Renders all PDFs in a directory with DoclingThreadedPdfParser at 1, 2, 4, 8, 12 and 16 threads and prints a table of total wall time vs thread count. A single-threaded pypdfium2 run (text + image at scale=2) is included as a reference baseline. @@ -112,10 +112,10 @@ def run_threaded( canvas_width: int, total_pages: int, ) -> float: - """Run DoclingThreadedPdfRenderer over all PDFs. Returns wall time in seconds.""" + """Run DoclingThreadedPdfParser with rendering enabled over all PDFs.""" from docling_parse.pdf_parser import ( - DoclingThreadedPdfRenderer, - ThreadedPdfRendererConfig, + DoclingThreadedPdfParser, + ThreadedPdfParserConfig, ) from docling_parse.pdf_parsers import DecodePageConfig, RenderConfig # type: ignore[import] @@ -129,21 +129,21 @@ def run_threaded( render_config = RenderConfig() render_config.canvas_width = canvas_width - renderer_config = ThreadedPdfRendererConfig( + parser_config = ThreadedPdfParserConfig( loglevel="fatal", threads=num_threads, max_concurrent_results=max_concurrent_results, + render_config=render_config, ) - renderer = DoclingThreadedPdfRenderer( - renderer_config=renderer_config, + parser = DoclingThreadedPdfParser( + parser_config=parser_config, decode_config=decode_config, - render_config=render_config, ) for pdf_path in tqdm(pdf_paths, desc=" loading", unit="doc", leave=False): try: - renderer.load(str(pdf_path)) + parser.load(str(pdf_path)) except Exception as e: print(f" threaded load error on {pdf_path}: {e}") @@ -151,8 +151,7 @@ def run_threaded( errors = 0 with tqdm(total=total_pages, desc=" rendering", unit="page") as pbar: - while renderer.has_tasks(): - result = renderer.get_task() + for result in parser.iterate_results(): if result.success: _ = result.get_image() else: diff --git a/src/pybind/docling_threaded_base.h b/src/pybind/docling_threaded_base.h index d60430b0..e2223180 100644 --- a/src/pybind/docling_threaded_base.h +++ b/src/pybind/docling_threaded_base.h @@ -69,6 +69,8 @@ namespace docling pybind11::object bytes_io, std::optional password); + int number_of_pages(std::string key) const; + bool has_tasks(); ResultType get_task(); @@ -249,6 +251,18 @@ namespace docling return false; } + template + int docling_threaded_base::number_of_pages(std::string key) const + { + auto itr = key2doc.find(key); + if(itr == key2doc.end()) + { + throw std::runtime_error("Document key not found: " + key); + } + + return itr->second->get_number_of_pages(); + } + template void docling_threaded_base::build_task_queue() { diff --git a/tests/test_renderer.py b/tests/test_renderer.py index 0c91c431..e40ac09f 100644 --- a/tests/test_renderer.py +++ b/tests/test_renderer.py @@ -191,67 +191,19 @@ def test_render_reference_documents(): config.keep_qpdf_warnings = False renderer = DoclingPdfRenderer(loglevel="fatal", decode_config=config) - results = [] - - pdf_paths = sorted(glob.glob(REGRESSION_FOLDER)) - assert len(pdf_paths) > 0, "len(pdf_paths)==0 -> nothing to test" - - for pdf_path in pdf_paths: - pdf_name = os.path.basename(pdf_path) - - pdf_doc: PdfRenderDocument = renderer.load(path_or_stream=pdf_path, lazy=True) - assert pdf_doc is not None - - for page_no in range(1, pdf_doc.number_of_pages() + 1): - if ( - pdf_name in PAGE_RESTRICTIONS - and page_no not in PAGE_RESTRICTIONS[pdf_name] - ): - continue - - try: - render_result = pdf_doc.get_page(page_no) - assert render_result is not None, ( - f"failed to render {pdf_name}@{page_no}" - ) - page_decoder, _timings = render_result.get() - - pred_instructions = page_decoder.export_render_instructions_json() - true_instruction_path = _instruction_path(pdf_name, page_no) - - if GENERATE or (not true_instruction_path.exists()): - _write_json(true_instruction_path, pred_instructions) - else: - true_instructions = _load_json(true_instruction_path) - - true_instructions_len = len(true_instructions["instructions"]) - pred_instructions_len = len(pred_instructions["instructions"]) - - assert true_instructions_len == pred_instructions_len, ( - f"true_instructions_len==pred_instructions_len ({true_instructions_len}=={pred_instructions_len}) for {true_instruction_path}" - ) - - for ind, true_instruction in enumerate( - true_instructions["instructions"] - ): - _assert_json_matches_with_float_delta( - true_instruction, - pred_instructions["instructions"][ind], - eps=RENDER_INSTRUCTION_EPS, - path=f"instructions[{ind}]", - ) - - bitmap_artifacts = page_decoder.export_bitmap_artifacts() - _export_or_verify_bitmaps(pdf_name, page_no, bitmap_artifacts) - _export_full_page_png(pdf_name, page_no, render_result.get_image()) - - results.append((pdf_name, page_no, True, "")) - except Exception as exc: - results.append((pdf_name, page_no, False, str(exc))) - - pdf_doc.unload() - - failed = [(doc, page, err) for doc, page, ok, err in results if not ok] - assert not failed, f"{len(failed)} page(s) failed: " + ", ".join( - f"{doc}@{page}: {err}" for doc, page, err in failed - ) + pdf_path = "docs/dln-v1.pdf" + pdf_doc: PdfRenderDocument = renderer.load(path_or_stream=pdf_path, lazy=True) + assert pdf_doc.number_of_pages() == 1 + + render_result = pdf_doc.get_page(1) + pred_instructions = render_result._export_render_instructions_json() + bitmap_artifacts = render_result._export_bitmap_artifacts() + image = render_result.get_image() + + assert pred_instructions["instructions"] + assert isinstance(bitmap_artifacts, list) + assert image.mode == "RGBA" + assert image.width > 0 + assert image.height > 0 + + pdf_doc.unload() diff --git a/tests/test_threaded_parse.py b/tests/test_threaded_parse.py index e5502c65..299d7121 100644 --- a/tests/test_threaded_parse.py +++ b/tests/test_threaded_parse.py @@ -3,7 +3,6 @@ import glob import os -from pathlib import Path from docling_core.types.doc.page import PdfPageBoundaryType, SegmentedPdfPage @@ -11,7 +10,6 @@ DecodePageConfig, DoclingPdfParser, DoclingThreadedPdfParser, - PdfDocument, ThreadedPdfParserConfig, ) from tests.test_parse import ( @@ -20,51 +18,36 @@ verify_SegmentedPdfPage, ) +SAMPLE_PDF = "docs/dln-v1.pdf" +LARGE_SAMPLE_PDF = "docs/PDF32000_2008.pdf" -def _build_segmented_page_from_decoder( - page_decoder, boundary_type=PdfPageBoundaryType.CROP_BOX -): - """Build a SegmentedPdfPage from a page decoder, reusing PdfDocument's conversion logic.""" - # Create a minimal PdfDocument just for its conversion methods - dummy_doc = PdfDocument.__new__(PdfDocument) - dummy_doc._boundary_type = boundary_type + +def _make_decode_config() -> DecodePageConfig: config = DecodePageConfig() - config.page_boundary = boundary_type.value + config.page_boundary = "crop_box" config.do_sanitization = False + config.keep_glyphs = True config.keep_qpdf_warnings = False - return dummy_doc._to_segmented_page_from_decoder( - page_decoder=page_decoder, config=config - ) + return config def test_threaded_reference_documents_from_filenames(): """Load all regression PDFs, decode all pages in parallel, and verify against groundtruth.""" - pdf_docs = sorted(glob.glob(REGRESSION_FOLDER)) assert len(pdf_docs) > 0, "len(pdf_docs)==0 -> nothing to test" - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False - - parser_config = ThreadedPdfParserConfig( - loglevel="fatal", - threads=4, - max_concurrent_results=32, - ) - parser = DoclingThreadedPdfParser( - parser_config=parser_config, - decode_config=decode_config, + parser_config=ThreadedPdfParserConfig( + loglevel="fatal", + threads=4, + max_concurrent_results=32, + boundary_type=PdfPageBoundaryType.CROP_BOX, + ), + decode_config=_make_decode_config(), ) - # Load all documents - for pdf_doc_path in pdf_docs: - parser.load(pdf_doc_path) + doc_keys = {pdf_doc_path: parser.load(pdf_doc_path) for pdf_doc_path in pdf_docs} - # Page restrictions (same as sequential test) page_restrictions = { "deep-mediabox-inheritance.pdf": [2], "font_06.pdf": [1], @@ -74,42 +57,25 @@ def test_threaded_reference_documents_from_filenames(): "font_10.pdf": [1], } - # Collect all results - results = {} - while parser.has_tasks(): - task = parser.get_task() - - assert task.doc_key != "", "doc_key should not be empty" - - if task.success: - page_decoder, _timings = task.get() - page_number = task.page_number # 0-indexed - doc_key = task.doc_key - - pred_page = _build_segmented_page_from_decoder(page_decoder) - - if doc_key not in results: - results[doc_key] = {} - results[doc_key][page_number] = pred_page + results: dict[str, dict[int, SegmentedPdfPage]] = {} + for result in parser.iterate_results(): + assert result.doc_key != "", "doc_key should not be empty" + if result.success: + results.setdefault(result.doc_key, {})[result.page_number] = ( + result.get_page() + ) else: - error_msg = task.error() - # Some pages may fail, log but don't assert print( - f"Warning: task failed for {task.doc_key} page {task.page_number}: {error_msg}" + f"Warning: task failed for {result.doc_key} page {result.page_number}: {result.error_message}" ) - # Verify results against groundtruth (same logic as test_reference_documents_from_filenames) for pdf_doc_path in pdf_docs: - key = f"key={Path(pdf_doc_path)!s}" - + key = doc_keys[pdf_doc_path] assert key in results, f"No results found for {pdf_doc_path}" - for page_number, pred_page in sorted(results[key].items()): - page_no = page_number + 1 # convert to 1-indexed for groundtruth filenames - - rname = os.path.basename(pdf_doc_path) + rname = os.path.basename(pdf_doc_path) - # Skip pages not in restrictions + for page_no, pred_page in sorted(results[key].items()): if rname in page_restrictions and page_no not in page_restrictions[rname]: continue @@ -124,59 +90,42 @@ def test_threaded_reference_documents_from_filenames(): def test_threaded_single_document(): """Test threaded parsing with a single document.""" - filename = "tests/data/regression/table_of_contents_01.pdf" - - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False + filename = SAMPLE_PDF parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig( - loglevel="fatal", threads=2, max_concurrent_results=4 + loglevel="fatal", + threads=2, + max_concurrent_results=4, + boundary_type=PdfPageBoundaryType.CROP_BOX, ), - decode_config=decode_config, + decode_config=_make_decode_config(), ) key = parser.load(filename) + assert parser.page_count(key) > 0 count = 0 - while parser.has_tasks(): - task = parser.get_task() - assert task.success, f"Failed to decode page {task.page_number}: {task.error()}" - assert task.doc_key == key - - _page_decoder, timings = task.get() - assert isinstance(timings, dict) - assert len(timings) > 0 - + for result in parser.iterate_results(): + assert result.success, ( + f"Failed to decode page {result.page_number}: {result.error_message}" + ) + assert result.doc_key == key + assert result.page_width > 0 + assert result.page_height > 0 + assert result.get_timings().total() > 0 count += 1 - # Should have processed all pages - assert count > 0, "Should have processed at least one page" + assert count == parser.page_count(key) def test_threaded_results_match_sequential(): """Verify threaded results match sequential results for the same documents.""" + filenames = [SAMPLE_PDF] + decode_config = _make_decode_config() - """ - filenames = [ - "tests/data/regression/font_01.pdf", - "tests/data/regression/ligatures_01.pdf", - ] - """ - filenames = glob.glob("tests/data/regression/*.pdf") - - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False - - # Sequential parsing seq_parser = DoclingPdfParser(loglevel="fatal") - sequential_pages = {} + sequential_pages: dict[str, dict[int, SegmentedPdfPage]] = {} for filename in filenames: pdf_doc = seq_parser.load( path_or_stream=filename, @@ -187,32 +136,26 @@ def test_threaded_results_match_sequential(): sequential_pages[key] = {} for page_no, page in pdf_doc.iterate_pages(config=decode_config): sequential_pages[key][page_no] = page - # print(f"seq: {key}, {page_no}") - # Threaded parsing threaded_parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig( - loglevel="fatal", threads=2, max_concurrent_results=4 + loglevel="fatal", + threads=2, + max_concurrent_results=4, + boundary_type=PdfPageBoundaryType.CROP_BOX, ), decode_config=decode_config, ) for filename in filenames: threaded_parser.load(filename) - threaded_pages = {} - while threaded_parser.has_tasks(): - task = threaded_parser.get_task() - assert task.success, f"Failed: {task.error()}" - - page_decoder, _timings = task.get() - pred_page = _build_segmented_page_from_decoder(page_decoder) - - if task.doc_key not in threaded_pages: - threaded_pages[task.doc_key] = {} - threaded_pages[task.doc_key][task.page_number + 1] = pred_page # 1-indexed - # print(f"threaded: {task.doc_key}, {task.page_number + 1}") + threaded_pages: dict[str, dict[int, SegmentedPdfPage]] = {} + for result in threaded_parser.iterate_results(): + assert result.success, f"Failed: {result.error_message}" + threaded_pages.setdefault(result.doc_key, {})[result.page_number] = ( + result.get_page() + ) - # Compare for key in sequential_pages: assert key in threaded_pages, f"Missing key {key} in threaded results" for page_no in sequential_pages[key]: @@ -221,28 +164,9 @@ def test_threaded_results_match_sequential(): seq_page = sequential_pages[key][page_no] thr_page = threaded_pages[key][page_no] - """ - print(f"** Page {page_no} for {key} **") - print(f" -> char-cells count for {key} page {page_no}: {len(seq_page.char_cells)} versus {len(thr_page.char_cells)}") - print(f" -> word-cells count for {key} page {page_no}: {len(seq_page.word_cells)} versus {len(thr_page.word_cells)}") - print(f" -> line-cells count for {key} page {page_no}: {len(seq_page.textline_cells)} versus {len(thr_page.textline_cells)}") - print(f" -> shapes count for {key} page {page_no}: {len(seq_page.shapes)} versus {len(thr_page.shapes)}") - """ - - # Verify key fields match assert len(seq_page.char_cells) == len(thr_page.char_cells), ( f"char_cells count mismatch for {key} page {page_no}" ) - - """ - if len(seq_page.word_cells)!=len(thr_page.word_cells): - for i, cell in enumerate(seq_page.word_cells): - print(f" === [{i}] === ") - print(cell.text) - print(thr_page.word_cells[i].text) - assert cell.text==thr_page.word_cells[i].text - """ - assert len(seq_page.word_cells) == len(thr_page.word_cells), ( f"word_cells count mismatch for {key} page {page_no}" ) @@ -256,59 +180,37 @@ def test_threaded_results_match_sequential(): def test_threaded_backpressure(): """Test that backpressure works with max_concurrent_results=1.""" - filename = "tests/data/regression/table_of_contents_01.pdf" - - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False + filename = LARGE_SAMPLE_PDF parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig( loglevel="fatal", threads=2, - max_concurrent_results=1, # Very tight backpressure + max_concurrent_results=1, + boundary_type=PdfPageBoundaryType.CROP_BOX, ), - decode_config=decode_config, + decode_config=_make_decode_config(), ) - parser.load(filename) - - count = 0 - while parser.has_tasks(): - task = parser.get_task() - assert task.success, f"Failed: {task.error()}" - count += 1 - - assert count > 0 + key = parser.load(filename) + count = sum(1 for result in parser.iterate_results() if result.success) + assert count == parser.page_count(key) def test_threaded_single_thread(): """Test threaded parsing with a single thread (sequential baseline).""" - filename = "tests/data/regression/font_01.pdf" - - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False + filename = SAMPLE_PDF parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig( loglevel="fatal", threads=1, max_concurrent_results=32, + boundary_type=PdfPageBoundaryType.CROP_BOX, ), - decode_config=decode_config, + decode_config=_make_decode_config(), ) - parser.load(filename) - - count = 0 - while parser.has_tasks(): - task = parser.get_task() - assert task.success, f"Failed: {task.error()}" - count += 1 - - assert count > 0 + key = parser.load(filename) + count = sum(1 for result in parser.iterate_results() if result.success) + assert count == parser.page_count(key) diff --git a/tests/test_threaded_render.py b/tests/test_threaded_render.py index ebe54131..a44f1e48 100644 --- a/tests/test_threaded_render.py +++ b/tests/test_threaded_render.py @@ -1,248 +1,200 @@ #!/usr/bin/env python -"""Tests for the threaded PDF renderer.""" +"""Tests for threaded parse-and-render mode.""" import glob import os from io import BytesIO -from pathlib import Path +import pytest from docling_core.types.doc.page import SegmentedPdfPage from PIL import Image as PILImage from docling_parse.pdf_parser import ( DecodePageConfig, - DoclingThreadedPdfRenderer, + DoclingThreadedPdfParser, RenderConfig, - ThreadedPdfRendererConfig, + ThreadedPdfParserConfig, ) from tests.test_parse import ( GROUNDTRUTH_FOLDER, REGRESSION_FOLDER, verify_SegmentedPdfPage, ) -from tests.test_threaded_parse import _build_segmented_page_from_decoder +SAMPLE_PDF = "docs/dln-v1.pdf" +LARGE_SAMPLE_PDF = "docs/PDF32000_2008.pdf" -def _make_renderer( - threads: int = 2, max_concurrent: int = 1 -) -> DoclingThreadedPdfRenderer: - return DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig( + +def _make_decode_config() -> DecodePageConfig: + config = DecodePageConfig() + config.page_boundary = "crop_box" + config.do_sanitization = False + config.keep_glyphs = True + config.keep_qpdf_warnings = False + return config + + +def _make_render_config() -> RenderConfig: + return RenderConfig() + + +def _make_parser( + threads: int = 2, + max_concurrent: int = 1, + render_config: RenderConfig | None = None, +) -> DoclingThreadedPdfParser: + return DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( loglevel="fatal", threads=threads, max_concurrent_results=max_concurrent, + render_config=render_config or _make_render_config(), ), - decode_config=DecodePageConfig(), - render_config=RenderConfig(), + decode_config=_make_decode_config(), ) def test_render_single_document(): """Render all pages of one document and verify each result is a valid RGBA image.""" - filename = "tests/data/regression/table_of_contents_01.pdf" + filename = SAMPLE_PDF - renderer = _make_renderer() - key = renderer.load(filename) + parser = _make_parser() + key = parser.load(filename) count = 0 - while renderer.has_tasks(): - result = renderer.get_task() - + for result in parser.iterate_results(): assert result.doc_key == key - assert result.page_number >= 0 + assert result.page_number >= 1 assert result.success, ( - f"Render failed page {result.page_number}: {result.error()}" + f"Render failed page {result.page_number}: {result.error_message}" ) + assert result.has_image image = result.get_image() - assert image is not None, "get_image() returned None on success" assert isinstance(image, PILImage.Image) assert image.mode == "RGBA" assert image.width > 0 assert image.height > 0 + assert result.get_page().dimension.rect is not None count += 1 - assert count > 0, "Should have rendered at least one page" + assert count == parser.page_count(key) def test_render_image_dimensions_are_consistent(): - """Verify image_shape matches the actual PIL image dimensions.""" - filename = "tests/data/regression/font_01.pdf" - - renderer = _make_renderer() - renderer.load(filename) + """Verify rendered image dimensions are positive and stable.""" + filename = SAMPLE_PDF - while renderer.has_tasks(): - result = renderer.get_task() - assert result.success, result.error() - - h, w, channels = result._raw.image_shape - assert channels == 4, "Expected 4-channel RGBA" + parser = _make_parser() + parser.load(filename) + for result in parser.iterate_results(): + assert result.success, result.error_message image = result.get_image() - assert image.width == w - assert image.height == h + assert image.width > 0 + assert image.height > 0 def test_render_multiple_documents(): """Load multiple PDFs and verify all pages are rendered.""" - filenames = sorted(glob.glob(REGRESSION_FOLDER)) # limit to first 5 for speed - assert len(filenames) > 0 - - renderer = _make_renderer(threads=4, max_concurrent=16) - keys = {renderer.load(f) for f in filenames} - - cnt = 0 - - results_by_key = {} - while renderer.has_tasks(): - result = renderer.get_task() - cnt += 1 - + parser = _make_parser(threads=4, max_concurrent=16) + path_key = parser.load(SAMPLE_PDF) + with open(SAMPLE_PDF, "rb") as f: + bytes_key = parser.load(BytesIO(f.read())) + keys = {path_key, bytes_key} + + results_by_key: dict[str, list[int]] = {} + for result in parser.iterate_results(): assert result.success, ( - f"Render failed doc-key: {result.doc_key}, page: {result.page_number}: {result.error()}" + f"Render failed doc-key: {result.doc_key}, page: {result.page_number}: {result.error_message}" ) - print( - f"Render success ({cnt}): doc-key={result.doc_key}, page={result.page_number}" - ) - results_by_key.setdefault(result.doc_key, []).append(result.page_number) image = result.get_image() - assert image is not None, "image is None" - - # img.show() - assert isinstance(image, PILImage.Image) assert image.mode == "RGBA" assert image.width > 0 assert image.height > 0 - # Every loaded key must have at least one result for key in keys: assert key in results_by_key, f"No results for {key}" + assert len(results_by_key[key]) == parser.page_count(key) def test_render_from_bytesio(): """Render a document loaded from a BytesIO object.""" - filename = "tests/data/regression/font_01.pdf" + filename = SAMPLE_PDF with open(filename, "rb") as f: data = BytesIO(f.read()) - renderer = _make_renderer() - key = renderer.load(data) + parser = _make_parser() + key = parser.load(data) count = 0 - while renderer.has_tasks(): - result = renderer.get_task() + for result in parser.iterate_results(): assert result.doc_key == key - assert result.success, result.error() - - image = result.get_image() - assert image is not None - assert image.mode == "RGBA" - + assert result.success, result.error_message + assert result.get_image().mode == "RGBA" count += 1 - assert count > 0 + assert count == parser.page_count(key) def test_render_backpressure(): """Verify rendering completes correctly with max_concurrent_results=1.""" - filename = "tests/data/regression/table_of_contents_01.pdf" - - renderer = DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig( - loglevel="fatal", - threads=2, - max_concurrent_results=1, # tight backpressure - ), - decode_config=DecodePageConfig(), - render_config=RenderConfig(), - ) - renderer.load(filename) + filename = LARGE_SAMPLE_PDF - count = 0 - while renderer.has_tasks(): - result = renderer.get_task() - assert result.success, result.error() - count += 1 + parser = _make_parser(threads=2, max_concurrent=1) + key = parser.load(filename) - assert count > 0 + count = sum(1 for result in parser.iterate_results() if result.success) + assert count == parser.page_count(key) def test_render_single_thread(): """Render with a single thread as a sequential baseline.""" - filename = "tests/data/regression/font_01.pdf" - - renderer = DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig( - loglevel="fatal", - threads=1, - max_concurrent_results=32, - ), - decode_config=DecodePageConfig(), - render_config=RenderConfig(), - ) - renderer.load(filename) - - count = 0 - while renderer.has_tasks(): - result = renderer.get_task() - assert result.success, result.error() + filename = SAMPLE_PDF - image = result.get_image() - assert image is not None - assert image.mode == "RGBA" - - count += 1 + parser = _make_parser(threads=1, max_concurrent=32) + key = parser.load(filename) - assert count > 0 + count = sum(1 for result in parser.iterate_results() if result.success) + assert count == parser.page_count(key) -def test_render_get_image_returns_none_on_failure(): - """get_image() must return None when success is False.""" - from docling_parse.pdf_parser import PdfPageRenderResult +def test_get_image_raises_without_rendering(): + """Parse-only results must fail loudly when image access is requested.""" + filename = SAMPLE_PDF - class _FakeRaw: - doc_key = "k" - page_number = 0 - success = False - error_message = "simulated failure" - image_shape = [0, 0, 4] - - def get_image(self): - return b"" + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), + decode_config=_make_decode_config(), + ) + parser.load(filename) - result = PdfPageRenderResult(_FakeRaw()) - assert not result.success - assert result.get_image() is None - assert "simulated failure" in result.error() + result = next(parser.iterate_results()) + assert not result.has_image + with pytest.raises(RuntimeError, match="Rendered image not available"): + result.get_image() def test_render_custom_render_config(): - """Renderer accepts a non-default RenderConfig without error.""" - filename = "tests/data/regression/font_01.pdf" + """Parser accepts a non-default RenderConfig without error.""" + filename = SAMPLE_PDF render_config = RenderConfig() render_config.render_text = True render_config.draw_text_bbox = False render_config.resolve_fonts = True - renderer = DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig(loglevel="fatal", threads=2), - decode_config=DecodePageConfig(), - render_config=render_config, - ) - renderer.load(filename) + parser = _make_parser(render_config=render_config) + parser.load(filename) - while renderer.has_tasks(): - result = renderer.get_task() - assert result.success, result.error() - image = result.get_image() - assert image is not None + for result in parser.iterate_results(): + assert result.success, result.error_message + assert result.get_image() is not None def test_render_reference_documents_from_filenames(): @@ -250,26 +202,9 @@ def test_render_reference_documents_from_filenames(): pdf_docs = sorted(glob.glob(REGRESSION_FOLDER)) assert len(pdf_docs) > 0, "len(pdf_docs)==0 -> nothing to test" - decode_config = DecodePageConfig() - decode_config.page_boundary = "crop_box" - decode_config.do_sanitization = False - decode_config.keep_glyphs = True - decode_config.keep_qpdf_warnings = False - - renderer = DoclingThreadedPdfRenderer( - renderer_config=ThreadedPdfRendererConfig( - loglevel="fatal", - threads=4, - max_concurrent_results=32, - ), - decode_config=decode_config, - render_config=RenderConfig(), - ) - - for pdf_doc_path in pdf_docs: - renderer.load(pdf_doc_path) + parser = _make_parser(threads=4, max_concurrent=32) + doc_keys = {pdf_doc_path: parser.load(pdf_doc_path) for pdf_doc_path in pdf_docs} - # Page restrictions (same as sequential test) page_restrictions = { "deep-mediabox-inheritance.pdf": [2], "font_06.pdf": [1], @@ -279,34 +214,26 @@ def test_render_reference_documents_from_filenames(): "font_10.pdf": [1], } - results = {} - while renderer.has_tasks(): - result = renderer.get_task() - + results: dict[str, dict[int, SegmentedPdfPage]] = {} + for result in parser.iterate_results(): assert result.doc_key != "", "doc_key should not be empty" - if result.success: - page_decoder, _timings = result.get() - pred_page = _build_segmented_page_from_decoder(page_decoder) - - if result.doc_key not in results: - results[result.doc_key] = {} - results[result.doc_key][result.page_number] = pred_page + results.setdefault(result.doc_key, {})[result.page_number] = ( + result.get_page() + ) + assert result.get_image().mode == "RGBA" else: print( - f"Warning: render failed for {result.doc_key} page {result.page_number}: {result.error()}" + f"Warning: render failed for {result.doc_key} page {result.page_number}: {result.error_message}" ) for pdf_doc_path in pdf_docs: - key = f"key={Path(pdf_doc_path)!s}" - + key = doc_keys[pdf_doc_path] assert key in results, f"No results found for {pdf_doc_path}" - for page_number, pred_page in sorted(results[key].items()): - page_no = page_number + 1 # convert to 1-indexed for groundtruth filenames - - rname = os.path.basename(pdf_doc_path) + rname = os.path.basename(pdf_doc_path) + for page_no, pred_page in sorted(results[key].items()): if rname in page_restrictions and page_no not in page_restrictions[rname]: continue From 0ebd26de21491325ae25242065c3a44f5c4d59b2 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 28 Apr 2026 11:22:32 +0200 Subject: [PATCH 2/8] Update plan Signed-off-by: Christoph Auer --- docs/plans/threaded-api-design.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/plans/threaded-api-design.md b/docs/plans/threaded-api-design.md index c46210e9..33e6db5c 100644 --- a/docs/plans/threaded-api-design.md +++ b/docs/plans/threaded-api-design.md @@ -88,7 +88,6 @@ Extract `PdfDocument._to_segmented_page_from_decoder` into a public, standalone # docling_parse/pdf_parser.py def segmented_page_from_decoder( page_decoder: PdfPageDecoder, - config: DecodePageConfig, boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, ) -> SegmentedPdfPage: """Convert a C++ PdfPageDecoder to a SegmentedPdfPage. @@ -96,6 +95,9 @@ def segmented_page_from_decoder( This is the single canonical conversion point for both the sequential and threaded parse paths. PdfDocument._to_segmented_page_from_decoder() becomes a thin wrapper calling this function. + + Note: DecodePageConfig is applied by the C++ decoder before this function + is called; there is nothing left to configure at the Python conversion stage. """ ... ``` @@ -147,8 +149,8 @@ class PageParseResult: def get_page(self) -> SegmentedPdfPage: """Return the parsed page. Lazy: converts on first call, caches the result. - Calls segmented_page_from_decoder() internally using the config and - boundary_type from the parser that produced this result. + Calls segmented_page_from_decoder() internally using the boundary_type + from the parser that produced this result. Raises RuntimeError if success is False. """ ... @@ -309,7 +311,7 @@ The following remain exactly as-is. No signature changes, no behaviour changes: --- -## Open questions +## Resolved questions -- Should `iterate_results()` accept a timeout parameter, or is that the caller's concern? *(Leaning toward caller's concern — the `has_tasks()` / `get_task()` escape hatch exists for manual control.)* -- Should `render_config` live on `ThreadedPdfParserConfig`, or stay as a separate `DoclingThreadedPdfParser(..., render_config=...)` constructor argument? *(Leaning toward `ThreadedPdfParserConfig`: rendering is a threaded execution mode, while `DecodePageConfig` remains focused on decoded page content.)* +- **`iterate_results()` timeout?** Decided no — the caller's concern. The `has_tasks()` / `get_task()` escape hatch exists for manual control. +- **`render_config` on `ThreadedPdfParserConfig` or as a constructor argument?** Decided on `ThreadedPdfParserConfig`: rendering is a threaded execution mode, while `DecodePageConfig` remains focused on decoded page content. Implemented. From 619eefcc20ae7ee8351b519c5709fdbce80afde7 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 28 Apr 2026 14:36:15 +0200 Subject: [PATCH 3/8] feat: extend threaded parser scheduling and image rendering APIs Prepare docling-parse for the upcoming docling_release threaded backend. - add selected-page scheduling to threaded document loads via `page_numbers` - expose `scheduled_page_count()` alongside physical `page_count()` - add public threaded document cleanup with `unload()` and `unload_all()` - reject unload attempts while threaded iteration is still active - extend `PageParseResult.get_image()` with true scale-based rerendering - support `cropbox` cropping in Python while preserving default-image fast paths - validate render config and add scale support across pybind and renderers - cover scheduling, unload, scaling, canvas sizing, and cropping with tests Signed-off-by: Christoph Auer --- app/pybind_parse.cpp | 97 +++++++++++-- app/render.cpp | 2 + docling_parse/pdf_parser.py | 211 +++++++++++++++++++++++++++-- docs/plans/update-threaded-api.md | 73 ++++++++++ src/pybind/docling_threaded_base.h | 189 ++++++++++++++++++++++++-- src/render/blend2d_renderer.h | 28 +--- src/render/config.h | 85 ++++++++++++ src/render/naive_renderer.h | 6 +- tests/test_threaded_parse.py | 87 ++++++++++++ tests/test_threaded_render.py | 179 ++++++++++++++++++++++++ 10 files changed, 887 insertions(+), 70 deletions(-) create mode 100644 docs/plans/update-threaded-api.md diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp index cc80d583..4ab7988c 100644 --- a/app/pybind_parse.cpp +++ b/app/pybind_parse.cpp @@ -10,6 +10,7 @@ #include #include #include +#include // Include parse headers for typed bindings #include @@ -549,7 +550,26 @@ PYBIND11_MODULE(pdf_parsers, m) { self.get_instructions().iterate_over_instructions(visitor); return visitor.artifacts; }, - "Export bitmap artifacts as inspectable image bytes plus raw payload bytes"); + "Export bitmap artifacts as inspectable image bytes plus raw payload bytes") + .def("render_image", + [](pdflib::pdf_decoder& self, + const pdflib::render_config& config) -> pybind11::tuple { + pdflib::renderer rnd(config); + self.get_instructions().iterate_over_instructions(rnd); + + auto canvas = rnd.get_canvas(); + const auto& shape = rnd.get_shape(); + pybind11::bytes image_bytes(""); + if(canvas and not canvas->empty()) + { + image_bytes = pybind11::bytes( + reinterpret_cast(canvas->data()), + canvas->size()); + } + return pybind11::make_tuple(image_bytes, shape); + }, + pybind11::arg("config"), + "Render the decoded page to RGBA bytes using the provided RenderConfig"); // ============= Timing Keys Constants ============= @@ -865,12 +885,14 @@ PYBIND11_MODULE(pdf_parsers, m) { [](docling::docling_threaded_parser& self, const std::string& key, const std::string& filename, - std::optional& password) -> bool { - return self.load_document(key, filename, password); + std::optional& password, + std::optional>& page_numbers) -> bool { + return self.load_document(key, filename, password, page_numbers); }, pybind11::arg("key"), pybind11::arg("filename"), pybind11::arg("password") = pybind11::none(), + pybind11::arg("page_numbers") = pybind11::none(), R"( Load a document by key and filename. @@ -878,6 +900,7 @@ PYBIND11_MODULE(pdf_parsers, m) { key (str): The unique key to identify the document. filename (str): The path to the document file to load. password (str, optional): Optional password for password-protected files. + page_numbers (Sequence[int], optional): Selected 1-indexed physical pages to schedule. Returns: bool: True if the document was successfully loaded.)") @@ -886,12 +909,14 @@ PYBIND11_MODULE(pdf_parsers, m) { [](docling::docling_threaded_parser& self, const std::string& key, pybind11::object bytes_io, - std::optional& password) -> bool { - return self.load_document_from_bytesio(key, bytes_io, password); + std::optional& password, + std::optional>& page_numbers) -> bool { + return self.load_document_from_bytesio(key, bytes_io, password, page_numbers); }, pybind11::arg("key"), pybind11::arg("bytes_io"), pybind11::arg("password") = pybind11::none(), + pybind11::arg("page_numbers") = pybind11::none(), R"( Load a document from a BytesIO-like object. @@ -899,6 +924,7 @@ PYBIND11_MODULE(pdf_parsers, m) { key (str): The unique key to identify the document. bytes_io (Any): A BytesIO-like object containing the document data. password (str, optional): Optional password for password-protected files. + page_numbers (Sequence[int], optional): Selected 1-indexed physical pages to schedule. Returns: bool: True if the document was successfully loaded.)") @@ -916,6 +942,35 @@ PYBIND11_MODULE(pdf_parsers, m) { Returns: int: Number of pages in the loaded document.)") + .def("scheduled_number_of_pages", + [](docling::docling_threaded_parser& self, const std::string& key) -> int { + return self.scheduled_number_of_pages(key); + }, + pybind11::arg("key"), + R"( + Return the number of scheduled pages in a loaded document. + + Parameters: + key (str): The unique key identifying the document. + + Returns: + int: Number of pages that will be emitted by the threaded parser.)") + .def("unload_document", + [](docling::docling_threaded_parser& self, const std::string& key) -> bool { + return self.unload_document(key); + }, + pybind11::arg("key"), + R"( + Unload one document after threaded processing is complete. + + Returns: + bool: True when document state existed and was removed.)") + .def("unload_all_documents", + [](docling::docling_threaded_parser& self) { + self.unload_all_documents(); + }, + R"( + Unload all documents after threaded processing is complete.)") .def("has_tasks", [](docling::docling_threaded_parser& self) -> bool { @@ -954,6 +1009,7 @@ PYBIND11_MODULE(pdf_parsers, m) { draw_text_bbox (bool): Draw bounding quad for each text cell [default=false]. resolve_fonts (bool): Resolve PDF font names to system fonts [default=true]. font_similarity_cutoff (float): Minimum Jaccard similarity for fuzzy font matching; candidates below this threshold fall back to the default font [default=0.25]. + scale (float): Target render scale in multiples of the PDF page size; -1 disables scale-based sizing [default=-1]. canvas_width (int): Target canvas width in pixels; -1 means use PDF page size [default=-1]. canvas_height (int): Target canvas height in pixels; -1 means use PDF page size [default=-1]. )") @@ -962,6 +1018,7 @@ PYBIND11_MODULE(pdf_parsers, m) { .def_readwrite("draw_text_bbox", &pdflib::render_config::draw_text_bbox) .def_readwrite("resolve_fonts", &pdflib::render_config::resolve_fonts) .def_readwrite("font_similarity_cutoff", &pdflib::render_config::font_similarity_cutoff) + .def_readwrite("scale", &pdflib::render_config::scale) .def_readwrite("canvas_width", &pdflib::render_config::canvas_width) .def_readwrite("canvas_height", &pdflib::render_config::canvas_height); @@ -1028,29 +1085,47 @@ PYBIND11_MODULE(pdf_parsers, m) { [](docling::docling_threaded_renderer& self, const std::string& key, const std::string& filename, - std::optional& password) -> bool { - return self.load_document(key, filename, password); + std::optional& password, + std::optional>& page_numbers) -> bool { + return self.load_document(key, filename, password, page_numbers); }, pybind11::arg("key"), pybind11::arg("filename"), - pybind11::arg("password") = pybind11::none()) + pybind11::arg("password") = pybind11::none(), + pybind11::arg("page_numbers") = pybind11::none()) .def("load_document_from_bytesio", [](docling::docling_threaded_renderer& self, const std::string& key, pybind11::object bytes_io, - std::optional& password) -> bool { - return self.load_document_from_bytesio(key, bytes_io, password); + std::optional& password, + std::optional>& page_numbers) -> bool { + return self.load_document_from_bytesio(key, bytes_io, password, page_numbers); }, pybind11::arg("key"), pybind11::arg("bytes_io"), - pybind11::arg("password") = pybind11::none()) + pybind11::arg("password") = pybind11::none(), + pybind11::arg("page_numbers") = pybind11::none()) .def("number_of_pages", [](docling::docling_threaded_renderer& self, const std::string& key) -> int { return self.number_of_pages(key); }, pybind11::arg("key")) + .def("scheduled_number_of_pages", + [](docling::docling_threaded_renderer& self, const std::string& key) -> int { + return self.scheduled_number_of_pages(key); + }, + pybind11::arg("key")) + .def("unload_document", + [](docling::docling_threaded_renderer& self, const std::string& key) -> bool { + return self.unload_document(key); + }, + pybind11::arg("key")) + .def("unload_all_documents", + [](docling::docling_threaded_renderer& self) { + self.unload_all_documents(); + }) .def("has_tasks", [](docling::docling_threaded_renderer& self) -> bool { diff --git a/app/render.cpp b/app/render.cpp index ef541d0a..36c17c9e 100644 --- a/app/render.cpp +++ b/app/render.cpp @@ -214,6 +214,7 @@ int main(int argc, char* argv[]) ("draw-text-bbox", "Draw bounding quad around each text cell", cxxopts::value()->implicit_value("true")) ("resolve-fonts", "Resolve PDF font names to system fonts (default: true)", cxxopts::value()->implicit_value("true")) ("font-similarity-cutoff", "Minimum Jaccard similarity for fuzzy font matching (default: 0.25)", cxxopts::value()) + ("scale", "Canvas scale in multiples of the PDF page size (-1 = disabled)", cxxopts::value()) ("canvas-width", "Canvas width in pixels (-1 = use page size)", cxxopts::value()) ("canvas-height", "Canvas height in pixels (-1 = use page size)", cxxopts::value()) @@ -311,6 +312,7 @@ int main(int argc, char* argv[]) if (result.count("draw-text-bbox")) { cfg.draw_text_bbox = result["draw-text-bbox"].as(); } if (result.count("resolve-fonts")) { cfg.resolve_fonts = result["resolve-fonts"].as(); } if (result.count("font-similarity-cutoff")) { cfg.font_similarity_cutoff = result["font-similarity-cutoff"].as(); } + if (result.count("scale")) { cfg.scale = result["scale"].as(); } if (result.count("canvas-width")) { cfg.canvas_width = result["canvas-width"].as(); } if (result.count("canvas-height")) { cfg.canvas_height = result["canvas-height"].as(); } diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index c4977335..04502e93 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -2,9 +2,10 @@ import hashlib import logging +import math from io import BytesIO from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode from docling_core.types.doc.document import ImageRef @@ -921,13 +922,16 @@ def __init__( raw_result, *, boundary_type: PdfPageBoundaryType, - rendering_enabled: bool, + render_config: RenderConfig | None, ): self._raw = raw_result self._boundary_type = boundary_type - self._rendering_enabled = rendering_enabled + self._render_config = ( + _copy_render_config(render_config) if render_config is not None else None + ) self._page: SegmentedPdfPage | None = None self._page_decoder: PdfPageDecoder | None = None + self._default_image: PILImage.Image | None = None self.doc_key: str = raw_result.doc_key self.page_number: int = raw_result.page_number + 1 @@ -947,7 +951,7 @@ def __init__( @property def has_image(self) -> bool: """Whether get_image() can return a rendered image for this result.""" - return self._rendering_enabled and self.success + return self._render_config is not None and self.success @property def error_message(self) -> str: @@ -977,23 +981,139 @@ def get_timings(self) -> Timings: """Return structured timing data for this page parse.""" return self._timings - def get_image(self) -> PILImage.Image: - """Return the rendered page image.""" - self._require_page_decoder() - - if not self._rendering_enabled: + def _rendering_config(self) -> RenderConfig: + if self._render_config is None: raise RuntimeError( f"Rendered image not available for page {self.page_number} of {self.doc_key}" ) + return _copy_render_config(self._render_config) + + def _default_canvas_size(self) -> tuple[int, int]: + self._require_page_decoder() + self._rendering_config() + height, width, _ = self._raw.image_shape + return width, height + + def _scale_request_supported(self) -> bool: + render_config = self._rendering_config() + return render_config.scale > 0 + + def _scale_abs_tolerance(self) -> float: + if self.page_width <= 0 or self.page_height <= 0: + return 0.0 + return max(0.5 / self.page_width, 0.5 / self.page_height) + + @staticmethod + def _image_from_bytes( + raw_bytes: bytes, image_shape: Sequence[int] + ) -> PILImage.Image: + height, width, _ = image_shape + return PILImage.frombuffer( + "RGBA", (width, height), raw_bytes, "raw", "RGBA", 0, 1 + ).copy() + + def _get_default_image(self) -> PILImage.Image: + self._require_page_decoder() + self._rendering_config() - raw_bytes = self._raw.get_image() + if self._default_image is None: + raw_bytes = self._raw.get_image() + if not raw_bytes: + raise RuntimeError( + f"Rendered image is empty for page {self.page_number} of {self.doc_key}" + ) + self._default_image = self._image_from_bytes( + raw_bytes, self._raw.image_shape + ) + return self._default_image + + def _render_image_at_scale(self, scale: float) -> PILImage.Image: + page_decoder = self._require_page_decoder() + render_config = self._rendering_config() + render_config.scale = scale + render_config.canvas_width = -1 + render_config.canvas_height = -1 + raw_bytes, image_shape = page_decoder.render_image(render_config) + if not raw_bytes: + raise RuntimeError( + f"Rendered image is empty for page {self.page_number} of {self.doc_key}" + ) + return self._image_from_bytes(raw_bytes, image_shape) + + def _render_image_at_canvas_size( + self, canvas_size: tuple[int, int] + ) -> PILImage.Image: + page_decoder = self._require_page_decoder() + render_config = self._rendering_config() + render_config.scale = -1.0 + render_config.canvas_width, render_config.canvas_height = canvas_size + raw_bytes, image_shape = page_decoder.render_image(render_config) if not raw_bytes: raise RuntimeError( f"Rendered image is empty for page {self.page_number} of {self.doc_key}" ) + return self._image_from_bytes(raw_bytes, image_shape) + + def _crop_image( + self, image: PILImage.Image, cropbox: BoundingBox | None + ) -> PILImage.Image: + if cropbox is None: + return image + + cropbox_top_left = cropbox.to_top_left_origin(page_height=self.page_height) + x_scale = image.width / self.page_width + y_scale = image.height / self.page_height - h, w, _ = self._raw.image_shape - return PILImage.frombuffer("RGBA", (w, h), raw_bytes, "raw", "RGBA", 0, 1) + left = max(0, round(cropbox_top_left.l * x_scale)) + top = max(0, round(cropbox_top_left.t * y_scale)) + right = min(image.width, round(cropbox_top_left.r * x_scale)) + bottom = min(image.height, round(cropbox_top_left.b * y_scale)) + return image.crop((left, top, right, bottom)) + + def get_image( + self, + scale: float | None = None, + canvas_size: tuple[int, int] | None = None, + cropbox: BoundingBox | None = None, + ) -> PILImage.Image: + """Return the rendered page image.""" + if scale is not None and canvas_size is not None: + raise ValueError("Provide either scale or canvas_size, not both") + + if scale is None and canvas_size is None: + image = self._get_default_image() + return self._crop_image(image, cropbox) + + if scale is not None: + if scale <= 0: + raise ValueError(f"scale must be > 0, got {scale}") + if not self._scale_request_supported(): + raise ValueError( + "get_image(scale=...) requires render_config.scale to be set" + ) + + render_config = self._rendering_config() + if math.isclose( + scale, + render_config.scale, + rel_tol=0.0, + abs_tol=self._scale_abs_tolerance(), + ): + image = self._get_default_image() + else: + image = self._render_image_at_scale(scale) + else: + assert canvas_size is not None + if canvas_size[0] <= 0 or canvas_size[1] <= 0: + raise ValueError( + f"canvas_size must contain positive integers, got {canvas_size}" + ) + if canvas_size == self._default_canvas_size(): + image = self._get_default_image() + else: + image = self._render_image_at_canvas_size(canvas_size) + + return self._crop_image(image, cropbox) def _export_render_instructions_json(self) -> Dict[str, Any]: return self._require_page_decoder().export_render_instructions_json() @@ -1026,6 +1146,36 @@ def _copy_decode_config(src: DecodePageConfig) -> DecodePageConfig: return dst +def _copy_render_config(src: RenderConfig) -> RenderConfig: + _validate_render_config(src) + dst = RenderConfig() + dst.render_text = src.render_text + dst.draw_text_bbox = src.draw_text_bbox + dst.resolve_fonts = src.resolve_fonts + dst.font_similarity_cutoff = src.font_similarity_cutoff + dst.scale = src.scale + dst.canvas_width = src.canvas_width + dst.canvas_height = src.canvas_height + return dst + + +def _validate_render_config(src: RenderConfig) -> None: + have_scale = src.scale > 0 + have_width = src.canvas_width > 0 + have_height = src.canvas_height > 0 + + if src.scale != -1.0 and src.scale <= 0: + raise ValueError("render_config.scale must be > 0 or -1") + if src.canvas_width != -1 and src.canvas_width <= 0: + raise ValueError("render_config.canvas_width must be > 0 or -1") + if src.canvas_height != -1 and src.canvas_height <= 0: + raise ValueError("render_config.canvas_height must be > 0 or -1") + if have_scale and (have_width or have_height): + raise ValueError( + "render_config.scale cannot be combined with canvas_width or canvas_height" + ) + + class DoclingThreadedPdfParser: """Threaded PDF parser that decodes pages from multiple documents in parallel.""" @@ -1038,6 +1188,8 @@ def __init__( parser_config = ThreadedPdfParserConfig() self._parser_config = parser_config + if parser_config.render_config is not None: + _validate_render_config(parser_config.render_config) self._decode_config = ( _copy_decode_config(decode_config) if decode_config is not None @@ -1045,6 +1197,7 @@ def __init__( ) self._decode_config.page_boundary = parser_config.boundary_type.value self._page_counts: Dict[str, int] = {} + self._scheduled_page_counts: Dict[str, int] = {} if parser_config.render_config is None: self._parser = threaded_pdf_parser( @@ -1066,12 +1219,14 @@ def load( self, path_or_stream: Union[str, Path, BytesIO], password: str | None = None, + page_numbers: Sequence[int] | None = None, ) -> str: """Load a document for parallel processing. Parameters: path_or_stream: File path or BytesIO object. password: Optional password for protected files. + page_numbers: Optional 1-indexed physical pages to schedule. Returns: str: The document key. @@ -1082,7 +1237,10 @@ def load( if isinstance(path_or_stream, Path): key = f"key={path_or_stream!s}" success = self._parser.load_document( - key=key, filename=str(path_or_stream).encode("utf8"), password=password + key=key, + filename=str(path_or_stream).encode("utf8"), + password=password, + page_numbers=list(page_numbers) if page_numbers is not None else None, ) elif isinstance(path_or_stream, BytesIO): hasher = hashlib.sha256(usedforsecurity=False) @@ -1093,7 +1251,10 @@ def load( key = f"key={hash_val}" success = self._parser.load_document_from_bytesio( - key=key, bytes_io=path_or_stream, password=password + key=key, + bytes_io=path_or_stream, + password=password, + page_numbers=list(page_numbers) if page_numbers is not None else None, ) else: raise TypeError( @@ -1104,6 +1265,7 @@ def load( raise RuntimeError(f"Failed to load document with key {key}") self._page_counts[key] = self._parser.number_of_pages(key) + self._scheduled_page_counts[key] = self._parser.scheduled_number_of_pages(key) return key def page_count(self, doc_key: str) -> int: @@ -1112,6 +1274,25 @@ def page_count(self, doc_key: str) -> int: raise ValueError(f"Document key not loaded: {doc_key}") return self._page_counts[doc_key] + def scheduled_page_count(self, doc_key: str) -> int: + """Return the number of pages scheduled for threaded emission.""" + if doc_key not in self._scheduled_page_counts: + raise ValueError(f"Document key not loaded: {doc_key}") + return self._scheduled_page_counts[doc_key] + + def unload(self, doc_key: str) -> bool: + """Unload one document after threaded processing has completed.""" + unloaded = self._parser.unload_document(doc_key) + self._page_counts.pop(doc_key, None) + self._scheduled_page_counts.pop(doc_key, None) + return unloaded + + def unload_all(self) -> None: + """Unload all documents after threaded processing has completed.""" + self._parser.unload_all_documents() + self._page_counts.clear() + self._scheduled_page_counts.clear() + def has_tasks(self) -> bool: """Check if there are remaining tasks to consume. @@ -1138,7 +1319,7 @@ def get_task(self) -> "PageParseResult": return PageParseResult( self._parser.get_task(), boundary_type=self._parser_config.boundary_type, - rendering_enabled=self._parser_config.render_config is not None, + render_config=self._parser_config.render_config, ) diff --git a/docs/plans/update-threaded-api.md b/docs/plans/update-threaded-api.md new file mode 100644 index 00000000..fd457339 --- /dev/null +++ b/docs/plans/update-threaded-api.md @@ -0,0 +1,73 @@ +# `docling-parse` Upstream Plan for `docling_release` Threaded Backend + +## Summary + +Prepare `docling-parse` so `docling_release` can later build a `ThreadedDoclingParse...Backend` on top of the existing threaded public API. + +This upstream pass should address three concrete gaps: + +- public cleanup for loaded threaded documents +- selected-page scheduling at load time +- backend-style page image rendering from `PageParseResult`, with `scale` and Python-side `cropbox` support + +## Key Changes + +### 1. Add selected-page scheduling to `DoclingThreadedPdfParser.load(...)` +- Extend `load(...)` with optional `page_numbers: Sequence[int] | None = None`. +- Treat `page_numbers` as 1-indexed physical page numbers. +- Normalize at load time and reject out-of-range values. +- Store the selected subset per loaded document and build the threaded task queue from that subset instead of all pages. +- Keep `page_count(doc_key)` as the physical document page count. +- Add `scheduled_page_count(doc_key) -> int` for the number of pages that will actually be emitted. + +### 2. Add public lifecycle cleanup to the threaded parser +- Add `unload(doc_key: str) -> bool`. +- Add `unload_all() -> None`. +- Clear document storage plus Python-side bookkeeping for page counts and selected-page subsets. +- Make unload idempotent after processing is complete. +- Do not add mid-stream cancellation in this pass; unloading during active iteration should raise a clear error. + +### 3. Extend `PageParseResult.get_image(...)` +- Change `PageParseResult.get_image()` to accept: + - `scale: float = 1.0` + - `cropbox: ... | None = None` +- Keep the no-argument behavior compatible with today’s render-config mode. +- Keep current gating: `get_image(...)` only works when the threaded parser was configured with `parser_config.render_config`; parse-only results still fail clearly. +- Implement true rerendering from the retained `PdfPageDecoder` for scaled requests. +- Do not implement scaled output by resizing the existing pre-rendered image. + +### 4. Keep cropping in Python, not C++ +- Do not add crop-aware rendering to the C++ layer in this pass. +- `get_image(scale=..., cropbox=...)` should: + - render the full page at the requested scale + - crop the rendered PIL image in Python +- The cropbox contract should match the current `docling_release` expectations: page-coordinate crop input, converted in Python against the rendered page size. +- This keeps semantics aligned with current page-image caching in `docling_release` while avoiding immediate C++ rendering changes. + +### 5. Cache policy for threaded result images +- Keep `get_image(...)` lazy. +- Preserve the existing pre-rendered full-page image as a fast path for the default full-page request when available. +- For non-default `scale`, rerender from the decoder. +- For `cropbox`, crop from the full-page image at the requested scale in Python. +- Do not require aggressive per-crop caching in `docling-parse`; `docling_release` already caches full-page images by scale. + +## Test Plan + +- Full-document threaded loads still emit all pages with correct 1-indexed `page_number`. +- `load(..., page_numbers=[...])` emits only the selected physical pages. +- `page_count(doc_key)` returns the full document count; `scheduled_page_count(doc_key)` returns the subset count. +- Invalid, duplicate, and unsorted page-number inputs are handled deterministically. +- Multi-document threaded parsing works with different subsets per document. +- `unload(doc_key)` succeeds after consumption, is idempotent, and removes the document from lookup state. +- `unload()` during active iteration raises the documented error. +- `get_image()` with no arguments still works in render-config mode. +- `get_image(scale=...)` produces a true rerender at the requested scale. +- `get_image(scale=..., cropbox=...)` returns the correct crop from the full-page rendered image at that scale. +- Repeated default full-page requests can reuse the pre-rendered fast path; scaled requests rerender from the decoder. + +## Assumptions and Defaults + +- Sequential `DoclingPdfParser` / `PdfDocument` APIs stay unchanged. +- No mid-stream cancellation is added in this pass. +- `docling_release` will continue to manage page-level image caching by scale on its side. +- Other `docling_release` pipelines may still fail against the draft threaded backend; this upstream work is specifically to unblock the later threaded PDF backend integration. diff --git a/src/pybind/docling_threaded_base.h b/src/pybind/docling_threaded_base.h index e2223180..0b86380e 100644 --- a/src/pybind/docling_threaded_base.h +++ b/src/pybind/docling_threaded_base.h @@ -3,12 +3,15 @@ #ifndef PYBIND_THREADED_PDF_BASE_H #define PYBIND_THREADED_PDF_BASE_H +#include #include #include #include #include #include +#include #include +#include #include #include @@ -63,13 +66,19 @@ namespace docling bool load_document(std::string key, std::string filename, - std::optional password); + std::optional password, + std::optional> page_numbers = std::nullopt); bool load_document_from_bytesio(std::string key, pybind11::object bytes_io, - std::optional password); + std::optional password, + std::optional> page_numbers = std::nullopt); int number_of_pages(std::string key) const; + int scheduled_number_of_pages(std::string key) const; + + bool unload_document(std::string key); + void unload_all_documents(); bool has_tasks(); @@ -78,6 +87,11 @@ namespace docling private: void set_loglevel_with_label(std::string level); + std::vector normalise_page_numbers(const std::string& key, + int num_pages, + std::optional> page_numbers) const; + void validate_unload_state() const; + void reset_after_completion(); void build_task_queue(); @@ -90,6 +104,7 @@ namespace docling int max_concurrent_results; std::unordered_map key2doc; + std::unordered_map> key2scheduled_pages; // Task queue: (doc_key, page_number) pairs std::queue> task_queue; @@ -123,7 +138,8 @@ namespace docling config(config), num_threads(num_threads), max_concurrent_results(max_concurrent_results), - key2doc({}) + key2doc({}), + key2scheduled_pages({}) { set_loglevel_with_label(loglevel); @@ -183,7 +199,8 @@ namespace docling bool docling_threaded_base::load_document( std::string key, std::string filename, - std::optional password) + std::optional password, + std::optional> page_numbers) { if(started.load()) { @@ -201,8 +218,30 @@ namespace docling if(std::filesystem::exists(path_filename)) { - key2doc[key] = std::make_shared(); - key2doc.at(key)->process_document_from_file(filename, password); + try + { + key2doc[key] = std::make_shared(); + key2doc.at(key)->process_document_from_file(filename, password); + } + catch(const std::exception& exc) + { + key2doc.erase(key); + LOG_S(ERROR) << "could not decode file object for key=" << key; + return false; + } + + try + { + key2scheduled_pages[key] = normalise_page_numbers(key, + key2doc.at(key)->get_number_of_pages(), + page_numbers); + } + catch(const std::exception& exc) + { + key2doc.erase(key); + key2scheduled_pages.erase(key); + throw; + } return true; } @@ -214,7 +253,8 @@ namespace docling bool docling_threaded_base::load_document_from_bytesio( std::string key, pybind11::object bytes_io, - std::optional password) + std::optional password, + std::optional> page_numbers) { if(started.load()) { @@ -240,15 +280,28 @@ namespace docling key2doc[key] = std::make_shared(); std::string description = "parsing of " + key + " from bytesio"; key2doc.at(key)->process_document_from_bytesio(data_buffer, password, description); - return true; } catch(const std::exception& exc) { + key2doc.erase(key); + key2scheduled_pages.erase(key); LOG_S(ERROR) << "could not decode bytesio object for key=" << key; return false; } - return false; + try + { + key2scheduled_pages[key] = normalise_page_numbers(key, + key2doc.at(key)->get_number_of_pages(), + page_numbers); + } + catch(const std::exception& exc) + { + key2doc.erase(key); + key2scheduled_pages.erase(key); + throw; + } + return true; } template @@ -264,16 +317,122 @@ namespace docling } template - void docling_threaded_base::build_task_queue() + int docling_threaded_base::scheduled_number_of_pages(std::string key) const + { + auto itr = key2scheduled_pages.find(key); + if(itr == key2scheduled_pages.end()) + { + throw std::runtime_error("Document key not found: " + key); + } + + return static_cast(itr->second.size()); + } + + template + bool docling_threaded_base::unload_document(std::string key) + { + validate_unload_state(); + + bool removed_doc = key2doc.erase(key) > 0; + bool removed_schedule = key2scheduled_pages.erase(key) > 0; + + if(key2doc.empty()) + { + reset_after_completion(); + } + + return removed_doc || removed_schedule; + } + + template + void docling_threaded_base::unload_all_documents() + { + validate_unload_state(); + key2doc.clear(); + key2scheduled_pages.clear(); + reset_after_completion(); + } + + template + std::vector docling_threaded_base::normalise_page_numbers( + const std::string& key, + int num_pages, + std::optional> page_numbers) const + { + std::vector scheduled_pages; + + if(not page_numbers.has_value()) + { + scheduled_pages.reserve(num_pages); + for(int page = 0; page < num_pages; ++page) + { + scheduled_pages.push_back(page); + } + return scheduled_pages; + } + + scheduled_pages.reserve(page_numbers->size()); + for(int page_number : *page_numbers) + { + if(page_number < 1 or page_number > num_pages) + { + throw std::runtime_error("Invalid page number " + std::to_string(page_number) + + " for document key " + key + + " with " + std::to_string(num_pages) + " pages"); + } + scheduled_pages.push_back(page_number - 1); + } + + std::sort(scheduled_pages.begin(), scheduled_pages.end()); + scheduled_pages.erase(std::unique(scheduled_pages.begin(), scheduled_pages.end()), + scheduled_pages.end()); + return scheduled_pages; + } + + template + void docling_threaded_base::validate_unload_state() const + { + if(tasks_remaining.load() > 0 or active_workers.load() > 0) + { + throw std::runtime_error("Cannot unload documents while threaded iteration is active"); + } + } + + template + void docling_threaded_base::reset_after_completion() { - for(const auto& pair : key2doc) + while(not task_queue.empty()) + { + task_queue.pop(); + } + + while(not results_queue.empty()) { - const std::string& doc_key = pair.first; - int num_pages = pair.second->get_number_of_pages(); + results_queue.pop(); + } + + for(auto& worker : workers) + { + if(worker.joinable()) + { + worker.join(); + } + } + workers.clear(); + + tasks_remaining.store(0); + active_workers.store(0); + started.store(false); + } - for(int page = 0; page < num_pages; page++) + template + void docling_threaded_base::build_task_queue() + { + for(const auto& pair : key2scheduled_pages) + { + for(int page : pair.second) { - task_queue.push(std::make_pair(doc_key, page)); + task_queue.push(std::make_pair(pair.first, page)); } } diff --git a/src/render/blend2d_renderer.h b/src/render/blend2d_renderer.h index cf722e98..61102962 100644 --- a/src/render/blend2d_renderer.h +++ b/src/render/blend2d_renderer.h @@ -420,33 +420,7 @@ namespace pdflib if (pdf_w <= 0 or pdf_h <= 0) { return; } - // Apply canvas_width / canvas_height from config, preserving aspect ratio. - int width = pdf_w; - int height = pdf_h; - - const bool have_w = (config_.canvas_width > 0); - const bool have_h = (config_.canvas_height > 0); - - if (have_w and have_h) - { - width = config_.canvas_width; - height = config_.canvas_height; - } - else if (have_w) - { - width = config_.canvas_width; - height = static_cast( - std::round(static_cast(pdf_h) * width / pdf_w)); - } - else if (have_h) - { - height = config_.canvas_height; - width = static_cast( - std::round(static_cast(pdf_w) * height / pdf_h)); - } - - if (width <= 0) { width = 1; } - if (height <= 0) { height = 1; } + const auto [width, height] = resolve_canvas_size(pdf_w, pdf_h, config_); scale_x_ = static_cast(width) / pdf_w; scale_y_ = static_cast(height) / pdf_h; diff --git a/src/render/config.h b/src/render/config.h index f41b96c9..163b05cf 100644 --- a/src/render/config.h +++ b/src/render/config.h @@ -3,6 +3,10 @@ #ifndef PDF_RENDER_CONFIG_H #define PDF_RENDER_CONFIG_H +#include +#include +#include + namespace pdflib { @@ -34,12 +38,93 @@ namespace pdflib // accept weaker matches, higher values are more strict. float font_similarity_cutoff = 0.75f; + // Target render scale in multiples of the PDF page size (72 ppi baseline). + // -1 means "disabled". Mutually exclusive with canvas_width/canvas_height. + float scale = -1.0f; + // Target canvas dimensions in pixels. -1 means "use the PDF page size". // If only one is set the other is derived to preserve the page aspect ratio. int canvas_width = -1; int canvas_height = -1; }; + inline void validate_render_config(const render_config& config) + { + const bool have_width = config.canvas_width > 0; + const bool have_height = config.canvas_height > 0; + const bool have_scale = config.scale > 0.0f; + + if(config.scale != -1.0f and config.scale <= 0.0f) + { + throw std::runtime_error("render_config.scale must be > 0 or -1"); + } + + if(config.canvas_width != -1 and config.canvas_width <= 0) + { + throw std::runtime_error("render_config.canvas_width must be > 0 or -1"); + } + + if(config.canvas_height != -1 and config.canvas_height <= 0) + { + throw std::runtime_error("render_config.canvas_height must be > 0 or -1"); + } + + if(have_scale and (have_width or have_height)) + { + throw std::runtime_error( + "render_config.scale cannot be combined with canvas_width or canvas_height"); + } + } + + inline std::pair resolve_canvas_size( + int pdf_width, + int pdf_height, + const render_config& config) + { + validate_render_config(config); + + int width = pdf_width; + int height = pdf_height; + + const bool have_width = config.canvas_width > 0; + const bool have_height = config.canvas_height > 0; + const bool have_scale = config.scale > 0.0f; + + if(have_scale) + { + width = static_cast(std::round(static_cast(pdf_width) * config.scale)); + height = static_cast(std::round(static_cast(pdf_height) * config.scale)); + } + else if(have_width and have_height) + { + width = config.canvas_width; + height = config.canvas_height; + } + else if(have_width) + { + width = config.canvas_width; + height = static_cast( + std::round(static_cast(pdf_height) * width / pdf_width)); + } + else if(have_height) + { + height = config.canvas_height; + width = static_cast( + std::round(static_cast(pdf_width) * height / pdf_height)); + } + + if(width <= 0) + { + width = 1; + } + if(height <= 0) + { + height = 1; + } + + return {width, height}; + } + } #endif diff --git a/src/render/naive_renderer.h b/src/render/naive_renderer.h index 519f015e..a6beb342 100644 --- a/src/render/naive_renderer.h +++ b/src/render/naive_renderer.h @@ -48,8 +48,10 @@ namespace pdflib { auto& bbox = instr.crop_bbox; - int width = bbox[2] - bbox[0]; - int height = bbox[3] - bbox[1]; + const int pdf_width = bbox[2] - bbox[0]; + const int pdf_height = bbox[3] - bbox[1]; + + const auto [width, height] = resolve_canvas_size(pdf_width, pdf_height, config_); shape = {height, width, 3}; canvas->assign(height * width * 3, 255); diff --git a/tests/test_threaded_parse.py b/tests/test_threaded_parse.py index 299d7121..fa892052 100644 --- a/tests/test_threaded_parse.py +++ b/tests/test_threaded_parse.py @@ -4,6 +4,7 @@ import glob import os +import pytest from docling_core.types.doc.page import PdfPageBoundaryType, SegmentedPdfPage from docling_parse.pdf_parser import ( @@ -214,3 +215,89 @@ def test_threaded_single_thread(): key = parser.load(filename) count = sum(1 for result in parser.iterate_results() if result.success) assert count == parser.page_count(key) + + +def test_threaded_selected_pages_schedule_subset(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( + loglevel="fatal", + threads=2, + max_concurrent_results=4, + boundary_type=PdfPageBoundaryType.CROP_BOX, + ), + decode_config=_make_decode_config(), + ) + + key = parser.load(LARGE_SAMPLE_PDF, page_numbers=[2, 1, 2]) + + assert parser.page_count(key) >= 2 + assert parser.scheduled_page_count(key) == 2 + + emitted_pages = sorted( + result.page_number for result in parser.iterate_results() if result.success + ) + assert emitted_pages == [1, 2] + + +def test_threaded_selected_pages_invalid_page_number(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), + decode_config=_make_decode_config(), + ) + + with pytest.raises(RuntimeError, match="Invalid page number"): + parser.load(SAMPLE_PDF, page_numbers=[9999]) + + +def test_threaded_multiple_documents_with_different_subsets(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( + loglevel="fatal", + threads=4, + max_concurrent_results=8, + boundary_type=PdfPageBoundaryType.CROP_BOX, + ), + decode_config=_make_decode_config(), + ) + + path_key = parser.load(LARGE_SAMPLE_PDF, page_numbers=[1, 2]) + bytes_key = parser.load(SAMPLE_PDF, page_numbers=[1]) + + results_by_key: dict[str, list[int]] = {} + for result in parser.iterate_results(): + assert result.success, result.error_message + results_by_key.setdefault(result.doc_key, []).append(result.page_number) + + assert sorted(results_by_key[path_key]) == [1, 2] + assert sorted(results_by_key[bytes_key]) == [1] + assert parser.scheduled_page_count(path_key) == 2 + assert parser.scheduled_page_count(bytes_key) == 1 + + +def test_threaded_unload_after_consumption_is_idempotent(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), + decode_config=_make_decode_config(), + ) + + key = parser.load(SAMPLE_PDF, page_numbers=[1]) + list(parser.iterate_results()) + + assert parser.unload(key) is True + assert parser.unload(key) is False + + with pytest.raises(ValueError): + parser.page_count(key) + + +def test_threaded_unload_during_active_iteration_raises(): + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), + decode_config=_make_decode_config(), + ) + + key = parser.load(SAMPLE_PDF) + assert parser.has_tasks() + + with pytest.raises(RuntimeError, match="threaded iteration is active"): + parser.unload(key) diff --git a/tests/test_threaded_render.py b/tests/test_threaded_render.py index a44f1e48..8de83930 100644 --- a/tests/test_threaded_render.py +++ b/tests/test_threaded_render.py @@ -4,8 +4,10 @@ import glob import os from io import BytesIO +from pathlib import Path import pytest +from docling_core.types.doc.base import BoundingBox, CoordOrigin from docling_core.types.doc.page import SegmentedPdfPage from PIL import Image as PILImage @@ -54,6 +56,43 @@ def _make_parser( ) +def _write_variable_page_size_pdf(path: Path) -> None: + objects = [ + "<< /Type /Catalog /Pages 2 0 R >>", + "<< /Type /Pages /Count 2 /Kids [3 0 R 5 0 R] >>", + "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 300] /Contents 4 0 R >>", + "<< /Length 0 >>\nstream\n\nendstream", + "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 400 500] /Contents 6 0 R >>", + "<< /Length 0 >>\nstream\n\nendstream", + ] + + chunks = [b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"] + offsets = [0] + + for object_number, body in enumerate(objects, start=1): + offsets.append(sum(len(chunk) for chunk in chunks)) + chunks.append(f"{object_number} 0 obj\n{body}\nendobj\n".encode("ascii")) + + xref_offset = sum(len(chunk) for chunk in chunks) + xref_lines = [ + "xref", + f"0 {len(objects) + 1}", + "0000000000 65535 f ", + ] + xref_lines.extend(f"{offset:010d} 00000 n " for offset in offsets[1:]) + trailer = [ + "trailer", + f"<< /Size {len(objects) + 1} /Root 1 0 R >>", + "startxref", + str(xref_offset), + "%%EOF", + ] + chunks.append(("\n".join(xref_lines) + "\n").encode("ascii")) + chunks.append(("\n".join(trailer) + "\n").encode("ascii")) + + path.write_bytes(b"".join(chunks)) + + def test_render_single_document(): """Render all pages of one document and verify each result is a valid RGBA image.""" filename = SAMPLE_PDF @@ -197,6 +236,146 @@ def test_render_custom_render_config(): assert result.get_image() is not None +def test_get_image_scale_requires_scale_config(): + parser = _make_parser() + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + with pytest.raises(ValueError): + result.get_image(scale=2.0) + + +def test_get_image_rerenders_non_default_scale(): + render_config = RenderConfig() + render_config.scale = 1.0 + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + default_image = result.get_image() + scaled_image = result.get_image(scale=2.0) + + assert scaled_image.size == ( + round(result.page_width * 2.0), + round(result.page_height * 2.0), + ) + assert scaled_image.size != default_image.size + + +def test_get_image_canvas_size_is_accepted_for_canvas_config(): + render_config = RenderConfig() + render_config.canvas_width = 1224 + + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + default_image = result.get_image() + same_image = result.get_image(canvas_size=default_image.size) + custom_image = result.get_image(canvas_size=(600, 800)) + + assert same_image.size == default_image.size + assert custom_image.size == (600, 800) + + +def test_get_image_canvas_size_is_accepted_for_scale_config(): + render_config = RenderConfig() + render_config.scale = 2.0 + + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + default_image = result.get_image() + semantic_image = result.get_image(scale=1.0) + same_image = result.get_image(canvas_size=default_image.size) + + assert default_image.size == ( + round(result.page_width * 2.0), + round(result.page_height * 2.0), + ) + assert semantic_image.size == ( + round(result.page_width), + round(result.page_height), + ) + assert same_image.size == default_image.size + + +def test_get_image_rejects_scale_with_canvas_size(): + render_config = RenderConfig() + render_config.scale = 1.0 + + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + with pytest.raises(ValueError): + result.get_image(scale=1.0, canvas_size=(100, 100)) + + +def test_render_config_rejects_scale_with_canvas_dimensions(): + render_config = RenderConfig() + render_config.scale = 2.0 + render_config.canvas_width = 1224 + + with pytest.raises(ValueError): + _make_parser(render_config=render_config) + + +def test_get_image_crops_using_page_coordinates(): + render_config = RenderConfig() + render_config.scale = 2.0 + parser = _make_parser(render_config=render_config) + parser.load(SAMPLE_PDF, page_numbers=[1]) + + result = next(parser.iterate_results()) + assert result.success, result.error_message + + cropbox = BoundingBox( + l=10, + t=20, + r=60, + b=90, + coord_origin=CoordOrigin.TOPLEFT, + ) + cropped = result.get_image(scale=2.0, cropbox=cropbox) + + assert cropped.size == ( + round((cropbox.r - cropbox.l) * 2.0), + round((cropbox.b - cropbox.t) * 2.0), + ) + + +def test_render_scale_config_handles_pages_with_different_sizes(tmp_path: Path): + pdf_path = tmp_path / "variable_page_sizes.pdf" + _write_variable_page_size_pdf(pdf_path) + + render_config = RenderConfig() + render_config.scale = 2.0 + + parser = _make_parser(render_config=render_config) + parser.load(pdf_path) + + sizes_by_page: dict[int, tuple[int, int]] = {} + for result in parser.iterate_results(): + assert result.success, result.error_message + image = result.get_image() + sizes_by_page[result.page_number] = image.size + + assert sizes_by_page[1] == (400, 600) + assert sizes_by_page[2] == (800, 1000) + + def test_render_reference_documents_from_filenames(): """Render all regression PDFs and verify parse output against groundtruth.""" pdf_docs = sorted(glob.glob(REGRESSION_FOLDER)) From 8174041171c4fca539d0a1a476090debb4c9d86c Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 28 Apr 2026 15:07:59 +0200 Subject: [PATCH 4/8] fix: address threaded render and unload race issues Signed-off-by: Christoph Auer --- app/pybind_parse.cpp | 5 ++++- docling_parse/pdf_parser.py | 25 +++++++++++-------------- src/pybind/docling_threaded_base.h | 2 +- tests/test_threaded_render.py | 14 ++++++++++---- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp index 4ab7988c..a7a5da9d 100644 --- a/app/pybind_parse.cpp +++ b/app/pybind_parse.cpp @@ -555,7 +555,10 @@ PYBIND11_MODULE(pdf_parsers, m) { [](pdflib::pdf_decoder& self, const pdflib::render_config& config) -> pybind11::tuple { pdflib::renderer rnd(config); - self.get_instructions().iterate_over_instructions(rnd); + { + pybind11::gil_scoped_release release; + self.get_instructions().iterate_over_instructions(rnd); + } auto canvas = rnd.get_canvas(); const auto& shape = rnd.get_shape(); diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 04502e93..8ef8d6c7 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -926,9 +926,7 @@ def __init__( ): self._raw = raw_result self._boundary_type = boundary_type - self._render_config = ( - _copy_render_config(render_config) if render_config is not None else None - ) + self._render_config = render_config self._page: SegmentedPdfPage | None = None self._page_decoder: PdfPageDecoder | None = None self._default_image: PILImage.Image | None = None @@ -994,10 +992,6 @@ def _default_canvas_size(self) -> tuple[int, int]: height, width, _ = self._raw.image_shape return width, height - def _scale_request_supported(self) -> bool: - render_config = self._rendering_config() - return render_config.scale > 0 - def _scale_abs_tolerance(self) -> float: if self.page_width <= 0 or self.page_height <= 0: return 0.0 @@ -1059,6 +1053,8 @@ def _crop_image( ) -> PILImage.Image: if cropbox is None: return image + if self.page_width <= 0 or self.page_height <= 0: + return image cropbox_top_left = cropbox.to_top_left_origin(page_height=self.page_height) x_scale = image.width / self.page_width @@ -1087,11 +1083,6 @@ def get_image( if scale is not None: if scale <= 0: raise ValueError(f"scale must be > 0, got {scale}") - if not self._scale_request_supported(): - raise ValueError( - "get_image(scale=...) requires render_config.scale to be set" - ) - render_config = self._rendering_config() if math.isclose( scale, @@ -1147,7 +1138,6 @@ def _copy_decode_config(src: DecodePageConfig) -> DecodePageConfig: def _copy_render_config(src: RenderConfig) -> RenderConfig: - _validate_render_config(src) dst = RenderConfig() dst.render_text = src.render_text dst.draw_text_bbox = src.draw_text_bbox @@ -1176,6 +1166,11 @@ def _validate_render_config(src: RenderConfig) -> None: ) +def _validated_render_config(src: RenderConfig) -> RenderConfig: + _validate_render_config(src) + return _copy_render_config(src) + + class DoclingThreadedPdfParser: """Threaded PDF parser that decodes pages from multiple documents in parallel.""" @@ -1189,7 +1184,9 @@ def __init__( self._parser_config = parser_config if parser_config.render_config is not None: - _validate_render_config(parser_config.render_config) + parser_config.render_config = _validated_render_config( + parser_config.render_config + ) self._decode_config = ( _copy_decode_config(decode_config) if decode_config is not None diff --git a/src/pybind/docling_threaded_base.h b/src/pybind/docling_threaded_base.h index 0b86380e..4756c0dc 100644 --- a/src/pybind/docling_threaded_base.h +++ b/src/pybind/docling_threaded_base.h @@ -392,7 +392,7 @@ namespace docling template void docling_threaded_base::validate_unload_state() const { - if(tasks_remaining.load() > 0 or active_workers.load() > 0) + if(tasks_remaining.load() > 0) { throw std::runtime_error("Cannot unload documents while threaded iteration is active"); } diff --git a/tests/test_threaded_render.py b/tests/test_threaded_render.py index 8de83930..b806f0b5 100644 --- a/tests/test_threaded_render.py +++ b/tests/test_threaded_render.py @@ -236,15 +236,21 @@ def test_render_custom_render_config(): assert result.get_image() is not None -def test_get_image_scale_requires_scale_config(): - parser = _make_parser() +def test_get_image_scale_rerenders_for_canvas_config(): + render_config = RenderConfig() + render_config.canvas_width = 1224 + parser = _make_parser(render_config=render_config) parser.load(SAMPLE_PDF, page_numbers=[1]) result = next(parser.iterate_results()) assert result.success, result.error_message - with pytest.raises(ValueError): - result.get_image(scale=2.0) + scaled_image = result.get_image(scale=2.0) + + assert scaled_image.size == ( + round(result.page_width * 2.0), + round(result.page_height * 2.0), + ) def test_get_image_rerenders_non_default_scale(): From e0fb2e276ca732c8051989be7e732d787e55aaec Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 28 Apr 2026 15:16:36 +0200 Subject: [PATCH 5/8] Update plans Signed-off-by: Christoph Auer --- docs/plans/threaded-api-design.md | 432 +++++++++++++++++------------- docs/plans/update-threaded-api.md | 73 ----- 2 files changed, 241 insertions(+), 264 deletions(-) delete mode 100644 docs/plans/update-threaded-api.md diff --git a/docs/plans/threaded-api-design.md b/docs/plans/threaded-api-design.md index 33e6db5c..b23be024 100644 --- a/docs/plans/threaded-api-design.md +++ b/docs/plans/threaded-api-design.md @@ -1,240 +1,275 @@ # Threaded Parser Public API Design -**Status:** Draft — in iteration -**Date:** 2026-04-24 -**Scope:** `docling-parse` only — docling integration is a separate concern +**Status:** Implemented +**Last updated:** 2026-04-28 +**Scope:** `docling-parse` only + +This document is the consolidated design and behavior reference for the public threaded parser API in `docling-parse`. + +It supersedes the narrower `update-threaded-api.md` plan. The decisions from that follow-up plan are folded in here, and the examples below reflect the current implementation rather than an earlier proposal draft. + +--- + +## Goals + +- Keep the sequential `PdfDocument`-based API stable. +- Provide one public threaded parser entry point for both parse-only and parse-and-render workflows. +- Hide C++ decoder objects from normal Python callers. +- Keep page results typed, lazy, and consistent with the sequential API where possible. +- Support selected-page scheduling and explicit cleanup for multi-document threaded workloads. --- -## Constraints +## Stable constraints -- **Sequential `PdfDocument`-based API is frozen.** No breaking changes to `DoclingPdfParser`, `PdfDocument`, `PdfDocument.get_page()`, `PdfDocument.iterate_pages()`, or any of their signatures. Existing code that uses the sequential path continues to work unchanged. -- **Threaded API may break.** `DoclingThreadedPdfParser`, `PageDecodeResult`, `PdfPageRenderResult`, and `ThreadedPdfParserConfig` can all change. There are no known external users relying on the current threaded API shape. +- The sequential API remains unchanged: + - `DoclingPdfParser` + - `PdfDocument` + - `PdfDocument.get_page()` + - `PdfDocument.iterate_pages()` + - `PdfDocument.get_page_with_timings()` +- The threaded API is the place where the public redesign happened. +- Rendering remains optional and is enabled by configuration, not by switching to a separate public threaded class. --- -## Problems with the current threaded API +## Final public shape -### 1. C++ internals leak into user code +### One threaded parser interface -`DoclingThreadedPdfParser.get_task()` returns a raw `PageDecodeResult` whose `.get()` returns `(PdfPageDecoder, timings_dict)`. `PdfPageDecoder` is a C++ binding object with no documented Python interface. Callers must know to call `PdfDocument._to_segmented_page_from_decoder()` on it — a private method not intended for external use. +The public threaded entry point is: -The benchmark works around this with: ```python -dummy_doc = PdfDocument.__new__(PdfDocument) -dummy_doc._boundary_type = PdfPageBoundaryType.CROP_BOX -seg_page = dummy_doc._to_segmented_page_from_decoder(page_decoder, config) +DoclingThreadedPdfParser( + parser_config: ThreadedPdfParserConfig | None = None, + decode_config: DecodePageConfig | None = None, +) ``` -This is a hack that will break silently if `PdfDocument` internals change. -`DoclingThreadedPdfRenderer.get_task()` has the same problem: `PdfPageRenderResult.get()` also returns `(PdfPageDecoder, timings_dict)`. +There is no separate public `DoclingThreadedPdfRenderer` API anymore. Parse-only and parse-and-render share the same Python interface. -The current split also exposes an implementation detail as a public API split. The renderer result is essentially a decoded page result plus an optional rendered image. Users should not have to pick a different threaded class and result type just because they want the image bytes produced during decode. +### Threaded parser configuration -### 2. Conversion logic is private and on the wrong class +```python +class ThreadedPdfParserConfig(BaseModel): + loglevel: str = "fatal" + threads: int = 4 + max_concurrent_results: int = 32 + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX + render_config: RenderConfig | None = None +``` -`PdfDocument._to_segmented_page_from_decoder()` converts `PdfPageDecoder → SegmentedPdfPage`. Logically this is a pure function: it does not depend on document state, only on `_boundary_type`. It belongs at module level, not as an instance method on `PdfDocument`. +Key points: -### 3. Page numbering inconsistency +- `boundary_type` now has an explicit home in the threaded path. +- `render_config=None` selects parse-only operation. +- `render_config` present selects parse-and-render operation. +- `DecodePageConfig` and `RenderConfig` stay separate because they configure different pipeline stages. -`PageDecodeResult.page_number` is **0-indexed**. -`PdfPageRenderResult.page_number` is also **0-indexed**. -`PdfDocument.get_page()` and `iterate_pages()` are **1-indexed**. -Callers of the threaded paths must remember to add 1. This is an unnecessary and error-prone divergence. +### Public result type -### 4. No Pythonic iteration +`get_task()` and `iterate_results()` return `PageParseResult`. -The `has_tasks()` / `get_task()` loop is functional but requires callers to write the same boilerplate every time. The sequential API provides `iterate_pages()`. Neither threaded class has an equivalent. +`PageParseResult` exposes: -### 5. `timings` returned as a raw dict +- `doc_key: str` +- `page_number: int` +- `page_width: float` +- `page_height: float` +- `success: bool` +- `error_message: str` +- `has_image: bool` +- `get_page() -> SegmentedPdfPage` +- `get_timings() -> Timings` +- `get_image(...) -> PIL.Image.Image` -The sequential path exposes the typed `Timings` model (with `.total()`, `.get()`, `.keys()`, etc.). The threaded `get()` returns a plain `dict`. These should be consistent. +Notable behavior: -### 6. `boundary_type` has no home in the threaded path +- `page_number` is 1-indexed, matching the sequential API. +- `get_page()` is lazy and caches the converted `SegmentedPdfPage`. +- `get_timings()` returns the typed `Timings` model, not a raw dict. +- Failed results keep `page_width` and `page_height` at `0.0`, and `get_page()` / `get_image()` raise clearly. -The sequential `DoclingPdfParser.load()` accepts `boundary_type`. There is no way to set it for the threaded parser — the conversion hack requires setting it manually on the dummy `PdfDocument` instance. +--- -### 7. No way to query page count before iteration +## Why this design replaced the earlier threaded API -After `parser.load()` / `renderer.load()`, callers have no way to ask how many pages a document has without starting iteration. This is needed by consumers that must pre-allocate structures or define termination conditions before any page arrives. +The old threaded surface had several problems: -### 8. Parser and renderer public APIs are redundant +- It leaked `PdfPageDecoder` into user code. +- It required private `PdfDocument` conversion helpers to turn results into `SegmentedPdfPage`. +- It used 0-indexed page numbers, unlike the sequential API. +- It split parsing and rendering into redundant public threaded classes. +- It returned raw timing dicts instead of `Timings`. +- It had no first-class selected-page scheduling or unload lifecycle on the Python API. -The public distinction between `DoclingThreadedPdfParser` and `DoclingThreadedPdfRenderer` is not strong enough to justify two APIs. Rendering does not produce a fundamentally different page outcome; it produces the same decoded page outcome with an additional optional image artifact. This should be represented as one threaded parser interface whose configuration decides whether page images are produced. +The implemented design resolves those issues without changing the sequential parser contract. -This keeps user code stable when a workflow later starts needing page images: users change config and start calling `get_image()`, not swap classes, result types, and import paths. +--- -On the C++ side this is already mostly true structurally. `docling_threaded_parser` and `docling_threaded_renderer` both inherit from the same `docling_threaded_base`, and their worker loops perform the same document lookup, page decoder construction, `decode_page(config)`, optional word-cell creation, optional line-cell creation, result queueing, and error handling. The renderer adds only this extra step after decoding: +## Conversion model -```cpp -pdflib::renderer rnd(render_cfg); -page_decoder->get_instructions().iterate_over_instructions(rnd); +The canonical conversion helper is now the public module-level function: -result.image_data = rnd.get_canvas(); -result.image_shape = rnd.get_shape(); +```python +segmented_page_from_decoder( + page_decoder: PdfPageDecoder, + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, +) -> SegmentedPdfPage ``` -So the current second backend is not a fundamentally different threading model. It is the same threaded decode pipeline with an optional render stage and a wider result payload. +This is used by both the sequential and threaded paths. ---- +`PdfDocument._to_segmented_page_from_decoder()` still exists as a thin wrapper for internal sequential use, but threaded callers no longer need any private `PdfDocument` methods. -## Proposed changes +--- -### A. Public module-level conversion function +## Document loading and scheduling -Extract `PdfDocument._to_segmented_page_from_decoder` into a public, standalone function: +### Loading ```python -# docling_parse/pdf_parser.py -def segmented_page_from_decoder( - page_decoder: PdfPageDecoder, - boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, -) -> SegmentedPdfPage: - """Convert a C++ PdfPageDecoder to a SegmentedPdfPage. +doc_key = parser.load( + path_or_stream, + password: str | None = None, + page_numbers: Sequence[int] | None = None, +) +``` + +Behavior: + +- `page_numbers` is optional. +- When provided, it is interpreted as 1-indexed physical page numbers. +- The C++ layer normalizes the scheduled subset by sorting and de-duplicating it. +- Out-of-range page numbers raise a `RuntimeError`. +- The returned `doc_key` is the routing key for later results and metadata queries. + +### Page counts - This is the single canonical conversion point for both the sequential and - threaded parse paths. PdfDocument._to_segmented_page_from_decoder() becomes - a thin wrapper calling this function. +Two count queries are available immediately after `load()`: - Note: DecodePageConfig is applied by the C++ decoder before this function - is called; there is nothing left to configure at the Python conversion stage. - """ - ... +```python +page_count(doc_key) -> int +scheduled_page_count(doc_key) -> int ``` -`PdfDocument._to_segmented_page_from_decoder()` delegates to this function, so the sequential path is untouched. +Semantics: + +- `page_count(doc_key)` is the physical page count of the loaded document. +- `scheduled_page_count(doc_key)` is the number of pages that will actually be emitted by the threaded parser for that document. + +This distinction matters when `page_numbers` is used. --- -### B. Configuration controls parse-only vs parse-and-render +## Result delivery model -Keep one public threaded parser interface. Configuration, not the class name, decides whether page images are rendered. +### Completion order -```python -class ThreadedPdfParserConfig(BaseModel): - loglevel: str = "fatal" - threads: int = 4 - max_concurrent_results: int = 32 - boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX # new - render_config: RenderConfig | None = None -``` +`iterate_results()` yields results in completion order, not page-number order. + +If callers need in-order processing, they should collect by `page_number` and sort after consumption. + +### Manual vs iterator control -When `render_config is None`, `DoclingThreadedPdfParser` uses the parse-only backend. When `render_config` is provided, it uses the threaded render backend internally and surfaces the same `PageParseResult` type with image access enabled. +The threaded parser intentionally exposes both: -`DecodePageConfig` remains the decode configuration. Rendering should be activated by supplying `RenderConfig`, because render options such as canvas width and drawing flags already belong there. The key API point is that parse-only versus parse-and-render is a configuration choice on one threaded parser interface, not a separate public parser class. +- `has_tasks()` +- `get_task()` +- `iterate_results()` -Keep `DecodePageConfig` and `RenderConfig` as distinct types. They describe different pipeline stages: +`has_tasks()` is not deprecated. It remains the manual-control escape hatch. -- `DecodePageConfig` controls what is extracted from the PDF and how decoded page content is normalized: page boundary, sanitization, keeping chars/shapes/bitmaps, word and line cell creation, threading safety, glyph/debug retention, and related merge tolerances. -- `RenderConfig` controls how an already decoded page is rasterized: whether to draw text, whether to draw text bounding boxes, font resolution behavior, font matching cutoff, and target canvas dimensions. +Important runtime detail: -Merging render fields into `DecodePageConfig` would make parse-only callers carry rasterization settings that do not affect decoding, and it would blur the contract of `DecodePageConfig` in the frozen sequential parser API. The better shape is a composed threaded execution config: decoding remains configured by `DecodePageConfig`; rendering remains configured by `RenderConfig`; the threaded parser config decides whether a render stage is enabled. +- The first call to `has_tasks()` starts the threaded work by building the task queue and launching workers. +- `iterate_results()` simply loops on `has_tasks()` and `get_task()`. --- -### C. Typed result object: `PageParseResult` +## Cleanup and unload behavior -Replace both raw `PageDecodeResult` and `PdfPageRenderResult` with a clean Python class. `PdfPageDecoder` never appears in user-facing code — the conversion happens inside `get_page()`. +The threaded parser now has explicit lifecycle cleanup: ```python -class PageParseResult: - """Outcome of one page processed by DoclingThreadedPdfParser.""" - - doc_key: str # document identifier returned by .load() - page_number: int # 1-indexed — consistent with the sequential API - page_width: float # page width in points (from boundary box; cheap, no full conversion needed) - page_height: float - success: bool - - def get_page(self) -> SegmentedPdfPage: - """Return the parsed page. Lazy: converts on first call, caches the result. - - Calls segmented_page_from_decoder() internally using the boundary_type - from the parser that produced this result. - Raises RuntimeError if success is False. - """ - ... - - def get_timings(self) -> Timings: - """Return structured timing data for this page parse.""" - ... - - def get_image(self) -> PILImage.Image: - """Return the rendered page image. - - Raises RuntimeError if this result was produced with rendering disabled - or if success is False. - """ - ... - - @property - def has_image(self) -> bool: - """Whether get_image() can return a rendered image for this result.""" - ... - - @property - def error_message(self) -> str: - """Error description; empty string when successful.""" - ... +unload(doc_key: str) -> bool +unload_all() -> None ``` -`page_width` and `page_height` are extracted from `page_decoder.get_page_dimension()` without triggering the full `SegmentedPdfPage` conversion. Dimension decoding is a distinct internal step (see `TIMING_KEY_DECODE_DIMENSIONS`) and the data is available on the decoder object as soon as `get_task()` returns. +Semantics: -`get_page()` is **lazy**: it converts on first call and caches the result. This keeps conversion cost on the worker/consumer thread rather than on the task-delivery path, and avoids wasted work on error paths where `get_page()` is never called. +- `unload(doc_key)` removes one loaded document after threaded processing has completed. +- `unload_all()` clears all loaded documents after threaded processing has completed. +- Python-side count bookkeeping is cleared together with the underlying parser state. +- `unload(doc_key)` is idempotent after successful consumption: + - first unload returns `True` + - unloading the same key again returns `False` +- Unloading during active threaded iteration raises a clear runtime error. -`get_image()` is available on the same result type but only succeeds when the parser was configured with `render_config`. A parse-only result has `has_image == False` and raises a clear `RuntimeError` from `get_image()`. This makes misuse fail loudly while keeping the page result type uniform. +The current implementation defines "safe to unload" by checking whether results remain to be consumed, not whether worker threads have fully wound down. That matches the intended public contract: unloading should succeed once result consumption is complete. --- -### D. Iterator API on `DoclingThreadedPdfParser` +## Image rendering model -```python -class DoclingThreadedPdfParser: - - def page_count(self, doc_key: str) -> int: - """Return the total page count for a loaded document. +Rendering is available only when the parser was created with `parser_config.render_config`. - Available immediately after load(), before iteration begins. - """ - ... +For parse-only results: - def iterate_results(self) -> Iterator[PageParseResult]: - """Yield page results as they complete. +- `has_image` is `False` +- `get_image(...)` raises `RuntimeError` - Pages are returned in COMPLETION ORDER, not page-number order. - Worker threads start on the first call (same as has_tasks()). +For parse-and-render results: - Use result.page_number and result.doc_key to route results. - To process in page order, collect into a dict keyed by page_number - and sort after iteration is complete. - """ - while self.has_tasks(): - yield self.get_task() +- the default render is produced during threaded parsing +- the image is exposed lazily through `get_image(...)` - def get_task(self) -> PageParseResult: # return type changes - """Block until the next result is available and return it.""" - ... +### `get_image(...)` signature - # has_tasks() is unchanged — stays for callers needing manual control +```python +get_image( + scale: float | None = None, + canvas_size: tuple[int, int] | None = None, + cropbox: BoundingBox | None = None, +) -> PIL.Image.Image ``` ---- +### Supported behavior -### E. Remove the separate threaded renderer API +- `scale` and `canvas_size` are mutually exclusive. +- Calling `get_image()` with no arguments returns the default pre-rendered image. +- Calling `get_image(scale=...)` performs a true rerender from the retained `PdfPageDecoder` when needed. +- Calling `get_image(canvas_size=...)` rerenders to the requested canvas size when needed. +- Calling `get_image(..., cropbox=...)` crops in Python after full-page rendering. -Do not introduce a second primary public interface for rendering. `DoclingThreadedPdfParser` should select the existing C++ threaded parser or renderer implementation internally based on `ThreadedPdfParserConfig.render_config`. +### Important decisions reflected in the implementation -Longer term, the C++ implementation can also be collapsed into one threaded worker implementation with an optional render stage. That would remove the duplicated worker-loop logic and keep the only behavioral branch close to the actual difference: whether `RenderConfig` is present. +- `get_image(scale=...)` is allowed whenever `render_config` is present. +- It is not restricted to cases where the original `render_config` used `scale`. +- A caller may configure the threaded parser with `canvas_width` / `canvas_height` and later request `get_image(scale=2.0)`. +- Non-default scale requests rerender from the decoder; they do not resize the existing default bitmap. -Remove `DoclingThreadedPdfRenderer`, `PdfPageRenderResult`, and `ThreadedPdfRendererConfig` as part of the threaded API break. There is no stable public interface for the threaded component yet, so keeping deprecated aliases would add compatibility surface without protecting a real external contract. +### Crop semantics -Documentation and examples should point users to `DoclingThreadedPdfParser` only. +- `cropbox` is specified in page coordinates. +- Cropping is done in Python against the rendered full-page image. +- Page-coordinate conversion uses the page height and rendered image dimensions. +- Degenerate page dimensions are handled defensively by returning the uncropped image rather than dividing by zero. ---- +### Cache behavior + +- The default full-page image is cached lazily per `PageParseResult`. +- Requests matching the default render can reuse that cached image. +- Rerendered `scale` and `canvas_size` requests are generated on demand from the decoder. +- There is no aggressive per-scale or per-crop cache inside `docling-parse`. -## Resulting user-facing API +### Thread efficiency -**Parse only (no images):** +The expensive C++ rerender path used by `PageParseResult.get_image(scale=...)` / `get_image(canvas_size=...)` releases the Python GIL during instruction replay, matching the threaded API's performance goals. + +--- + +## Parse-only example ```python from docling_parse.pdf_parser import DoclingThreadedPdfParser, ThreadedPdfParserConfig @@ -243,23 +278,30 @@ from docling_parse.pdf_parsers import DecodePageConfig decode_config = DecodePageConfig() decode_config.create_line_cells = True -parser_config = ThreadedPdfParserConfig(threads=4, max_concurrent_results=32) -parser = DoclingThreadedPdfParser(parser_config=parser_config, decode_config=decode_config) +parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig(threads=4), + decode_config=decode_config, +) -doc_key = parser.load(path) -total = parser.page_count(doc_key) +doc_key = parser.load(path, page_numbers=[1, 3, 5]) +total_pages = parser.page_count(doc_key) +scheduled_pages = parser.scheduled_page_count(doc_key) for result in parser.iterate_results(): - if result.success: - seg_page = result.get_page() # SegmentedPdfPage, lazy - size = (result.page_width, result.page_height) # available without get_page() - else: - print(f"p{result.page_number} ERROR: {result.error_message}") + if not result.success: + print(f"{result.doc_key} p{result.page_number}: {result.error_message}") + continue + + page = result.get_page() + size = (result.page_width, result.page_height) ``` -**Parse and render (with images):** +--- + +## Parse-and-render example ```python +from docling_core.types.doc.base import BoundingBox, CoordOrigin from docling_parse.pdf_parser import DoclingThreadedPdfParser, ThreadedPdfParserConfig from docling_parse.pdf_parsers import DecodePageConfig, RenderConfig @@ -267,51 +309,59 @@ render_config = RenderConfig() render_config.canvas_width = 1024 parser = DoclingThreadedPdfParser( - parser_config=ThreadedPdfParserConfig(threads=4, render_config=render_config), + parser_config=ThreadedPdfParserConfig( + threads=4, + render_config=render_config, + ), decode_config=DecodePageConfig(), ) doc_key = parser.load(path) -total = parser.page_count(doc_key) for result in parser.iterate_results(): - if result.success: - seg_page = result.get_page() # SegmentedPdfPage - image = result.get_image() # PIL RGBA Image - else: - print(f"p{result.page_number} ERROR: {result.error_message}") -``` - -**In-order collection (when page order matters):** - -```python -pages: dict[int, SegmentedPdfPage] = {} -for result in parser.iterate_results(): - if result.success: - pages[result.page_number] = result.get_page() - -for page_no in sorted(pages): - process(pages[page_no]) + if not result.success: + continue + + page = result.get_page() + default_image = result.get_image() + scaled_image = result.get_image(scale=2.0) + cropped = result.get_image( + scale=2.0, + cropbox=BoundingBox( + l=10, + t=20, + r=60, + b=90, + coord_origin=CoordOrigin.TOPLEFT, + ), + ) ``` --- -## Sequential path — unchanged +## Sequential path remains unchanged + +No signatures or semantics were changed for the sequential parser stack. -The following remain exactly as-is. No signature changes, no behaviour changes: +That includes: - `DoclingPdfParser` - `PdfDocument` -- `PdfDocument.get_page(page_no, *, config)` -- `PdfDocument.iterate_pages(*, config)` -- `PdfDocument.get_page_with_timings(page_no, *, config)` -- All `Timings`, `PdfAnnotations`, `PdfTocEntry` models +- existing `PdfDocument` page access methods +- existing typed models such as `Timings` -`PdfDocument._to_segmented_page_from_decoder()` stays as a private method (it will delegate to the new public `segmented_page_from_decoder()` function internally). External callers should migrate to using `PageParseResult.get_page()` instead. +The threaded redesign was intentionally isolated from the sequential API. --- -## Resolved questions - -- **`iterate_results()` timeout?** Decided no — the caller's concern. The `has_tasks()` / `get_task()` escape hatch exists for manual control. -- **`render_config` on `ThreadedPdfParserConfig` or as a constructor argument?** Decided on `ThreadedPdfParserConfig`: rendering is a threaded execution mode, while `DecodePageConfig` remains focused on decoded page content. Implemented. +## Summary of implemented decisions + +- One public threaded parser interface, not separate parser and renderer APIs. +- Typed `PageParseResult` objects instead of raw decoder-centric result objects. +- Public `segmented_page_from_decoder(...)` as the canonical conversion entry point. +- 1-indexed threaded `page_number`. +- `boundary_type` configured on `ThreadedPdfParserConfig`. +- `page_count()` plus `scheduled_page_count()` for subset-aware scheduling. +- `unload()` and `unload_all()` as explicit threaded lifecycle cleanup. +- `get_image(scale=...)`, `get_image(canvas_size=...)`, and Python-side `cropbox` support on `PageParseResult`. +- True rerendering from the retained decoder for non-default render requests. diff --git a/docs/plans/update-threaded-api.md b/docs/plans/update-threaded-api.md deleted file mode 100644 index fd457339..00000000 --- a/docs/plans/update-threaded-api.md +++ /dev/null @@ -1,73 +0,0 @@ -# `docling-parse` Upstream Plan for `docling_release` Threaded Backend - -## Summary - -Prepare `docling-parse` so `docling_release` can later build a `ThreadedDoclingParse...Backend` on top of the existing threaded public API. - -This upstream pass should address three concrete gaps: - -- public cleanup for loaded threaded documents -- selected-page scheduling at load time -- backend-style page image rendering from `PageParseResult`, with `scale` and Python-side `cropbox` support - -## Key Changes - -### 1. Add selected-page scheduling to `DoclingThreadedPdfParser.load(...)` -- Extend `load(...)` with optional `page_numbers: Sequence[int] | None = None`. -- Treat `page_numbers` as 1-indexed physical page numbers. -- Normalize at load time and reject out-of-range values. -- Store the selected subset per loaded document and build the threaded task queue from that subset instead of all pages. -- Keep `page_count(doc_key)` as the physical document page count. -- Add `scheduled_page_count(doc_key) -> int` for the number of pages that will actually be emitted. - -### 2. Add public lifecycle cleanup to the threaded parser -- Add `unload(doc_key: str) -> bool`. -- Add `unload_all() -> None`. -- Clear document storage plus Python-side bookkeeping for page counts and selected-page subsets. -- Make unload idempotent after processing is complete. -- Do not add mid-stream cancellation in this pass; unloading during active iteration should raise a clear error. - -### 3. Extend `PageParseResult.get_image(...)` -- Change `PageParseResult.get_image()` to accept: - - `scale: float = 1.0` - - `cropbox: ... | None = None` -- Keep the no-argument behavior compatible with today’s render-config mode. -- Keep current gating: `get_image(...)` only works when the threaded parser was configured with `parser_config.render_config`; parse-only results still fail clearly. -- Implement true rerendering from the retained `PdfPageDecoder` for scaled requests. -- Do not implement scaled output by resizing the existing pre-rendered image. - -### 4. Keep cropping in Python, not C++ -- Do not add crop-aware rendering to the C++ layer in this pass. -- `get_image(scale=..., cropbox=...)` should: - - render the full page at the requested scale - - crop the rendered PIL image in Python -- The cropbox contract should match the current `docling_release` expectations: page-coordinate crop input, converted in Python against the rendered page size. -- This keeps semantics aligned with current page-image caching in `docling_release` while avoiding immediate C++ rendering changes. - -### 5. Cache policy for threaded result images -- Keep `get_image(...)` lazy. -- Preserve the existing pre-rendered full-page image as a fast path for the default full-page request when available. -- For non-default `scale`, rerender from the decoder. -- For `cropbox`, crop from the full-page image at the requested scale in Python. -- Do not require aggressive per-crop caching in `docling-parse`; `docling_release` already caches full-page images by scale. - -## Test Plan - -- Full-document threaded loads still emit all pages with correct 1-indexed `page_number`. -- `load(..., page_numbers=[...])` emits only the selected physical pages. -- `page_count(doc_key)` returns the full document count; `scheduled_page_count(doc_key)` returns the subset count. -- Invalid, duplicate, and unsorted page-number inputs are handled deterministically. -- Multi-document threaded parsing works with different subsets per document. -- `unload(doc_key)` succeeds after consumption, is idempotent, and removes the document from lookup state. -- `unload()` during active iteration raises the documented error. -- `get_image()` with no arguments still works in render-config mode. -- `get_image(scale=...)` produces a true rerender at the requested scale. -- `get_image(scale=..., cropbox=...)` returns the correct crop from the full-page rendered image at that scale. -- Repeated default full-page requests can reuse the pre-rendered fast path; scaled requests rerender from the decoder. - -## Assumptions and Defaults - -- Sequential `DoclingPdfParser` / `PdfDocument` APIs stay unchanged. -- No mid-stream cancellation is added in this pass. -- `docling_release` will continue to manage page-level image caching by scale on its side. -- Other `docling_release` pipelines may still fail against the draft threaded backend; this upstream work is specifically to unblock the later threaded PDF backend integration. From b7e1faa02c42284abb2e5cd85c2a3482664ace7b Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 11 May 2026 12:21:23 +0200 Subject: [PATCH 6/8] Hide raw threaded pybind parser types behind internal names Signed-off-by: Christoph Auer --- app/pybind_parse.cpp | 30 +++++++++++++++--------------- docling_parse/pdf_parser.py | 8 ++++---- tests/test_threaded_parse.py | 8 ++++++++ 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp index 38322b5b..7b2eff3f 100644 --- a/app/pybind_parse.cpp +++ b/app/pybind_parse.cpp @@ -822,10 +822,10 @@ PYBIND11_MODULE(pdf_parsers, m) { // ============= Threaded PDF Parser ============= - // PageDecodeResult - result of a threaded page decode task - pybind11::class_(m, "PageDecodeResult", + // _PageDecodeResult - internal result of a threaded page decode task + pybind11::class_(m, "_PageDecodeResult", R"( - Result of a threaded page decoding task. + Internal result of a threaded page decoding task. Attributes: doc_key (str): The document key this page belongs to. @@ -862,10 +862,10 @@ PYBIND11_MODULE(pdf_parsers, m) { Returns: str: The error message.)"); - // threaded_pdf_parser - parallel PDF parser with bounded result queue - pybind11::class_(m, "threaded_pdf_parser", + // _threaded_pdf_parser - internal parallel PDF parser with bounded result queue + pybind11::class_(m, "_threaded_pdf_parser", R"( - Threaded PDF parser that processes pages in parallel. + Internal threaded PDF parser that processes pages in parallel. Loads multiple documents and decodes their pages using a thread pool. Results are available via a bounded queue to control memory usage. @@ -998,7 +998,7 @@ PYBIND11_MODULE(pdf_parsers, m) { Blocks until a result is available. Releases the GIL while waiting. Returns: - PageDecodeResult: The result of a page decoding task.)"); + _PageDecodeResult: The result of a page decoding task.)"); // ============= Threaded PDF Renderer ============= @@ -1029,12 +1029,12 @@ PYBIND11_MODULE(pdf_parsers, m) { .def_readwrite("canvas_width", &pdflib::render_config::canvas_width) .def_readwrite("canvas_height", &pdflib::render_config::canvas_height); - // PageRenderResult - result of a threaded page render task - pybind11::class_(m, "PageRenderResult", + // _PageRenderResult - internal result of a threaded page render task + pybind11::class_(m, "_PageRenderResult", R"( - Result of a threaded page rendering task. + Internal result of a threaded page rendering task. - Inherits all attributes of PageDecodeResult and adds rendered image data. + Inherits all attributes of _PageDecodeResult and adds rendered image data. Attributes: image_data: Raw RGBA bytes of the rendered page (height x width x 4, row-major). @@ -1062,10 +1062,10 @@ PYBIND11_MODULE(pdf_parsers, m) { Returns: bytes: Raw RGBA pixel data, or empty bytes on failure.)"); - // threaded_pdf_renderer - parallel PDF renderer with bounded result queue - pybind11::class_(m, "threaded_pdf_renderer", + // _threaded_pdf_renderer - internal parallel PDF renderer with bounded result queue + pybind11::class_(m, "_threaded_pdf_renderer", R"( - Threaded PDF renderer that decodes and renders pages in parallel. + Internal threaded PDF renderer that decodes and renders pages in parallel. Loads multiple documents and renders their pages using a thread pool. Each result contains both the decoded page data and the rendered RGBA image. @@ -1150,5 +1150,5 @@ PYBIND11_MODULE(pdf_parsers, m) { Blocks until a result is available. Releases the GIL while waiting. Returns: - PageRenderResult: The result of a page rendering task.)"); + _PageRenderResult: The result of a page rendering task.)"); } diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 8ef8d6c7..19f30cbd 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -66,8 +66,8 @@ get_static_timing_keys, is_static_timing_key, pdf_parser, # type: ignore[import] - threaded_pdf_parser, # type: ignore[import] - threaded_pdf_renderer, # type: ignore[import] + _threaded_pdf_parser, # type: ignore[import] + _threaded_pdf_renderer, # type: ignore[import] ) # Configure logging @@ -1197,14 +1197,14 @@ def __init__( self._scheduled_page_counts: Dict[str, int] = {} if parser_config.render_config is None: - self._parser = threaded_pdf_parser( + self._parser = _threaded_pdf_parser( loglevel=parser_config.loglevel, num_threads=parser_config.threads, max_concurrent_results=parser_config.max_concurrent_results, config=self._decode_config, ) else: - self._parser = threaded_pdf_renderer( + self._parser = _threaded_pdf_renderer( loglevel=parser_config.loglevel, num_threads=parser_config.threads, max_concurrent_results=parser_config.max_concurrent_results, diff --git a/tests/test_threaded_parse.py b/tests/test_threaded_parse.py index fa892052..bd572bea 100644 --- a/tests/test_threaded_parse.py +++ b/tests/test_threaded_parse.py @@ -7,6 +7,7 @@ import pytest from docling_core.types.doc.page import PdfPageBoundaryType, SegmentedPdfPage +from docling_parse import pdf_parsers from docling_parse.pdf_parser import ( DecodePageConfig, DoclingPdfParser, @@ -32,6 +33,13 @@ def _make_decode_config() -> DecodePageConfig: return config +def test_threaded_raw_pybind_types_are_internal(): + assert not hasattr(pdf_parsers, "PageDecodeResult") + assert not hasattr(pdf_parsers, "threaded_pdf_parser") + assert not hasattr(pdf_parsers, "PageRenderResult") + assert not hasattr(pdf_parsers, "threaded_pdf_renderer") + + def test_threaded_reference_documents_from_filenames(): """Load all regression PDFs, decode all pages in parallel, and verify against groundtruth.""" pdf_docs = sorted(glob.glob(REGRESSION_FOLDER)) From 6b681d3236f073fb6d6803caba3abbb56d376b24 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 11 May 2026 13:38:36 +0200 Subject: [PATCH 7/8] Remove DoclingPdfRenderer (deprecation) Signed-off-by: Christoph Auer --- docling_parse/pdf_parser.py | 125 +-------------------- tests/test_renderer.py | 209 ------------------------------------ 2 files changed, 2 insertions(+), 332 deletions(-) delete mode 100644 tests/test_renderer.py diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 19f30cbd..6bf92c65 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -62,12 +62,12 @@ DecodePageConfig, # type: ignore[import] PdfPageDecoder, # type: ignore[import] RenderConfig, # type: ignore[import] + _threaded_pdf_parser, # type: ignore[import] + _threaded_pdf_renderer, # type: ignore[import] get_decode_page_timing_keys, get_static_timing_keys, is_static_timing_key, pdf_parser, # type: ignore[import] - _threaded_pdf_parser, # type: ignore[import] - _threaded_pdf_renderer, # type: ignore[import] ) # Configure logging @@ -1319,124 +1319,3 @@ def get_task(self) -> "PageParseResult": render_config=self._parser_config.render_config, ) - -class PdfRenderDocument: - def __init__( - self, - *, - path_or_stream: Union[Path, bytes], - parser_doc: PdfDocument, - parser_config: ThreadedPdfParserConfig, - decode_config: DecodePageConfig, - password: str | None = None, - ): - self._path_or_stream = path_or_stream - self._parser_doc = parser_doc - self._parser_config = parser_config - self._decode_config = decode_config - self._password = password - self._pages: Dict[int, PageParseResult] = {} - - def _make_renderer(self) -> "DoclingThreadedPdfParser": - return DoclingThreadedPdfParser( - parser_config=self._parser_config, - decode_config=self._decode_config, - ) - - def _load_source(self, renderer: "DoclingThreadedPdfParser") -> str: - if isinstance(self._path_or_stream, Path): - return renderer.load(self._path_or_stream, password=self._password) - - return renderer.load(BytesIO(self._path_or_stream), password=self._password) - - def _render_all_pages(self) -> None: - if len(self._pages) == self.number_of_pages(): - return - - renderer = self._make_renderer() - key = self._load_source(renderer) - - while renderer.has_tasks(): - result = renderer.get_task() - if result.doc_key != key: - continue - if not result.success: - raise RuntimeError( - f"Failed to render page {result.page_number}: {result.error_message}" - ) - self._pages[result.page_number] = result - - def number_of_pages(self) -> int: - return self._parser_doc.number_of_pages() - - def get_page(self, page_no: int) -> PageParseResult: - if not (1 <= page_no <= self.number_of_pages()): - raise ValueError( - f"incorrect page_no: {page_no} (min:1, max:{self.number_of_pages()})" - ) - - if page_no not in self._pages: - self._render_all_pages() - - return self._pages[page_no] - - def iterate_pages(self) -> Iterator[Tuple[int, PageParseResult]]: - self._render_all_pages() - for page_no in range(1, self.number_of_pages() + 1): - yield page_no, self._pages[page_no] - - def unload(self) -> bool: - self._pages.clear() - return self._parser_doc.unload() - - -class DoclingPdfRenderer: - def __init__( - self, - loglevel: str = "fatal", - decode_config: DecodePageConfig | None = None, - render_config: RenderConfig | None = None, - ): - self._loglevel = loglevel - self._parser = DoclingPdfParser(loglevel=loglevel) - self._decode_config = decode_config or DecodePageConfig() - self._render_config = render_config or RenderConfig() - - def load( - self, - path_or_stream: Union[str, Path, BytesIO], - lazy: bool = True, - boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, - password: str | None = None, - ) -> PdfRenderDocument: - parser_doc = self._parser.load( - path_or_stream=path_or_stream, - lazy=lazy, - boundary_type=boundary_type, - password=password, - ) - - if isinstance(path_or_stream, str): - source: Union[Path, bytes] = Path(path_or_stream) - elif isinstance(path_or_stream, Path): - source = path_or_stream - elif isinstance(path_or_stream, BytesIO): - source = path_or_stream.getvalue() - else: - raise TypeError( - f"Expected str, Path, or BytesIO, got {type(path_or_stream)}" - ) - - return PdfRenderDocument( - path_or_stream=source, - parser_doc=parser_doc, - parser_config=ThreadedPdfParserConfig( - loglevel=self._loglevel, - threads=1, - max_concurrent_results=1, - boundary_type=boundary_type, - render_config=self._render_config, - ), - decode_config=self._decode_config, - password=password, - ) diff --git a/tests/test_renderer.py b/tests/test_renderer.py deleted file mode 100644 index e40ac09f..00000000 --- a/tests/test_renderer.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python -import glob -import hashlib -import json -import os -from pathlib import Path -from typing import Any - -from docling_parse.pdf_parser import ( - DecodePageConfig, - DoclingPdfRenderer, - PdfRenderDocument, -) - -GENERATE = True -RENDER_INSTRUCTION_EPS = 0.005 - -GROUNDTRUTH_RENDERER_FOLDER = "tests/data/groundtruth_renderer" -REGRESSION_FOLDER = "tests/data/regression/*.pdf" - -PAGE_RESTRICTIONS = { - "deep-mediabox-inheritance.pdf": [2], - "font_06.pdf": [1], - "font_07.pdf": [1], - "font_08.pdf": [1], - "font_09.pdf": [1], - "font_10.pdf": [1], -} - -BITMAP_RESTRICTIONS = { - "indexed_iccbased.pdf": { - 1: [1, 5, 10, 15], - }, -} -MAX_BITMAPS_PER_PAGE = 5 - - -def _round_floats(obj, ndigits=3): - if isinstance(obj, float): - return round(obj, ndigits) - if isinstance(obj, dict): - return {k: _round_floats(v, ndigits) for k, v in obj.items()} - if isinstance(obj, list): - return [_round_floats(v, ndigits) for v in obj] - return obj - - -def _assert_json_matches_with_float_delta( - expected: Any, actual: Any, eps: float, path: str = "root" -) -> None: - if isinstance(expected, bool) or isinstance(actual, bool): - assert expected == actual, f"{path}: {expected!r} != {actual!r}" - return - - if isinstance(expected, float): - assert isinstance(actual, (int, float)), ( - f"{path}: expected float, got {type(actual).__name__}" - ) - assert abs(expected - float(actual)) <= eps, ( - f"{path}: abs({expected} - {actual}) > {eps}" - ) - return - - if isinstance(expected, dict): - assert isinstance(actual, dict), ( - f"{path}: expected dict, got {type(actual).__name__}" - ) - assert expected.keys() == actual.keys(), f"{path}: key mismatch" - for key in expected: - _assert_json_matches_with_float_delta( - expected[key], actual[key], eps, path=f"{path}.{key}" - ) - return - - if isinstance(expected, list): - assert isinstance(actual, list), ( - f"{path}: expected list, got {type(actual).__name__}" - ) - assert len(expected) == len(actual), f"{path}: length mismatch" - for idx, (expected_item, actual_item) in enumerate(zip(expected, actual)): - _assert_json_matches_with_float_delta( - expected_item, actual_item, eps, path=f"{path}[{idx}]" - ) - return - - assert expected == actual, f"{path}: {expected!r} != {actual!r}" - - -def _page_prefix(pdf_name: str, page_no: int) -> Path: - return Path(GROUNDTRUTH_RENDERER_FOLDER) / f"{pdf_name}.page_no_{page_no}" - - -def _instruction_path(pdf_name: str, page_no: int) -> Path: - return Path(f"{_page_prefix(pdf_name, page_no)}.instructions.json") - - -def _bitmap_json_path(pdf_name: str, page_no: int, bitmap_index: int) -> Path: - return Path(f"{_page_prefix(pdf_name, page_no)}.bitmap_{bitmap_index}.json") - - -def _full_page_png_path(pdf_name: str, page_no: int) -> Path: - return Path(f"{_page_prefix(pdf_name, page_no)}.full_page.png") - - -def _write_json(path: Path, payload) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - with open(path, "w", encoding="utf-8") as fw: - json.dump(_round_floats(payload), fw, indent=2) - - -def _load_json(path: Path): - with open(path, encoding="utf-8") as fr: - return json.load(fr) - - -def _artifact_basename( - pdf_name: str, page_no: int, bitmap_index: int, extension: str -) -> str: - return f"{pdf_name}.page_no_{page_no}.bitmap_{bitmap_index}{extension}" - - -def _selected_bitmap_indices(pdf_name: str, page_no: int, num_bitmaps: int) -> set[int]: - restricted = BITMAP_RESTRICTIONS.get(pdf_name, {}).get(page_no) - - if restricted is None: - return set(range(1, min(num_bitmaps, MAX_BITMAPS_PER_PAGE) + 1)) - - return set(restricted[:MAX_BITMAPS_PER_PAGE]) - - -def _export_or_verify_bitmaps(pdf_name: str, page_no: int, bitmaps) -> None: - selected = _selected_bitmap_indices(pdf_name, page_no, len(bitmaps)) - - for bitmap_index, bitmap in enumerate(bitmaps, start=1): - if bitmap_index not in selected: - continue - - raw_sha256 = hashlib.sha256(bitmap["raw_data"]).hexdigest() - extension = bitmap["extension"] - artifact_name = _artifact_basename(pdf_name, page_no, bitmap_index, extension) - artifact_path = Path(GROUNDTRUTH_RENDERER_FOLDER) / artifact_name - sidecar_path = _bitmap_json_path(pdf_name, page_no, bitmap_index) - - sidecar = { - "index": bitmap["index"], - "xobject_key": bitmap["xobject_key"], - "shape": bitmap["shape"], - "pixel_format": bitmap["pixel_format"], - "image_mask": bitmap["image_mask"], - "rgb_filling": bitmap["rgb_filling"], - "quad": bitmap["quad"], - "exported_filename": artifact_name, - "raw_sha256": raw_sha256, - } - - if GENERATE or (not sidecar_path.exists()) or (not artifact_path.exists()): - _write_json(sidecar_path, sidecar) - with open(artifact_path, "wb") as fw: - fw.write(bitmap["encoded_data"]) - continue - - true_sidecar = _load_json(sidecar_path) - assert true_sidecar == _round_floats(sidecar), ( - f"bitmap metadata mismatch for {sidecar_path}" - ) - - with open(artifact_path, "rb") as fr: - true_bytes = fr.read() - assert true_bytes == bitmap["encoded_data"], ( - f"bitmap artifact bytes mismatch for {artifact_path}" - ) - - -def _export_full_page_png(pdf_name: str, page_no: int, image) -> None: - out_path = _full_page_png_path(pdf_name, page_no) - if out_path.exists(): - return - - if image is None: - return - - out_path.parent.mkdir(parents=True, exist_ok=True) - image.save(out_path, format="PNG") - - -def test_render_reference_documents(): - config = DecodePageConfig() - config.page_boundary = "crop_box" - config.do_sanitization = False - config.keep_glyphs = True - config.keep_qpdf_warnings = False - renderer = DoclingPdfRenderer(loglevel="fatal", decode_config=config) - - pdf_path = "docs/dln-v1.pdf" - pdf_doc: PdfRenderDocument = renderer.load(path_or_stream=pdf_path, lazy=True) - assert pdf_doc.number_of_pages() == 1 - - render_result = pdf_doc.get_page(1) - pred_instructions = render_result._export_render_instructions_json() - bitmap_artifacts = render_result._export_bitmap_artifacts() - image = render_result.get_image() - - assert pred_instructions["instructions"] - assert isinstance(bitmap_artifacts, list) - assert image.mode == "RGBA" - assert image.width > 0 - assert image.height > 0 - - pdf_doc.unload() From 75bc9a4734818eef881c7746b1b9f808449d7a0c Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 11 May 2026 14:51:39 +0200 Subject: [PATCH 8/8] lint/format stuff Signed-off-by: Christoph Auer --- docling_parse/pdf_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 6bf92c65..656f2d1e 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -1318,4 +1318,3 @@ def get_task(self) -> "PageParseResult": boundary_type=self._parser_config.boundary_type, render_config=self._parser_config.render_config, ) -