From cea0c1e9a5c38e2114b9dcd81b7866be1c3234be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Thu, 2 Apr 2026 23:47:33 +0800 Subject: [PATCH] Fix compatibility for PDFs with embedded outlines --- pageindex/page_index.py | 13 +++++- pageindex/utils.py | 92 +++++++++++++++++++++++++++++++++++++ tests/test_outline_first.py | 60 ++++++++++++++++++++++++ 3 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 tests/test_outline_first.py diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 9004309fb..21847c3c4 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1080,7 +1080,16 @@ def page_index_main(doc, opt=None): logger.info({'total_token': sum([page[1] for page in page_list])}) async def page_index_builder(): - structure = await tree_parser(page_list, opt, doc=doc, logger=logger) + outline_structure = get_pdf_outline_tree(doc) + if outline_structure: + logger.info({ + 'outline_first': True, + 'outline_node_count': len(structure_to_list(outline_structure)) + }) + structure = outline_structure + else: + logger.info({'outline_first': False}) + structure = await tree_parser(page_list, opt, doc=doc, logger=logger) if opt.if_add_node_id == 'yes': write_node_id(structure) if opt.if_add_node_text == 'yes': @@ -1151,4 +1160,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt if truncated_items: print(f"Truncated {len(truncated_items)} TOC items that exceeded document length") - return toc_with_page_number \ No newline at end of file + return toc_with_page_number diff --git a/pageindex/utils.py b/pageindex/utils.py index f00ccf3a7..9961abe6b 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -708,3 +708,95 @@ def print_wrapped(text, width=100): for line in text.splitlines(): print(textwrap.fill(line, width=width)) + +def _outline_destination_title(dest) -> str: + title = getattr(dest, "title", None) + if title is None and hasattr(dest, "get"): + title = dest.get("/Title") + return (title or "").replace("\r", "").strip() + + +def _outline_destination_page(reader, dest) -> int | None: + try: + page = reader.get_destination_page_number(dest) + 1 + return page if page > 0 else None + except Exception: + return None + + +def _parse_pdf_outline_items(reader, items): + nodes = [] + i = 0 + while i < len(items): + item = items[i] + if isinstance(item, list): + i += 1 + continue + + node = { + "title": _outline_destination_title(item), + "start_index": _outline_destination_page(reader, item), + "nodes": [], + } + + if i + 1 < len(items) and isinstance(items[i + 1], list): + node["nodes"] = _parse_pdf_outline_items(reader, items[i + 1]) + if node["start_index"] is None: + for child in node["nodes"]: + if child.get("start_index") is not None: + node["start_index"] = child["start_index"] + break + i += 1 + + nodes.append(node) + i += 1 + return nodes + + +def _assign_outline_end_indexes(nodes, fallback_end: int) -> None: + for idx, node in enumerate(nodes): + next_start = None + for sibling in nodes[idx + 1:]: + if sibling.get("start_index") is not None: + next_start = sibling["start_index"] + break + + candidate_end = (next_start - 1) if next_start else fallback_end + if node.get("start_index") is not None and candidate_end < node["start_index"]: + candidate_end = node["start_index"] + + if node["nodes"]: + _assign_outline_end_indexes(node["nodes"], candidate_end) + child_ends = [child.get("end_index") for child in node["nodes"] if child.get("end_index") is not None] + node["end_index"] = max(child_ends) if child_ends else candidate_end + else: + node["end_index"] = candidate_end + + +def get_pdf_outline_tree(pdf_path): + """ + Build a tree from embedded PDF outline/bookmarks when present. + Returns [] when outline is unavailable or unusable. + """ + try: + reader = PyPDF2.PdfReader(pdf_path) + outline = reader.outline + if not isinstance(outline, list) or len(outline) == 0: + return [] + + tree = _parse_pdf_outline_items(reader, outline) + tree = [node for node in tree if node.get("title")] + if not tree: + return [] + + _assign_outline_end_indexes(tree, len(reader.pages)) + + flat_nodes = structure_to_list(tree) + valid_nodes = [node for node in flat_nodes if node.get("start_index") is not None] + # Sparse outlines are not good enough to replace the normal parser. + if len(valid_nodes) < 5: + return [] + + return tree + except Exception: + return [] diff --git a/tests/test_outline_first.py b/tests/test_outline_first.py new file mode 100644 index 000000000..6c06513eb --- /dev/null +++ b/tests/test_outline_first.py @@ -0,0 +1,60 @@ +import unittest +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import AsyncMock, patch + +from pageindex.page_index import page_index_main +from pageindex.utils import get_pdf_outline_tree, structure_to_list + + +class OutlineFirstTests(unittest.TestCase): + def test_embedded_outline_builds_a_usable_tree(self): + pdf_path = Path("examples/documents/PRML.pdf") + if not pdf_path.exists(): + self.skipTest(f"missing sample PDF: {pdf_path}") + + outline_tree = get_pdf_outline_tree(str(pdf_path)) + + self.assertIsInstance(outline_tree, list) + self.assertTrue(outline_tree, "expected an outline-first tree for PRML.pdf") + + flat_nodes = structure_to_list(outline_tree) + valid_nodes = [node for node in flat_nodes if node.get("start_index") is not None] + + self.assertGreaterEqual(len(valid_nodes), 5) + self.assertTrue(all(node["title"] for node in flat_nodes)) + self.assertTrue( + all( + node.get("end_index") is not None and node["end_index"] >= node["start_index"] + for node in valid_nodes + ) + ) + + def test_page_index_main_prefers_outline_tree_over_tree_parser(self): + pdf_path = "examples/documents/PRML.pdf" + outline_tree = [{"title": "Outline Root", "start_index": 1, "end_index": 3, "nodes": []}] + opt = SimpleNamespace( + model=None, + if_add_node_id="no", + if_add_node_text="no", + if_add_node_summary="no", + if_add_doc_description="no", + ) + + tree_parser_mock = AsyncMock(side_effect=AssertionError("tree_parser should not run")) + + with patch("pageindex.page_index.get_page_tokens", return_value=[("page", 1)]), \ + patch("pageindex.page_index.get_pdf_outline_tree", return_value=outline_tree), \ + patch("pageindex.page_index.tree_parser", tree_parser_mock), \ + patch("pageindex.page_index.JsonLogger") as logger_cls: + logger = logger_cls.return_value + logger.info.return_value = None + + result = page_index_main(pdf_path, opt) + + self.assertEqual(result["structure"], outline_tree) + tree_parser_mock.assert_not_awaited() + + +if __name__ == "__main__": + unittest.main()