From f65ad967c476e8d99e09ef22d307ff721678142a Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Thu, 16 Apr 2026 10:05:57 -0400 Subject: [PATCH] tomd: kimi review follow-ups (compute_bbox dedup, reply-to cap) Addresses the deferred items from PR #18's "Not in this PR" section, sourced from the Apr 15 kimi-k2.5 review. - Promote _compute_bbox from extract.py to types.py as public compute_bbox. No silent empty-list guard: callers already enforce non-empty input, and a ValueError on empty is the correct contract (crash loudly, don't degrade). Replace the inline min/max duplicate in cleanup.py._join_cross_page with a call to the shared helper. - Cap the reply-to continuation loop in wg21.py at REPLY_TO_CONTINUATION_CAP (5) blocks. On a malformed PDF where page 0 body text contains email addresses, the unbounded loop could sweep arbitrary content into metadata["reply-to"]. --- tomd/lib/pdf/cleanup.py | 9 ++------- tomd/lib/pdf/extract.py | 15 +++------------ tomd/lib/pdf/types.py | 14 ++++++++++++++ tomd/lib/pdf/wg21.py | 7 +++++++ tomd/tests/test_extract.py | 16 +++++++++++++++- tomd/tests/test_wg21.py | 23 ++++++++++++++++++++++- 6 files changed, 63 insertions(+), 21 deletions(-) diff --git a/tomd/lib/pdf/cleanup.py b/tomd/lib/pdf/cleanup.py index 8a5cad3..dc5fe75 100644 --- a/tomd/lib/pdf/cleanup.py +++ b/tomd/lib/pdf/cleanup.py @@ -11,6 +11,7 @@ Y_TOLERANCE, REPEATING_THRESHOLD, EDGE_ITEMS_PER_PAGE, TERMINAL_PUNCTUATION, PAGE_NUM_RE, COMPOUND_PREFIXES, + compute_bbox, ) _log = logging.getLogger(__name__) @@ -167,13 +168,7 @@ def _join_cross_page(blocks: list[Block]) -> list[Block]: and prev_text[-1] not in TERMINAL_PUNCTUATION and cur_text[0].islower()): prev.lines.extend(block.lines) - bboxes = [ln.bbox for ln in prev.lines] - prev.bbox = ( - min(b[0] for b in bboxes), - min(b[1] for b in bboxes), - max(b[2] for b in bboxes), - max(b[3] for b in bboxes), - ) + prev.bbox = compute_bbox([ln.bbox for ln in prev.lines]) else: result.append(replace(block, lines=list(block.lines))) diff --git a/tomd/lib/pdf/extract.py b/tomd/lib/pdf/extract.py index 62159d0..4b37cb2 100644 --- a/tomd/lib/pdf/extract.py +++ b/tomd/lib/pdf/extract.py @@ -8,22 +8,13 @@ Block, Line, Span, WORD_GAP_RATIO, LINE_SPACING_RATIO, PARA_SPACING_RATIO, FALLBACK_FONT_SIZE, + compute_bbox, ) from .mono import classify_monospace _log = logging.getLogger(__name__) -def _compute_bbox(bboxes: list[tuple]) -> tuple[float, float, float, float]: - """Compute the bounding box enclosing all given bbox tuples.""" - return ( - min(b[0] for b in bboxes), - min(b[1] for b in bboxes), - max(b[2] for b in bboxes), - max(b[3] for b in bboxes), - ) - - def extract_mupdf(page, page_num: int) -> list[Block]: """Extract text using MuPDF's built-in block/line/span hierarchy. @@ -145,7 +136,7 @@ def _flush_line(): _flush_word() if not cur_spans: return - bbox = _compute_bbox([s.bbox for s in cur_spans]) + bbox = compute_bbox([s.bbox for s in cur_spans]) cur_lines.append(Line( spans=list(cur_spans), bbox=bbox, @@ -157,7 +148,7 @@ def _flush_block(): _flush_line() if not cur_lines: return - bbox = _compute_bbox([ln.bbox for ln in cur_lines]) + bbox = compute_bbox([ln.bbox for ln in cur_lines]) blocks.append(Block( lines=list(cur_lines), bbox=bbox, diff --git a/tomd/lib/pdf/types.py b/tomd/lib/pdf/types.py index 3dfcac6..7788b46 100644 --- a/tomd/lib/pdf/types.py +++ b/tomd/lib/pdf/types.py @@ -208,6 +208,20 @@ class PageEdgeItem: TERMINAL_PUNCTUATION = frozenset(".?!:") + +def compute_bbox(bboxes: list[tuple]) -> tuple[float, float, float, float]: + """Compute the bounding box enclosing all given bbox tuples. + + Raises ValueError (via min/max) if bboxes is empty. Callers must + ensure at least one bbox is present. + """ + return ( + min(b[0] for b in bboxes), + min(b[1] for b in bboxes), + max(b[2] for b in bboxes), + max(b[3] for b in bboxes), + ) + FALLBACK_FONT_SIZE = 12.0 FALLBACK_BODY_SIZE = 11.0 MIN_UNCERTAIN_WORDS = 10 diff --git a/tomd/lib/pdf/wg21.py b/tomd/lib/pdf/wg21.py index 20c85e1..4319824 100644 --- a/tomd/lib/pdf/wg21.py +++ b/tomd/lib/pdf/wg21.py @@ -25,6 +25,9 @@ _PARENS_RE = re.compile(r"[()]") +# Maximum number of continuation blocks consumed after a reply-to label. +REPLY_TO_CONTINUATION_CAP = 5 + def _clean(text: str) -> str: """Strip zero-width chars and whitespace.""" @@ -160,11 +163,14 @@ def extract_metadata_from_blocks(blocks: list[Block], if found_any: consumed.add(i) if "reply" in " ".join(_clean(ln.text) for ln in block.lines).lower(): + continuation_count = 0 for j, next_block in page0_blocks: if j <= i: continue if j in consumed: continue + if continuation_count >= REPLY_TO_CONTINUATION_CAP: + break next_text = _clean(next_block.lines[0].text) if next_block.lines else "" if not next_text or _LABEL_RE.match(next_text): break @@ -175,6 +181,7 @@ def extract_metadata_from_blocks(blocks: list[Block], existing = metadata.get("reply-to", []) metadata["reply-to"] = existing + extra_authors consumed.add(j) + continuation_count += 1 else: break diff --git a/tomd/tests/test_extract.py b/tomd/tests/test_extract.py index 51aa5cf..8551df3 100644 --- a/tomd/tests/test_extract.py +++ b/tomd/tests/test_extract.py @@ -1,8 +1,9 @@ """Tests for lib.pdf.extract.""" +import pytest from unittest.mock import MagicMock from lib.pdf.extract import extract_spatial, attach_links -from lib.pdf.types import Span, Line, Block +from lib.pdf.types import Span, Line, Block, compute_bbox def _make_page(chars_by_span): @@ -95,6 +96,19 @@ def test_extract_spatial_sorts_across_blocks_in_y_band(): assert text.index("L") < text.index("R"), f"got text={text!r}" +class TestComputeBbox: + def test_single_box(self): + assert compute_bbox([(1.0, 2.0, 3.0, 4.0)]) == (1.0, 2.0, 3.0, 4.0) + + def test_multiple_boxes(self): + result = compute_bbox([(10, 20, 30, 40), (5, 25, 35, 38)]) + assert result == (5, 20, 35, 40) + + def test_empty_raises(self): + with pytest.raises(ValueError): + compute_bbox([]) + + def _make_block_with_span(text, bbox): span = Span(text=text, bbox=bbox) line = Line(spans=[span]) diff --git a/tomd/tests/test_wg21.py b/tomd/tests/test_wg21.py index 1862a14..1ea17b4 100644 --- a/tomd/tests/test_wg21.py +++ b/tomd/tests/test_wg21.py @@ -1,7 +1,7 @@ """Tests for lib.pdf.wg21.""" from lib.pdf.types import Block, Line, Span -from lib.pdf.wg21 import extract_metadata_from_blocks +from lib.pdf.wg21 import extract_metadata_from_blocks, REPLY_TO_CONTINUATION_CAP def _meta_block(lines_text, page_num=0, font_size=9.0): @@ -95,3 +95,24 @@ def test_reply_to_name_then_email_on_next_line(): meta, consumed = extract_metadata_from_blocks([b]) assert "reply-to" in meta assert any("Bob Jones" in a for a in meta["reply-to"]) + + +def test_reply_to_continuation_capped(): + """Reply-to loop must stop after REPLY_TO_CONTINUATION_CAP blocks, + even if later blocks still contain emails.""" + reply_block = _meta_block(["Reply-to: Alice "]) + # Generate more continuation blocks than the cap allows + extra_count = REPLY_TO_CONTINUATION_CAP + 5 + extras = [ + _meta_block([f"Person{n} "]) + for n in range(extra_count) + ] + blocks = [reply_block] + extras + meta, consumed = extract_metadata_from_blocks(blocks) + # The continuation blocks consumed must not exceed the cap + # (block 0 is consumed as the reply-to label block itself) + continuation_consumed = consumed - {0} + assert len(continuation_consumed) == REPLY_TO_CONTINUATION_CAP + # The block just past the cap must not be consumed + past_cap_idx = 1 + REPLY_TO_CONTINUATION_CAP + assert past_cap_idx not in consumed