Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions tomd/lib/pdf/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Y_TOLERANCE, REPEATING_THRESHOLD, EDGE_ITEMS_PER_PAGE,
TERMINAL_PUNCTUATION,
PAGE_NUM_RE, COMPOUND_PREFIXES,
compute_bbox,
)

_log = logging.getLogger(__name__)
Expand Down Expand Up @@ -167,13 +168,7 @@ def _join_cross_page(blocks: list[Block]) -> list[Block]:
and prev_text[-1] not in TERMINAL_PUNCTUATION
and cur_text[0].islower()):
prev.lines.extend(block.lines)
bboxes = [ln.bbox for ln in prev.lines]
prev.bbox = (
min(b[0] for b in bboxes),
min(b[1] for b in bboxes),
max(b[2] for b in bboxes),
max(b[3] for b in bboxes),
)
prev.bbox = compute_bbox([ln.bbox for ln in prev.lines])
Comment thread
gregjkal marked this conversation as resolved.
else:
result.append(replace(block, lines=list(block.lines)))

Expand Down
15 changes: 3 additions & 12 deletions tomd/lib/pdf/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,13 @@
Block, Line, Span,
WORD_GAP_RATIO, LINE_SPACING_RATIO, PARA_SPACING_RATIO,
FALLBACK_FONT_SIZE,
compute_bbox,
)
from .mono import classify_monospace

_log = logging.getLogger(__name__)


def _compute_bbox(bboxes: list[tuple]) -> tuple[float, float, float, float]:
"""Compute the bounding box enclosing all given bbox tuples."""
return (
min(b[0] for b in bboxes),
min(b[1] for b in bboxes),
max(b[2] for b in bboxes),
max(b[3] for b in bboxes),
)


def extract_mupdf(page, page_num: int) -> list[Block]:
"""Extract text using MuPDF's built-in block/line/span hierarchy.

Expand Down Expand Up @@ -145,7 +136,7 @@ def _flush_line():
_flush_word()
if not cur_spans:
return
bbox = _compute_bbox([s.bbox for s in cur_spans])
bbox = compute_bbox([s.bbox for s in cur_spans])
cur_lines.append(Line(
spans=list(cur_spans),
bbox=bbox,
Expand All @@ -157,7 +148,7 @@ def _flush_block():
_flush_line()
if not cur_lines:
return
bbox = _compute_bbox([ln.bbox for ln in cur_lines])
bbox = compute_bbox([ln.bbox for ln in cur_lines])
blocks.append(Block(
lines=list(cur_lines),
bbox=bbox,
Expand Down
14 changes: 14 additions & 0 deletions tomd/lib/pdf/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,20 @@ class PageEdgeItem:

TERMINAL_PUNCTUATION = frozenset(".?!:")


def compute_bbox(bboxes: list[tuple]) -> tuple[float, float, float, float]:
"""Compute the bounding box enclosing all given bbox tuples.

Raises ValueError (via min/max) if bboxes is empty. Callers must
ensure at least one bbox is present.
"""
return (
min(b[0] for b in bboxes),
min(b[1] for b in bboxes),
max(b[2] for b in bboxes),
max(b[3] for b in bboxes),
)

FALLBACK_FONT_SIZE = 12.0
FALLBACK_BODY_SIZE = 11.0
MIN_UNCERTAIN_WORDS = 10
Expand Down
7 changes: 7 additions & 0 deletions tomd/lib/pdf/wg21.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@

_PARENS_RE = re.compile(r"[()]")

# Maximum number of continuation blocks consumed after a reply-to label.
REPLY_TO_CONTINUATION_CAP = 5


def _clean(text: str) -> str:
"""Strip zero-width chars and whitespace."""
Expand Down Expand Up @@ -160,11 +163,14 @@ def extract_metadata_from_blocks(blocks: list[Block],
if found_any:
consumed.add(i)
if "reply" in " ".join(_clean(ln.text) for ln in block.lines).lower():
continuation_count = 0
for j, next_block in page0_blocks:
if j <= i:
continue
if j in consumed:
continue
if continuation_count >= REPLY_TO_CONTINUATION_CAP:
break
next_text = _clean(next_block.lines[0].text) if next_block.lines else ""
if not next_text or _LABEL_RE.match(next_text):
break
Expand All @@ -175,6 +181,7 @@ def extract_metadata_from_blocks(blocks: list[Block],
existing = metadata.get("reply-to", [])
metadata["reply-to"] = existing + extra_authors
consumed.add(j)
continuation_count += 1
else:
break

Expand Down
16 changes: 15 additions & 1 deletion tomd/tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Tests for lib.pdf.extract."""

import pytest
from unittest.mock import MagicMock
from lib.pdf.extract import extract_spatial, attach_links
from lib.pdf.types import Span, Line, Block
from lib.pdf.types import Span, Line, Block, compute_bbox


def _make_page(chars_by_span):
Expand Down Expand Up @@ -95,6 +96,19 @@ def test_extract_spatial_sorts_across_blocks_in_y_band():
assert text.index("L") < text.index("R"), f"got text={text!r}"


class TestComputeBbox:
def test_single_box(self):
assert compute_bbox([(1.0, 2.0, 3.0, 4.0)]) == (1.0, 2.0, 3.0, 4.0)

def test_multiple_boxes(self):
result = compute_bbox([(10, 20, 30, 40), (5, 25, 35, 38)])
assert result == (5, 20, 35, 40)

def test_empty_raises(self):
with pytest.raises(ValueError):
compute_bbox([])


def _make_block_with_span(text, bbox):
span = Span(text=text, bbox=bbox)
line = Line(spans=[span])
Expand Down
23 changes: 22 additions & 1 deletion tomd/tests/test_wg21.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Tests for lib.pdf.wg21."""

from lib.pdf.types import Block, Line, Span
from lib.pdf.wg21 import extract_metadata_from_blocks
from lib.pdf.wg21 import extract_metadata_from_blocks, REPLY_TO_CONTINUATION_CAP


def _meta_block(lines_text, page_num=0, font_size=9.0):
Expand Down Expand Up @@ -95,3 +95,24 @@ def test_reply_to_name_then_email_on_next_line():
meta, consumed = extract_metadata_from_blocks([b])
assert "reply-to" in meta
assert any("Bob Jones" in a for a in meta["reply-to"])


def test_reply_to_continuation_capped():
"""Reply-to loop must stop after REPLY_TO_CONTINUATION_CAP blocks,
even if later blocks still contain emails."""
reply_block = _meta_block(["Reply-to: Alice <alice@x.com>"])
# Generate more continuation blocks than the cap allows
extra_count = REPLY_TO_CONTINUATION_CAP + 5
extras = [
_meta_block([f"Person{n} <p{n}@x.com>"])
for n in range(extra_count)
]
blocks = [reply_block] + extras
meta, consumed = extract_metadata_from_blocks(blocks)
# The continuation blocks consumed must not exceed the cap
# (block 0 is consumed as the reply-to label block itself)
continuation_consumed = consumed - {0}
assert len(continuation_consumed) == REPLY_TO_CONTINUATION_CAP
# The block just past the cap must not be consumed
past_cap_idx = 1 + REPLY_TO_CONTINUATION_CAP
assert past_cap_idx not in consumed
Loading