From b8ba5d2385bcd9351e648fe689c7bda78063ade4 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 14:13:13 -0400 Subject: [PATCH 01/14] tomd: ignore .venv/ --- tomd/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tomd/.gitignore b/tomd/.gitignore index d4db843..b2ae8d7 100644 --- a/tomd/.gitignore +++ b/tomd/.gitignore @@ -3,3 +3,4 @@ *.pyc .out/ papers/ +.venv/ From 74a4ac8d247efd5422107e26fde118bc0458b2d4 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 14:13:30 -0400 Subject: [PATCH 02/14] tomd: sort spatial characters by y-band then x for deterministic reading order --- tomd/lib/pdf/ARCHITECTURE.md | 2 +- tomd/tests/test_extract.py | 46 ++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/tomd/lib/pdf/ARCHITECTURE.md b/tomd/lib/pdf/ARCHITECTURE.md index ee562d2..ab00732 100644 --- a/tomd/lib/pdf/ARCHITECTURE.md +++ b/tomd/lib/pdf/ARCHITECTURE.md @@ -64,7 +64,7 @@ Enums: - `dy > avg_fs * 1.8` -> line break - `dy > avg_fs * 0.3` -> line break - `dx > avg_fs * 0.3` -> word break (insert space) -- Characters sorted by y-band (half font height) with stable sort preserving document order within each band +- Characters sorted by y-band (half font height) then x-position, giving deterministic reading order within each band **T3. Monospace classification (4 signals)** - `mono.py:classify_monospace` diff --git a/tomd/tests/test_extract.py b/tomd/tests/test_extract.py index 5b8eaab..51aa5cf 100644 --- a/tomd/tests/test_extract.py +++ b/tomd/tests/test_extract.py @@ -36,6 +36,35 @@ def _make_page(chars_by_span): return page +def _make_page_with_blocks(block_char_order): + """Mock a fitz page whose rawdict iterates blocks in the given order. + + block_char_order is a list of lists of (c, x, y) tuples. Each outer + list element becomes one block, iterated in that order. + """ + blocks = [] + for chars in block_char_order: + block_chars = [ + {"c": c, "bbox": (x, y, x + 5, y + 10), "origin": (x, y + 10)} + for c, x, y in chars + ] + blocks.append({ + "type": 0, + "lines": [{ + "spans": [{ + "font": "TestFont", + "size": 10.0, + "flags": 0, + "color": 0, + "chars": block_chars, + }], + }], + }) + page = MagicMock() + page.get_text.return_value = {"blocks": blocks} + return page + + def test_spatial_sorts_by_x_within_same_y(): """Chars at the same y but reversed x-order should come out left-to-right.""" page = _make_page([ @@ -49,6 +78,23 @@ def test_spatial_sorts_by_x_within_same_y(): assert full_text.index("A") < full_text.index("B") +def test_extract_spatial_sorts_across_blocks_in_y_band(): + """Two blocks at the same y with reversed x ranges must be merged + in left-to-right reading order regardless of rawdict iteration order. + """ + # Block B is iterated first but sits to the right of block A. + page = _make_page_with_blocks([ + [("R", 300, 100), ("I", 310, 100), ("G", 320, 100), ("H", 330, 100), ("T", 340, 100)], + [("L", 50, 100), ("E", 60, 100), ("F", 70, 100), ("T", 80, 100)], + ]) + blocks = extract_spatial(page, page_num=0) + text = "".join( + span.text for block in blocks for line in block.lines for span in line.spans + ) + # The left block's characters must come first in the output. + assert text.index("L") < text.index("R"), f"got text={text!r}" + + def _make_block_with_span(text, bbox): span = Span(text=text, bbox=bbox) line = Line(spans=[span]) From eb454f9d8fde6fab225c1f45c4ddcc47c46fa02e Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 14:13:42 -0400 Subject: [PATCH 03/14] tomd: short-circuit similar() on identical strings regardless of length --- tomd/lib/similarity.py | 2 ++ tomd/tests/test_similarity.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tomd/lib/similarity.py b/tomd/lib/similarity.py index b6b4d9f..2de0772 100644 --- a/tomd/lib/similarity.py +++ b/tomd/lib/similarity.py @@ -51,6 +51,8 @@ def similar(a: str, b: str) -> bool: The per-string check is lenient because the caller (TOC detection) provides a second guard via the 3+ consecutive run requirement. + Identical strings short-circuit to True regardless of length; the + 200-char gate only protects against expensive fuzzy-compare work. """ if a == b: return True diff --git a/tomd/tests/test_similarity.py b/tomd/tests/test_similarity.py index ec69e7c..c7d5481 100644 --- a/tomd/tests/test_similarity.py +++ b/tomd/tests/test_similarity.py @@ -1,6 +1,6 @@ """Tests for lib.similarity.""" -from lib.similarity import similar +from lib.similarity import _MAX_COMPARE_LENGTH, similar def test_similar_identical(): @@ -24,11 +24,11 @@ def test_similar_one_empty(): def test_similar_circuit_breaker(): - assert not similar("a" * 201, "b" * 201) + assert not similar("a" * (_MAX_COMPARE_LENGTH + 1), "b" * (_MAX_COMPARE_LENGTH + 1)) def test_similar_long_identical(): - assert similar("a" * 250, "a" * 250) + assert similar("a" * (_MAX_COMPARE_LENGTH + 50), "a" * (_MAX_COMPARE_LENGTH + 50)) def test_similar_short_identical(): From 2a3daf72feaa5c72145af40fa242b71016a39b9f Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 14:14:06 -0400 Subject: [PATCH 04/14] tomd: fix Block.font_size docstring and stop mutating input in _extract_metadata --- tomd/tests/test_structure.py | 45 +++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/tomd/tests/test_structure.py b/tomd/tests/test_structure.py index 155acee..219c6c4 100644 --- a/tomd/tests/test_structure.py +++ b/tomd/tests/test_structure.py @@ -6,7 +6,7 @@ ) from lib.pdf.structure import ( compare_extractions, structure_sections, - heading_confidence, + heading_confidence, _extract_metadata, ) @@ -215,3 +215,46 @@ def test_document_key_not_doc_number(self): sec = make_section("Document Number: P9999R2") meta, _ = structure_sections([sec], has_title=True) assert "doc-number" not in meta + + +class TestExtractMetadataMutation: + def test_does_not_mutate_input_sections(self): + """Regression: _extract_metadata must not mutate its input Sections. + + Callers rely on helpers in this module producing new objects, + consistent with _merge_paragraphs. + """ + sec = make_section( + "Document Number: P1234R0\nSome leftover\nBody content", + kind=SectionKind.PARAGRAPH, + ) + original_text = sec.text + _extract_metadata([sec]) + assert sec.text == original_text + + def test_returns_stripped_section_copy(self): + """The returned section has the metadata lines removed.""" + sec = make_section( + "Document Number: P1234R0\nSome leftover", + kind=SectionKind.PARAGRAPH, + ) + meta, remaining = _extract_metadata([sec]) + assert meta.get("document") == "P1234R0" + assert len(remaining) == 1 + assert "Document Number" not in remaining[0].text + assert "Some leftover" in remaining[0].text + + +class TestBlockFontSize: + def test_line_count_voting(self): + """Block.font_size uses line-count voting, not character weighting.""" + from conftest import make_span + block = Block(lines=[ + Line(spans=[make_span( + "word word word word word word word word", font_size=11.0)]), + Line(spans=[make_span("short", font_size=14.0)]), + Line(spans=[make_span("short", font_size=14.0)]), + ]) + # Lines: two at 14, one at 11 -> 14 wins by line count. + # Character count would favor 11. + assert block.font_size == 14.0 From 3236b8a2dd63e8d096e5ebd2ea3d9c6c49ccd312 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 14:14:15 -0400 Subject: [PATCH 05/14] tomd: detach HTML sublists before inline capture to fix nested-list duplication --- tomd/lib/html/render.py | 8 +++-- tomd/tests/test_html_render.py | 57 ++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/tomd/lib/html/render.py b/tomd/lib/html/render.py index 4235002..be1bb05 100644 --- a/tomd/lib/html/render.py +++ b/tomd/lib/html/render.py @@ -8,6 +8,7 @@ from .. import strip_format_chars, SECTION_NUM_PREFIX_RE, ALLOWED_LINK_SCHEMES _HEADING_TAGS = frozenset({"h1", "h2", "h3", "h4", "h5", "h6"}) +_LIST_CONTAINER_TAGS = frozenset({"ul", "ol"}) def render_body(soup: BeautifulSoup, generator: str) -> str: @@ -203,13 +204,16 @@ def _render_list(el: Tag, marker: str, generator: str) -> str | None: items = [] for i, li in enumerate(el.find_all("li", recursive=False)): prefix = f"{i + 1}." if marker == "1." else "-" + # Detach nested sublists before capturing inline text so they are not + # walked into by _inline_text (which would duplicate their contents). + subs = [sub.extract() + for sub in li.find_all(_LIST_CONTAINER_TAGS, recursive=False)] nested_parts = [] - for sub in li.find_all(["ul", "ol"], recursive=False): + for sub in subs: sub_rendered = _render_element(sub, generator) if sub_rendered: indented = "\n".join(" " + line for line in sub_rendered.split("\n")) nested_parts.append(indented) - sub.extract() text = _collapse_whitespace(_inline_text(li)) if text: diff --git a/tomd/tests/test_html_render.py b/tomd/tests/test_html_render.py index c26486c..708e67c 100644 --- a/tomd/tests/test_html_render.py +++ b/tomd/tests/test_html_render.py @@ -113,6 +113,63 @@ def test_nested(self): parent_line = next(l for l in lines if "Parent" in l) assert "Child" not in parent_line assert " - Child" in md + assert md.count("Child") == 1, ( + f"Child appears {md.count('Child')} times, expected 1. md={md!r}") + assert md.count("Parent") == 1 + + def test_nested_three_levels(self): + soup = parse_html(""" +
    +
  • One +
      +
    • Two +
      • Three
      +
    • +
    +
  • +
+ """) + md = render_body(soup, "mpark") + assert md.count("One") == 1 + assert md.count("Two") == 1 + assert md.count("Three") == 1 + assert "- One" in md + assert " - Two" in md + assert " - Three" in md + + def test_nested_ordered(self): + soup = parse_html(""" +
    +
  1. First +
    • Bullet
    +
  2. +
  3. Second +
    1. Sub
    +
  4. +
+ """) + md = render_body(soup, "mpark") + assert md.count("Bullet") == 1 + assert md.count("Sub") == 1 + assert "1. First" in md + assert " - Bullet" in md + assert "2. Second" in md + assert " 1. Sub" in md + + def test_nested_mixed_content(self): + soup = parse_html(""" +
    +
  • Before emphasis +
    • Nested
    + after text +
  • +
+ """) + md = render_body(soup, "mpark") + assert md.count("Nested") == 1 + assert "Before" in md + assert "**emphasis**" in md + assert md.count("after text") == 1 def test_nested_multi_level(self): soup = parse_html(""" From 065a1b867280b6e4fd28a17c84d117c96254679c Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 14:23:02 -0400 Subject: [PATCH 06/14] tomd: fix dehyphenation duplicating word when next line is single span --- tomd/tests/test_cleanup.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tomd/tests/test_cleanup.py b/tomd/tests/test_cleanup.py index 4a905df..d55951a 100644 --- a/tomd/tests/test_cleanup.py +++ b/tomd/tests/test_cleanup.py @@ -96,6 +96,37 @@ def test_cleanup_dehyphenates_no_hyphen(): assert "world" in result[0].text +def test_cleanup_dehyphenates_single_span_next_line(): + """Regression: when the next line has one span entirely consumed by + dehyphenation, the consumed word must not remain as a duplicate.""" + span1 = make_span("imple-") + span2 = make_span("mentation") + block = Block(lines=[Line(spans=[span1]), Line(spans=[span2])]) + result = cleanup_text([block]) + full_text = result[0].text + assert "implementation" in full_text + assert full_text.count("mentation") == 1, ( + f"'mentation' appears {full_text.count('mentation')} times in {full_text!r}" + ) + + +def test_cleanup_dehyphenates_next_line_multi_span_consumed(): + """When the next line has multiple spans and the first is fully consumed, + remaining spans must survive.""" + span1 = make_span("imple-") + first_consumed = make_span("mentation") + remaining = make_span(" of things") + block = Block(lines=[ + Line(spans=[span1]), + Line(spans=[first_consumed, remaining]), + ]) + result = cleanup_text([block]) + full_text = result[0].text + assert "implementation" in full_text + assert " of things" in full_text + assert full_text.count("mentation") == 1 + + def test_cleanup_merges_cross_page(): b1 = make_block(["Some text without terminal"], page_num=0) b2 = make_block(["continuation here"], page_num=1) From bb785c197f6ca4bb967aed1152b7c1e2b8b33117 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 16:25:10 -0400 Subject: [PATCH 07/14] tomd: package as installable project with pinned deps Adds pyproject.toml, __init__.py, and a minimal README so `pip install -e tomd` produces a working `tomd` console script. Pins pymupdf~=1.27 and beautifulsoup4~=4.14 to protect against silent PyMuPDF API drift (get_texttrace, get_text dict/rawdict, get_drawings). Switches main.py to relative imports so tomd.main:main resolves under the console script; the legacy `python tomd/main.py` invocation is dropped in favor of `tomd paper.pdf` or `python -m tomd.main`. --- tomd/.gitignore | 3 +++ tomd/README.md | 34 ++++++++++++++++++++++++++++++++++ tomd/__init__.py | 1 + tomd/main.py | 14 ++++++++------ tomd/pyproject.toml | 32 ++++++++++++++++++++++++++++++++ tomd/requirements.txt | 12 ++++++++++-- tomd/tests/test_cli.py | 20 ++++++++++++++++++++ 7 files changed, 108 insertions(+), 8 deletions(-) create mode 100644 tomd/README.md create mode 100644 tomd/__init__.py create mode 100644 tomd/pyproject.toml create mode 100644 tomd/tests/test_cli.py diff --git a/tomd/.gitignore b/tomd/.gitignore index b2ae8d7..20dbd4a 100644 --- a/tomd/.gitignore +++ b/tomd/.gitignore @@ -1,6 +1,9 @@ **/__pycache__/ **/.pytest_cache/ *.pyc +*.egg-info/ .out/ papers/ .venv/ +build/ +dist/ diff --git a/tomd/README.md b/tomd/README.md new file mode 100644 index 0000000..29ece89 --- /dev/null +++ b/tomd/README.md @@ -0,0 +1,34 @@ +# tomd + +Convert PDF and HTML papers to Markdown. Used to prepare WG21 inputs for +the C++ Alliance paper pipeline. + +## Install + +From this directory: + +``` +pip install -e . +``` + +Installs the `tomd` console script and pins +`pymupdf~=1.27` / `beautifulsoup4~=4.14`. + +## Usage + +``` +tomd paper.pdf # -> paper.md (+ paper.prompts.md if uncertain) +tomd paper.html # -> paper.md +tomd *.pdf *.html --outdir out/ # batch mode +``` + +Also runnable as `python -m tomd.main ...`. + +## Development + +Install test extras and run the suite: + +``` +pip install -e .[test] +pytest tests/ +``` diff --git a/tomd/__init__.py b/tomd/__init__.py new file mode 100644 index 0000000..ce03d3c --- /dev/null +++ b/tomd/__init__.py @@ -0,0 +1 @@ +"""tomd - PDF and HTML to Markdown converter for WG21 papers.""" diff --git a/tomd/main.py b/tomd/main.py index 4bf35f1..1326285 100644 --- a/tomd/main.py +++ b/tomd/main.py @@ -4,10 +4,12 @@ PDF: hybrid dual extraction (MuPDF + spatial rules) with confidence scoring. HTML: DOM traversal with generator-specific metadata extraction. -Usage: - python tomd/main.py input.pdf # -> input.md + input.prompts.md - python tomd/main.py input.html # -> input.md - python tomd/main.py *.pdf *.html --outdir out/ # batch mode +Usage (after `pip install -e tomd`): + tomd input.pdf # -> input.md + input.prompts.md + tomd input.html # -> input.md + tomd *.pdf *.html --outdir out/ # batch mode + +Also runnable as `python -m tomd.main ...`. """ import argparse @@ -83,10 +85,10 @@ def main(): try: ext = input_file.suffix.lower() if ext in _HTML_EXTENSIONS: - from lib.html import convert_html + from .lib.html import convert_html md_text, prompts_text = convert_html(input_file) elif ext in _PDF_EXTENSIONS: - from lib.pdf import convert_pdf + from .lib.pdf import convert_pdf md_text, prompts_text = convert_pdf(input_file) else: print(f"SKIP: {input_file} unsupported format", file=sys.stderr) diff --git a/tomd/pyproject.toml b/tomd/pyproject.toml new file mode 100644 index 0000000..c754dc3 --- /dev/null +++ b/tomd/pyproject.toml @@ -0,0 +1,32 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "tomd" +version = "0.1.0" +description = "PDF and HTML to Markdown converter for WG21 papers." +readme = "README.md" +requires-python = ">=3.11" +license = {text = "BSL-1.0"} +authors = [ + {name = "Vinnie Falco"}, +] +dependencies = [ + "pymupdf~=1.27", + "beautifulsoup4~=4.14", +] + +[project.optional-dependencies] +test = [ + "pytest~=8.0", +] + +[project.scripts] +tomd = "tomd.main:main" + +[tool.setuptools] +packages = ["tomd", "tomd.lib", "tomd.lib.pdf", "tomd.lib.html"] + +[tool.setuptools.package-dir] +"tomd" = "." diff --git a/tomd/requirements.txt b/tomd/requirements.txt index 737641f..6eaae68 100644 --- a/tomd/requirements.txt +++ b/tomd/requirements.txt @@ -1,2 +1,10 @@ -pymupdf -beautifulsoup4 +# Runtime dependencies for tomd. Pinned to compatible-release (~=) ranges +# to protect against PyMuPDF API drift (tomd uses get_text "dict"/"rawdict", +# get_texttrace, and get_drawings, any of which can shift between minor +# versions) and to bound BeautifulSoup API changes. +# +# Bump these pins intentionally after running the full test suite against +# the new version. + +pymupdf~=1.27 +beautifulsoup4~=4.14 diff --git a/tomd/tests/test_cli.py b/tomd/tests/test_cli.py new file mode 100644 index 0000000..2fc5802 --- /dev/null +++ b/tomd/tests/test_cli.py @@ -0,0 +1,20 @@ +"""Smoke tests for the tomd CLI entry point.""" + +import subprocess +import sys + +import pytest + + +def test_tomd_module_invokable_via_python(): + """`python -m tomd.main --help` must succeed when tomd is installed.""" + try: + import tomd.main # noqa: F401 + except ImportError: + pytest.skip("tomd not installed as a package (run `pip install -e .`)") + result = subprocess.run( + [sys.executable, "-m", "tomd.main", "--help"], + capture_output=True, text=True, check=False, + ) + assert result.returncode == 0, (result.stdout, result.stderr) + assert "tomd" in result.stdout From 23505fecf887f516567d75abffa8bd9a0ab0cac1 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 16:55:12 -0400 Subject: [PATCH 08/14] tomd: skip html golden tests when papers/ corpus is absent papers/ is gitignored, so fresh clones saw 7 AssertionError failures before any conversion logic ran. Replace the file-existence assert with pytest.skip() so the tests stay granular (per-stem) and only run when their input HTML is present. --- tomd/tests/test_html_golden.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tomd/tests/test_html_golden.py b/tomd/tests/test_html_golden.py index 0fc4086..0b9c84a 100644 --- a/tomd/tests/test_html_golden.py +++ b/tomd/tests/test_html_golden.py @@ -43,7 +43,8 @@ def _diff_head(actual: str, golden: str, limit: int = 120) -> str: @pytest.mark.parametrize("stem", _GOLDEN_STEMS) def test_convert_html_matches_golden(stem: str): html_path = _PAPERS / f"{stem}.html" - assert html_path.is_file(), f"missing paper HTML: {html_path}" + if not html_path.is_file(): + pytest.skip(f"missing paper HTML: {html_path} (papers/ is gitignored)") md, prompts = convert_html(html_path) golden_md = _GOLDEN / f"{stem}.golden.md" From f9b211c5e43218b95078264e79d0324cb053cb08 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 17:06:31 -0400 Subject: [PATCH 09/14] tomd: add pytest CI workflow for Python 3.12 and 3.13 Adds .github/workflows/tomd-tests.yml with a pytest matrix scoped to tomd/** path changes. Bumps requires-python to >=3.12 to match the tested range. --- .github/workflows/tomd-tests.yml | 40 ++++++++++++++++++++++++++++++++ tomd/pyproject.toml | 2 +- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/tomd-tests.yml diff --git a/.github/workflows/tomd-tests.yml b/.github/workflows/tomd-tests.yml new file mode 100644 index 0000000..239db26 --- /dev/null +++ b/.github/workflows/tomd-tests.yml @@ -0,0 +1,40 @@ +name: tomd tests + +on: + push: + branches: [master] + paths: + - "tomd/**" + - ".github/workflows/tomd-tests.yml" + pull_request: + paths: + - "tomd/**" + - ".github/workflows/tomd-tests.yml" + workflow_dispatch: + +jobs: + test: + name: pytest (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + cache-dependency-path: tomd/requirements.txt + + - name: Install tomd with test extras + run: | + python -m pip install --upgrade pip + pip install -e tomd[test] + + - name: Run pytest + working-directory: tomd + run: python -m pytest tests/ -v --tb=short diff --git a/tomd/pyproject.toml b/tomd/pyproject.toml index c754dc3..6f53e09 100644 --- a/tomd/pyproject.toml +++ b/tomd/pyproject.toml @@ -7,7 +7,7 @@ name = "tomd" version = "0.1.0" description = "PDF and HTML to Markdown converter for WG21 papers." readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.12" license = {text = "BSL-1.0"} authors = [ {name = "Vinnie Falco"}, From 25bb253017bc78cfb51f188318693e998e017608 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 17:20:18 -0400 Subject: [PATCH 10/14] tomd: add LICENSE and expand user-facing README BSL-1.0 LICENSE file matches the license already declared in pyproject.toml; README grows from an install-only stub to cover usage, output, uncertain-region markers, limitations, design-doc links, and development. Closes issues/15. --- tomd/LICENSE | 25 ++++++++++++++++++ tomd/README.md | 70 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 tomd/LICENSE diff --git a/tomd/LICENSE b/tomd/LICENSE new file mode 100644 index 0000000..e439b22 --- /dev/null +++ b/tomd/LICENSE @@ -0,0 +1,25 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Copyright (c) 2026 Vinnie Falco + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/tomd/README.md b/tomd/README.md index 29ece89..76456ad 100644 --- a/tomd/README.md +++ b/tomd/README.md @@ -1,7 +1,12 @@ # tomd -Convert PDF and HTML papers to Markdown. Used to prepare WG21 inputs for -the C++ Alliance paper pipeline. +Convert WG21 committee papers from PDF or HTML to clean Markdown. + +tomd is purpose-built for C++ standards committee paper conversion. It +understands WG21 metadata fields (document number, date, reply-to, audience), +detects structural elements (headings, lists, tables, code blocks, wording +sections), and produces Markdown that looks like a human wrote it, suitable +for version control, pull request diffs, and plain-text review workflows. ## Install @@ -11,8 +16,9 @@ From this directory: pip install -e . ``` -Installs the `tomd` console script and pins -`pymupdf~=1.27` / `beautifulsoup4~=4.14`. +Requires Python 3.12 or newer. Runtime dependencies (`pymupdf~=1.27`, +`beautifulsoup4~=4.14`) are declared in `pyproject.toml` and installed +automatically. ## Usage @@ -20,10 +26,62 @@ Installs the `tomd` console script and pins tomd paper.pdf # -> paper.md (+ paper.prompts.md if uncertain) tomd paper.html # -> paper.md tomd *.pdf *.html --outdir out/ # batch mode +tomd -v paper.pdf # verbose logging +tomd -o out.md paper.pdf # explicit output path (single-file only) ``` Also runnable as `python -m tomd.main ...`. +### Output + +- `paper.md` is always produced. It contains YAML front matter (title, + document number, date, audience, reply-to) followed by the paper body + rendered as Markdown. +- `paper.prompts.md` is produced only when the converter found uncertain + regions. It pairs each uncertain span with both extraction paths (MuPDF + and spatial) plus surrounding context, formatted for manual LLM + reconciliation. If no uncertain regions exist, no prompts file is written + (and any stale one at the output path is removed). + +### Uncertain regions + +tomd uses dual-extraction with confidence scoring. When the MuPDF and +spatial paths disagree on a page, the region is emitted in the output +marked with an HTML comment: + +``` + +``` + +The accompanying `.prompts.md` file contains ready-to-feed LLM prompts for +each marker. You resolve uncertain regions manually; the LLM fixes +structure, never content. + +## Limitations + +- **No OCR.** Scanned or image-only PDFs are not supported. +- **No vision fallback.** Papers that rely on non-extractable layout + (complex equations, diagrams) will not convert cleanly. +- **HTML generator coverage.** Four generators are detected directly: + mpark/wg21, Bikeshed, HackMD, and hand-written. Other sources fall back + to a generic extractor that may miss metadata fields. +- **LLM auto-resolution is deferred to v2.** The `.prompts.md` file is + produced; feeding it to an LLM and applying the result is manual in this + release. + +## Design + +Design and architecture documentation lives alongside the code: + +- [`CLAUDE.md`](CLAUDE.md) - architecture rules and invariants (contributors + and AI agents). +- [`lib/pdf/ARCHITECTURE.md`](lib/pdf/ARCHITECTURE.md) - PDF converter + pipeline and the techniques it uses. +- [`lib/html/ARCHITECTURE.md`](lib/html/ARCHITECTURE.md) - HTML converter + pipeline. + +Read these in order if you are modifying tomd. + ## Development Install test extras and run the suite: @@ -32,3 +90,7 @@ Install test extras and run the suite: pip install -e .[test] pytest tests/ ``` + +## License + +Boost Software License 1.0. See [`LICENSE`](LICENSE). From 07b61f7565d001d6799d8d3709ad35fa4e6a6959 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 17:34:58 -0400 Subject: [PATCH 11/14] tomd: consolidate document- and section-number regex shapes The doc-number shape lived in both lib/__init__.py (DOC_NUM_RE) and lib/pdf/types.py (DOC_FIELD_RE), and the dotted-decimal section-number shape lived in both SECTION_NUM_PREFIX_RE and SECTION_NUM_RE. The tomd-specific rule "regex patterns for metadata fields must be defined in one place" was violated; the pair of doc-number patterns also disagreed on SD-N coverage. Extract two core shape strings (DOC_NUM_PATTERN, SECTION_NUM_PATTERN) in lib/__init__.py and rebuild the four callers on them. DOC_FIELD_RE picks up SD-N support for free. No call-site changes: every existing group(0)/group(1)/match/search semantic is preserved (verified by grep for .groups() tuple unpacking; there are none). Closes issues/08. --- tomd/CLAUDE.md | 2 +- tomd/lib/__init__.py | 27 ++++++++++----- tomd/lib/pdf/types.py | 14 +++++--- tomd/tests/test_regex_patterns.py | 57 +++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 15 deletions(-) create mode 100644 tomd/tests/test_regex_patterns.py diff --git a/tomd/CLAUDE.md b/tomd/CLAUDE.md index cdc017d..15fa83b 100644 --- a/tomd/CLAUDE.md +++ b/tomd/CLAUDE.md @@ -100,7 +100,7 @@ Auto-resolution via `--llm` flag is deferred to v2. For v1, the tool produces a ## File Map - `main.py` - CLI entry point. Argparse, glob expansion, output path logic, main(). No conversion logic. -- `lib/__init__.py` - Shared text utilities and constants for PDF and HTML converters: `ascii_escape`, `strip_format_chars`, `format_front_matter`, `ALLOWED_LINK_SCHEMES`, and shared regex patterns (`EMAIL_RE`, `DATE_RE`, `DOC_NUM_RE`, `SECTION_NUM_PREFIX_RE`). +- `lib/__init__.py` - Shared text utilities and constants for PDF and HTML converters: `ascii_escape`, `strip_format_chars`, `format_front_matter`, `parse_author_lines`, `ALLOWED_LINK_SCHEMES`, shared regex patterns (`EMAIL_RE`, `DATE_RE`, `DOC_NUM_RE`, `SECTION_NUM_PREFIX_RE`), and their reusable shape strings (`DOC_NUM_PATTERN`, `SECTION_NUM_PATTERN`) consumed by `lib/pdf/types.py` to build `DOC_FIELD_RE` and `SECTION_NUM_RE`. - `lib/similarity.py` - Dual-algorithm string similarity (SequenceMatcher + Jaccard). Per-algorithm thresholds, 200-char circuit breaker. Format-agnostic. - `lib/toc.py` - Table of Contents detection. Matches section texts against known headings using fuzzy similarity. Bridges small gaps. Format-agnostic - no dependency on PDF types. - `lib/pdf/__init__.py` - Exports `convert_pdf()`. Orchestrates the full pipeline in order. Includes monospace propagation, wording classification, and page 0 color extraction via space-color proxy. diff --git a/tomd/lib/__init__.py b/tomd/lib/__init__.py index b7fcad4..dcd53d4 100644 --- a/tomd/lib/__init__.py +++ b/tomd/lib/__init__.py @@ -148,15 +148,24 @@ def parse_author_lines(lines, clean_line=None, skip_line=None): DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b") -# Broad document-number pattern used for header stripping and HTML metadata. +# Core pattern shapes (no anchors, no label context) reused across modules +# so every document- and section-number pattern has a single source of truth. +# `lib/pdf/types.py` builds the labeled PDF variants (DOC_FIELD_RE, +# SECTION_NUM_RE) on top of these. +DOC_NUM_PATTERN = ( + r"[DPN]\d{3,5}R\d+" + r"|[DPN]\d{3,5}" + r"|N\d{3,5}" + r"|SD-\d+" +) + +SECTION_NUM_PATTERN = r"\d+(?:\.\d+)*" + +# Broad document-number match used for header stripping and HTML metadata. # For line-anchored field extraction in PDF blocks, see DOC_FIELD_RE in # lib/pdf/types.py, which targets "Document Number: PXXXXrN" line prefixes. -DOC_NUM_RE = re.compile( - r"\b([DPN]\d{3,5}R\d+)\b" - r"|\b([DPN]\d{3,5})\b" - r"|\b(N\d{3,5})\b" - r"|\b(SD-\d+)\b", - re.IGNORECASE, -) +DOC_NUM_RE = re.compile(rf"\b({DOC_NUM_PATTERN})\b", re.IGNORECASE) -SECTION_NUM_PREFIX_RE = re.compile(r"^\d+(?:\.\d+)*\.?\s+") +# Leading section-number prefix used by the HTML renderer to strip a number +# (e.g. "2.1.3 " or "1. ") from heading text. +SECTION_NUM_PREFIX_RE = re.compile(rf"^{SECTION_NUM_PATTERN}\.?\s+") diff --git a/tomd/lib/pdf/types.py b/tomd/lib/pdf/types.py index c816c13..3dfcac6 100644 --- a/tomd/lib/pdf/types.py +++ b/tomd/lib/pdf/types.py @@ -5,6 +5,8 @@ from dataclasses import dataclass, field from enum import Enum +from tomd.lib import DOC_NUM_PATTERN, SECTION_NUM_PATTERN + class Confidence(Enum): """Confidence level for structural classification decisions.""" @@ -135,15 +137,17 @@ class PageEdgeItem: # --- Precompiled regex patterns --- -SECTION_NUM_RE = re.compile( - r"^(\d+(?:\.\d+)*)\s+(.+)", -) +# Section number at the start of a line with required trailing content +# (used for heading detection); shares the core shape with +# SECTION_NUM_PREFIX_RE in lib/__init__.py. +SECTION_NUM_RE = re.compile(rf"^({SECTION_NUM_PATTERN})\s+(.+)") # Line-anchored pattern targeting "Document Number: PXXXXRN" field lines in # PDF block text. More restrictive than DOC_NUM_RE in lib/__init__.py, which -# is a broad substring match used for header stripping and HTML contexts. +# is a broad substring match used for header stripping and HTML contexts; +# both patterns share the core DOC_NUM_PATTERN shape. DOC_FIELD_RE = re.compile( - r"Document\s+(?:Number|#)[:\s]+([DPN]\d{3,5}(?:R\d+)?|N\d{3,5})", + rf"Document\s+(?:Number|#)[:\s]+({DOC_NUM_PATTERN})", re.IGNORECASE, ) diff --git a/tomd/tests/test_regex_patterns.py b/tomd/tests/test_regex_patterns.py new file mode 100644 index 0000000..c0cbd9a --- /dev/null +++ b/tomd/tests/test_regex_patterns.py @@ -0,0 +1,57 @@ +"""Tests for the shared document- and section-number regex patterns. + +After the consolidation in issue 08, `DOC_NUM_PATTERN` and +`SECTION_NUM_PATTERN` live in `lib/__init__.py`; the PDF-specific +labeled variants in `lib/pdf/types.py` are built on top of them. +These tests lock down the behavior each call site depends on. +""" + +from lib import DOC_NUM_RE, SECTION_NUM_PREFIX_RE +from lib.pdf.types import DOC_FIELD_RE, SECTION_NUM_RE + + +def test_doc_num_matches_all_wg21_forms(): + for s in ("P1234", "P1234R0", "P12345R9", "D0042R3", "N5012", "SD-9"): + assert DOC_NUM_RE.search(s), f"failed to match {s!r}" + + +def test_doc_num_group_zero_returns_full_number(): + # Call sites depend on m.group(0) returning the matched number. + m = DOC_NUM_RE.search("see P1234R0 for details") + assert m is not None + assert m.group(0).upper() == "P1234R0" + + +def test_doc_num_rejects_too_short_prefix(): + # WG21 doc numbers have at least 3 digits; shorter must not match. + assert DOC_NUM_RE.search("P12") is None + assert DOC_NUM_RE.search("N42") is None + + +def test_doc_field_matches_labeled_forms(): + m = DOC_FIELD_RE.search("Document Number: P1234R0") + assert m and m.group(1).upper() == "P1234R0" + m = DOC_FIELD_RE.search("Document #: N5012") + assert m and m.group(1).upper() == "N5012" + + +def test_doc_field_now_supports_sd_form(): + # Regression: after consolidation DOC_FIELD_RE inherits SD-N support + # from the shared DOC_NUM_PATTERN. + m = DOC_FIELD_RE.search("Document Number: SD-1") + assert m and m.group(1).upper() == "SD-1" + + +def test_section_num_prefix_strips_leading_number(): + assert SECTION_NUM_PREFIX_RE.sub("", "2.1.3 Details") == "Details" + assert SECTION_NUM_PREFIX_RE.sub("", "1. Introduction") == "Introduction" + # Non-matching input passes through unchanged. + assert SECTION_NUM_PREFIX_RE.sub("", "Introduction") == "Introduction" + + +def test_section_num_re_captures_number_and_title(): + m = SECTION_NUM_RE.match("2.1.3 Details of the feature") + assert m is not None + assert m.group(1) == "2.1.3" + assert m.group(2) == "Details of the feature" + assert SECTION_NUM_RE.match("Abstract") is None From e23323efef01cc4ca45dec8ae3c948418563d567 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 17:35:34 -0400 Subject: [PATCH 12/14] tomd: pin parse_author_lines behavior with direct tests The shared author state machine in lib/__init__.py was already deduplicated (both lib/pdf/wg21.py and lib/html/extract.py delegate to parse_author_lines with per-caller clean_line/skip_line callbacks). Only the pattern-level coverage proposed in issues/07 was missing: the helper is exercised today only through the callers' tests. Add tests/test_authors.py covering the pending-name pairing, trailing-name flush, blank-line skip, and clean_line/skip_line injection points. Closes issues/07. --- tomd/tests/test_authors.py | 85 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 tomd/tests/test_authors.py diff --git a/tomd/tests/test_authors.py b/tomd/tests/test_authors.py new file mode 100644 index 0000000..3aa7d90 --- /dev/null +++ b/tomd/tests/test_authors.py @@ -0,0 +1,85 @@ +"""Tests for lib.parse_author_lines. + +The shared state machine that both `lib/pdf/wg21.py:_parse_authors` and +`lib/html/extract.py:_parse_mpark_authors` delegate to. These tests pin +the helper's own contract: pending-name pairing, multi-line email pairs, +blank-line skipping, trailing-name flush, and the clean_line / skip_line +injection points the callers depend on. +""" + +from lib import parse_author_lines + + +def test_name_and_email_same_line(): + result = parse_author_lines(["Alice Example alice@example.com"]) + assert result == ["Alice Example "] + + +def test_name_then_email_next_line(): + result = parse_author_lines(["Alice Example", "alice@example.com"]) + assert result == ["Alice Example "] + + +def test_email_alone(): + result = parse_author_lines(["alice@example.com"]) + assert result == [""] + + +def test_multiple_authors_alternating(): + result = parse_author_lines([ + "Alice Example", + "alice@example.com", + "Bob Sample", + "bob@example.com", + ]) + assert result == [ + "Alice Example ", + "Bob Sample ", + ] + + +def test_name_only_no_email_becomes_bare_entry(): + result = parse_author_lines(["Alice Example"]) + assert result == ["Alice Example"] + + +def test_blank_lines_are_skipped(): + result = parse_author_lines( + ["", "Alice Example", " ", "alice@example.com", ""]) + assert result == ["Alice Example "] + + +def test_trailing_pending_name_is_flushed(): + # A name with no following email must still appear in the output. + result = parse_author_lines([ + "Alice Example", "alice@example.com", "Bob Solo", + ]) + assert result == [ + "Alice Example ", + "Bob Solo", + ] + + +def test_empty_input(): + assert parse_author_lines([]) == [] + + +def test_custom_clean_line_strips_brackets(): + # Mirrors how lib/html/extract.py injects angle-bracket stripping. + import re + angle = re.compile(r"[<>]") + result = parse_author_lines( + ["Alice ", ""], + clean_line=lambda t: angle.sub("", t).strip(), + ) + assert result == ["Alice Example "] + + +def test_custom_skip_line_rejects_non_author_content(): + # Mirrors how HTML rejects doc-number lines and PDF rejects label lines: + # the skipped line must neither become a pending name nor appear in output. + result = parse_author_lines( + ["P1234R0", "Alice Example", "alice@example.com"], + skip_line=lambda l: l == "P1234R0", + ) + assert result == ["Alice Example "] From 16f85f2d7d798f4f5ab82b909135d216e83df316 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Wed, 15 Apr 2026 18:05:11 -0400 Subject: [PATCH 13/14] tomd: remove get_texttrace() fallback, tighten dep pins With pymupdf and beautifulsoup4 pinned in requirements.txt and pyproject.toml, the try/except AttributeError guard around page.get_texttrace() in find_hidden_regions was defending against an unreachable path. Silent degradation on a dep mismatch would hide real bugs; the pin is the contract, so a missing API should surface as a clear AttributeError. Tightened the pins from ~=1.27 / ~=4.14 to ~=1.27.0 / ~=4.14.0 so that only patch releases are accepted. The looser form actually allowed any minor bump up to the next major, which contradicted the comment in requirements.txt flagging PyMuPDF minor-version API drift as the concern. --- tomd/lib/pdf/cleanup.py | 8 +------- tomd/pyproject.toml | 4 ++-- tomd/requirements.txt | 4 ++-- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/tomd/lib/pdf/cleanup.py b/tomd/lib/pdf/cleanup.py index faf9761..3054435 100644 --- a/tomd/lib/pdf/cleanup.py +++ b/tomd/lib/pdf/cleanup.py @@ -212,13 +212,7 @@ def find_hidden_regions(page, body_fonts: set[str] | None = None, if body_fonts is None: return hidden_bboxes - try: - trace = page.get_texttrace() - except AttributeError: - _log.debug("get_texttrace() not available; skipping hidden region detection") - return hidden_bboxes - - for span in trace: + for span in page.get_texttrace(): if span.get("type") == 3: continue diff --git a/tomd/pyproject.toml b/tomd/pyproject.toml index 6f53e09..dd2eb1d 100644 --- a/tomd/pyproject.toml +++ b/tomd/pyproject.toml @@ -13,8 +13,8 @@ authors = [ {name = "Vinnie Falco"}, ] dependencies = [ - "pymupdf~=1.27", - "beautifulsoup4~=4.14", + "pymupdf~=1.27.0", + "beautifulsoup4~=4.14.0", ] [project.optional-dependencies] diff --git a/tomd/requirements.txt b/tomd/requirements.txt index 6eaae68..86d5c0a 100644 --- a/tomd/requirements.txt +++ b/tomd/requirements.txt @@ -6,5 +6,5 @@ # Bump these pins intentionally after running the full test suite against # the new version. -pymupdf~=1.27 -beautifulsoup4~=4.14 +pymupdf~=1.27.0 +beautifulsoup4~=4.14.0 From be4f82fc726803a4f5202ef200b4cfd9c22afbab Mon Sep 17 00:00:00 2001 From: Jeremy Childers <30885417+jlchilders11@users.noreply.github.com> Date: Wed, 15 Apr 2026 18:18:12 -0400 Subject: [PATCH 14/14] Add missing tests and fix cleanup.py bug * tomd: Add tests for header footer functions * hidden regions tests * position-based list tests * HTML generator fixtures * PR feedback --- tomd/lib/pdf/cleanup.py | 17 +- tomd/tests/fixtures/html/bikeshed_sample.html | 21 ++ tomd/tests/fixtures/html/hackmd_sample.html | 17 ++ .../html/handwritten_address_sample.html | 15 ++ .../html/handwritten_table_sample.html | 15 ++ tomd/tests/test_header_footer.py | 184 ++++++++++++++++++ tomd/tests/test_hidden_regions.py | 158 +++++++++++++++ tomd/tests/test_html_generators.py | 124 ++++++++++++ tomd/tests/test_position_lists.py | 142 ++++++++++++++ 9 files changed, 687 insertions(+), 6 deletions(-) create mode 100644 tomd/tests/fixtures/html/bikeshed_sample.html create mode 100644 tomd/tests/fixtures/html/hackmd_sample.html create mode 100644 tomd/tests/fixtures/html/handwritten_address_sample.html create mode 100644 tomd/tests/fixtures/html/handwritten_table_sample.html create mode 100644 tomd/tests/test_header_footer.py create mode 100644 tomd/tests/test_hidden_regions.py create mode 100644 tomd/tests/test_html_generators.py create mode 100644 tomd/tests/test_position_lists.py diff --git a/tomd/lib/pdf/cleanup.py b/tomd/lib/pdf/cleanup.py index 3054435..8a5cad3 100644 --- a/tomd/lib/pdf/cleanup.py +++ b/tomd/lib/pdf/cleanup.py @@ -2,7 +2,7 @@ import logging import re -from collections import defaultdict +from collections import defaultdict, Counter from dataclasses import replace from .. import strip_format_chars, DOC_NUM_RE @@ -79,12 +79,7 @@ def detect_repeating(all_edge_items: list[list[PageEdgeItem]], pages_seen = len(set(it.page_num for it in items)) if pages_seen < threshold: continue - texts = [it.text for it in items] - if len(set(texts)) == 1: - repeating.add((y_key, texts[0])) - _log.debug("Repeating exact: y=%.1f text=%r", y_key, texts[0]) - continue if all(PAGE_NUM_RE.match(t) for t in texts): repeating.add((y_key, "__PAGE_NUM__")) @@ -96,6 +91,16 @@ def detect_repeating(all_edge_items: list[list[PageEdgeItem]], _log.debug("Repeating doc number at y=%.1f", y_key) continue + text_counts = Counter(it.text for it in items) + exact_hit = False + for text, count in text_counts.items(): + if count >= threshold: + repeating.add((y_key, text)) + _log.debug("Repeating exact: y=%.1f text=%r", y_key, text) + exact_hit = True + if exact_hit: + continue + return repeating diff --git a/tomd/tests/fixtures/html/bikeshed_sample.html b/tomd/tests/fixtures/html/bikeshed_sample.html new file mode 100644 index 0000000..20153c7 --- /dev/null +++ b/tomd/tests/fixtures/html/bikeshed_sample.html @@ -0,0 +1,21 @@ + + + + + +P9999R0 Test Bikeshed Paper + + +

P9999R0 Test Bikeshed Paper

+

Before the metadata list.

+
+
Audience:
+
SG1
+
Editor:
+
+
+ +

Introduction

+

Body paragraph content.

+ + \ No newline at end of file diff --git a/tomd/tests/fixtures/html/hackmd_sample.html b/tomd/tests/fixtures/html/hackmd_sample.html new file mode 100644 index 0000000..75e80b1 --- /dev/null +++ b/tomd/tests/fixtures/html/hackmd_sample.html @@ -0,0 +1,17 @@ + + + + +P9999R0: Test HackMD Paper - HackMD + + + +

P9999R0: Test HackMD Paper

+ + + +
Document:P9999R0
Audience:SG1
+

Introduction

+

Body paragraph.

+ + \ No newline at end of file diff --git a/tomd/tests/fixtures/html/handwritten_address_sample.html b/tomd/tests/fixtures/html/handwritten_address_sample.html new file mode 100644 index 0000000..50e9748 --- /dev/null +++ b/tomd/tests/fixtures/html/handwritten_address_sample.html @@ -0,0 +1,15 @@ + + +Handwritten Sample + +
+Document Number: P9999R0
+Date: 2026-03-15
+Audience: SG1
+Alice Author +
+

Test Handwritten Paper

+

Introduction

+

Body paragraph.

+ + \ No newline at end of file diff --git a/tomd/tests/fixtures/html/handwritten_table_sample.html b/tomd/tests/fixtures/html/handwritten_table_sample.html new file mode 100644 index 0000000..f2050af --- /dev/null +++ b/tomd/tests/fixtures/html/handwritten_table_sample.html @@ -0,0 +1,15 @@ + + +Handwritten Table Sample + + + + + + +
Document Number:P9998R0
Date:2026-02-01
Audience:EWG
Reply-to:Bob Sample
+

Another Handwritten Paper

+

Scope

+

Body paragraph.

+ + \ No newline at end of file diff --git a/tomd/tests/test_header_footer.py b/tomd/tests/test_header_footer.py new file mode 100644 index 0000000..8e879ac --- /dev/null +++ b/tomd/tests/test_header_footer.py @@ -0,0 +1,184 @@ + +"""Tests for header/footer detection and stripping in lib.pdf.cleanup.""" +from lib.pdf.cleanup import get_edge_items, detect_repeating, strip_repeating +from lib.pdf.types import ( + Block, Line, Span, PageEdgeItem, Y_TOLERANCE, EDGE_ITEMS_PER_PAGE, +) + + +def _make_line(text, y, page_num=0, x0=50.0, x1=550.0): + """Construct a Line with a single span at the given y position.""" + return Line( + spans=[Span(text=text, font_name="Body", font_size=11.0)], + bbox=(x0, y, x1, y + 12.0), + page_num=page_num, + ) + + +def _make_block_at_y(lines_data, page_num=0): + """Build a Block whose bbox spans its lines.""" + lines = [_make_line(t, y, page_num=page_num) for t, y in lines_data] + ys = [ln.bbox[1] for ln in lines] + y2s = [ln.bbox[3] for ln in lines] + return Block( + lines=lines, + bbox=(50.0, min(ys), 550.0, max(y2s)), + page_num=page_num, + ) + + +# ---- get_edge_items ------------------------------------------------------ + +def test_edge_items_picks_top_and_bottom(): + """Top 3 and bottom 3 by y-coordinate, with no dedup needed when texts differ.""" + # Lines at y = 30, 60, 90 (top half) and 500, 540, 580 (bottom half). + block_top = _make_block_at_y([("Header A", 30), ("Header B", 60), ("Header C", 90)]) + block_body = _make_block_at_y([("body line", 300)]) + block_bot = _make_block_at_y([("Footer X", 500), ("Footer Y", 540), ("Footer Z", 580)]) + items = get_edge_items([block_top, block_body, block_bot], page_num=1, page_height=600) + texts = [it.text for it in items] + # Top 3 by y (ascending): Header A/B/C. Bottom 3 by y (largest): Footer X/Y/Z + # (and possibly body line too — the function takes items[:3] and items[-3:]). + assert "Header A" in texts + assert "Header B" in texts + assert "Header C" in texts + assert "Footer X" in texts or "Footer Y" in texts # bottom range + assert "Footer Z" in texts + + +def test_edge_items_dedups_same_text_same_y(): + """Duplicate (text, y) pairs collapse to a single edge item.""" + # Two blocks contribute lines with identical text at identical y — dedup. + b1 = _make_block_at_y([("Page 1", 30)]) + b2 = _make_block_at_y([("Page 1", 30)]) + items = get_edge_items([b1, b2], page_num=1, page_height=600) + texts = [it.text for it in items] + assert texts.count("Page 1") == 1 + + +def test_edge_items_empty_page(): + assert get_edge_items([], page_num=1, page_height=600) == [] + + +def test_edge_items_skips_blank_lines(): + """Lines whose text is empty after strip are not edge items.""" + b = _make_block_at_y([(" ", 30), ("Real header", 60)]) + items = get_edge_items([b], page_num=1, page_height=600) + texts = [it.text for it in items] + assert texts == ["Real header"] + + +def test_edge_items_limits_per_page(): + """No more than EDGE_ITEMS_PER_PAGE top + EDGE_ITEMS_PER_PAGE bottom.""" + # 10 lines spread across y. + lines_data = [(f"line {i}", 20.0 + i * 30) for i in range(10)] + b = _make_block_at_y(lines_data) + items = get_edge_items([b], page_num=1, page_height=600) + # Top 3 + bottom 3 = 6; dedup only when keys collide. + assert len(items) <= 2 * EDGE_ITEMS_PER_PAGE + + +# ---- detect_repeating ---------------------------------------------------- + +def test_detect_repeating_exact_text(): + """Same text at same y on >=50% of pages is classified as repeating.""" + # 5 pages, 4 of them have "Running Head" at y=30. + all_edges = [ + [PageEdgeItem(text="Running Head", y=30.0, page_num=pg, bbox=(0, 30, 100, 42))] + for pg in range(1, 5) + ] + [ + [PageEdgeItem(text="Unique Title", y=30.0, page_num=5, bbox=(0, 30, 100, 42))], + ] + result = detect_repeating(all_edges, total_pages=5) + # y_bucket = round(30 / Y_TOLERANCE) * Y_TOLERANCE = 30.0 (Y_TOLERANCE is 2.0) + assert (30.0, "Running Head") in result + + +def test_detect_repeating_skips_below_threshold(): + """If an item appears on fewer than threshold pages, it's not repeating.""" + # 5 pages, only 1 has "Not Repeating" at y=30. + all_edges = [ + [PageEdgeItem(text="Not Repeating", y=30.0, page_num=1, bbox=(0, 30, 100, 42))], + [], [], [], [], + ] + result = detect_repeating(all_edges, total_pages=5) + assert not any(p == "Not Repeating" for _, p in result) + + +def test_detect_repeating_short_doc_returns_empty(): + """Fewer than 3 pages -> empty set (threshold 0.5 of 2 = 1, but function + requires total_pages >= 3).""" + all_edges = [[PageEdgeItem(text="Header", y=30.0, page_num=1, bbox=(0, 30, 100, 42))]] + assert detect_repeating(all_edges, total_pages=1) == set() + assert detect_repeating(all_edges * 2, total_pages=2) == set() + + +def test_detect_repeating_page_number_pattern(): + """Different page numbers at same y are classified as __PAGE_NUM__.""" + all_edges = [ + [PageEdgeItem(text=str(pg), y=580.0, page_num=pg, bbox=(270, 580, 290, 592))] + for pg in range(1, 6) + ] + result = detect_repeating(all_edges, total_pages=5) + # y_bucket = round(580 / 2) * 2 = 580.0 + assert (580.0, "__PAGE_NUM__") in result + + +def test_detect_repeating_doc_number_pattern(): + """Running doc number at same y across pages is classified as __DOC_NUM__.""" + # Same paper, revision number varies line-by-line — not realistic, but exercises + # the path. In practice the doc number repeats identically, which would hit the + # exact-text branch before DOC_NUM. Use revisions that differ: + docs = ["P1234R0", "P1234R1", "P1234R0", "P1234R2"] + all_edges = [ + [PageEdgeItem(text=docs[i], y=30.0, page_num=i + 1, bbox=(0, 30, 100, 42))] + for i in range(4) + ] + result = detect_repeating(all_edges, total_pages=4) + assert (30.0, "__DOC_NUM__") in result + + +# ---- strip_repeating ----------------------------------------------------- + +def test_strip_repeating_removes_exact_match(): + """A line whose (y, text) matches a repeating entry is removed.""" + b = _make_block_at_y([("Running Head", 30), ("Real content", 60)]) + repeating = {(36.0, "Running Head")} + result = strip_repeating([b], repeating) + assert len(result) == 1 + texts = [ln.text for ln in result[0].lines] + assert "Real content" in texts + assert "Running Head" not in texts + + +def test_strip_repeating_removes_page_numbers(): + """A line matching PAGE_NUM_RE at the repeating y-bucket is removed.""" + b = _make_block_at_y([("42", 580), ("Body line", 300)]) + repeating = {(586.0, "__PAGE_NUM__")} + result = strip_repeating([b], repeating) + texts = [ln.text for ln in result[0].lines] + assert "42" not in texts + assert "Body line" in texts + + +def test_strip_repeating_y_tolerance(): + """Lines whose y differs by <= Y_TOLERANCE from the repeating bucket are stripped.""" + b = _make_block_at_y([("Running Head", 31)]) # y=31, tolerance 2.0 around 30 + repeating = {(36.0, "Running Head")} + result = strip_repeating([b], repeating) + # Block may be dropped if it becomes empty. + assert not result or not any("Running Head" in ln.text for blk in result for ln in blk.lines) + + +def test_strip_repeating_drops_empty_blocks(): + """A block with all lines stripped is omitted from the output.""" + b = _make_block_at_y([("Running Head", 30)]) + repeating = {(36.0, "Running Head")} + result = strip_repeating([b], repeating) + assert result == [] + + +def test_strip_repeating_empty_input(): + """Empty repeating set returns blocks unchanged.""" + b = _make_block_at_y([("content", 300)]) + assert strip_repeating([b], set()) == [b] \ No newline at end of file diff --git a/tomd/tests/test_hidden_regions.py b/tomd/tests/test_hidden_regions.py new file mode 100644 index 0000000..5db00f1 --- /dev/null +++ b/tomd/tests/test_hidden_regions.py @@ -0,0 +1,158 @@ + +"""Tests for hidden region detection and stripping in lib.pdf.cleanup.""" +from unittest.mock import MagicMock + +from lib.pdf.cleanup import find_hidden_regions, strip_hidden_blocks +from lib.pdf.types import Block, Line, Span + + +# ---- find_hidden_regions ------------------------------------------------- + +def _span_record(font, color, char_bboxes, span_type=1): + """Build a texttrace-shaped span record.""" + return { + "type": span_type, + "font": font, + "color": color, + "chars": [(None, None, None, bb) for bb in char_bboxes], + } + + +def test_find_hidden_regions_no_body_fonts_returns_empty(): + """When body_fonts is None, the function short-circuits.""" + page = MagicMock() + page.get_texttrace.return_value = [ + _span_record("Roboto", 0x808080, [(10, 10, 20, 20)]) + ] + assert find_hidden_regions(page, body_fonts=None) == set() + + +def test_find_hidden_regions_roboto_non_black(): + """Roboto font + non-black color + not-in-body-fonts -> hidden.""" + page = MagicMock() + page.get_texttrace.return_value = [ + _span_record("Roboto", 0x808080, [(10, 10, 20, 20)]) + ] + result = find_hidden_regions(page, body_fonts={"cambria"}) + assert (10, 10, 20, 20) in result + + +def test_find_hidden_regions_google_font(): + """Google-prefixed font triggers detection.""" + page = MagicMock() + page.get_texttrace.return_value = [ + _span_record("GoogleSans-Regular", 0x808080, [(10, 10, 20, 20)]) + ] + result = find_hidden_regions(page, body_fonts={"cambria"}) + assert (10, 10, 20, 20) in result + + +def test_find_hidden_regions_material_font(): + """Material UI font triggers detection.""" + page = MagicMock() + page.get_texttrace.return_value = [ + _span_record("MaterialIcons", 0x808080, [(10, 10, 20, 20)]) + ] + result = find_hidden_regions(page, body_fonts={"cambria"}) + assert (10, 10, 20, 20) in result + + +def test_find_hidden_regions_body_font_not_hidden(): + """A font that IS in body_fonts is not classified as hidden, even if + it coincidentally matches a widget keyword.""" + page = MagicMock() + page.get_texttrace.return_value = [ + _span_record("Roboto", 0x808080, [(10, 10, 20, 20)]) + ] + result = find_hidden_regions(page, body_fonts={"roboto"}) + assert result == set() + + +def test_find_hidden_regions_black_color_not_hidden(): + """Non-body-font, Roboto, but BLACK color -> not hidden (rule: non-black).""" + page = MagicMock() + page.get_texttrace.return_value = [ + _span_record("Roboto", 0, [(10, 10, 20, 20)]) + ] + result = find_hidden_regions(page, body_fonts={"cambria"}) + assert result == set() + + +def test_find_hidden_regions_black_tuple_color_not_hidden(): + """The (0, 0, 0) tuple form of black is also recognized.""" + page = MagicMock() + page.get_texttrace.return_value = [ + _span_record("Roboto", (0, 0, 0), [(10, 10, 20, 20)]) + ] + result = find_hidden_regions(page, body_fonts={"cambria"}) + assert result == set() + + +def test_find_hidden_regions_non_widget_font_not_hidden(): + """A non-body font that isn't Roboto/Google/Material is left alone.""" + page = MagicMock() + page.get_texttrace.return_value = [ + _span_record("SomeOtherFont", 0x808080, [(10, 10, 20, 20)]) + ] + result = find_hidden_regions(page, body_fonts={"cambria"}) + assert result == set() + + +def test_find_hidden_regions_mode_3_skipped(): + """Rendering mode 3 (invisible text) is explicitly ignored by the function.""" + page = MagicMock() + page.get_texttrace.return_value = [ + _span_record("Roboto", 0x808080, [(10, 10, 20, 20)], span_type=3), + ] + result = find_hidden_regions(page, body_fonts={"cambria"}) + assert result == set() + + +# ---- strip_hidden_blocks ------------------------------------------------- + +def _make_block(text, x0, y0, x1, y1): + span = Span( + text=text, font_name="Body", font_size=11.0, + bbox=(x0, y0, x1, y1), + ) + line = Line(spans=[span], bbox=(x0, y0, x1, y1)) + return Block(lines=[line], bbox=(x0, y0, x1, y1)) + + +def test_strip_hidden_blocks_empty_hidden_set_returns_input(): + """No hidden bboxes -> input blocks returned unchanged.""" + block = _make_block("visible text", 10, 10, 100, 30) + assert strip_hidden_blocks([block], set()) == [block] + + +def test_strip_hidden_blocks_drops_block_entirely_in_hidden(): + """A block whose only span overlaps a hidden bbox is dropped.""" + block = _make_block("widget text", 10, 10, 100, 30) + hidden = {(5.0, 5.0, 150.0, 50.0)} # engulfs the block + result = strip_hidden_blocks([block], hidden) + assert result == [] + + +def test_strip_hidden_blocks_keeps_block_outside_hidden(): + """A block whose span is outside all hidden bboxes survives.""" + block = _make_block("body text", 10, 300, 100, 320) + hidden = {(5.0, 5.0, 150.0, 50.0)} # hidden at y=5..50; block at y=300 untouched + result = strip_hidden_blocks([block], hidden) + assert result == [block] + + +def test_strip_hidden_blocks_keeps_block_with_any_visible_span(): + """A block with one hidden span and one visible span is kept.""" + hidden_span = Span( + text="widget", font_name="Roboto", font_size=11.0, + bbox=(10, 10, 50, 30), + ) + visible_span = Span( + text="content", font_name="Body", font_size=11.0, + bbox=(60, 10, 150, 30), + ) + line = Line(spans=[hidden_span, visible_span], bbox=(10, 10, 150, 30)) + block = Block(lines=[line], bbox=(10, 10, 150, 30)) + hidden = {(5.0, 5.0, 55.0, 35.0)} # covers hidden_span only + result = strip_hidden_blocks([block], hidden) + assert result == [block] \ No newline at end of file diff --git a/tomd/tests/test_html_generators.py b/tomd/tests/test_html_generators.py new file mode 100644 index 0000000..5b290fd --- /dev/null +++ b/tomd/tests/test_html_generators.py @@ -0,0 +1,124 @@ + +"""Per-generator integration tests for the HTML converter.""" +from pathlib import Path + +from lib.html.extract import ( + parse_html, detect_generator, extract_metadata, strip_boilerplate, +) +from lib.html.render import render_body + + +FIXTURES = Path(__file__).parent / "fixtures" / "html" + + +def _load(name: str) -> str: + return (FIXTURES / name).read_text(encoding="utf-8") + + +# ---- Bikeshed ----------------------------------------------------------- + +def test_bikeshed_detection(): + soup = parse_html(_load("bikeshed_sample.html")) + assert detect_generator(soup) == "bikeshed" + + +def test_bikeshed_metadata_extraction(): + soup = parse_html(_load("bikeshed_sample.html")) + meta = extract_metadata(soup, "bikeshed") + assert meta.get("document") == "P9999R0" + assert meta.get("title") == "Test Bikeshed Paper" + assert meta.get("date") == "2026-03-15" + assert meta.get("audience") == "SG1" + reply_to = meta.get("reply-to", []) + assert any("editor@example.com" in entry for entry in reply_to) + + +def test_bikeshed_boilerplate_stripped(): + soup = parse_html(_load("bikeshed_sample.html")) + strip_boilerplate(soup, "bikeshed") + # h1.p-name and data-fill-with divs removed. + assert soup.find("h1", class_="p-name") is None + + +def test_bikeshed_body_renders(): + soup = parse_html(_load("bikeshed_sample.html")) + strip_boilerplate(soup, "bikeshed") + md = render_body(soup, "bikeshed") + assert "## Introduction" in md + assert "Body paragraph content." in md + + +# ---- HackMD ------------------------------------------------------------- + +def test_hackmd_detection(): + soup = parse_html(_load("hackmd_sample.html")) + assert detect_generator(soup) == "hackmd" + + +def test_hackmd_metadata_via_generic_fallback(): + """HackMD has no specific extractor; metadata comes through the generic path.""" + soup = parse_html(_load("hackmd_sample.html")) + meta = extract_metadata(soup, "hackmd") + # detect_generator returned "hackmd" but extract_metadata dispatches by argument; + # passing "hackmd" falls through to _extract_generic_metadata. + assert meta.get("title") == "P9999R0: Test HackMD Paper" + # The generic table scan should pick up document/audience from the . + assert meta.get("document") == "P9999R0" + assert meta.get("audience") == "SG1" + + +def test_hackmd_body_renders(): + soup = parse_html(_load("hackmd_sample.html")) + strip_boilerplate(soup, "hackmd") + md = render_body(soup, "hackmd") + assert "## Introduction" in md + assert "Body paragraph." in md + + +# ---- Hand-written (address form) ---------------------------------------- + +def test_handwritten_address_detection(): + soup = parse_html(_load("handwritten_address_sample.html")) + assert detect_generator(soup) == "hand-written" + + +def test_handwritten_address_metadata(): + soup = parse_html(_load("handwritten_address_sample.html")) + meta = extract_metadata(soup, "hand-written") + assert meta.get("document") == "P9999R0" + assert meta.get("date") == "2026-03-15" + assert meta.get("audience") == "SG1" + reply_to = meta.get("reply-to", []) + assert any("alice@example.com" in entry for entry in reply_to) + + +def test_handwritten_address_boilerplate_stripped(): + soup = parse_html(_load("handwritten_address_sample.html")) + strip_boilerplate(soup, "hand-written") + #
removed. + assert soup.find("address") is None + + +def test_handwritten_address_body_renders(): + soup = parse_html(_load("handwritten_address_sample.html")) + strip_boilerplate(soup, "hand-written") + md = render_body(soup, "hand-written") + assert "## Introduction" in md + + +# ---- Hand-written (table.header form) ----------------------------------- + +def test_handwritten_table_metadata(): + soup = parse_html(_load("handwritten_table_sample.html")) + meta = extract_metadata(soup, "hand-written") + assert meta.get("document") == "P9998R0" + assert meta.get("date") == "2026-02-01" + assert meta.get("audience") == "EWG" + reply_to = meta.get("reply-to", []) + assert any("bob@example.com" in entry for entry in reply_to) + + +def test_handwritten_table_boilerplate_stripped(): + soup = parse_html(_load("handwritten_table_sample.html")) + strip_boilerplate(soup, "hand-written") + assert soup.find("table", class_="header") is None \ No newline at end of file diff --git a/tomd/tests/test_position_lists.py b/tomd/tests/test_position_lists.py new file mode 100644 index 0000000..c943919 --- /dev/null +++ b/tomd/tests/test_position_lists.py @@ -0,0 +1,142 @@ + +"""Tests for position-based list detection in lib.pdf.structure.""" +from lib.pdf.structure import ( + _detect_lists_by_position, + _split_section_by_position, + _join_bullet_marker_lines, +) +from lib.pdf.types import Section, SectionKind, Line, Span, Block + + +def _bullet_line(text, x0, y=100.0): + """Construct a Line at the given x-indent with a bullet as first char. + + The text should start with a bullet character so _line_starts_with_bullet + returns True. + """ + span = Span(text=text, font_name="Body", font_size=11.0, + bbox=(x0, y, x0 + 300, y + 12)) + return Line( + spans=[span], + bbox=(x0, y, x0 + 300, y + 12), + ) + + +def _body_line(text, x0=50.0, y=100.0): + """Construct a Line at body-margin indent (no bullet).""" + span = Span(text=text, font_name="Body", font_size=11.0, + bbox=(x0, y, x0 + 300, y + 12)) + return Line( + spans=[span], + bbox=(x0, y, x0 + 300, y + 12), + ) + + +def _make_section_with_lines(lines, kind=SectionKind.PARAGRAPH, font_size=11.0): + text = "\n".join(ln.text for ln in lines) + return Section( + kind=kind, + text=text, + lines=lines, + page_num=0, + font_size=font_size, + ) + + +# ---- _detect_lists_by_position ------------------------------------------- + +def test_detect_preserves_non_paragraph_sections(): + """HEADING, TABLE, UNCERTAIN sections pass through unchanged.""" + heading = _make_section_with_lines([_body_line("Heading")], kind=SectionKind.HEADING) + result = _detect_lists_by_position([heading]) + assert result == [heading] + + +def test_detect_paragraph_without_bullets_unchanged(): + """A paragraph with no indented bullets is returned as-is.""" + para = _make_section_with_lines([ + _body_line("First paragraph line."), + _body_line("Second paragraph line."), + ]) + result = _detect_lists_by_position([para]) + assert len(result) == 1 + assert result[0].kind == SectionKind.PARAGRAPH + + +def test_detect_converts_indented_bullets_to_list(): + """Indented lines starting with bullets become LIST sections.""" + # Body margin needs establishing via the first non-bullet line, or via + # _get_body_margin's heuristic. Use x=50 for body and x=80 for indented bullets. + para = _make_section_with_lines([ + _body_line("Introduction paragraph.", x0=50.0), + _bullet_line("\u2022 first item", x0=80.0, y=114.0), + _bullet_line("\u2022 second item", x0=80.0, y=128.0), + ]) + result = _detect_lists_by_position([para]) + kinds = [sec.kind for sec in result] + # Expect at least one LIST section in the result. + assert SectionKind.LIST in kinds, f"got kinds={kinds}" + + +def test_detect_mixed_list_and_paragraph_split(): + """Body-margin lines between bullet groups split into their own PARAGRAPH.""" + para = _make_section_with_lines([ + _bullet_line("\u2022 first bullet", x0=80.0, y=100.0), + _body_line("Interstitial body text.", x0=50.0, y=114.0), + _bullet_line("\u2022 second bullet", x0=80.0, y=128.0), + ]) + result = _detect_lists_by_position([para]) + kinds = [sec.kind for sec in result] + # Expect LIST, PARAGRAPH, LIST order (subject to implementation details). + assert SectionKind.LIST in kinds + assert SectionKind.PARAGRAPH in kinds + + +# ---- _split_section_by_position tracks indent level ---------------------- + +def test_split_section_by_position_nested_indent(): + """A bullet at indent level 2 (x further right) carries indent_level=2.""" + # body_margin is the leftmost frequent x; derive it from the non-bullet line. + # Use the internal _get_body_margin via the public function path instead. + para = _make_section_with_lines([ + _body_line("Paragraph body.", x0=50.0, y=100.0), + _bullet_line("\u2022 outer", x0=80.0, y=114.0), + _bullet_line("\u2022 nested", x0=120.0, y=128.0), # further right than outer + ]) + result = _detect_lists_by_position([para]) + list_sections = [s for s in result if s.kind == SectionKind.LIST] + indent_levels = [s.indent_level for s in list_sections] + # At least one section should be at indent_level > 0. + assert any(i > 0 for i in indent_levels), f"got indent_levels={indent_levels}" + + +# ---- _join_bullet_marker_lines ------------------------------------------- + +def test_join_bullet_marker_merges_bullet_and_text_lines(): + """When a line is just a bullet char and the next is its text, merge them.""" + bullet_span = Span(text="\u2022", font_name="Body", font_size=11.0, + bbox=(50, 100, 58, 112)) + text_span = Span(text="item text", font_name="Body", font_size=11.0, + bbox=(68, 100, 200, 112)) + bullet_line = Line(spans=[bullet_span], bbox=(50, 100, 58, 112)) + text_line = Line(spans=[text_span], bbox=(68, 100, 200, 112)) + result = _join_bullet_marker_lines([bullet_line, text_line]) + # After joining, one Line containing a combined span (bullet + space + text). + assert len(result) == 1 + combined_text = result[0].text + assert combined_text.startswith("\u2022") + assert "item text" in combined_text + + +def test_join_bullet_marker_leaves_non_bullet_pairs_alone(): + """Two adjacent normal lines are not merged.""" + l1 = _body_line("line one") + l2 = _body_line("line two") + result = _join_bullet_marker_lines([l1, l2]) + assert len(result) == 2 + + +def test_join_bullet_marker_handles_single_line(): + """Fewer than 2 lines -> returned unchanged.""" + l1 = _body_line("solo line") + assert _join_bullet_marker_lines([l1]) == [l1] \ No newline at end of file