From b8ba5d2385bcd9351e648fe689c7bda78063ade4 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 14:13:13 -0400
Subject: [PATCH 01/14] tomd: ignore .venv/

---
 tomd/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tomd/.gitignore b/tomd/.gitignore
index d4db843..b2ae8d7 100644
--- a/tomd/.gitignore
+++ b/tomd/.gitignore
@@ -3,3 +3,4 @@
 *.pyc
 .out/
 papers/
+.venv/

From 74a4ac8d247efd5422107e26fde118bc0458b2d4 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 14:13:30 -0400
Subject: [PATCH 02/14] tomd: sort spatial characters by y-band then x for
 deterministic reading order

---
 tomd/lib/pdf/ARCHITECTURE.md |  2 +-
 tomd/tests/test_extract.py   | 46 ++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/tomd/lib/pdf/ARCHITECTURE.md b/tomd/lib/pdf/ARCHITECTURE.md
index ee562d2..ab00732 100644
--- a/tomd/lib/pdf/ARCHITECTURE.md
+++ b/tomd/lib/pdf/ARCHITECTURE.md
@@ -64,7 +64,7 @@ Enums:
   - `dy > avg_fs * 1.8` -> line break
   - `dy > avg_fs * 0.3` -> line break
   - `dx > avg_fs * 0.3` -> word break (insert space)
-- Characters sorted by y-band (half font height) with stable sort preserving document order within each band
+- Characters sorted by y-band (half font height) then x-position, giving deterministic reading order within each band
 
 **T3. Monospace classification (4 signals)**
 - `mono.py:classify_monospace`
diff --git a/tomd/tests/test_extract.py b/tomd/tests/test_extract.py
index 5b8eaab..51aa5cf 100644
--- a/tomd/tests/test_extract.py
+++ b/tomd/tests/test_extract.py
@@ -36,6 +36,35 @@ def _make_page(chars_by_span):
     return page
 
 
+def _make_page_with_blocks(block_char_order):
+    """Mock a fitz page whose rawdict iterates blocks in the given order.
+
+    block_char_order is a list of lists of (c, x, y) tuples. Each outer
+    list element becomes one block, iterated in that order.
+    """
+    blocks = []
+    for chars in block_char_order:
+        block_chars = [
+            {"c": c, "bbox": (x, y, x + 5, y + 10), "origin": (x, y + 10)}
+            for c, x, y in chars
+        ]
+        blocks.append({
+            "type": 0,
+            "lines": [{
+                "spans": [{
+                    "font": "TestFont",
+                    "size": 10.0,
+                    "flags": 0,
+                    "color": 0,
+                    "chars": block_chars,
+                }],
+            }],
+        })
+    page = MagicMock()
+    page.get_text.return_value = {"blocks": blocks}
+    return page
+
+
 def test_spatial_sorts_by_x_within_same_y():
     """Chars at the same y but reversed x-order should come out left-to-right."""
     page = _make_page([
@@ -49,6 +78,23 @@ def test_spatial_sorts_by_x_within_same_y():
     assert full_text.index("A") < full_text.index("B")
 
 
+def test_extract_spatial_sorts_across_blocks_in_y_band():
+    """Two blocks at the same y with reversed x ranges must be merged
+    in left-to-right reading order regardless of rawdict iteration order.
+    """
+    # Block B is iterated first but sits to the right of block A.
+    page = _make_page_with_blocks([
+        [("R", 300, 100), ("I", 310, 100), ("G", 320, 100), ("H", 330, 100), ("T", 340, 100)],
+        [("L", 50, 100), ("E", 60, 100), ("F", 70, 100), ("T", 80, 100)],
+    ])
+    blocks = extract_spatial(page, page_num=0)
+    text = "".join(
+        span.text for block in blocks for line in block.lines for span in line.spans
+    )
+    # The left block's characters must come first in the output.
+    assert text.index("L") < text.index("R"), f"got text={text!r}"
+
+
 def _make_block_with_span(text, bbox):
     span = Span(text=text, bbox=bbox)
     line = Line(spans=[span])

From eb454f9d8fde6fab225c1f45c4ddcc47c46fa02e Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 14:13:42 -0400
Subject: [PATCH 03/14] tomd: short-circuit similar() on identical strings
 regardless of length

---
 tomd/lib/similarity.py        | 2 ++
 tomd/tests/test_similarity.py | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tomd/lib/similarity.py b/tomd/lib/similarity.py
index b6b4d9f..2de0772 100644
--- a/tomd/lib/similarity.py
+++ b/tomd/lib/similarity.py
@@ -51,6 +51,8 @@ def similar(a: str, b: str) -> bool:
 
     The per-string check is lenient because the caller (TOC detection)
     provides a second guard via the 3+ consecutive run requirement.
+    Identical strings short-circuit to True regardless of length; the
+    200-char gate only protects against expensive fuzzy-compare work.
     """
     if a == b:
         return True
diff --git a/tomd/tests/test_similarity.py b/tomd/tests/test_similarity.py
index ec69e7c..c7d5481 100644
--- a/tomd/tests/test_similarity.py
+++ b/tomd/tests/test_similarity.py
@@ -1,6 +1,6 @@
 """Tests for lib.similarity."""
 
-from lib.similarity import similar
+from lib.similarity import _MAX_COMPARE_LENGTH, similar
 
 
 def test_similar_identical():
@@ -24,11 +24,11 @@ def test_similar_one_empty():
 
 
 def test_similar_circuit_breaker():
-    assert not similar("a" * 201, "b" * 201)
+    assert not similar("a" * (_MAX_COMPARE_LENGTH + 1), "b" * (_MAX_COMPARE_LENGTH + 1))
 
 
 def test_similar_long_identical():
-    assert similar("a" * 250, "a" * 250)
+    assert similar("a" * (_MAX_COMPARE_LENGTH + 50), "a" * (_MAX_COMPARE_LENGTH + 50))
 
 
 def test_similar_short_identical():

From 2a3daf72feaa5c72145af40fa242b71016a39b9f Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 14:14:06 -0400
Subject: [PATCH 04/14] tomd: fix Block.font_size docstring and stop mutating
 input in _extract_metadata

---
 tomd/tests/test_structure.py | 45 +++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/tomd/tests/test_structure.py b/tomd/tests/test_structure.py
index 155acee..219c6c4 100644
--- a/tomd/tests/test_structure.py
+++ b/tomd/tests/test_structure.py
@@ -6,7 +6,7 @@
 )
 from lib.pdf.structure import (
     compare_extractions, structure_sections,
-    heading_confidence,
+    heading_confidence, _extract_metadata,
 )
 
 
@@ -215,3 +215,46 @@ def test_document_key_not_doc_number(self):
         sec = make_section("Document Number: P9999R2")
         meta, _ = structure_sections([sec], has_title=True)
         assert "doc-number" not in meta
+
+
+class TestExtractMetadataMutation:
+    def test_does_not_mutate_input_sections(self):
+        """Regression: _extract_metadata must not mutate its input Sections.
+
+        Callers rely on helpers in this module producing new objects,
+        consistent with _merge_paragraphs.
+        """
+        sec = make_section(
+            "Document Number: P1234R0\nSome leftover\nBody content",
+            kind=SectionKind.PARAGRAPH,
+        )
+        original_text = sec.text
+        _extract_metadata([sec])
+        assert sec.text == original_text
+
+    def test_returns_stripped_section_copy(self):
+        """The returned section has the metadata lines removed."""
+        sec = make_section(
+            "Document Number: P1234R0\nSome leftover",
+            kind=SectionKind.PARAGRAPH,
+        )
+        meta, remaining = _extract_metadata([sec])
+        assert meta.get("document") == "P1234R0"
+        assert len(remaining) == 1
+        assert "Document Number" not in remaining[0].text
+        assert "Some leftover" in remaining[0].text
+
+
+class TestBlockFontSize:
+    def test_line_count_voting(self):
+        """Block.font_size uses line-count voting, not character weighting."""
+        from conftest import make_span
+        block = Block(lines=[
+            Line(spans=[make_span(
+                "word word word word word word word word", font_size=11.0)]),
+            Line(spans=[make_span("short", font_size=14.0)]),
+            Line(spans=[make_span("short", font_size=14.0)]),
+        ])
+        # Lines: two at 14, one at 11 -> 14 wins by line count.
+        # Character count would favor 11.
+        assert block.font_size == 14.0

From 3236b8a2dd63e8d096e5ebd2ea3d9c6c49ccd312 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 14:14:15 -0400
Subject: [PATCH 05/14] tomd: detach HTML sublists before inline capture to fix
 nested-list duplication

---
 tomd/lib/html/render.py        |  8 +++--
 tomd/tests/test_html_render.py | 57 ++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/tomd/lib/html/render.py b/tomd/lib/html/render.py
index 4235002..be1bb05 100644
--- a/tomd/lib/html/render.py
+++ b/tomd/lib/html/render.py
@@ -8,6 +8,7 @@
 from .. import strip_format_chars, SECTION_NUM_PREFIX_RE, ALLOWED_LINK_SCHEMES
 
 _HEADING_TAGS = frozenset({"h1", "h2", "h3", "h4", "h5", "h6"})
+_LIST_CONTAINER_TAGS = frozenset({"ul", "ol"})
 
 
 def render_body(soup: BeautifulSoup, generator: str) -> str:
@@ -203,13 +204,16 @@ def _render_list(el: Tag, marker: str, generator: str) -> str | None:
     items = []
     for i, li in enumerate(el.find_all("li", recursive=False)):
         prefix = f"{i + 1}." if marker == "1." else "-"
+        # Detach nested sublists before capturing inline text so they are not
+        # walked into by _inline_text (which would duplicate their contents).
+        subs = [sub.extract()
+                for sub in li.find_all(_LIST_CONTAINER_TAGS, recursive=False)]
         nested_parts = []
-        for sub in li.find_all(["ul", "ol"], recursive=False):
+        for sub in subs:
             sub_rendered = _render_element(sub, generator)
             if sub_rendered:
                 indented = "\n".join("  " + line for line in sub_rendered.split("\n"))
                 nested_parts.append(indented)
-            sub.extract()
 
         text = _collapse_whitespace(_inline_text(li))
         if text:
diff --git a/tomd/tests/test_html_render.py b/tomd/tests/test_html_render.py
index c26486c..708e67c 100644
--- a/tomd/tests/test_html_render.py
+++ b/tomd/tests/test_html_render.py
@@ -113,6 +113,63 @@ def test_nested(self):
         parent_line = next(l for l in lines if "Parent" in l)
         assert "Child" not in parent_line
         assert "  - Child" in md
+        assert md.count("Child") == 1, (
+            f"Child appears {md.count('Child')} times, expected 1. md={md!r}")
+        assert md.count("Parent") == 1
+
+    def test_nested_three_levels(self):
+        soup = parse_html("""
+        <ul>
+          <li>One
+            <ul>
+              <li>Two
+                <ul><li>Three</li></ul>
+              </li>
+            </ul>
+          </li>
+        </ul>
+        """)
+        md = render_body(soup, "mpark")
+        assert md.count("One") == 1
+        assert md.count("Two") == 1
+        assert md.count("Three") == 1
+        assert "- One" in md
+        assert "  - Two" in md
+        assert "    - Three" in md
+
+    def test_nested_ordered(self):
+        soup = parse_html("""
+        <ol>
+          <li>First
+            <ul><li>Bullet</li></ul>
+          </li>
+          <li>Second
+            <ol><li>Sub</li></ol>
+          </li>
+        </ol>
+        """)
+        md = render_body(soup, "mpark")
+        assert md.count("Bullet") == 1
+        assert md.count("Sub") == 1
+        assert "1. First" in md
+        assert "  - Bullet" in md
+        assert "2. Second" in md
+        assert "  1. Sub" in md
+
+    def test_nested_mixed_content(self):
+        soup = parse_html("""
+        <ul>
+          <li>Before <strong>emphasis</strong>
+            <ul><li>Nested</li></ul>
+            after text
+          </li>
+        </ul>
+        """)
+        md = render_body(soup, "mpark")
+        assert md.count("Nested") == 1
+        assert "Before" in md
+        assert "**emphasis**" in md
+        assert md.count("after text") == 1
 
     def test_nested_multi_level(self):
         soup = parse_html("""

From 065a1b867280b6e4fd28a17c84d117c96254679c Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 14:23:02 -0400
Subject: [PATCH 06/14] tomd: fix dehyphenation duplicating word when next line
 is single span

---
 tomd/tests/test_cleanup.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tomd/tests/test_cleanup.py b/tomd/tests/test_cleanup.py
index 4a905df..d55951a 100644
--- a/tomd/tests/test_cleanup.py
+++ b/tomd/tests/test_cleanup.py
@@ -96,6 +96,37 @@ def test_cleanup_dehyphenates_no_hyphen():
     assert "world" in result[0].text
 
 
+def test_cleanup_dehyphenates_single_span_next_line():
+    """Regression: when the next line has one span entirely consumed by
+    dehyphenation, the consumed word must not remain as a duplicate."""
+    span1 = make_span("imple-")
+    span2 = make_span("mentation")
+    block = Block(lines=[Line(spans=[span1]), Line(spans=[span2])])
+    result = cleanup_text([block])
+    full_text = result[0].text
+    assert "implementation" in full_text
+    assert full_text.count("mentation") == 1, (
+        f"'mentation' appears {full_text.count('mentation')} times in {full_text!r}"
+    )
+
+
+def test_cleanup_dehyphenates_next_line_multi_span_consumed():
+    """When the next line has multiple spans and the first is fully consumed,
+    remaining spans must survive."""
+    span1 = make_span("imple-")
+    first_consumed = make_span("mentation")
+    remaining = make_span(" of things")
+    block = Block(lines=[
+        Line(spans=[span1]),
+        Line(spans=[first_consumed, remaining]),
+    ])
+    result = cleanup_text([block])
+    full_text = result[0].text
+    assert "implementation" in full_text
+    assert " of things" in full_text
+    assert full_text.count("mentation") == 1
+
+
 def test_cleanup_merges_cross_page():
     b1 = make_block(["Some text without terminal"], page_num=0)
     b2 = make_block(["continuation here"], page_num=1)

From bb785c197f6ca4bb967aed1152b7c1e2b8b33117 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 16:25:10 -0400
Subject: [PATCH 07/14] tomd: package as installable project with pinned deps

Adds pyproject.toml, __init__.py, and a minimal README so `pip install -e tomd`
produces a working `tomd` console script. Pins pymupdf~=1.27 and
beautifulsoup4~=4.14 to protect against silent PyMuPDF API drift (get_texttrace,
get_text dict/rawdict, get_drawings). Switches main.py to relative imports so
tomd.main:main resolves under the console script; the legacy
`python tomd/main.py` invocation is dropped in favor of `tomd paper.pdf` or
`python -m tomd.main`.
---
 tomd/.gitignore        |  3 +++
 tomd/README.md         | 34 ++++++++++++++++++++++++++++++++++
 tomd/__init__.py       |  1 +
 tomd/main.py           | 14 ++++++++------
 tomd/pyproject.toml    | 32 ++++++++++++++++++++++++++++++++
 tomd/requirements.txt  | 12 ++++++++++--
 tomd/tests/test_cli.py | 20 ++++++++++++++++++++
 7 files changed, 108 insertions(+), 8 deletions(-)
 create mode 100644 tomd/README.md
 create mode 100644 tomd/__init__.py
 create mode 100644 tomd/pyproject.toml
 create mode 100644 tomd/tests/test_cli.py

diff --git a/tomd/.gitignore b/tomd/.gitignore
index b2ae8d7..20dbd4a 100644
--- a/tomd/.gitignore
+++ b/tomd/.gitignore
@@ -1,6 +1,9 @@
 **/__pycache__/
 **/.pytest_cache/
 *.pyc
+*.egg-info/
 .out/
 papers/
 .venv/
+build/
+dist/
diff --git a/tomd/README.md b/tomd/README.md
new file mode 100644
index 0000000..29ece89
--- /dev/null
+++ b/tomd/README.md
@@ -0,0 +1,34 @@
+# tomd
+
+Convert PDF and HTML papers to Markdown. Used to prepare WG21 inputs for
+the C++ Alliance paper pipeline.
+
+## Install
+
+From this directory:
+
+```
+pip install -e .
+```
+
+Installs the `tomd` console script and pins
+`pymupdf~=1.27` / `beautifulsoup4~=4.14`.
+
+## Usage
+
+```
+tomd paper.pdf                  # -> paper.md (+ paper.prompts.md if uncertain)
+tomd paper.html                 # -> paper.md
+tomd *.pdf *.html --outdir out/ # batch mode
+```
+
+Also runnable as `python -m tomd.main ...`.
+
+## Development
+
+Install test extras and run the suite:
+
+```
+pip install -e .[test]
+pytest tests/
+```
diff --git a/tomd/__init__.py b/tomd/__init__.py
new file mode 100644
index 0000000..ce03d3c
--- /dev/null
+++ b/tomd/__init__.py
@@ -0,0 +1 @@
+"""tomd - PDF and HTML to Markdown converter for WG21 papers."""
diff --git a/tomd/main.py b/tomd/main.py
index 4bf35f1..1326285 100644
--- a/tomd/main.py
+++ b/tomd/main.py
@@ -4,10 +4,12 @@
 PDF: hybrid dual extraction (MuPDF + spatial rules) with confidence scoring.
 HTML: DOM traversal with generator-specific metadata extraction.
 
-Usage:
-    python tomd/main.py input.pdf                  # -> input.md + input.prompts.md
-    python tomd/main.py input.html                 # -> input.md
-    python tomd/main.py *.pdf *.html --outdir out/ # batch mode
+Usage (after `pip install -e tomd`):
+    tomd input.pdf                  # -> input.md + input.prompts.md
+    tomd input.html                 # -> input.md
+    tomd *.pdf *.html --outdir out/ # batch mode
+
+Also runnable as `python -m tomd.main ...`.
 """
 
 import argparse
@@ -83,10 +85,10 @@ def main():
         try:
             ext = input_file.suffix.lower()
             if ext in _HTML_EXTENSIONS:
-                from lib.html import convert_html
+                from .lib.html import convert_html
                 md_text, prompts_text = convert_html(input_file)
             elif ext in _PDF_EXTENSIONS:
-                from lib.pdf import convert_pdf
+                from .lib.pdf import convert_pdf
                 md_text, prompts_text = convert_pdf(input_file)
             else:
                 print(f"SKIP: {input_file} unsupported format", file=sys.stderr)
diff --git a/tomd/pyproject.toml b/tomd/pyproject.toml
new file mode 100644
index 0000000..c754dc3
--- /dev/null
+++ b/tomd/pyproject.toml
@@ -0,0 +1,32 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "tomd"
+version = "0.1.0"
+description = "PDF and HTML to Markdown converter for WG21 papers."
+readme = "README.md"
+requires-python = ">=3.11"
+license = {text = "BSL-1.0"}
+authors = [
+  {name = "Vinnie Falco"},
+]
+dependencies = [
+  "pymupdf~=1.27",
+  "beautifulsoup4~=4.14",
+]
+
+[project.optional-dependencies]
+test = [
+  "pytest~=8.0",
+]
+
+[project.scripts]
+tomd = "tomd.main:main"
+
+[tool.setuptools]
+packages = ["tomd", "tomd.lib", "tomd.lib.pdf", "tomd.lib.html"]
+
+[tool.setuptools.package-dir]
+"tomd" = "."
diff --git a/tomd/requirements.txt b/tomd/requirements.txt
index 737641f..6eaae68 100644
--- a/tomd/requirements.txt
+++ b/tomd/requirements.txt
@@ -1,2 +1,10 @@
-pymupdf
-beautifulsoup4
+# Runtime dependencies for tomd. Pinned to compatible-release (~=) ranges
+# to protect against PyMuPDF API drift (tomd uses get_text "dict"/"rawdict",
+# get_texttrace, and get_drawings, any of which can shift between minor
+# versions) and to bound BeautifulSoup API changes.
+#
+# Bump these pins intentionally after running the full test suite against
+# the new version.
+
+pymupdf~=1.27
+beautifulsoup4~=4.14
diff --git a/tomd/tests/test_cli.py b/tomd/tests/test_cli.py
new file mode 100644
index 0000000..2fc5802
--- /dev/null
+++ b/tomd/tests/test_cli.py
@@ -0,0 +1,20 @@
+"""Smoke tests for the tomd CLI entry point."""
+
+import subprocess
+import sys
+
+import pytest
+
+
+def test_tomd_module_invokable_via_python():
+    """`python -m tomd.main --help` must succeed when tomd is installed."""
+    try:
+        import tomd.main  # noqa: F401
+    except ImportError:
+        pytest.skip("tomd not installed as a package (run `pip install -e .`)")
+    result = subprocess.run(
+        [sys.executable, "-m", "tomd.main", "--help"],
+        capture_output=True, text=True, check=False,
+    )
+    assert result.returncode == 0, (result.stdout, result.stderr)
+    assert "tomd" in result.stdout

From 23505fecf887f516567d75abffa8bd9a0ab0cac1 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 16:55:12 -0400
Subject: [PATCH 08/14] tomd: skip html golden tests when papers/ corpus is
 absent

papers/ is gitignored, so fresh clones saw 7 AssertionError failures
before any conversion logic ran. Replace the file-existence assert with
pytest.skip() so the tests stay granular (per-stem) and only run when
their input HTML is present.
---
 tomd/tests/test_html_golden.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tomd/tests/test_html_golden.py b/tomd/tests/test_html_golden.py
index 0fc4086..0b9c84a 100644
--- a/tomd/tests/test_html_golden.py
+++ b/tomd/tests/test_html_golden.py
@@ -43,7 +43,8 @@ def _diff_head(actual: str, golden: str, limit: int = 120) -> str:
 @pytest.mark.parametrize("stem", _GOLDEN_STEMS)
 def test_convert_html_matches_golden(stem: str):
     html_path = _PAPERS / f"{stem}.html"
-    assert html_path.is_file(), f"missing paper HTML: {html_path}"
+    if not html_path.is_file():
+        pytest.skip(f"missing paper HTML: {html_path} (papers/ is gitignored)")
 
     md, prompts = convert_html(html_path)
     golden_md = _GOLDEN / f"{stem}.golden.md"

From f9b211c5e43218b95078264e79d0324cb053cb08 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 17:06:31 -0400
Subject: [PATCH 09/14] tomd: add pytest CI workflow for Python 3.12 and 3.13

Adds .github/workflows/tomd-tests.yml with a pytest matrix scoped to
tomd/** path changes. Bumps requires-python to >=3.12 to match the
tested range.
---
 .github/workflows/tomd-tests.yml | 40 ++++++++++++++++++++++++++++++++
 tomd/pyproject.toml              |  2 +-
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/tomd-tests.yml

diff --git a/.github/workflows/tomd-tests.yml b/.github/workflows/tomd-tests.yml
new file mode 100644
index 0000000..239db26
--- /dev/null
+++ b/.github/workflows/tomd-tests.yml
@@ -0,0 +1,40 @@
+name: tomd tests
+
+on:
+  push:
+    branches: [master]
+    paths:
+      - "tomd/**"
+      - ".github/workflows/tomd-tests.yml"
+  pull_request:
+    paths:
+      - "tomd/**"
+      - ".github/workflows/tomd-tests.yml"
+  workflow_dispatch:
+
+jobs:
+  test:
+    name: pytest (Python ${{ matrix.python-version }})
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "pip"
+          cache-dependency-path: tomd/requirements.txt
+
+      - name: Install tomd with test extras
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e tomd[test]
+
+      - name: Run pytest
+        working-directory: tomd
+        run: python -m pytest tests/ -v --tb=short
diff --git a/tomd/pyproject.toml b/tomd/pyproject.toml
index c754dc3..6f53e09 100644
--- a/tomd/pyproject.toml
+++ b/tomd/pyproject.toml
@@ -7,7 +7,7 @@ name = "tomd"
 version = "0.1.0"
 description = "PDF and HTML to Markdown converter for WG21 papers."
 readme = "README.md"
-requires-python = ">=3.11"
+requires-python = ">=3.12"
 license = {text = "BSL-1.0"}
 authors = [
   {name = "Vinnie Falco"},

From 25bb253017bc78cfb51f188318693e998e017608 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 17:20:18 -0400
Subject: [PATCH 10/14] tomd: add LICENSE and expand user-facing README

BSL-1.0 LICENSE file matches the license already declared in
pyproject.toml; README grows from an install-only stub to cover usage,
output, uncertain-region markers, limitations, design-doc links, and
development. Closes issues/15.
---
 tomd/LICENSE   | 25 ++++++++++++++++++
 tomd/README.md | 70 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 tomd/LICENSE

diff --git a/tomd/LICENSE b/tomd/LICENSE
new file mode 100644
index 0000000..e439b22
--- /dev/null
+++ b/tomd/LICENSE
@@ -0,0 +1,25 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Copyright (c) 2026 Vinnie Falco
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/tomd/README.md b/tomd/README.md
index 29ece89..76456ad 100644
--- a/tomd/README.md
+++ b/tomd/README.md
@@ -1,7 +1,12 @@
 # tomd
 
-Convert PDF and HTML papers to Markdown. Used to prepare WG21 inputs for
-the C++ Alliance paper pipeline.
+Convert WG21 committee papers from PDF or HTML to clean Markdown.
+
+tomd is purpose-built for C++ standards committee paper conversion. It
+understands WG21 metadata fields (document number, date, reply-to, audience),
+detects structural elements (headings, lists, tables, code blocks, wording
+sections), and produces Markdown that looks like a human wrote it, suitable
+for version control, pull request diffs, and plain-text review workflows.
 
 ## Install
 
@@ -11,8 +16,9 @@ From this directory:
 pip install -e .
 ```
 
-Installs the `tomd` console script and pins
-`pymupdf~=1.27` / `beautifulsoup4~=4.14`.
+Requires Python 3.12 or newer. Runtime dependencies (`pymupdf~=1.27`,
+`beautifulsoup4~=4.14`) are declared in `pyproject.toml` and installed
+automatically.
 
 ## Usage
 
@@ -20,10 +26,62 @@ Installs the `tomd` console script and pins
 tomd paper.pdf                  # -> paper.md (+ paper.prompts.md if uncertain)
 tomd paper.html                 # -> paper.md
 tomd *.pdf *.html --outdir out/ # batch mode
+tomd -v paper.pdf               # verbose logging
+tomd -o out.md paper.pdf        # explicit output path (single-file only)
 ```
 
 Also runnable as `python -m tomd.main ...`.
 
+### Output
+
+- `paper.md` is always produced. It contains YAML front matter (title,
+  document number, date, audience, reply-to) followed by the paper body
+  rendered as Markdown.
+- `paper.prompts.md` is produced only when the converter found uncertain
+  regions. It pairs each uncertain span with both extraction paths (MuPDF
+  and spatial) plus surrounding context, formatted for manual LLM
+  reconciliation. If no uncertain regions exist, no prompts file is written
+  (and any stale one at the output path is removed).
+
+### Uncertain regions
+
+tomd uses dual-extraction with confidence scoring. When the MuPDF and
+spatial paths disagree on a page, the region is emitted in the output
+marked with an HTML comment:
+
+```
+<!-- tomd:uncertain:L120-L145 -->
+```
+
+The accompanying `.prompts.md` file contains ready-to-feed LLM prompts for
+each marker. You resolve uncertain regions manually; the LLM fixes
+structure, never content.
+
+## Limitations
+
+- **No OCR.** Scanned or image-only PDFs are not supported.
+- **No vision fallback.** Papers that rely on non-extractable layout
+  (complex equations, diagrams) will not convert cleanly.
+- **HTML generator coverage.** Four generators are detected directly:
+  mpark/wg21, Bikeshed, HackMD, and hand-written. Other sources fall back
+  to a generic extractor that may miss metadata fields.
+- **LLM auto-resolution is deferred to v2.** The `.prompts.md` file is
+  produced; feeding it to an LLM and applying the result is manual in this
+  release.
+
+## Design
+
+Design and architecture documentation lives alongside the code:
+
+- [`CLAUDE.md`](CLAUDE.md) - architecture rules and invariants (contributors
+  and AI agents).
+- [`lib/pdf/ARCHITECTURE.md`](lib/pdf/ARCHITECTURE.md) - PDF converter
+  pipeline and the techniques it uses.
+- [`lib/html/ARCHITECTURE.md`](lib/html/ARCHITECTURE.md) - HTML converter
+  pipeline.
+
+Read these in order if you are modifying tomd.
+
 ## Development
 
 Install test extras and run the suite:
@@ -32,3 +90,7 @@ Install test extras and run the suite:
 pip install -e .[test]
 pytest tests/
 ```
+
+## License
+
+Boost Software License 1.0. See [`LICENSE`](LICENSE).

From 07b61f7565d001d6799d8d3709ad35fa4e6a6959 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 17:34:58 -0400
Subject: [PATCH 11/14] tomd: consolidate document- and section-number regex
 shapes

The doc-number shape lived in both lib/__init__.py (DOC_NUM_RE) and
lib/pdf/types.py (DOC_FIELD_RE), and the dotted-decimal section-number
shape lived in both SECTION_NUM_PREFIX_RE and SECTION_NUM_RE. The
tomd-specific rule "regex patterns for metadata fields must be defined
in one place" was violated; the pair of doc-number patterns also
disagreed on SD-N coverage.

Extract two core shape strings (DOC_NUM_PATTERN, SECTION_NUM_PATTERN)
in lib/__init__.py and rebuild the four callers on them.  DOC_FIELD_RE
picks up SD-N support for free.  No call-site changes: every existing
group(0)/group(1)/match/search semantic is preserved (verified by grep
for .groups() tuple unpacking; there are none). Closes issues/08.
---
 tomd/CLAUDE.md                    |  2 +-
 tomd/lib/__init__.py              | 27 ++++++++++-----
 tomd/lib/pdf/types.py             | 14 +++++---
 tomd/tests/test_regex_patterns.py | 57 +++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+), 15 deletions(-)
 create mode 100644 tomd/tests/test_regex_patterns.py

diff --git a/tomd/CLAUDE.md b/tomd/CLAUDE.md
index cdc017d..15fa83b 100644
--- a/tomd/CLAUDE.md
+++ b/tomd/CLAUDE.md
@@ -100,7 +100,7 @@ Auto-resolution via `--llm` flag is deferred to v2. For v1, the tool produces a
 ## File Map
 
 - `main.py` - CLI entry point. Argparse, glob expansion, output path logic, main(). No conversion logic.
-- `lib/__init__.py` - Shared text utilities and constants for PDF and HTML converters: `ascii_escape`, `strip_format_chars`, `format_front_matter`, `ALLOWED_LINK_SCHEMES`, and shared regex patterns (`EMAIL_RE`, `DATE_RE`, `DOC_NUM_RE`, `SECTION_NUM_PREFIX_RE`).
+- `lib/__init__.py` - Shared text utilities and constants for PDF and HTML converters: `ascii_escape`, `strip_format_chars`, `format_front_matter`, `parse_author_lines`, `ALLOWED_LINK_SCHEMES`, shared regex patterns (`EMAIL_RE`, `DATE_RE`, `DOC_NUM_RE`, `SECTION_NUM_PREFIX_RE`), and their reusable shape strings (`DOC_NUM_PATTERN`, `SECTION_NUM_PATTERN`) consumed by `lib/pdf/types.py` to build `DOC_FIELD_RE` and `SECTION_NUM_RE`.
 - `lib/similarity.py` - Dual-algorithm string similarity (SequenceMatcher + Jaccard). Per-algorithm thresholds, 200-char circuit breaker. Format-agnostic.
 - `lib/toc.py` - Table of Contents detection. Matches section texts against known headings using fuzzy similarity. Bridges small gaps. Format-agnostic - no dependency on PDF types.
 - `lib/pdf/__init__.py` - Exports `convert_pdf()`. Orchestrates the full pipeline in order. Includes monospace propagation, wording classification, and page 0 color extraction via space-color proxy.
diff --git a/tomd/lib/__init__.py b/tomd/lib/__init__.py
index b7fcad4..dcd53d4 100644
--- a/tomd/lib/__init__.py
+++ b/tomd/lib/__init__.py
@@ -148,15 +148,24 @@ def parse_author_lines(lines, clean_line=None, skip_line=None):
 
 DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
 
-# Broad document-number pattern used for header stripping and HTML metadata.
+# Core pattern shapes (no anchors, no label context) reused across modules
+# so every document- and section-number pattern has a single source of truth.
+# `lib/pdf/types.py` builds the labeled PDF variants (DOC_FIELD_RE,
+# SECTION_NUM_RE) on top of these.
+DOC_NUM_PATTERN = (
+    r"[DPN]\d{3,5}R\d+"
+    r"|[DPN]\d{3,5}"
+    r"|N\d{3,5}"
+    r"|SD-\d+"
+)
+
+SECTION_NUM_PATTERN = r"\d+(?:\.\d+)*"
+
+# Broad document-number match used for header stripping and HTML metadata.
 # For line-anchored field extraction in PDF blocks, see DOC_FIELD_RE in
 # lib/pdf/types.py, which targets "Document Number: PXXXXrN" line prefixes.
-DOC_NUM_RE = re.compile(
-    r"\b([DPN]\d{3,5}R\d+)\b"
-    r"|\b([DPN]\d{3,5})\b"
-    r"|\b(N\d{3,5})\b"
-    r"|\b(SD-\d+)\b",
-    re.IGNORECASE,
-)
+DOC_NUM_RE = re.compile(rf"\b({DOC_NUM_PATTERN})\b", re.IGNORECASE)
 
-SECTION_NUM_PREFIX_RE = re.compile(r"^\d+(?:\.\d+)*\.?\s+")
+# Leading section-number prefix used by the HTML renderer to strip a number
+# (e.g. "2.1.3 " or "1. ") from heading text.
+SECTION_NUM_PREFIX_RE = re.compile(rf"^{SECTION_NUM_PATTERN}\.?\s+")
diff --git a/tomd/lib/pdf/types.py b/tomd/lib/pdf/types.py
index c816c13..3dfcac6 100644
--- a/tomd/lib/pdf/types.py
+++ b/tomd/lib/pdf/types.py
@@ -5,6 +5,8 @@
 from dataclasses import dataclass, field
 from enum import Enum
 
+from tomd.lib import DOC_NUM_PATTERN, SECTION_NUM_PATTERN
+
 
 class Confidence(Enum):
     """Confidence level for structural classification decisions."""
@@ -135,15 +137,17 @@ class PageEdgeItem:
 
 # --- Precompiled regex patterns ---
 
-SECTION_NUM_RE = re.compile(
-    r"^(\d+(?:\.\d+)*)\s+(.+)",
-)
+# Section number at the start of a line with required trailing content
+# (used for heading detection); shares the core shape with
+# SECTION_NUM_PREFIX_RE in lib/__init__.py.
+SECTION_NUM_RE = re.compile(rf"^({SECTION_NUM_PATTERN})\s+(.+)")
 
 # Line-anchored pattern targeting "Document Number: PXXXXRN" field lines in
 # PDF block text. More restrictive than DOC_NUM_RE in lib/__init__.py, which
-# is a broad substring match used for header stripping and HTML contexts.
+# is a broad substring match used for header stripping and HTML contexts;
+# both patterns share the core DOC_NUM_PATTERN shape.
 DOC_FIELD_RE = re.compile(
-    r"Document\s+(?:Number|#)[:\s]+([DPN]\d{3,5}(?:R\d+)?|N\d{3,5})",
+    rf"Document\s+(?:Number|#)[:\s]+({DOC_NUM_PATTERN})",
     re.IGNORECASE,
 )
 
diff --git a/tomd/tests/test_regex_patterns.py b/tomd/tests/test_regex_patterns.py
new file mode 100644
index 0000000..c0cbd9a
--- /dev/null
+++ b/tomd/tests/test_regex_patterns.py
@@ -0,0 +1,57 @@
+"""Tests for the shared document- and section-number regex patterns.
+
+After the consolidation in issue 08, `DOC_NUM_PATTERN` and
+`SECTION_NUM_PATTERN` live in `lib/__init__.py`; the PDF-specific
+labeled variants in `lib/pdf/types.py` are built on top of them.
+These tests lock down the behavior each call site depends on.
+"""
+
+from lib import DOC_NUM_RE, SECTION_NUM_PREFIX_RE
+from lib.pdf.types import DOC_FIELD_RE, SECTION_NUM_RE
+
+
+def test_doc_num_matches_all_wg21_forms():
+    for s in ("P1234", "P1234R0", "P12345R9", "D0042R3", "N5012", "SD-9"):
+        assert DOC_NUM_RE.search(s), f"failed to match {s!r}"
+
+
+def test_doc_num_group_zero_returns_full_number():
+    # Call sites depend on m.group(0) returning the matched number.
+    m = DOC_NUM_RE.search("see P1234R0 for details")
+    assert m is not None
+    assert m.group(0).upper() == "P1234R0"
+
+
+def test_doc_num_rejects_too_short_prefix():
+    # WG21 doc numbers have at least 3 digits; shorter must not match.
+    assert DOC_NUM_RE.search("P12") is None
+    assert DOC_NUM_RE.search("N42") is None
+
+
+def test_doc_field_matches_labeled_forms():
+    m = DOC_FIELD_RE.search("Document Number: P1234R0")
+    assert m and m.group(1).upper() == "P1234R0"
+    m = DOC_FIELD_RE.search("Document #: N5012")
+    assert m and m.group(1).upper() == "N5012"
+
+
+def test_doc_field_now_supports_sd_form():
+    # Regression: after consolidation DOC_FIELD_RE inherits SD-N support
+    # from the shared DOC_NUM_PATTERN.
+    m = DOC_FIELD_RE.search("Document Number: SD-1")
+    assert m and m.group(1).upper() == "SD-1"
+
+
+def test_section_num_prefix_strips_leading_number():
+    assert SECTION_NUM_PREFIX_RE.sub("", "2.1.3 Details") == "Details"
+    assert SECTION_NUM_PREFIX_RE.sub("", "1. Introduction") == "Introduction"
+    # Non-matching input passes through unchanged.
+    assert SECTION_NUM_PREFIX_RE.sub("", "Introduction") == "Introduction"
+
+
+def test_section_num_re_captures_number_and_title():
+    m = SECTION_NUM_RE.match("2.1.3 Details of the feature")
+    assert m is not None
+    assert m.group(1) == "2.1.3"
+    assert m.group(2) == "Details of the feature"
+    assert SECTION_NUM_RE.match("Abstract") is None

From e23323efef01cc4ca45dec8ae3c948418563d567 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 17:35:34 -0400
Subject: [PATCH 12/14] tomd: pin parse_author_lines behavior with direct tests

The shared author state machine in lib/__init__.py was already
deduplicated (both lib/pdf/wg21.py and lib/html/extract.py delegate to
parse_author_lines with per-caller clean_line/skip_line callbacks). Only
the pattern-level coverage proposed in issues/07 was missing: the helper
is exercised today only through the callers' tests. Add tests/test_authors.py
covering the pending-name pairing, trailing-name flush, blank-line skip,
and clean_line/skip_line injection points. Closes issues/07.
---
 tomd/tests/test_authors.py | 85 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 tomd/tests/test_authors.py

diff --git a/tomd/tests/test_authors.py b/tomd/tests/test_authors.py
new file mode 100644
index 0000000..3aa7d90
--- /dev/null
+++ b/tomd/tests/test_authors.py
@@ -0,0 +1,85 @@
+"""Tests for lib.parse_author_lines.
+
+The shared state machine that both `lib/pdf/wg21.py:_parse_authors` and
+`lib/html/extract.py:_parse_mpark_authors` delegate to. These tests pin
+the helper's own contract: pending-name pairing, multi-line email pairs,
+blank-line skipping, trailing-name flush, and the clean_line / skip_line
+injection points the callers depend on.
+"""
+
+from lib import parse_author_lines
+
+
+def test_name_and_email_same_line():
+    result = parse_author_lines(["Alice Example alice@example.com"])
+    assert result == ["Alice Example <alice@example.com>"]
+
+
+def test_name_then_email_next_line():
+    result = parse_author_lines(["Alice Example", "alice@example.com"])
+    assert result == ["Alice Example <alice@example.com>"]
+
+
+def test_email_alone():
+    result = parse_author_lines(["alice@example.com"])
+    assert result == ["<alice@example.com>"]
+
+
+def test_multiple_authors_alternating():
+    result = parse_author_lines([
+        "Alice Example",
+        "alice@example.com",
+        "Bob Sample",
+        "bob@example.com",
+    ])
+    assert result == [
+        "Alice Example <alice@example.com>",
+        "Bob Sample <bob@example.com>",
+    ]
+
+
+def test_name_only_no_email_becomes_bare_entry():
+    result = parse_author_lines(["Alice Example"])
+    assert result == ["Alice Example"]
+
+
+def test_blank_lines_are_skipped():
+    result = parse_author_lines(
+        ["", "Alice Example", "  ", "alice@example.com", ""])
+    assert result == ["Alice Example <alice@example.com>"]
+
+
+def test_trailing_pending_name_is_flushed():
+    # A name with no following email must still appear in the output.
+    result = parse_author_lines([
+        "Alice Example", "alice@example.com", "Bob Solo",
+    ])
+    assert result == [
+        "Alice Example <alice@example.com>",
+        "Bob Solo",
+    ]
+
+
+def test_empty_input():
+    assert parse_author_lines([]) == []
+
+
+def test_custom_clean_line_strips_brackets():
+    # Mirrors how lib/html/extract.py injects angle-bracket stripping.
+    import re
+    angle = re.compile(r"[<>]")
+    result = parse_author_lines(
+        ["Alice <Example>", "<alice@example.com>"],
+        clean_line=lambda t: angle.sub("", t).strip(),
+    )
+    assert result == ["Alice Example <alice@example.com>"]
+
+
+def test_custom_skip_line_rejects_non_author_content():
+    # Mirrors how HTML rejects doc-number lines and PDF rejects label lines:
+    # the skipped line must neither become a pending name nor appear in output.
+    result = parse_author_lines(
+        ["P1234R0", "Alice Example", "alice@example.com"],
+        skip_line=lambda l: l == "P1234R0",
+    )
+    assert result == ["Alice Example <alice@example.com>"]

From 16f85f2d7d798f4f5ab82b909135d216e83df316 Mon Sep 17 00:00:00 2001
From: Greg Kaleka <greg@gregkaleka.com>
Date: Wed, 15 Apr 2026 18:05:11 -0400
Subject: [PATCH 13/14] tomd: remove get_texttrace() fallback, tighten dep pins

With pymupdf and beautifulsoup4 pinned in requirements.txt and
pyproject.toml, the try/except AttributeError guard around
page.get_texttrace() in find_hidden_regions was defending against an
unreachable path. Silent degradation on a dep mismatch would hide real
bugs; the pin is the contract, so a missing API should surface as a
clear AttributeError.

Tightened the pins from ~=1.27 / ~=4.14 to ~=1.27.0 / ~=4.14.0 so that
only patch releases are accepted. The looser form actually allowed any
minor bump up to the next major, which contradicted the comment in
requirements.txt flagging PyMuPDF minor-version API drift as the
concern.
---
 tomd/lib/pdf/cleanup.py | 8 +-------
 tomd/pyproject.toml     | 4 ++--
 tomd/requirements.txt   | 4 ++--
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/tomd/lib/pdf/cleanup.py b/tomd/lib/pdf/cleanup.py
index faf9761..3054435 100644
--- a/tomd/lib/pdf/cleanup.py
+++ b/tomd/lib/pdf/cleanup.py
@@ -212,13 +212,7 @@ def find_hidden_regions(page, body_fonts: set[str] | None = None,
     if body_fonts is None:
         return hidden_bboxes
 
-    try:
-        trace = page.get_texttrace()
-    except AttributeError:
-        _log.debug("get_texttrace() not available; skipping hidden region detection")
-        return hidden_bboxes
-
-    for span in trace:
+    for span in page.get_texttrace():
         if span.get("type") == 3:
             continue
 
diff --git a/tomd/pyproject.toml b/tomd/pyproject.toml
index 6f53e09..dd2eb1d 100644
--- a/tomd/pyproject.toml
+++ b/tomd/pyproject.toml
@@ -13,8 +13,8 @@ authors = [
   {name = "Vinnie Falco"},
 ]
 dependencies = [
-  "pymupdf~=1.27",
-  "beautifulsoup4~=4.14",
+  "pymupdf~=1.27.0",
+  "beautifulsoup4~=4.14.0",
 ]
 
 [project.optional-dependencies]
diff --git a/tomd/requirements.txt b/tomd/requirements.txt
index 6eaae68..86d5c0a 100644
--- a/tomd/requirements.txt
+++ b/tomd/requirements.txt
@@ -6,5 +6,5 @@
 # Bump these pins intentionally after running the full test suite against
 # the new version.
 
-pymupdf~=1.27
-beautifulsoup4~=4.14
+pymupdf~=1.27.0
+beautifulsoup4~=4.14.0

From be4f82fc726803a4f5202ef200b4cfd9c22afbab Mon Sep 17 00:00:00 2001
From: Jeremy Childers <30885417+jlchilders11@users.noreply.github.com>
Date: Wed, 15 Apr 2026 18:18:12 -0400
Subject: [PATCH 14/14] Add missing tests and fix cleanup.py bug

* tomd: Add tests for header footer functions

* hidden regions tests

*  position-based list tests

*  HTML generator fixtures

* PR feedback
---
 tomd/lib/pdf/cleanup.py                       |  17 +-
 tomd/tests/fixtures/html/bikeshed_sample.html |  21 ++
 tomd/tests/fixtures/html/hackmd_sample.html   |  17 ++
 .../html/handwritten_address_sample.html      |  15 ++
 .../html/handwritten_table_sample.html        |  15 ++
 tomd/tests/test_header_footer.py              | 184 ++++++++++++++++++
 tomd/tests/test_hidden_regions.py             | 158 +++++++++++++++
 tomd/tests/test_html_generators.py            | 124 ++++++++++++
 tomd/tests/test_position_lists.py             | 142 ++++++++++++++
 9 files changed, 687 insertions(+), 6 deletions(-)
 create mode 100644 tomd/tests/fixtures/html/bikeshed_sample.html
 create mode 100644 tomd/tests/fixtures/html/hackmd_sample.html
 create mode 100644 tomd/tests/fixtures/html/handwritten_address_sample.html
 create mode 100644 tomd/tests/fixtures/html/handwritten_table_sample.html
 create mode 100644 tomd/tests/test_header_footer.py
 create mode 100644 tomd/tests/test_hidden_regions.py
 create mode 100644 tomd/tests/test_html_generators.py
 create mode 100644 tomd/tests/test_position_lists.py

diff --git a/tomd/lib/pdf/cleanup.py b/tomd/lib/pdf/cleanup.py
index 3054435..8a5cad3 100644
--- a/tomd/lib/pdf/cleanup.py
+++ b/tomd/lib/pdf/cleanup.py
@@ -2,7 +2,7 @@
 
 import logging
 import re
-from collections import defaultdict
+from collections import defaultdict, Counter
 from dataclasses import replace
 
 from .. import strip_format_chars, DOC_NUM_RE
@@ -79,12 +79,7 @@ def detect_repeating(all_edge_items: list[list[PageEdgeItem]],
         pages_seen = len(set(it.page_num for it in items))
         if pages_seen < threshold:
             continue
-
         texts = [it.text for it in items]
-        if len(set(texts)) == 1:
-            repeating.add((y_key, texts[0]))
-            _log.debug("Repeating exact: y=%.1f text=%r", y_key, texts[0])
-            continue
 
         if all(PAGE_NUM_RE.match(t) for t in texts):
             repeating.add((y_key, "__PAGE_NUM__"))
@@ -96,6 +91,16 @@ def detect_repeating(all_edge_items: list[list[PageEdgeItem]],
             _log.debug("Repeating doc number at y=%.1f", y_key)
             continue
 
+        text_counts = Counter(it.text for it in items)
+        exact_hit = False
+        for text, count in text_counts.items():
+            if count >= threshold:
+                repeating.add((y_key, text))
+                _log.debug("Repeating exact: y=%.1f text=%r", y_key, text)
+                exact_hit = True
+        if exact_hit:
+            continue
+
     return repeating
 
 
diff --git a/tomd/tests/fixtures/html/bikeshed_sample.html b/tomd/tests/fixtures/html/bikeshed_sample.html
new file mode 100644
index 0000000..20153c7
--- /dev/null
+++ b/tomd/tests/fixtures/html/bikeshed_sample.html
@@ -0,0 +1,21 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8">
+<meta name="generator" content="Bikeshed 1.2.3">
+<title>P9999R0 Test Bikeshed Paper</title>
+</head>
+<body>
+<h1 class="p-name">P9999R0 Test Bikeshed Paper</h1>
+<p>Before the metadata list.</p>
+<dl>
+  <dt>Audience:</dt>
+  <dd>SG1</dd>
+  <dt>Editor:</dt>
+  <dd><a class="email" href="mailto:editor@example.com">Eddie Editor</a></dd>
+</dl>
+<time class="dt-updated" datetime="2026-03-15">2026-03-15</time>
+<h2>Introduction</h2>
+<p>Body paragraph content.</p>
+</body>
+</html>
\ No newline at end of file
diff --git a/tomd/tests/fixtures/html/hackmd_sample.html b/tomd/tests/fixtures/html/hackmd_sample.html
new file mode 100644
index 0000000..75e80b1
--- /dev/null
+++ b/tomd/tests/fixtures/html/hackmd_sample.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8">
+<title>P9999R0: Test HackMD Paper - HackMD</title>
+<link rel="stylesheet" href="https://hackmd.io/stylesheets/main.css">
+</head>
+<body>
+<h1>P9999R0: Test HackMD Paper</h1>
+<table>
+  <tr><td>Document:</td><td>P9999R0</td></tr>
+  <tr><td>Audience:</td><td>SG1</td></tr>
+</table>
+<h2>Introduction</h2>
+<p>Body paragraph.</p>
+</body>
+</html>
\ No newline at end of file
diff --git a/tomd/tests/fixtures/html/handwritten_address_sample.html b/tomd/tests/fixtures/html/handwritten_address_sample.html
new file mode 100644
index 0000000..50e9748
--- /dev/null
+++ b/tomd/tests/fixtures/html/handwritten_address_sample.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html>
+<html>
+<head><meta charset="utf-8"><title>Handwritten Sample</title></head>
+<body>
+<address>
+Document Number: P9999R0<br>
+Date: 2026-03-15<br>
+Audience: SG1<br>
+<a href="mailto:alice@example.com">Alice Author</a>
+</address>
+<h1>Test Handwritten Paper</h1>
+<h2>Introduction</h2>
+<p>Body paragraph.</p>
+</body>
+</html>
\ No newline at end of file
diff --git a/tomd/tests/fixtures/html/handwritten_table_sample.html b/tomd/tests/fixtures/html/handwritten_table_sample.html
new file mode 100644
index 0000000..f2050af
--- /dev/null
+++ b/tomd/tests/fixtures/html/handwritten_table_sample.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html>
+<html>
+<head><meta charset="utf-8"><title>Handwritten Table Sample</title></head>
+<body>
+<table class="header">
+  <tr><th>Document Number:</th><td>P9998R0</td></tr>
+  <tr><th>Date:</th><td>2026-02-01</td></tr>
+  <tr><th>Audience:</th><td>EWG</td></tr>
+  <tr><th>Reply-to:</th><td><a href="mailto:bob@example.com">Bob Sample</a></td></tr>
+</table>
+<h1>Another Handwritten Paper</h1>
+<h2>Scope</h2>
+<p>Body paragraph.</p>
+</body>
+</html>
\ No newline at end of file
diff --git a/tomd/tests/test_header_footer.py b/tomd/tests/test_header_footer.py
new file mode 100644
index 0000000..8e879ac
--- /dev/null
+++ b/tomd/tests/test_header_footer.py
@@ -0,0 +1,184 @@
+
+"""Tests for header/footer detection and stripping in lib.pdf.cleanup."""
+from lib.pdf.cleanup import get_edge_items, detect_repeating, strip_repeating
+from lib.pdf.types import (
+    Block, Line, Span, PageEdgeItem, Y_TOLERANCE, EDGE_ITEMS_PER_PAGE,
+)
+
+
+def _make_line(text, y, page_num=0, x0=50.0, x1=550.0):
+    """Construct a Line with a single span at the given y position."""
+    return Line(
+        spans=[Span(text=text, font_name="Body", font_size=11.0)],
+        bbox=(x0, y, x1, y + 12.0),
+        page_num=page_num,
+    )
+
+
+def _make_block_at_y(lines_data, page_num=0):
+    """Build a Block whose bbox spans its lines."""
+    lines = [_make_line(t, y, page_num=page_num) for t, y in lines_data]
+    ys = [ln.bbox[1] for ln in lines]
+    y2s = [ln.bbox[3] for ln in lines]
+    return Block(
+        lines=lines,
+        bbox=(50.0, min(ys), 550.0, max(y2s)),
+        page_num=page_num,
+    )
+
+
+# ---- get_edge_items ------------------------------------------------------
+
+def test_edge_items_picks_top_and_bottom():
+    """Top 3 and bottom 3 by y-coordinate, with no dedup needed when texts differ."""
+    # Lines at y = 30, 60, 90 (top half) and 500, 540, 580 (bottom half).
+    block_top = _make_block_at_y([("Header A", 30), ("Header B", 60), ("Header C", 90)])
+    block_body = _make_block_at_y([("body line", 300)])
+    block_bot = _make_block_at_y([("Footer X", 500), ("Footer Y", 540), ("Footer Z", 580)])
+    items = get_edge_items([block_top, block_body, block_bot], page_num=1, page_height=600)
+    texts = [it.text for it in items]
+    # Top 3 by y (ascending): Header A/B/C. Bottom 3 by y (largest): Footer X/Y/Z
+    # (and possibly body line too — the function takes items[:3] and items[-3:]).
+    assert "Header A" in texts
+    assert "Header B" in texts
+    assert "Header C" in texts
+    assert "Footer X" in texts or "Footer Y" in texts  # bottom range
+    assert "Footer Z" in texts
+
+
+def test_edge_items_dedups_same_text_same_y():
+    """Duplicate (text, y) pairs collapse to a single edge item."""
+    # Two blocks contribute lines with identical text at identical y — dedup.
+    b1 = _make_block_at_y([("Page 1", 30)])
+    b2 = _make_block_at_y([("Page 1", 30)])
+    items = get_edge_items([b1, b2], page_num=1, page_height=600)
+    texts = [it.text for it in items]
+    assert texts.count("Page 1") == 1
+
+
+def test_edge_items_empty_page():
+    assert get_edge_items([], page_num=1, page_height=600) == []
+
+
+def test_edge_items_skips_blank_lines():
+    """Lines whose text is empty after strip are not edge items."""
+    b = _make_block_at_y([("   ", 30), ("Real header", 60)])
+    items = get_edge_items([b], page_num=1, page_height=600)
+    texts = [it.text for it in items]
+    assert texts == ["Real header"]
+
+
+def test_edge_items_limits_per_page():
+    """No more than EDGE_ITEMS_PER_PAGE top + EDGE_ITEMS_PER_PAGE bottom."""
+    # 10 lines spread across y.
+    lines_data = [(f"line {i}", 20.0 + i * 30) for i in range(10)]
+    b = _make_block_at_y(lines_data)
+    items = get_edge_items([b], page_num=1, page_height=600)
+    # Top 3 + bottom 3 = 6; dedup only when keys collide.
+    assert len(items) <= 2 * EDGE_ITEMS_PER_PAGE
+
+
+# ---- detect_repeating ----------------------------------------------------
+
+def test_detect_repeating_exact_text():
+    """Same text at same y on >=50% of pages is classified as repeating."""
+    # 5 pages, 4 of them have "Running Head" at y=30.
+    all_edges = [
+        [PageEdgeItem(text="Running Head", y=30.0, page_num=pg, bbox=(0, 30, 100, 42))]
+        for pg in range(1, 5)
+    ] + [
+        [PageEdgeItem(text="Unique Title", y=30.0, page_num=5, bbox=(0, 30, 100, 42))],
+    ]
+    result = detect_repeating(all_edges, total_pages=5)
+    # y_bucket = round(30 / Y_TOLERANCE) * Y_TOLERANCE = 30.0 (Y_TOLERANCE is 2.0)
+    assert (30.0, "Running Head") in result
+
+
+def test_detect_repeating_skips_below_threshold():
+    """If an item appears on fewer than threshold pages, it's not repeating."""
+    # 5 pages, only 1 has "Not Repeating" at y=30.
+    all_edges = [
+        [PageEdgeItem(text="Not Repeating", y=30.0, page_num=1, bbox=(0, 30, 100, 42))],
+        [], [], [], [],
+    ]
+    result = detect_repeating(all_edges, total_pages=5)
+    assert not any(p == "Not Repeating" for _, p in result)
+
+
+def test_detect_repeating_short_doc_returns_empty():
+    """Fewer than 3 pages -> empty set (threshold 0.5 of 2 = 1, but function
+    requires total_pages >= 3)."""
+    all_edges = [[PageEdgeItem(text="Header", y=30.0, page_num=1, bbox=(0, 30, 100, 42))]]
+    assert detect_repeating(all_edges, total_pages=1) == set()
+    assert detect_repeating(all_edges * 2, total_pages=2) == set()
+
+
+def test_detect_repeating_page_number_pattern():
+    """Different page numbers at same y are classified as __PAGE_NUM__."""
+    all_edges = [
+        [PageEdgeItem(text=str(pg), y=580.0, page_num=pg, bbox=(270, 580, 290, 592))]
+        for pg in range(1, 6)
+    ]
+    result = detect_repeating(all_edges, total_pages=5)
+    # y_bucket = round(580 / 2) * 2 = 580.0
+    assert (580.0, "__PAGE_NUM__") in result
+
+
+def test_detect_repeating_doc_number_pattern():
+    """Running doc number at same y across pages is classified as __DOC_NUM__."""
+    # Same paper, revision number varies line-by-line — not realistic, but exercises
+    # the path. In practice the doc number repeats identically, which would hit the
+    # exact-text branch before DOC_NUM. Use revisions that differ:
+    docs = ["P1234R0", "P1234R1", "P1234R0", "P1234R2"]
+    all_edges = [
+        [PageEdgeItem(text=docs[i], y=30.0, page_num=i + 1, bbox=(0, 30, 100, 42))]
+        for i in range(4)
+    ]
+    result = detect_repeating(all_edges, total_pages=4)
+    assert (30.0, "__DOC_NUM__") in result
+
+
+# ---- strip_repeating -----------------------------------------------------
+
+def test_strip_repeating_removes_exact_match():
+    """A line whose (y, text) matches a repeating entry is removed."""
+    b = _make_block_at_y([("Running Head", 30), ("Real content", 60)])
+    repeating = {(36.0, "Running Head")}
+    result = strip_repeating([b], repeating)
+    assert len(result) == 1
+    texts = [ln.text for ln in result[0].lines]
+    assert "Real content" in texts
+    assert "Running Head" not in texts
+
+
+def test_strip_repeating_removes_page_numbers():
+    """A line matching PAGE_NUM_RE at the repeating y-bucket is removed."""
+    b = _make_block_at_y([("42", 580), ("Body line", 300)])
+    repeating = {(586.0, "__PAGE_NUM__")}
+    result = strip_repeating([b], repeating)
+    texts = [ln.text for ln in result[0].lines]
+    assert "42" not in texts
+    assert "Body line" in texts
+
+
+def test_strip_repeating_y_tolerance():
+    """Lines whose y differs by <= Y_TOLERANCE from the repeating bucket are stripped."""
+    b = _make_block_at_y([("Running Head", 31)])   # y=31, tolerance 2.0 around 30
+    repeating = {(36.0, "Running Head")}
+    result = strip_repeating([b], repeating)
+    # Block may be dropped if it becomes empty.
+    assert not result or not any("Running Head" in ln.text for blk in result for ln in blk.lines)
+
+
+def test_strip_repeating_drops_empty_blocks():
+    """A block with all lines stripped is omitted from the output."""
+    b = _make_block_at_y([("Running Head", 30)])
+    repeating = {(36.0, "Running Head")}
+    result = strip_repeating([b], repeating)
+    assert result == []
+
+
+def test_strip_repeating_empty_input():
+    """Empty repeating set returns blocks unchanged."""
+    b = _make_block_at_y([("content", 300)])
+    assert strip_repeating([b], set()) == [b]
\ No newline at end of file
diff --git a/tomd/tests/test_hidden_regions.py b/tomd/tests/test_hidden_regions.py
new file mode 100644
index 0000000..5db00f1
--- /dev/null
+++ b/tomd/tests/test_hidden_regions.py
@@ -0,0 +1,158 @@
+
+"""Tests for hidden region detection and stripping in lib.pdf.cleanup."""
+from unittest.mock import MagicMock
+
+from lib.pdf.cleanup import find_hidden_regions, strip_hidden_blocks
+from lib.pdf.types import Block, Line, Span
+
+
+# ---- find_hidden_regions -------------------------------------------------
+
+def _span_record(font, color, char_bboxes, span_type=1):
+    """Build a texttrace-shaped span record."""
+    return {
+        "type": span_type,
+        "font": font,
+        "color": color,
+        "chars": [(None, None, None, bb) for bb in char_bboxes],
+    }
+
+
+def test_find_hidden_regions_no_body_fonts_returns_empty():
+    """When body_fonts is None, the function short-circuits."""
+    page = MagicMock()
+    page.get_texttrace.return_value = [
+        _span_record("Roboto", 0x808080, [(10, 10, 20, 20)])
+    ]
+    assert find_hidden_regions(page, body_fonts=None) == set()
+
+
+def test_find_hidden_regions_roboto_non_black():
+    """Roboto font + non-black color + not-in-body-fonts -> hidden."""
+    page = MagicMock()
+    page.get_texttrace.return_value = [
+        _span_record("Roboto", 0x808080, [(10, 10, 20, 20)])
+    ]
+    result = find_hidden_regions(page, body_fonts={"cambria"})
+    assert (10, 10, 20, 20) in result
+
+
+def test_find_hidden_regions_google_font():
+    """Google-prefixed font triggers detection."""
+    page = MagicMock()
+    page.get_texttrace.return_value = [
+        _span_record("GoogleSans-Regular", 0x808080, [(10, 10, 20, 20)])
+    ]
+    result = find_hidden_regions(page, body_fonts={"cambria"})
+    assert (10, 10, 20, 20) in result
+
+
+def test_find_hidden_regions_material_font():
+    """Material UI font triggers detection."""
+    page = MagicMock()
+    page.get_texttrace.return_value = [
+        _span_record("MaterialIcons", 0x808080, [(10, 10, 20, 20)])
+    ]
+    result = find_hidden_regions(page, body_fonts={"cambria"})
+    assert (10, 10, 20, 20) in result
+
+
+def test_find_hidden_regions_body_font_not_hidden():
+    """A font that IS in body_fonts is not classified as hidden, even if
+    it coincidentally matches a widget keyword."""
+    page = MagicMock()
+    page.get_texttrace.return_value = [
+        _span_record("Roboto", 0x808080, [(10, 10, 20, 20)])
+    ]
+    result = find_hidden_regions(page, body_fonts={"roboto"})
+    assert result == set()
+
+
+def test_find_hidden_regions_black_color_not_hidden():
+    """Non-body-font, Roboto, but BLACK color -> not hidden (rule: non-black)."""
+    page = MagicMock()
+    page.get_texttrace.return_value = [
+        _span_record("Roboto", 0, [(10, 10, 20, 20)])
+    ]
+    result = find_hidden_regions(page, body_fonts={"cambria"})
+    assert result == set()
+
+
+def test_find_hidden_regions_black_tuple_color_not_hidden():
+    """The (0, 0, 0) tuple form of black is also recognized."""
+    page = MagicMock()
+    page.get_texttrace.return_value = [
+        _span_record("Roboto", (0, 0, 0), [(10, 10, 20, 20)])
+    ]
+    result = find_hidden_regions(page, body_fonts={"cambria"})
+    assert result == set()
+
+
+def test_find_hidden_regions_non_widget_font_not_hidden():
+    """A non-body font that isn't Roboto/Google/Material is left alone."""
+    page = MagicMock()
+    page.get_texttrace.return_value = [
+        _span_record("SomeOtherFont", 0x808080, [(10, 10, 20, 20)])
+    ]
+    result = find_hidden_regions(page, body_fonts={"cambria"})
+    assert result == set()
+
+
+def test_find_hidden_regions_mode_3_skipped():
+    """Rendering mode 3 (invisible text) is explicitly ignored by the function."""
+    page = MagicMock()
+    page.get_texttrace.return_value = [
+        _span_record("Roboto", 0x808080, [(10, 10, 20, 20)], span_type=3),
+    ]
+    result = find_hidden_regions(page, body_fonts={"cambria"})
+    assert result == set()
+
+
+# ---- strip_hidden_blocks -------------------------------------------------
+
+def _make_block(text, x0, y0, x1, y1):
+    span = Span(
+        text=text, font_name="Body", font_size=11.0,
+        bbox=(x0, y0, x1, y1),
+    )
+    line = Line(spans=[span], bbox=(x0, y0, x1, y1))
+    return Block(lines=[line], bbox=(x0, y0, x1, y1))
+
+
+def test_strip_hidden_blocks_empty_hidden_set_returns_input():
+    """No hidden bboxes -> input blocks returned unchanged."""
+    block = _make_block("visible text", 10, 10, 100, 30)
+    assert strip_hidden_blocks([block], set()) == [block]
+
+
+def test_strip_hidden_blocks_drops_block_entirely_in_hidden():
+    """A block whose only span overlaps a hidden bbox is dropped."""
+    block = _make_block("widget text", 10, 10, 100, 30)
+    hidden = {(5.0, 5.0, 150.0, 50.0)}  # engulfs the block
+    result = strip_hidden_blocks([block], hidden)
+    assert result == []
+
+
+def test_strip_hidden_blocks_keeps_block_outside_hidden():
+    """A block whose span is outside all hidden bboxes survives."""
+    block = _make_block("body text", 10, 300, 100, 320)
+    hidden = {(5.0, 5.0, 150.0, 50.0)}  # hidden at y=5..50; block at y=300 untouched
+    result = strip_hidden_blocks([block], hidden)
+    assert result == [block]
+
+
+def test_strip_hidden_blocks_keeps_block_with_any_visible_span():
+    """A block with one hidden span and one visible span is kept."""
+    hidden_span = Span(
+        text="widget", font_name="Roboto", font_size=11.0,
+        bbox=(10, 10, 50, 30),
+    )
+    visible_span = Span(
+        text="content", font_name="Body", font_size=11.0,
+        bbox=(60, 10, 150, 30),
+    )
+    line = Line(spans=[hidden_span, visible_span], bbox=(10, 10, 150, 30))
+    block = Block(lines=[line], bbox=(10, 10, 150, 30))
+    hidden = {(5.0, 5.0, 55.0, 35.0)}  # covers hidden_span only
+    result = strip_hidden_blocks([block], hidden)
+    assert result == [block]
\ No newline at end of file
diff --git a/tomd/tests/test_html_generators.py b/tomd/tests/test_html_generators.py
new file mode 100644
index 0000000..5b290fd
--- /dev/null
+++ b/tomd/tests/test_html_generators.py
@@ -0,0 +1,124 @@
+
+"""Per-generator integration tests for the HTML converter."""
+from pathlib import Path
+
+from lib.html.extract import (
+    parse_html, detect_generator, extract_metadata, strip_boilerplate,
+)
+from lib.html.render import render_body
+
+
+FIXTURES = Path(__file__).parent / "fixtures" / "html"
+
+
+def _load(name: str) -> str:
+    return (FIXTURES / name).read_text(encoding="utf-8")
+
+
+# ---- Bikeshed -----------------------------------------------------------
+
+def test_bikeshed_detection():
+    soup = parse_html(_load("bikeshed_sample.html"))
+    assert detect_generator(soup) == "bikeshed"
+
+
+def test_bikeshed_metadata_extraction():
+    soup = parse_html(_load("bikeshed_sample.html"))
+    meta = extract_metadata(soup, "bikeshed")
+    assert meta.get("document") == "P9999R0"
+    assert meta.get("title") == "Test Bikeshed Paper"
+    assert meta.get("date") == "2026-03-15"
+    assert meta.get("audience") == "SG1"
+    reply_to = meta.get("reply-to", [])
+    assert any("editor@example.com" in entry for entry in reply_to)
+
+
+def test_bikeshed_boilerplate_stripped():
+    soup = parse_html(_load("bikeshed_sample.html"))
+    strip_boilerplate(soup, "bikeshed")
+    # h1.p-name and data-fill-with divs removed.
+    assert soup.find("h1", class_="p-name") is None
+
+
+def test_bikeshed_body_renders():
+    soup = parse_html(_load("bikeshed_sample.html"))
+    strip_boilerplate(soup, "bikeshed")
+    md = render_body(soup, "bikeshed")
+    assert "## Introduction" in md
+    assert "Body paragraph content." in md
+
+
+# ---- HackMD -------------------------------------------------------------
+
+def test_hackmd_detection():
+    soup = parse_html(_load("hackmd_sample.html"))
+    assert detect_generator(soup) == "hackmd"
+
+
+def test_hackmd_metadata_via_generic_fallback():
+    """HackMD has no specific extractor; metadata comes through the generic path."""
+    soup = parse_html(_load("hackmd_sample.html"))
+    meta = extract_metadata(soup, "hackmd")
+    # detect_generator returned "hackmd" but extract_metadata dispatches by argument;
+    # passing "hackmd" falls through to _extract_generic_metadata.
+    assert meta.get("title") == "P9999R0: Test HackMD Paper"
+    # The generic table scan should pick up document/audience from the <table>.
+    assert meta.get("document") == "P9999R0"
+    assert meta.get("audience") == "SG1"
+
+
+def test_hackmd_body_renders():
+    soup = parse_html(_load("hackmd_sample.html"))
+    strip_boilerplate(soup, "hackmd")
+    md = render_body(soup, "hackmd")
+    assert "## Introduction" in md
+    assert "Body paragraph." in md
+
+
+# ---- Hand-written (address form) ----------------------------------------
+
+def test_handwritten_address_detection():
+    soup = parse_html(_load("handwritten_address_sample.html"))
+    assert detect_generator(soup) == "hand-written"
+
+
+def test_handwritten_address_metadata():
+    soup = parse_html(_load("handwritten_address_sample.html"))
+    meta = extract_metadata(soup, "hand-written")
+    assert meta.get("document") == "P9999R0"
+    assert meta.get("date") == "2026-03-15"
+    assert meta.get("audience") == "SG1"
+    reply_to = meta.get("reply-to", [])
+    assert any("alice@example.com" in entry for entry in reply_to)
+
+
+def test_handwritten_address_boilerplate_stripped():
+    soup = parse_html(_load("handwritten_address_sample.html"))
+    strip_boilerplate(soup, "hand-written")
+    # <address> removed.
+    assert soup.find("address") is None
+
+
+def test_handwritten_address_body_renders():
+    soup = parse_html(_load("handwritten_address_sample.html"))
+    strip_boilerplate(soup, "hand-written")
+    md = render_body(soup, "hand-written")
+    assert "## Introduction" in md
+
+
+# ---- Hand-written (table.header form) -----------------------------------
+
+def test_handwritten_table_metadata():
+    soup = parse_html(_load("handwritten_table_sample.html"))
+    meta = extract_metadata(soup, "hand-written")
+    assert meta.get("document") == "P9998R0"
+    assert meta.get("date") == "2026-02-01"
+    assert meta.get("audience") == "EWG"
+    reply_to = meta.get("reply-to", [])
+    assert any("bob@example.com" in entry for entry in reply_to)
+
+
+def test_handwritten_table_boilerplate_stripped():
+    soup = parse_html(_load("handwritten_table_sample.html"))
+    strip_boilerplate(soup, "hand-written")
+    assert soup.find("table", class_="header") is None
\ No newline at end of file
diff --git a/tomd/tests/test_position_lists.py b/tomd/tests/test_position_lists.py
new file mode 100644
index 0000000..c943919
--- /dev/null
+++ b/tomd/tests/test_position_lists.py
@@ -0,0 +1,142 @@
+
+"""Tests for position-based list detection in lib.pdf.structure."""
+from lib.pdf.structure import (
+    _detect_lists_by_position,
+    _split_section_by_position,
+    _join_bullet_marker_lines,
+)
+from lib.pdf.types import Section, SectionKind, Line, Span, Block
+
+
+def _bullet_line(text, x0, y=100.0):
+    """Construct a Line at the given x-indent with a bullet as first char.
+
+    The text should start with a bullet character so _line_starts_with_bullet
+    returns True.
+    """
+    span = Span(text=text, font_name="Body", font_size=11.0,
+                bbox=(x0, y, x0 + 300, y + 12))
+    return Line(
+        spans=[span],
+        bbox=(x0, y, x0 + 300, y + 12),
+    )
+
+
+def _body_line(text, x0=50.0, y=100.0):
+    """Construct a Line at body-margin indent (no bullet)."""
+    span = Span(text=text, font_name="Body", font_size=11.0,
+                bbox=(x0, y, x0 + 300, y + 12))
+    return Line(
+        spans=[span],
+        bbox=(x0, y, x0 + 300, y + 12),
+    )
+
+
+def _make_section_with_lines(lines, kind=SectionKind.PARAGRAPH, font_size=11.0):
+    text = "\n".join(ln.text for ln in lines)
+    return Section(
+        kind=kind,
+        text=text,
+        lines=lines,
+        page_num=0,
+        font_size=font_size,
+    )
+
+
+# ---- _detect_lists_by_position -------------------------------------------
+
+def test_detect_preserves_non_paragraph_sections():
+    """HEADING, TABLE, UNCERTAIN sections pass through unchanged."""
+    heading = _make_section_with_lines([_body_line("Heading")], kind=SectionKind.HEADING)
+    result = _detect_lists_by_position([heading])
+    assert result == [heading]
+
+
+def test_detect_paragraph_without_bullets_unchanged():
+    """A paragraph with no indented bullets is returned as-is."""
+    para = _make_section_with_lines([
+        _body_line("First paragraph line."),
+        _body_line("Second paragraph line."),
+    ])
+    result = _detect_lists_by_position([para])
+    assert len(result) == 1
+    assert result[0].kind == SectionKind.PARAGRAPH
+
+
+def test_detect_converts_indented_bullets_to_list():
+    """Indented lines starting with bullets become LIST sections."""
+    # Body margin needs establishing via the first non-bullet line, or via
+    # _get_body_margin's heuristic. Use x=50 for body and x=80 for indented bullets.
+    para = _make_section_with_lines([
+        _body_line("Introduction paragraph.", x0=50.0),
+        _bullet_line("\u2022 first item", x0=80.0, y=114.0),
+        _bullet_line("\u2022 second item", x0=80.0, y=128.0),
+    ])
+    result = _detect_lists_by_position([para])
+    kinds = [sec.kind for sec in result]
+    # Expect at least one LIST section in the result.
+    assert SectionKind.LIST in kinds, f"got kinds={kinds}"
+
+
+def test_detect_mixed_list_and_paragraph_split():
+    """Body-margin lines between bullet groups split into their own PARAGRAPH."""
+    para = _make_section_with_lines([
+        _bullet_line("\u2022 first bullet", x0=80.0, y=100.0),
+        _body_line("Interstitial body text.", x0=50.0, y=114.0),
+        _bullet_line("\u2022 second bullet", x0=80.0, y=128.0),
+    ])
+    result = _detect_lists_by_position([para])
+    kinds = [sec.kind for sec in result]
+    # Expect LIST, PARAGRAPH, LIST order (subject to implementation details).
+    assert SectionKind.LIST in kinds
+    assert SectionKind.PARAGRAPH in kinds
+
+
+# ---- _split_section_by_position tracks indent level ----------------------
+
+def test_split_section_by_position_nested_indent():
+    """A bullet at indent level 2 (x further right) carries indent_level=2."""
+    # body_margin is the leftmost frequent x; derive it from the non-bullet line.
+    # Use the internal _get_body_margin via the public function path instead.
+    para = _make_section_with_lines([
+        _body_line("Paragraph body.", x0=50.0, y=100.0),
+        _bullet_line("\u2022 outer", x0=80.0, y=114.0),
+        _bullet_line("\u2022 nested", x0=120.0, y=128.0),  # further right than outer
+    ])
+    result = _detect_lists_by_position([para])
+    list_sections = [s for s in result if s.kind == SectionKind.LIST]
+    indent_levels = [s.indent_level for s in list_sections]
+    # At least one section should be at indent_level > 0.
+    assert any(i > 0 for i in indent_levels), f"got indent_levels={indent_levels}"
+
+
+# ---- _join_bullet_marker_lines -------------------------------------------
+
+def test_join_bullet_marker_merges_bullet_and_text_lines():
+    """When a line is just a bullet char and the next is its text, merge them."""
+    bullet_span = Span(text="\u2022", font_name="Body", font_size=11.0,
+                       bbox=(50, 100, 58, 112))
+    text_span = Span(text="item text", font_name="Body", font_size=11.0,
+                     bbox=(68, 100, 200, 112))
+    bullet_line = Line(spans=[bullet_span], bbox=(50, 100, 58, 112))
+    text_line = Line(spans=[text_span], bbox=(68, 100, 200, 112))
+    result = _join_bullet_marker_lines([bullet_line, text_line])
+    # After joining, one Line containing a combined span (bullet + space + text).
+    assert len(result) == 1
+    combined_text = result[0].text
+    assert combined_text.startswith("\u2022")
+    assert "item text" in combined_text
+
+
+def test_join_bullet_marker_leaves_non_bullet_pairs_alone():
+    """Two adjacent normal lines are not merged."""
+    l1 = _body_line("line one")
+    l2 = _body_line("line two")
+    result = _join_bullet_marker_lines([l1, l2])
+    assert len(result) == 2
+
+
+def test_join_bullet_marker_handles_single_line():
+    """Fewer than 2 lines -> returned unchanged."""
+    l1 = _body_line("solo line")
+    assert _join_bullet_marker_lines([l1]) == [l1]
\ No newline at end of file