diff --git a/.github/workflows/tomd-tests.yml b/.github/workflows/tomd-tests.yml
new file mode 100644
index 0000000..239db26
--- /dev/null
+++ b/.github/workflows/tomd-tests.yml
@@ -0,0 +1,40 @@
+name: tomd tests
+
+on:
+ push:
+ branches: [master]
+ paths:
+ - "tomd/**"
+ - ".github/workflows/tomd-tests.yml"
+ pull_request:
+ paths:
+ - "tomd/**"
+ - ".github/workflows/tomd-tests.yml"
+ workflow_dispatch:
+
+jobs:
+ test:
+ name: pytest (Python ${{ matrix.python-version }})
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.12", "3.13"]
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: "pip"
+ cache-dependency-path: tomd/requirements.txt
+
+ - name: Install tomd with test extras
+ run: |
+ python -m pip install --upgrade pip
+ pip install -e tomd[test]
+
+ - name: Run pytest
+ working-directory: tomd
+ run: python -m pytest tests/ -v --tb=short
diff --git a/tomd/.gitignore b/tomd/.gitignore
index d4db843..20dbd4a 100644
--- a/tomd/.gitignore
+++ b/tomd/.gitignore
@@ -1,5 +1,9 @@
**/__pycache__/
**/.pytest_cache/
*.pyc
+*.egg-info/
.out/
papers/
+.venv/
+build/
+dist/
diff --git a/tomd/CLAUDE.md b/tomd/CLAUDE.md
index cdc017d..15fa83b 100644
--- a/tomd/CLAUDE.md
+++ b/tomd/CLAUDE.md
@@ -100,7 +100,7 @@ Auto-resolution via `--llm` flag is deferred to v2. For v1, the tool produces a
## File Map
- `main.py` - CLI entry point. Argparse, glob expansion, output path logic, main(). No conversion logic.
-- `lib/__init__.py` - Shared text utilities and constants for PDF and HTML converters: `ascii_escape`, `strip_format_chars`, `format_front_matter`, `ALLOWED_LINK_SCHEMES`, and shared regex patterns (`EMAIL_RE`, `DATE_RE`, `DOC_NUM_RE`, `SECTION_NUM_PREFIX_RE`).
+- `lib/__init__.py` - Shared text utilities and constants for PDF and HTML converters: `ascii_escape`, `strip_format_chars`, `format_front_matter`, `parse_author_lines`, `ALLOWED_LINK_SCHEMES`, shared regex patterns (`EMAIL_RE`, `DATE_RE`, `DOC_NUM_RE`, `SECTION_NUM_PREFIX_RE`), and their reusable shape strings (`DOC_NUM_PATTERN`, `SECTION_NUM_PATTERN`) consumed by `lib/pdf/types.py` to build `DOC_FIELD_RE` and `SECTION_NUM_RE`.
- `lib/similarity.py` - Dual-algorithm string similarity (SequenceMatcher + Jaccard). Per-algorithm thresholds, 200-char circuit breaker. Format-agnostic.
- `lib/toc.py` - Table of Contents detection. Matches section texts against known headings using fuzzy similarity. Bridges small gaps. Format-agnostic - no dependency on PDF types.
- `lib/pdf/__init__.py` - Exports `convert_pdf()`. Orchestrates the full pipeline in order. Includes monospace propagation, wording classification, and page 0 color extraction via space-color proxy.
diff --git a/tomd/LICENSE b/tomd/LICENSE
new file mode 100644
index 0000000..e439b22
--- /dev/null
+++ b/tomd/LICENSE
@@ -0,0 +1,25 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Copyright (c) 2026 Vinnie Falco
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/tomd/README.md b/tomd/README.md
new file mode 100644
index 0000000..76456ad
--- /dev/null
+++ b/tomd/README.md
@@ -0,0 +1,96 @@
+# tomd
+
+Convert WG21 committee papers from PDF or HTML to clean Markdown.
+
+tomd is purpose-built for C++ standards committee paper conversion. It
+understands WG21 metadata fields (document number, date, reply-to, audience),
+detects structural elements (headings, lists, tables, code blocks, wording
+sections), and produces Markdown that looks like a human wrote it, suitable
+for version control, pull request diffs, and plain-text review workflows.
+
+## Install
+
+From this directory:
+
+```
+pip install -e .
+```
+
+Requires Python 3.12 or newer. Runtime dependencies (`pymupdf~=1.27`,
+`beautifulsoup4~=4.14`) are declared in `pyproject.toml` and installed
+automatically.
+
+## Usage
+
+```
+tomd paper.pdf # -> paper.md (+ paper.prompts.md if uncertain)
+tomd paper.html # -> paper.md
+tomd *.pdf *.html --outdir out/ # batch mode
+tomd -v paper.pdf # verbose logging
+tomd -o out.md paper.pdf # explicit output path (single-file only)
+```
+
+Also runnable as `python -m tomd.main ...`.
+
+### Output
+
+- `paper.md` is always produced. It contains YAML front matter (title,
+ document number, date, audience, reply-to) followed by the paper body
+ rendered as Markdown.
+- `paper.prompts.md` is produced only when the converter found uncertain
+ regions. It pairs each uncertain span with both extraction paths (MuPDF
+ and spatial) plus surrounding context, formatted for manual LLM
+ reconciliation. If no uncertain regions exist, no prompts file is written
+ (and any stale one at the output path is removed).
+
+### Uncertain regions
+
+tomd uses dual-extraction with confidence scoring. When the MuPDF and
+spatial paths disagree on a page, the region is emitted in the output
+marked with an HTML comment:
+
+```
+
+```
+
+The accompanying `.prompts.md` file contains ready-to-feed LLM prompts for
+each marker. You resolve uncertain regions manually; the LLM fixes
+structure, never content.
+
+## Limitations
+
+- **No OCR.** Scanned or image-only PDFs are not supported.
+- **No vision fallback.** Papers that rely on non-extractable layout
+ (complex equations, diagrams) will not convert cleanly.
+- **HTML generator coverage.** Four generators are detected directly:
+ mpark/wg21, Bikeshed, HackMD, and hand-written. Other sources fall back
+ to a generic extractor that may miss metadata fields.
+- **LLM auto-resolution is deferred to v2.** The `.prompts.md` file is
+ produced; feeding it to an LLM and applying the result is manual in this
+ release.
+
+## Design
+
+Design and architecture documentation lives alongside the code:
+
+- [`CLAUDE.md`](CLAUDE.md) - architecture rules and invariants (contributors
+ and AI agents).
+- [`lib/pdf/ARCHITECTURE.md`](lib/pdf/ARCHITECTURE.md) - PDF converter
+ pipeline and the techniques it uses.
+- [`lib/html/ARCHITECTURE.md`](lib/html/ARCHITECTURE.md) - HTML converter
+ pipeline.
+
+Read these in order if you are modifying tomd.
+
+## Development
+
+Install test extras and run the suite:
+
+```
+pip install -e .[test]
+pytest tests/
+```
+
+## License
+
+Boost Software License 1.0. See [`LICENSE`](LICENSE).
diff --git a/tomd/__init__.py b/tomd/__init__.py
new file mode 100644
index 0000000..ce03d3c
--- /dev/null
+++ b/tomd/__init__.py
@@ -0,0 +1 @@
+"""tomd - PDF and HTML to Markdown converter for WG21 papers."""
diff --git a/tomd/lib/__init__.py b/tomd/lib/__init__.py
index b7fcad4..dcd53d4 100644
--- a/tomd/lib/__init__.py
+++ b/tomd/lib/__init__.py
@@ -148,15 +148,24 @@ def parse_author_lines(lines, clean_line=None, skip_line=None):
DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
-# Broad document-number pattern used for header stripping and HTML metadata.
+# Core pattern shapes (no anchors, no label context) reused across modules
+# so every document- and section-number pattern has a single source of truth.
+# `lib/pdf/types.py` builds the labeled PDF variants (DOC_FIELD_RE,
+# SECTION_NUM_RE) on top of these.
+DOC_NUM_PATTERN = (
+ r"[DPN]\d{3,5}R\d+"
+ r"|[DPN]\d{3,5}"
+ r"|N\d{3,5}"
+ r"|SD-\d+"
+)
+
+SECTION_NUM_PATTERN = r"\d+(?:\.\d+)*"
+
+# Broad document-number match used for header stripping and HTML metadata.
# For line-anchored field extraction in PDF blocks, see DOC_FIELD_RE in
# lib/pdf/types.py, which targets "Document Number: PXXXXrN" line prefixes.
-DOC_NUM_RE = re.compile(
- r"\b([DPN]\d{3,5}R\d+)\b"
- r"|\b([DPN]\d{3,5})\b"
- r"|\b(N\d{3,5})\b"
- r"|\b(SD-\d+)\b",
- re.IGNORECASE,
-)
+DOC_NUM_RE = re.compile(rf"\b({DOC_NUM_PATTERN})\b", re.IGNORECASE)
-SECTION_NUM_PREFIX_RE = re.compile(r"^\d+(?:\.\d+)*\.?\s+")
+# Leading section-number prefix used by the HTML renderer to strip a number
+# (e.g. "2.1.3 " or "1. ") from heading text.
+SECTION_NUM_PREFIX_RE = re.compile(rf"^{SECTION_NUM_PATTERN}\.?\s+")
diff --git a/tomd/lib/html/render.py b/tomd/lib/html/render.py
index 4235002..be1bb05 100644
--- a/tomd/lib/html/render.py
+++ b/tomd/lib/html/render.py
@@ -8,6 +8,7 @@
from .. import strip_format_chars, SECTION_NUM_PREFIX_RE, ALLOWED_LINK_SCHEMES
_HEADING_TAGS = frozenset({"h1", "h2", "h3", "h4", "h5", "h6"})
+_LIST_CONTAINER_TAGS = frozenset({"ul", "ol"})
def render_body(soup: BeautifulSoup, generator: str) -> str:
@@ -203,13 +204,16 @@ def _render_list(el: Tag, marker: str, generator: str) -> str | None:
items = []
for i, li in enumerate(el.find_all("li", recursive=False)):
prefix = f"{i + 1}." if marker == "1." else "-"
+ # Detach nested sublists before capturing inline text so they are not
+ # walked into by _inline_text (which would duplicate their contents).
+ subs = [sub.extract()
+ for sub in li.find_all(_LIST_CONTAINER_TAGS, recursive=False)]
nested_parts = []
- for sub in li.find_all(["ul", "ol"], recursive=False):
+ for sub in subs:
sub_rendered = _render_element(sub, generator)
if sub_rendered:
indented = "\n".join(" " + line for line in sub_rendered.split("\n"))
nested_parts.append(indented)
- sub.extract()
text = _collapse_whitespace(_inline_text(li))
if text:
diff --git a/tomd/lib/pdf/ARCHITECTURE.md b/tomd/lib/pdf/ARCHITECTURE.md
index ee562d2..ab00732 100644
--- a/tomd/lib/pdf/ARCHITECTURE.md
+++ b/tomd/lib/pdf/ARCHITECTURE.md
@@ -64,7 +64,7 @@ Enums:
- `dy > avg_fs * 1.8` -> line break
- `dy > avg_fs * 0.3` -> line break
- `dx > avg_fs * 0.3` -> word break (insert space)
-- Characters sorted by y-band (half font height) with stable sort preserving document order within each band
+- Characters sorted by y-band (half font height) then x-position, giving deterministic reading order within each band
**T3. Monospace classification (4 signals)**
- `mono.py:classify_monospace`
diff --git a/tomd/lib/pdf/cleanup.py b/tomd/lib/pdf/cleanup.py
index faf9761..8a5cad3 100644
--- a/tomd/lib/pdf/cleanup.py
+++ b/tomd/lib/pdf/cleanup.py
@@ -2,7 +2,7 @@
import logging
import re
-from collections import defaultdict
+from collections import defaultdict, Counter
from dataclasses import replace
from .. import strip_format_chars, DOC_NUM_RE
@@ -79,12 +79,7 @@ def detect_repeating(all_edge_items: list[list[PageEdgeItem]],
pages_seen = len(set(it.page_num for it in items))
if pages_seen < threshold:
continue
-
texts = [it.text for it in items]
- if len(set(texts)) == 1:
- repeating.add((y_key, texts[0]))
- _log.debug("Repeating exact: y=%.1f text=%r", y_key, texts[0])
- continue
if all(PAGE_NUM_RE.match(t) for t in texts):
repeating.add((y_key, "__PAGE_NUM__"))
@@ -96,6 +91,16 @@ def detect_repeating(all_edge_items: list[list[PageEdgeItem]],
_log.debug("Repeating doc number at y=%.1f", y_key)
continue
+ text_counts = Counter(it.text for it in items)
+ exact_hit = False
+ for text, count in text_counts.items():
+ if count >= threshold:
+ repeating.add((y_key, text))
+ _log.debug("Repeating exact: y=%.1f text=%r", y_key, text)
+ exact_hit = True
+ if exact_hit:
+ continue
+
return repeating
@@ -212,13 +217,7 @@ def find_hidden_regions(page, body_fonts: set[str] | None = None,
if body_fonts is None:
return hidden_bboxes
- try:
- trace = page.get_texttrace()
- except AttributeError:
- _log.debug("get_texttrace() not available; skipping hidden region detection")
- return hidden_bboxes
-
- for span in trace:
+ for span in page.get_texttrace():
if span.get("type") == 3:
continue
diff --git a/tomd/lib/pdf/types.py b/tomd/lib/pdf/types.py
index c816c13..3dfcac6 100644
--- a/tomd/lib/pdf/types.py
+++ b/tomd/lib/pdf/types.py
@@ -5,6 +5,8 @@
from dataclasses import dataclass, field
from enum import Enum
+from tomd.lib import DOC_NUM_PATTERN, SECTION_NUM_PATTERN
+
class Confidence(Enum):
"""Confidence level for structural classification decisions."""
@@ -135,15 +137,17 @@ class PageEdgeItem:
# --- Precompiled regex patterns ---
-SECTION_NUM_RE = re.compile(
- r"^(\d+(?:\.\d+)*)\s+(.+)",
-)
+# Section number at the start of a line with required trailing content
+# (used for heading detection); shares the core shape with
+# SECTION_NUM_PREFIX_RE in lib/__init__.py.
+SECTION_NUM_RE = re.compile(rf"^({SECTION_NUM_PATTERN})\s+(.+)")
# Line-anchored pattern targeting "Document Number: PXXXXRN" field lines in
# PDF block text. More restrictive than DOC_NUM_RE in lib/__init__.py, which
-# is a broad substring match used for header stripping and HTML contexts.
+# is a broad substring match used for header stripping and HTML contexts;
+# both patterns share the core DOC_NUM_PATTERN shape.
DOC_FIELD_RE = re.compile(
- r"Document\s+(?:Number|#)[:\s]+([DPN]\d{3,5}(?:R\d+)?|N\d{3,5})",
+ rf"Document\s+(?:Number|#)[:\s]+({DOC_NUM_PATTERN})",
re.IGNORECASE,
)
diff --git a/tomd/lib/similarity.py b/tomd/lib/similarity.py
index b6b4d9f..2de0772 100644
--- a/tomd/lib/similarity.py
+++ b/tomd/lib/similarity.py
@@ -51,6 +51,8 @@ def similar(a: str, b: str) -> bool:
The per-string check is lenient because the caller (TOC detection)
provides a second guard via the 3+ consecutive run requirement.
+ Identical strings short-circuit to True regardless of length; the
+ 200-char gate only protects against expensive fuzzy-compare work.
"""
if a == b:
return True
diff --git a/tomd/main.py b/tomd/main.py
index 4bf35f1..1326285 100644
--- a/tomd/main.py
+++ b/tomd/main.py
@@ -4,10 +4,12 @@
PDF: hybrid dual extraction (MuPDF + spatial rules) with confidence scoring.
HTML: DOM traversal with generator-specific metadata extraction.
-Usage:
- python tomd/main.py input.pdf # -> input.md + input.prompts.md
- python tomd/main.py input.html # -> input.md
- python tomd/main.py *.pdf *.html --outdir out/ # batch mode
+Usage (after `pip install -e tomd`):
+ tomd input.pdf # -> input.md + input.prompts.md
+ tomd input.html # -> input.md
+ tomd *.pdf *.html --outdir out/ # batch mode
+
+Also runnable as `python -m tomd.main ...`.
"""
import argparse
@@ -83,10 +85,10 @@ def main():
try:
ext = input_file.suffix.lower()
if ext in _HTML_EXTENSIONS:
- from lib.html import convert_html
+ from .lib.html import convert_html
md_text, prompts_text = convert_html(input_file)
elif ext in _PDF_EXTENSIONS:
- from lib.pdf import convert_pdf
+ from .lib.pdf import convert_pdf
md_text, prompts_text = convert_pdf(input_file)
else:
print(f"SKIP: {input_file} unsupported format", file=sys.stderr)
diff --git a/tomd/pyproject.toml b/tomd/pyproject.toml
new file mode 100644
index 0000000..dd2eb1d
--- /dev/null
+++ b/tomd/pyproject.toml
@@ -0,0 +1,32 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "tomd"
+version = "0.1.0"
+description = "PDF and HTML to Markdown converter for WG21 papers."
+readme = "README.md"
+requires-python = ">=3.12"
+license = {text = "BSL-1.0"}
+authors = [
+ {name = "Vinnie Falco"},
+]
+dependencies = [
+ "pymupdf~=1.27.0",
+ "beautifulsoup4~=4.14.0",
+]
+
+[project.optional-dependencies]
+test = [
+ "pytest~=8.0",
+]
+
+[project.scripts]
+tomd = "tomd.main:main"
+
+[tool.setuptools]
+packages = ["tomd", "tomd.lib", "tomd.lib.pdf", "tomd.lib.html"]
+
+[tool.setuptools.package-dir]
+"tomd" = "."
diff --git a/tomd/requirements.txt b/tomd/requirements.txt
index 737641f..86d5c0a 100644
--- a/tomd/requirements.txt
+++ b/tomd/requirements.txt
@@ -1,2 +1,10 @@
-pymupdf
-beautifulsoup4
+# Runtime dependencies for tomd. Pinned to compatible-release (~=) ranges
+# to protect against PyMuPDF API drift (tomd uses get_text "dict"/"rawdict",
+# get_texttrace, and get_drawings, any of which can shift between minor
+# versions) and to bound BeautifulSoup API changes.
+#
+# Bump these pins intentionally after running the full test suite against
+# the new version.
+
+pymupdf~=1.27.0
+beautifulsoup4~=4.14.0
diff --git a/tomd/tests/fixtures/html/bikeshed_sample.html b/tomd/tests/fixtures/html/bikeshed_sample.html
new file mode 100644
index 0000000..20153c7
--- /dev/null
+++ b/tomd/tests/fixtures/html/bikeshed_sample.html
@@ -0,0 +1,21 @@
+
+
+