diff --git a/exports/doclang-styled.docx b/exports/doclang-styled.docx index 5951e87..8c53087 100644 Binary files a/exports/doclang-styled.docx and b/exports/doclang-styled.docx differ diff --git a/exports/doclang.docx b/exports/doclang.docx index 0a243a5..87c1ebe 100644 Binary files a/exports/doclang.docx and b/exports/doclang.docx differ diff --git a/pyproject.toml b/pyproject.toml index 62bc5d9..8f6b5d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,13 +48,13 @@ ci = [ "mypy~=1.6", "ruff>=0.14.11", "pytest~=8.3", + "python-docx>=1.2.0", ] dev = [ { include-group = "ci" }, "pydantic>=2.11.10", "pyyaml>=6.0.3", "openpyxl>=3.1.0", - "python-docx>=1.2.0", "docling>=2.0.0", ] @@ -130,6 +130,7 @@ exclude = "(^|/)tests/data/.*" [[tool.mypy.overrides]] module = [ + "docx.*", "lxml.*", "saxonche.*", "typer.*", diff --git a/reference/input/reference.xlsx b/reference/input/reference.xlsx index 427c646..20cc7ac 100644 Binary files a/reference/input/reference.xlsx and b/reference/input/reference.xlsx differ diff --git a/tests/test_export.py b/tests/test_export.py new file mode 100644 index 0000000..2498909 --- /dev/null +++ b/tests/test_export.py @@ -0,0 +1,51 @@ +"""Tests for HTML comment handling in export_docx.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from utils.export_docx import _strip_html_comments_outside_code # noqa: E402 + + +def test_strip_prose_comments() -> None: + assert _strip_html_comments_outside_code("Hello world") == "Hello world" + + +def test_preserve_fenced_code_comments() -> None: + md = "```xml\n\n```" + assert _strip_html_comments_outside_code(md) == md + + +def test_preserve_inline_code_comments() -> None: + md = "Use `` here" + assert _strip_html_comments_outside_code(md) == md + + +def test_strip_prose_but_preserve_inline_code_on_same_line() -> None: + md = "See `foo ` and end" + assert _strip_html_comments_outside_code(md) == "See `foo ` and end" + + +def test_strip_prose_comments_inside_html_comment_with_fences() -> None: + md = "Before\n\nAfter\n" + assert _strip_html_comments_outside_code(md) == "Before\n\nAfter\n" + + +def test_strip_appendix_note_comment_from_spec() -> None: + md = Path(__file__).resolve().parents[1] / "spec.md" + stripped = _strip_html_comments_outside_code(md.read_text(encoding="utf-8")) + assert "NOTE: do not edit Appendix A" not in stripped + assert "Component-level metadata" not in stripped + assert "deriveable from the document" not in stripped + assert not stripped.rstrip().endswith("-->") + assert "" in stripped + + +def test_malformed_unclosed_inline_backtick_does_not_block_later_comment_stripping() -> None: + md = "See `; analogous to [``](#anchor)\n\nAfter\n" + assert _strip_html_comments_outside_code(md) == ("See `; analogous to [``](#anchor)\n\nAfter\n") diff --git a/tests/test_markdown_snippets.py b/tests/test_spec_snippets.py similarity index 100% rename from tests/test_markdown_snippets.py rename to tests/test_spec_snippets.py diff --git a/utils/export_docx.py b/utils/export_docx.py index ebc7db1..195feec 100644 --- a/utils/export_docx.py +++ b/utils/export_docx.py @@ -33,6 +33,7 @@ try: from docx import Document + from docx.document import Document as DocxDocument from docx.enum.text import WD_BREAK from docx.opc.constants import RELATIONSHIP_TYPE from docx.oxml import OxmlElement @@ -53,7 +54,7 @@ OUTPUT_DOCX = EXPORTS_DIR / "doclang.docx" -def add_toc(document: Document) -> None: +def add_toc(document: DocxDocument) -> None: # Insert a Word ToC field: TOC \o "1-3" \h \z \u p = document.add_paragraph() run = p.add_run() @@ -79,7 +80,7 @@ def add_toc(document: Document) -> None: hint._r.append(fld_end) -def add_toc_title_paragraph(document: Document): +def add_toc_title_paragraph(document: DocxDocument): """TOC section title: Heading 1 appearance without using Heading 1 style.""" try: paragraph = document.add_paragraph("Contents", style="TOC Heading") @@ -95,7 +96,7 @@ def add_toc_title_paragraph(document: Document): return paragraph -def add_toc_section(document: Document) -> None: +def add_toc_section(document: DocxDocument) -> None: """Insert a TOC title, the TOC field, and a page break after the section.""" add_toc_title_paragraph(document) document.add_paragraph() @@ -104,7 +105,7 @@ def add_toc_section(document: Document) -> None: break_p.add_run().add_break(WD_BREAK.PAGE) -def process_html_paragraph(document: Document, text: str) -> None: +def process_html_paragraph(document: DocxDocument, text: str) -> None: """Process paragraph text that contains HTML elements like
    ,
  • , and
    . Splits the text into segments and processes each appropriately: @@ -147,7 +148,7 @@ def process_html_paragraph(document: Document, text: str) -> None: process_br_tags(document, post_text) -def process_br_tags(document: Document, text: str) -> None: +def process_br_tags(document: DocxDocument, text: str) -> None: """Process text containing
    tags by splitting into multiple runs with line breaks.""" br_pattern = re.compile(r"", re.IGNORECASE) @@ -169,7 +170,7 @@ def process_br_tags(document: Document, text: str) -> None: p.add_run().add_break(WD_BREAK.LINE) -def finalize_paragraph_buf(document: Document, buf: list[str]) -> None: +def finalize_paragraph_buf(document: DocxDocument, buf: list[str]) -> None: if not buf: return text = " ".join(line.strip() for line in buf).strip() @@ -301,7 +302,7 @@ def split_md_row(row: str) -> list[str]: return [c.replace("\\|", "|") for c in re.split(r"\s*\|\s*", s)] -def add_code_block(document: Document, code_lines: list[str], language: Optional[str]) -> None: +def add_code_block(document: DocxDocument, code_lines: list[str], language: Optional[str]) -> None: # Add a monospaced, preformatted code block. p = document.add_paragraph() # Light grey background for the whole block @@ -468,13 +469,13 @@ def _process_code_and_bold(paragraph, text: str) -> None: _add_bold_and_text_runs(paragraph, tail) -def add_formatted_paragraph(document: Document, text: str): +def add_formatted_paragraph(document: DocxDocument, text: str): p = document.add_paragraph() _add_inline_formatted_runs(p, text) return p -def add_image(document: Document, src: str, base_dir: Path) -> bool: +def add_image(document: DocxDocument, src: str, base_dir: Path) -> bool: # Resolve and add image; return True if added. # Skip if remote URL. if re.match(r"^[a-z]+://", src): @@ -492,7 +493,12 @@ def add_image(document: Document, src: str, base_dir: Path) -> bool: # Resize to visible page width while preserving aspect ratio. try: section = document.sections[-1] - max_width = section.page_width - section.left_margin - section.right_margin + page_width = section.page_width + left_margin = section.left_margin + right_margin = section.right_margin + if page_width is None or left_margin is None or right_margin is None: + raise ValueError("section metrics unavailable") + max_width = page_width - left_margin - right_margin document.add_picture(str(img_path), width=max_width) except Exception: # Fallback if section metrics are unavailable @@ -583,7 +589,7 @@ def add_hyperlink(paragraph, url: str, text: str): def add_heading_with_bookmark( - document: Document, + document: DocxDocument, raw_title: str, markdown_level: int, bookmark_id: int, @@ -605,64 +611,91 @@ def add_heading_with_bookmark( add_bookmark(paragraph, markdown_anchor(raw_title), bookmark_id) +def _strip_html_comments_from_prose_line( + line: str, *, in_comment: bool, in_inline_code: bool +) -> tuple[str, bool, bool]: + """Strip HTML comments from a prose line, preserving inline `` `code` `` spans.""" + buf: list[str] = [] + i = 0 + length = len(line) + while i < length: + if in_comment: + end = line.find("-->", i) + if end == -1: + return "".join(buf), True, in_inline_code + i = end + 3 + in_comment = False + continue + + if in_inline_code: + ch = line[i] + buf.append(ch) + if ch == "`": + in_inline_code = False + i += 1 + continue + + if line.startswith(") outside fenced code blocks. + """Remove HTML comments () outside code. - Preserves comment markers that appear inside fenced code blocks. Also - removes multi-line comments and inline fragments within a line. + Preserves comment markers inside fenced code blocks and inline `` `code` `` + spans. Also removes multi-line comments and inline fragments within prose. """ lines = md_text.splitlines(keepends=True) out: list[str] = [] in_code = False in_comment = False for line in lines: - # Detect fenced code block start/end (must be fence-only line, like parser) - if not in_comment: - mcode = re.match(r"^(\s*)(`{3,}|~{3,})(\w+)?\s*$", line) - if mcode: - # Toggle code state - in_code = not in_code - out.append(line) - continue + # Fence lines inside HTML comments are comment text, not real code blocks. + # Opening fences may include a language tag (```xml); closing fences do not. + if not in_comment and not in_code and _is_fence_open_line(line): + in_code = True + out.append(line) + continue + if not in_comment and in_code and _is_fence_close_line(line): + in_code = False + out.append(line) + continue if in_code: out.append(line) continue - # Strip HTML comments in non-code text - i = 0 - L = len(line) - buf: list[str] = [] - while i < L: - if in_comment: - end = line.find("-->", i) - if end == -1: - # Entire remainder is within comment - i = L - in_comment = True - break - else: - # Close comment and continue scanning after it - i = end + 3 - in_comment = False - continue - else: - start = line.find("