From a2e6c1282cfaf1b1dfbffe726d6f47ddafbd408f Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Thu, 12 Mar 2026 09:34:26 -0700 Subject: [PATCH 1/7] Greater specificity of imports from re --- backend/doc/domain/parsing.py | 130 +++++++++++++++++----------------- 1 file changed, 64 insertions(+), 66 deletions(-) diff --git a/backend/doc/domain/parsing.py b/backend/doc/domain/parsing.py index eb5cf122..d734fee2 100644 --- a/backend/doc/domain/parsing.py +++ b/backend/doc/domain/parsing.py @@ -2,7 +2,17 @@ This module provides an API for parsing content. """ -import re +from re import ( + compile, + escape, + findall, + search, + split as re_split, + sub, + DOTALL, + MULTILINE, + Pattern, +) import time from glob import glob from os import DirEntry, scandir, walk @@ -72,15 +82,11 @@ ) from pydantic import HttpUrl - logger = settings.logger(__name__) H1, H2, H3, H4, H5 = "h1", "h2", "h3", "h4", "h5" -_SECTIONHEAD5_RE = re.compile( - r'\s*', - re.MULTILINE, -) +_SECTIONHEAD5_RE = compile(r'\s*') BC_ARTICLE_URL_FMT_STR: str = ( @@ -88,11 +94,11 @@ ) -CHAPTER_LABEL_REGEX = re.compile(r"\\cl\s+[^\n]+") -CHAPTER_LABEL_REGEX2 = re.compile(r"\\cl\s+(.+)") -CHAPTER_REGEX = re.compile(r"\\c\s+\d+") -CHAPTER_CAPTURE_REGEX = re.compile(r"(\\c\s+\d+)") -CHAPTER_CAPTURE_REGEX2 = re.compile(r"\\c\s+(\d+)") +CHAPTER_LABEL_REGEX = compile(r"\\cl\s+[^\n]+") +CHAPTER_LABEL_REGEX2 = compile(r"\\cl\s+(.+)") +CHAPTER_REGEX = compile(r"\\c\s+\d+") +CHAPTER_CAPTURE_REGEX = compile(r"(\\c\s+\d+)") +CHAPTER_CAPTURE_REGEX2 = compile(r"\\c\s+(\d+)") def find_usfm_files( @@ -256,7 +262,7 @@ def split_usfm_by_chapters( book_code: str, usfm_text: str, check_usfm: bool = settings.CHECK_USFM, - chapter_regex: re.Pattern[str] = CHAPTER_REGEX, + chapter_regex: Pattern[str] = CHAPTER_REGEX, resources_with_usfm_defects: Sequence[ tuple[str, str, str] ] = RESOURCES_WITH_USFM_DEFECTS, @@ -267,8 +273,8 @@ def split_usfm_by_chapters( """ chapter_markers = [] chapters = [] - chapter_markers = re.findall(chapter_regex, usfm_text) - chapters = re.split(chapter_regex, usfm_text) + chapter_markers = findall(chapter_regex, usfm_text) + chapters = re_split(chapter_regex, usfm_text) frontmatter = chapters.pop(0).strip() def needs_fixing() -> bool: @@ -303,28 +309,28 @@ def needs_fixing() -> bool: def ensure_chapter_label( chapter_usfm_text: str, chapter_num: int, - chapter_label_regex: re.Pattern[str] = CHAPTER_LABEL_REGEX, - chapter_regex: re.Pattern[str] = CHAPTER_REGEX, + chapter_label_regex: Pattern[str] = CHAPTER_LABEL_REGEX, + chapter_regex: Pattern[str] = CHAPTER_REGEX, ) -> str: r""" Modify USFM source to insert an English chapter label if it does not have one. Ensure that the chapter label includes the chapter number. """ - if not re.search(chapter_label_regex, chapter_usfm_text): - if re.search(chapter_regex, chapter_usfm_text): - chapter_usfm_text = re.sub( + if not search(chapter_label_regex, chapter_usfm_text): + if search(chapter_regex, chapter_usfm_text): + chapter_usfm_text = sub( r"(\\c\s+\d+)", "\n" + r"\1" + "\n" + r"\\cl Chapter " + f"{chapter_num}" + "\n", chapter_usfm_text, ) return chapter_usfm_text # Ensure chapter label contains the chapter number - match = re.search(r"\\cl\s+(.+)", chapter_usfm_text) + match = search(r"\\cl\s+(.+)", chapter_usfm_text) if match: label_text = match.group(1) if str(chapter_num) not in label_text: - updated_label = f"{re.escape(label_text)} {chapter_num}" - chapter_usfm_text = re.sub( + updated_label = f"{escape(label_text)} {chapter_num}" + chapter_usfm_text = sub( r"\\cl\s+(.+)", rf"\\cl {updated_label}", chapter_usfm_text, @@ -338,13 +344,13 @@ def ensure_chapter_label( def ensure_no_chapter_labels( chapter_usfm_text: str, - chapter_label_regex: re.Pattern[str] = CHAPTER_LABEL_REGEX, + chapter_label_regex: Pattern[str] = CHAPTER_LABEL_REGEX, ) -> str: r""" Modify USFM source to remove all chapter labels, \cl. """ - if re.search(chapter_label_regex, chapter_usfm_text): - updated_chapter_usfm_text = re.sub( + if search(chapter_label_regex, chapter_usfm_text): + updated_chapter_usfm_text = sub( chapter_label_regex, "", chapter_usfm_text, @@ -355,10 +361,10 @@ def ensure_no_chapter_labels( def get_chapter_num( chapter_usfm_text: str, - chapter_regex: re.Pattern[str] = CHAPTER_CAPTURE_REGEX2, + chapter_regex: Pattern[str] = CHAPTER_CAPTURE_REGEX2, ) -> int: """Get the chapter number from the USFM chapter source text.""" - if match := re.search(chapter_regex, chapter_usfm_text): + if match := search(chapter_regex, chapter_usfm_text): chapter_num = match.group(1) return int(chapter_num) return -1 # return sentinal @@ -372,7 +378,7 @@ def remove_null_bytes_and_control_characters(html_content: Optional[str]) -> str USFM. We strip those out as well as the possibility of ASCII NULL bytes. """ - return re.sub(r"[\x00-\x1F]+", "", html_content) if html_content else "" + return sub(r"[\x00-\x1F]+", "", html_content) if html_content else "" def extract_usfm_frontmatter(frontmatter: str) -> dict[str, str]: @@ -385,7 +391,7 @@ def extract_usfm_frontmatter(frontmatter: str) -> dict[str, str]: } extracted_data = {} for key, pattern in patterns.items(): - match = re.search(pattern, frontmatter, re.MULTILINE) + match = search(pattern, frontmatter, MULTILINE) if match: extracted_data[key] = match.group(1).strip() return extracted_data @@ -428,17 +434,17 @@ def maybe_localized_book_name(frontmatter: str) -> str: def ensure_chapter_marker( chapter_usfm_text: str, chapter_num: int, - chapter_regex: re.Pattern[str] = CHAPTER_CAPTURE_REGEX, + chapter_regex: Pattern[str] = CHAPTER_CAPTURE_REGEX, ) -> str: r""" Modify USFM source to insert a chapter marker, \c , if it does not have one. """ - if re.search(chapter_regex, chapter_usfm_text): + if search(chapter_regex, chapter_usfm_text): logger.debug("Chapter marker already existed, didn't add one") return chapter_usfm_text logger.debug("Chapter marker is missing, adding one...") # Try inserting before \cl, if present - if match := re.search(r"\\cl\s+[^\n]+", chapter_usfm_text): + if match := search(r"\\cl\s+[^\n]+", chapter_usfm_text): insert_pos = match.start() return ( chapter_usfm_text[:insert_pos] @@ -449,8 +455,10 @@ def ensure_chapter_marker( return f"\\c {chapter_num}\n" + chapter_usfm_text -def remove_sectionhead5_elements(content: str) -> str: - return _SECTIONHEAD5_RE.sub(" ", content) +def remove_sectionhead5_elements( + content: str, sectionhead5_re: Pattern[str] = _SECTIONHEAD5_RE +) -> str: + return sectionhead5_re.sub(" ", content) def usfm_book_content( @@ -729,7 +737,7 @@ def tn_verses_html( resource_requests, ) verse_html_content = cast(str, mistune.markdown(verse_md_content)) - adjusted_verse_html_content = re.sub(h1, h5, verse_html_content) + adjusted_verse_html_content = sub(h1, h5, verse_html_content) verses_html[verse_ref] = verse_fmt_str.format( # NOTE Use nationalized book name from usfm book if available rather # than English book name as here - we accompish this later in @@ -871,7 +879,7 @@ def tq_chapter_verses( resource_requests, ) verse_html_content = cast(str, mistune.markdown(verse_md_content)) - adjusted_verse_html_content = re.sub(h1, h5, verse_html_content) + adjusted_verse_html_content = sub(h1, h5, verse_html_content) verses_html[verse_ref] = verse_label_fmt_str.format( book_names[book_code], chapter_num, @@ -945,8 +953,8 @@ def tw_name_content_pairs( translation_words_dict_, ) html_word_content = cast(str, mistune.markdown(translation_word_content)) - html_word_content = re.sub(h2, h4, html_word_content) - html_word_content = re.sub(h1, h3, html_word_content) + html_word_content = sub(h2, h4, html_word_content) + html_word_content = sub(h1, h3, html_word_content) if generate_docx: html_word_content = preprocess_html_for_internal_docx_links( html_word_content @@ -1000,7 +1008,7 @@ def modify_commentary_label( ) -> str: # Modify chapter heading if it's the first chapter if chapter_num == 1: - chapter_commentary_html_content = re.sub( + chapter_commentary_html_content = sub( r"

(.*?)<\/h1>", r"

\1 Commentary

", chapter_commentary_html_content, @@ -1012,7 +1020,7 @@ def replace_relative_with_absolute_links( chapter_commentary_html_content: str, url_fmt_str: str = BC_ARTICLE_URL_FMT_STR, ) -> str: - chapter_commentary_html_content = re.sub( + chapter_commentary_html_content = sub( r'', lambda match: ' str: into a chapter this ends up creating a duplicate chapter marker. We deal with that here. """ - cleaned_verse_content = re.sub(r"^\\c\s+\d+", "", verse_content) + cleaned_verse_content = sub(r"^\\c\s+\d+", "", verse_content) return cleaned_verse_content @@ -1421,23 +1429,19 @@ def split_chapter_into_verses(chapter: USFMChapter) -> dict[str, str]: # ''' verse_dict = {} # Find all verse spans - verse_spans = re.findall( - r'(.*?)', chapter.content, re.DOTALL - ) + verse_spans = findall(r'(.*?)', chapter.content, DOTALL) for verse_span in verse_spans: # Extract the verse number from the versemarker - verse_number = re.search(r'(\d+)', verse_span) + verse_number = search(r'(\d+)', verse_span) if verse_number: verse_number_ = verse_number.group(1) # Remove versemarker - verse_text = re.sub(r'.*?', "", verse_span) + verse_text = sub(r'.*?', "", verse_span) # Remove footnotes numbers - verse_text = re.sub( - r'.*?', "", verse_text - ) + verse_text = sub(r'.*?', "", verse_text) # Fix spacing issue when div class="poetry-*" type divs # are used, e.g., yielding 'heartsas' for Hebrews 3:8 - verse_text = re.sub( + verse_text = sub( r'
(.*?)
', r" \1", verse_text, @@ -1503,12 +1507,10 @@ def split_chapter_into_verses_with_formatting( # footnote callers) and the second element the target footnotes HTML? verse_dict = {} # Find all verse spans - verse_spans = re.findall( - r'(.*?)', chapter.content, re.DOTALL - ) + verse_spans = findall(r'(.*?)', chapter.content, DOTALL) for verse_span in verse_spans: # Extract the verse number from the versemarker - verse_number = re.search(r'(\d+)', verse_span) + verse_number = search(r'(\d+)', verse_span) if verse_number: verse_number_ = verse_number.group(1) # Add to the dictionary with verse number as the key and verse text as the value @@ -1548,19 +1550,15 @@ def split_chapter_into_verses_with_formatting_for_f10( # cleaned_html = "".join(str(c) for c in verse_span.contents) cleaned_html = str(verse_span) # Fix spacing issues introduced by inner spans - cleaned_html = re.sub( + cleaned_html = sub( r"\s+([,;:.!?])", r"\1", cleaned_html ) # remove space before punctuation - cleaned_html = re.sub( - r"\s+'", "'", cleaned_html - ) # remove space before apostrophe - cleaned_html = re.sub( - r"'\s+", "'", cleaned_html - ) # remove space after apostrophe - cleaned_html = re.sub( + cleaned_html = sub(r"\s+'", "'", cleaned_html) # remove space before apostrophe + cleaned_html = sub(r"'\s+", "'", cleaned_html) # remove space after apostrophe + cleaned_html = sub( r"\s*-\s*", "-", cleaned_html ) # normalize spaces around hyphens - cleaned_html = re.sub(r"\s{2,}", " ", cleaned_html) # collapse double spaces + cleaned_html = sub(r"\s{2,}", " ", cleaned_html) # collapse double spaces cleaned_html = cleaned_html.strip() # if you want plain text instead, use: cleaned_text = verse_span.get_text(" ", strip=True) # store cleaned HTML fragment (still contains etc.) From a322e344ca14aace56f1b2b4b26da10db65f24e8 Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Thu, 12 Mar 2026 09:35:01 -0700 Subject: [PATCH 2/7] Ensure space before verse numbers --- backend/templates/html/header_compact_enclosing.html | 4 ++++ backend/templates/html/header_enclosing.html | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/backend/templates/html/header_compact_enclosing.html b/backend/templates/html/header_compact_enclosing.html index ddbf1859..38f2d234 100644 --- a/backend/templates/html/header_compact_enclosing.html +++ b/backend/templates/html/header_compact_enclosing.html @@ -446,6 +446,10 @@ font-weight: normal; } + .verse + .verse::before { + content: " "; + } + .versemarker { font-size: 0.5em; vertical-align: top; diff --git a/backend/templates/html/header_enclosing.html b/backend/templates/html/header_enclosing.html index f47f2279..049d0a68 100644 --- a/backend/templates/html/header_enclosing.html +++ b/backend/templates/html/header_enclosing.html @@ -448,6 +448,10 @@ font-weight: normal; } + .verse + .verse::before { + content: " "; + } + .versemarker { font-size: 0.5em; vertical-align: top; From d64d467cd8a45388cdc836ef809b2e5048449b5e Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Thu, 12 Mar 2026 15:55:58 -0700 Subject: [PATCH 3/7] Remove sectionhead5 sections and empty paragraph elements --- backend/doc/domain/parsing.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/backend/doc/domain/parsing.py b/backend/doc/domain/parsing.py index d734fee2..af5dfd8b 100644 --- a/backend/doc/domain/parsing.py +++ b/backend/doc/domain/parsing.py @@ -86,8 +86,8 @@ H1, H2, H3, H4, H5 = "h1", "h2", "h3", "h4", "h5" -_SECTIONHEAD5_RE = compile(r'\s*') - +SECTIONHEAD5_RE = compile(r'\s*') +EMPTY_P_RE = compile(r"

\s*

") BC_ARTICLE_URL_FMT_STR: str = ( "https://content.bibletranslationtools.org/WycliffeAssociates/en_bc/src/branch/master/{}" @@ -455,10 +455,13 @@ def ensure_chapter_marker( return f"\\c {chapter_num}\n" + chapter_usfm_text -def remove_sectionhead5_elements( - content: str, sectionhead5_re: Pattern[str] = _SECTIONHEAD5_RE +def remove_unwanted_elements( + content: str, + sectionhead5_re: Pattern[str] = SECTIONHEAD5_RE, + empty_paragraph_re: Pattern[str] = EMPTY_P_RE, ) -> str: - return sectionhead5_re.sub(" ", content) + result = sectionhead5_re.sub(" ", content) + return empty_paragraph_re.sub("", result) def usfm_book_content( @@ -522,15 +525,15 @@ def usfm_book_content( chapter_html_content = usfm_chapter_html( chapter_usfm, input_file, output_file, chapter_num ) - cleaned_chapter_html_content = remove_null_bytes_and_control_characters( + cleaned_chapter_html_content_ = remove_null_bytes_and_control_characters( chapter_html_content ) - chapter_html_content_sans_s5 = remove_sectionhead5_elements( - cleaned_chapter_html_content + cleaned_chapter_html_content = remove_unwanted_elements( + cleaned_chapter_html_content_ ) usfm_chapters[chapter_num] = USFMChapter( content=( - chapter_html_content_sans_s5 if chapter_html_content_sans_s5 else "" + cleaned_chapter_html_content if cleaned_chapter_html_content else "" ), verses=None, ) From ad631eb6dd48424ea8e0bab04e7056306443a9d1 Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Fri, 13 Mar 2026 04:14:43 -0700 Subject: [PATCH 4/7] Upgrade from htmldocx to html4docx package --- backend/doc/domain/document_generator.py | 2 +- backend/passages/domain/document_generator.py | 3 +-- backend/passages/utils/docx_utils.py | 2 +- backend/requirements.in | 2 +- backend/requirements.txt | 6 +++--- backend/stet/domain/document_generator.py | 2 +- backend/stet/utils/docx_utils.py | 2 +- pyproject.toml | 2 +- 8 files changed, 10 insertions(+), 11 deletions(-) diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py index 4429b0bf..b2933220 100755 --- a/backend/doc/domain/document_generator.py +++ b/backend/doc/domain/document_generator.py @@ -75,7 +75,7 @@ from docx.shared import RGBColor from docxcompose.composer import Composer # type: ignore from docxtpl import DocxTemplate # type: ignore -from htmldocx import HtmlToDocx # type: ignore +from html4docx import HtmlToDocx # type: ignore logger = settings.logger(__name__) diff --git a/backend/passages/domain/document_generator.py b/backend/passages/domain/document_generator.py index a55b046b..4a79e56d 100644 --- a/backend/passages/domain/document_generator.py +++ b/backend/passages/domain/document_generator.py @@ -24,7 +24,7 @@ from docx.oxml import parse_xml from docx.shared import Inches, RGBColor from docx.table import _Cell, _Row -from htmldocx import HtmlToDocx # type: ignore +from html4docx import HtmlToDocx # type: ignore from passages.domain.model import ( Passage, BibleReferenceWithAvailability, @@ -34,7 +34,6 @@ from passages.utils.docx_utils import add_footer, add_header from pydantic import Json - if TYPE_CHECKING: from typing import TypeAlias diff --git a/backend/passages/utils/docx_utils.py b/backend/passages/utils/docx_utils.py index 74467601..43c8c01f 100644 --- a/backend/passages/utils/docx_utils.py +++ b/backend/passages/utils/docx_utils.py @@ -11,7 +11,7 @@ from docx.shared import Pt, RGBColor from docx.table import Table from docx.text.paragraph import Paragraph -from htmldocx import HtmlToDocx # type: ignore +from html4docx import HtmlToDocx # type: ignore from docx.table import _Cell, _Row diff --git a/backend/requirements.in b/backend/requirements.in index 2cb262e9..c6af9614 100644 --- a/backend/requirements.in +++ b/backend/requirements.in @@ -15,7 +15,7 @@ fastapi[all] filelock flower gunicorn -htmldocx +html-for-docx jinja2 mistune orjson diff --git a/backend/requirements.txt b/backend/requirements.txt index f09ee819..4c7da8c2 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -24,7 +24,7 @@ babel==2.18.0 beautifulsoup4==4.14.3 # via # -r backend/requirements.in - # htmldocx + # html-for-docx billiard==4.2.4 # via celery brotli==1.2.0 @@ -95,7 +95,7 @@ h11==0.16.0 # via # httpcore # uvicorn -htmldocx==0.0.6 +html-for-docx==1.1.4 # via -r backend/requirements.in httpcore==1.0.9 # via httpx @@ -176,7 +176,7 @@ python-docx==1.2.0 # via # docxcompose3 # docxtpl - # htmldocx + # html-for-docx python-dotenv==1.2.1 # via # -r backend/requirements.in diff --git a/backend/stet/domain/document_generator.py b/backend/stet/domain/document_generator.py index a6d3d393..c70022de 100644 --- a/backend/stet/domain/document_generator.py +++ b/backend/stet/domain/document_generator.py @@ -25,7 +25,7 @@ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.oxml import OxmlElement from docx.oxml.ns import qn -from htmldocx import HtmlToDocx # type: ignore +from html4docx import HtmlToDocx # type: ignore from pydantic import Json from stet.domain.model import VerseEntry, WordEntry from stet.domain.parser import get_word_entry_dtos diff --git a/backend/stet/utils/docx_utils.py b/backend/stet/utils/docx_utils.py index c9cfaff6..0bf56588 100644 --- a/backend/stet/utils/docx_utils.py +++ b/backend/stet/utils/docx_utils.py @@ -13,7 +13,7 @@ from docx.shared import Pt, RGBColor from docx.table import Table from docx.text.paragraph import Paragraph -from htmldocx import HtmlToDocx # type: ignore[import-untyped] +from html4docx import HtmlToDocx # type: ignore[import-untyped] from docx.table import _Cell, _Row diff --git a/pyproject.toml b/pyproject.toml index 2050121f..f89abfd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ dependencies = [ "fastapi[all]", "filelock", "flower", - "htmldocx", + "html-for-docx", "html2docx", "gunicorn", "jinja2", From 7b901bf8e1438df8c458f48f3b5f98c2464ef88c Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Fri, 13 Mar 2026 04:15:29 -0700 Subject: [PATCH 5/7] Sort imports and format whitespace --- backend/doc/domain/document_generator.py | 4 ++-- backend/stet/domain/document_generator.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py index b2933220..a66f668f 100755 --- a/backend/doc/domain/document_generator.py +++ b/backend/doc/domain/document_generator.py @@ -3,14 +3,13 @@ and eventually a final document produced. """ +import re import subprocess import time from datetime import datetime from os.path import exists, join from typing import Final, Mapping, Optional, Sequence, TypeAlias, cast -# import regex as re # not yet supported in python 3.13 - used for unicode word boundaries for RTL languages -import re from celery import current_task from doc.config import settings from doc.domain import parsing, resource_lookup, worker @@ -77,6 +76,7 @@ from docxtpl import DocxTemplate # type: ignore from html4docx import HtmlToDocx # type: ignore +# import regex as re # not yet supported in python 3.13 - used for unicode word boundaries for RTL languages logger = settings.logger(__name__) diff --git a/backend/stet/domain/document_generator.py b/backend/stet/domain/document_generator.py index c70022de..b45feb79 100644 --- a/backend/stet/domain/document_generator.py +++ b/backend/stet/domain/document_generator.py @@ -47,7 +47,6 @@ ) from stet.utils.util import extract_chapter_and_beyond - logger = settings.logger(__name__) @@ -360,9 +359,7 @@ def generate_docx( source_paragraph = row_cells[0].paragraphs[0] source_paragraph.paragraph_format.line_spacing = 2.0 # Adjust line spacing if verse.source_has_preformatted_bolding: - add_preformatted_html_to_docx( - verse.source_text, source_paragraph - ) + add_preformatted_html_to_docx(verse.source_text, source_paragraph) elif len(word_entry.bolded_phrases) > 0: add_highlighted_html_to_docx_for_words( verse.source_text, source_paragraph, word_entry.bolded_phrases From 74779174c47ab1fc9fa4cbf0ab7c467980d93818 Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Fri, 13 Mar 2026 15:20:48 -0700 Subject: [PATCH 6/7] html4docx handles internal links html4docx handles internal links whereas htmldocx had not, so we can remove some custom code --- backend/doc/domain/document_generator.py | 4 +--- backend/doc/domain/parsing.py | 5 ---- backend/doc/utils/docx_util.py | 30 ------------------------ 3 files changed, 1 insertion(+), 38 deletions(-) diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py index a66f668f..6fdbf1c3 100755 --- a/backend/doc/domain/document_generator.py +++ b/backend/doc/domain/document_generator.py @@ -51,7 +51,6 @@ from doc.utils.docx_util import ( add_internal_docx_links, generate_docx_toc, - preprocess_html_for_internal_docx_links, style_superscripts, ) from doc.utils.file_utils import ( @@ -767,8 +766,7 @@ def compose_docx_document( else: add_one_column_section(doc) try: - processed_html = preprocess_html_for_internal_docx_links(part.content) - html_to_docx.add_html_to_document(processed_html, doc) + html_to_docx.add_html_to_document(part.content, doc) except ValueError as e: logger.exception("Error converting HTML to docx: %s", e) if part.use_section_visual_separator: diff --git a/backend/doc/domain/parsing.py b/backend/doc/domain/parsing.py index af5dfd8b..2423f68c 100644 --- a/backend/doc/domain/parsing.py +++ b/backend/doc/domain/parsing.py @@ -62,7 +62,6 @@ ) from doc.reviewers_guide.model import RGBook from doc.reviewers_guide.parser import get_rg_books -from doc.utils.docx_util import preprocess_html_for_internal_docx_links from doc.utils.text_utils import ( maybe_correct_book_name, chapter_label_numeric_part, @@ -958,10 +957,6 @@ def tw_name_content_pairs( html_word_content = cast(str, mistune.markdown(translation_word_content)) html_word_content = sub(h2, h4, html_word_content) html_word_content = sub(h1, h3, html_word_content) - if generate_docx: - html_word_content = preprocess_html_for_internal_docx_links( - html_word_content - ) pair = TWNameContentPair( localized_translation_word_, translation_word_filepath, diff --git a/backend/doc/utils/docx_util.py b/backend/doc/utils/docx_util.py index 703a2095..7490a4d7 100644 --- a/backend/doc/utils/docx_util.py +++ b/backend/doc/utils/docx_util.py @@ -46,36 +46,6 @@ def generate_docx_toc(docx_filepath: str) -> str: return str(toc_path) -def preprocess_html_for_internal_docx_links(html: str) -> str: - """ - Replace internal HTML anchors and headings with markers that survive HTML→DOCX conversion. - Example: -

→ {{BOOKMARK:intro}} - Christ → {{LINK_START:intro}}Christ{{LINK_END}} - """ - # Mark bookmarks - html = re.sub( - r'', - r"{{BOOKMARK:\1}}

", - html, - flags=re.IGNORECASE, - ) - # Replace links - html = re.sub( - r'(.*?)', - r"{{LINK_START:\1}}\2{{LINK_END}}", - html, - flags=re.IGNORECASE | re.DOTALL, - ) - html = re.sub( - r'(.*?)', - r"{{LINK_START:\1}}\2{{LINK_END}}", - html, - flags=re.IGNORECASE | re.DOTALL, - ) - return html - - def _make_text_run(text: str) -> Element: r = OxmlElement("w:r") t = OxmlElement("w:t") From 7039b7d16fa4557618c99055724ba9d726f61727 Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Fri, 13 Mar 2026 15:21:48 -0700 Subject: [PATCH 7/7] Ensure spacing before verse numbers in docx where needed This solves issue #292 --- backend/doc/utils/docx_util.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/backend/doc/utils/docx_util.py b/backend/doc/utils/docx_util.py index 7490a4d7..efaa20c0 100644 --- a/backend/doc/utils/docx_util.py +++ b/backend/doc/utils/docx_util.py @@ -174,13 +174,20 @@ def style_superscripts( 2 = +1pt 4 = +2pt 6 = +3pt - color: RGBColor for superscripts (e.g. light gray) """ for para in doc.paragraphs: - for run in para.runs: + runs = para.runs + for i, run in enumerate(runs): if run.font.superscript: + # --- Ensure space before superscript --- + if run.text and not run.text[0].isspace(): + prev_char = None + if i > 0 and runs[i - 1].text: + prev_char = runs[i - 1].text[-1] + if not prev_char or not prev_char.isspace(): + run.text = " " + run.text # --- Color --- run.font.color.rgb = color # --- Vertical position ---