From a2e6c1282cfaf1b1dfbffe726d6f47ddafbd408f Mon Sep 17 00:00:00 2001
From: linearcombination <4829djaskdfj@gmail.com>
Date: Thu, 12 Mar 2026 09:34:26 -0700
Subject: [PATCH 1/7] Greater specificity of imports from re

---
 backend/doc/domain/parsing.py | 130 +++++++++++++++++-----------------
 1 file changed, 64 insertions(+), 66 deletions(-)
diff --git a/backend/doc/domain/parsing.py b/backend/doc/domain/parsing.py
index eb5cf122..d734fee2 100644
--- a/backend/doc/domain/parsing.py
+++ b/backend/doc/domain/parsing.py
@@ -2,7 +2,17 @@
 This module provides an API for parsing content.
 """
 
-import re
+from re import (
+    compile,
+    escape,
+    findall,
+    search,
+    split as re_split,
+    sub,
+    DOTALL,
+    MULTILINE,
+    Pattern,
+)
 import time
 from glob import glob
 from os import DirEntry, scandir, walk
@@ -72,15 +82,11 @@
 )
 from pydantic import HttpUrl
 
-
 logger = settings.logger(__name__)
 
 H1, H2, H3, H4, H5 = "h1", "h2", "h3", "h4", "h5"
 
-_SECTIONHEAD5_RE = re.compile(
-    r'<div\s+class="sectionhead-5">\s*</div>',
-    re.MULTILINE,
-)
+_SECTIONHEAD5_RE = compile(r'<div\s+class="sectionhead-5">\s*</div>')
 
 
 BC_ARTICLE_URL_FMT_STR: str = (
@@ -88,11 +94,11 @@
 )
 
 
-CHAPTER_LABEL_REGEX = re.compile(r"\\cl\s+[^\n]+")
-CHAPTER_LABEL_REGEX2 = re.compile(r"\\cl\s+(.+)")
-CHAPTER_REGEX = re.compile(r"\\c\s+\d+")
-CHAPTER_CAPTURE_REGEX = re.compile(r"(\\c\s+\d+)")
-CHAPTER_CAPTURE_REGEX2 = re.compile(r"\\c\s+(\d+)")
+CHAPTER_LABEL_REGEX = compile(r"\\cl\s+[^\n]+")
+CHAPTER_LABEL_REGEX2 = compile(r"\\cl\s+(.+)")
+CHAPTER_REGEX = compile(r"\\c\s+\d+")
+CHAPTER_CAPTURE_REGEX = compile(r"(\\c\s+\d+)")
+CHAPTER_CAPTURE_REGEX2 = compile(r"\\c\s+(\d+)")
 
 
 def find_usfm_files(
@@ -256,7 +262,7 @@ def split_usfm_by_chapters(
     book_code: str,
     usfm_text: str,
     check_usfm: bool = settings.CHECK_USFM,
-    chapter_regex: re.Pattern[str] = CHAPTER_REGEX,
+    chapter_regex: Pattern[str] = CHAPTER_REGEX,
     resources_with_usfm_defects: Sequence[
         tuple[str, str, str]
     ] = RESOURCES_WITH_USFM_DEFECTS,
@@ -267,8 +273,8 @@ def split_usfm_by_chapters(
     """
     chapter_markers = []
     chapters = []
-    chapter_markers = re.findall(chapter_regex, usfm_text)
-    chapters = re.split(chapter_regex, usfm_text)
+    chapter_markers = findall(chapter_regex, usfm_text)
+    chapters = re_split(chapter_regex, usfm_text)
     frontmatter = chapters.pop(0).strip()
 
     def needs_fixing() -> bool:
@@ -303,28 +309,28 @@ def needs_fixing() -> bool:
 def ensure_chapter_label(
     chapter_usfm_text: str,
     chapter_num: int,
-    chapter_label_regex: re.Pattern[str] = CHAPTER_LABEL_REGEX,
-    chapter_regex: re.Pattern[str] = CHAPTER_REGEX,
+    chapter_label_regex: Pattern[str] = CHAPTER_LABEL_REGEX,
+    chapter_regex: Pattern[str] = CHAPTER_REGEX,
 ) -> str:
     r"""
     Modify USFM source to insert an English chapter label if it does not have one.
     Ensure that the chapter label includes the chapter number.
     """
-    if not re.search(chapter_label_regex, chapter_usfm_text):
-        if re.search(chapter_regex, chapter_usfm_text):
-            chapter_usfm_text = re.sub(
+    if not search(chapter_label_regex, chapter_usfm_text):
+        if search(chapter_regex, chapter_usfm_text):
+            chapter_usfm_text = sub(
                 r"(\\c\s+\d+)",
                 "\n" + r"\1" + "\n" + r"\\cl Chapter " + f"{chapter_num}" + "\n",
                 chapter_usfm_text,
             )
             return chapter_usfm_text
     # Ensure chapter label contains the chapter number
-    match = re.search(r"\\cl\s+(.+)", chapter_usfm_text)
+    match = search(r"\\cl\s+(.+)", chapter_usfm_text)
     if match:
         label_text = match.group(1)
         if str(chapter_num) not in label_text:
-            updated_label = f"{re.escape(label_text)} {chapter_num}"
-            chapter_usfm_text = re.sub(
+            updated_label = f"{escape(label_text)} {chapter_num}"
+            chapter_usfm_text = sub(
                 r"\\cl\s+(.+)",
                 rf"\\cl {updated_label}",
                 chapter_usfm_text,
@@ -338,13 +344,13 @@ def ensure_chapter_label(
 
 def ensure_no_chapter_labels(
     chapter_usfm_text: str,
-    chapter_label_regex: re.Pattern[str] = CHAPTER_LABEL_REGEX,
+    chapter_label_regex: Pattern[str] = CHAPTER_LABEL_REGEX,
 ) -> str:
     r"""
     Modify USFM source to remove all chapter labels, \cl.
     """
-    if re.search(chapter_label_regex, chapter_usfm_text):
-        updated_chapter_usfm_text = re.sub(
+    if search(chapter_label_regex, chapter_usfm_text):
+        updated_chapter_usfm_text = sub(
             chapter_label_regex,
             "",
             chapter_usfm_text,
@@ -355,10 +361,10 @@ def ensure_no_chapter_labels(
 
 def get_chapter_num(
     chapter_usfm_text: str,
-    chapter_regex: re.Pattern[str] = CHAPTER_CAPTURE_REGEX2,
+    chapter_regex: Pattern[str] = CHAPTER_CAPTURE_REGEX2,
 ) -> int:
     """Get the chapter number from the USFM chapter source text."""
-    if match := re.search(chapter_regex, chapter_usfm_text):
+    if match := search(chapter_regex, chapter_usfm_text):
         chapter_num = match.group(1)
         return int(chapter_num)
     return -1  # return sentinal
@@ -372,7 +378,7 @@ def remove_null_bytes_and_control_characters(html_content: Optional[str]) -> str
     USFM. We strip those out as well as the possibility of ASCII NULL
     bytes.
     """
-    return re.sub(r"[\x00-\x1F]+", "", html_content) if html_content else ""
+    return sub(r"[\x00-\x1F]+", "", html_content) if html_content else ""
 
 
 def extract_usfm_frontmatter(frontmatter: str) -> dict[str, str]:
@@ -385,7 +391,7 @@ def extract_usfm_frontmatter(frontmatter: str) -> dict[str, str]:
     }
     extracted_data = {}
     for key, pattern in patterns.items():
-        match = re.search(pattern, frontmatter, re.MULTILINE)
+        match = search(pattern, frontmatter, MULTILINE)
         if match:
             extracted_data[key] = match.group(1).strip()
     return extracted_data
@@ -428,17 +434,17 @@ def maybe_localized_book_name(frontmatter: str) -> str:
 def ensure_chapter_marker(
     chapter_usfm_text: str,
     chapter_num: int,
-    chapter_regex: re.Pattern[str] = CHAPTER_CAPTURE_REGEX,
+    chapter_regex: Pattern[str] = CHAPTER_CAPTURE_REGEX,
 ) -> str:
     r"""
     Modify USFM source to insert a chapter marker, \c <chapter_num>, if it does not have one.
     """
-    if re.search(chapter_regex, chapter_usfm_text):
+    if search(chapter_regex, chapter_usfm_text):
         logger.debug("Chapter marker already existed, didn't add one")
         return chapter_usfm_text
     logger.debug("Chapter marker is missing, adding one...")
     # Try inserting before \cl, if present
-    if match := re.search(r"\\cl\s+[^\n]+", chapter_usfm_text):
+    if match := search(r"\\cl\s+[^\n]+", chapter_usfm_text):
         insert_pos = match.start()
         return (
             chapter_usfm_text[:insert_pos]
@@ -449,8 +455,10 @@ def ensure_chapter_marker(
     return f"\\c {chapter_num}\n" + chapter_usfm_text
 
 
-def remove_sectionhead5_elements(content: str) -> str:
-    return _SECTIONHEAD5_RE.sub(" ", content)
+def remove_sectionhead5_elements(
+    content: str, sectionhead5_re: Pattern[str] = _SECTIONHEAD5_RE
+) -> str:
+    return sectionhead5_re.sub(" ", content)
 
 
 def usfm_book_content(
@@ -729,7 +737,7 @@ def tn_verses_html(
                 resource_requests,
             )
             verse_html_content = cast(str, mistune.markdown(verse_md_content))
-            adjusted_verse_html_content = re.sub(h1, h5, verse_html_content)
+            adjusted_verse_html_content = sub(h1, h5, verse_html_content)
             verses_html[verse_ref] = verse_fmt_str.format(
                 # NOTE Use nationalized book name from usfm book if available rather
                 # than English book name as here - we accompish this later in
@@ -871,7 +879,7 @@ def tq_chapter_verses(
                     resource_requests,
                 )
                 verse_html_content = cast(str, mistune.markdown(verse_md_content))
-                adjusted_verse_html_content = re.sub(h1, h5, verse_html_content)
+                adjusted_verse_html_content = sub(h1, h5, verse_html_content)
                 verses_html[verse_ref] = verse_label_fmt_str.format(
                     book_names[book_code],
                     chapter_num,
@@ -945,8 +953,8 @@ def tw_name_content_pairs(
                 translation_words_dict_,
             )
             html_word_content = cast(str, mistune.markdown(translation_word_content))
-            html_word_content = re.sub(h2, h4, html_word_content)
-            html_word_content = re.sub(h1, h3, html_word_content)
+            html_word_content = sub(h2, h4, html_word_content)
+            html_word_content = sub(h1, h3, html_word_content)
             if generate_docx:
                 html_word_content = preprocess_html_for_internal_docx_links(
                     html_word_content
@@ -1000,7 +1008,7 @@ def modify_commentary_label(
 ) -> str:
     # Modify chapter heading if it's the first chapter
     if chapter_num == 1:
-        chapter_commentary_html_content = re.sub(
+        chapter_commentary_html_content = sub(
             r"<h1>(.*?)<\/h1>",
             r"<h1>\1 Commentary</h1>",
             chapter_commentary_html_content,
@@ -1012,7 +1020,7 @@ def replace_relative_with_absolute_links(
     chapter_commentary_html_content: str,
     url_fmt_str: str = BC_ARTICLE_URL_FMT_STR,
 ) -> str:
-    chapter_commentary_html_content = re.sub(
+    chapter_commentary_html_content = sub(
         r'<a\s+href="\/(.*?)">',
         lambda match: '<a href="'
         + url_fmt_str.format(match.group(1))
@@ -1225,10 +1233,10 @@ def ensure_paragraph_before_verses(
     Return the possibly updated verse_content.
     """
     if (
-        re.compile(usfm_verse_one_file_regex).match(Path(usfm_file).name) is not None
+        compile(usfm_verse_one_file_regex).match(Path(usfm_file).name) is not None
     ):  # Verse 1 of chapter
         if (
-            re.compile(chapter_marker_not_on_own_line_regex).match(verse_content)
+            compile(chapter_marker_not_on_own_line_regex).match(verse_content)
             is not None
         ):  # Chapter marker not on own line.
             # Make chapter marker occupy its own line and add a USFM paragraph
@@ -1238,7 +1246,7 @@ def ensure_paragraph_before_verses(
             # Docx did not have one. Presumably the 3rd party lib we use to parse
             # HTML to Docx doesn't like spans that are not contained in a block
             # level element.
-            verse_content = re.sub(
+            verse_content = sub(
                 chapter_marker_not_on_own_line_with_match_groups,
                 chapter_marker_not_on_own_line_repair_regex,
                 verse_content,
@@ -1327,7 +1335,7 @@ def clean_verse_content(verse_content: str) -> str:
     into a chapter this ends up creating a duplicate chapter marker.
     We deal with that here.
     """
-    cleaned_verse_content = re.sub(r"^\\c\s+\d+", "", verse_content)
+    cleaned_verse_content = sub(r"^\\c\s+\d+", "", verse_content)
     return cleaned_verse_content
 
 
@@ -1421,23 +1429,19 @@ def split_chapter_into_verses(chapter: USFMChapter) -> dict[str, str]:
     # '''
     verse_dict = {}
     # Find all verse spans
-    verse_spans = re.findall(
-        r'<span class="verse">(.*?)</span>', chapter.content, re.DOTALL
-    )
+    verse_spans = findall(r'<span class="verse">(.*?)</span>', chapter.content, DOTALL)
     for verse_span in verse_spans:
         # Extract the verse number from the versemarker
-        verse_number = re.search(r'<sup class="versemarker">(\d+)</sup>', verse_span)
+        verse_number = search(r'<sup class="versemarker">(\d+)</sup>', verse_span)
         if verse_number:
             verse_number_ = verse_number.group(1)
             # Remove versemarker
-            verse_text = re.sub(r'<sup class="versemarker">.*?</sup>', "", verse_span)
+            verse_text = sub(r'<sup class="versemarker">.*?</sup>', "", verse_span)
             # Remove footnotes numbers
-            verse_text = re.sub(
-                r'<sup id=".*?" class="caller">.*?</sup>', "", verse_text
-            )
+            verse_text = sub(r'<sup id=".*?" class="caller">.*?</sup>', "", verse_text)
             # Fix spacing issue when div class="poetry-*" type divs
             # are used, e.g., yielding 'heartsas' for Hebrews 3:8
-            verse_text = re.sub(
+            verse_text = sub(
                 r'<div class="poetry-\d">(.*?)</div>',
                 r" \1",
                 verse_text,
@@ -1503,12 +1507,10 @@ def split_chapter_into_verses_with_formatting(
     # footnote callers) and the second element the target footnotes HTML?
     verse_dict = {}
     # Find all verse spans
-    verse_spans = re.findall(
-        r'<span class="verse">(.*?)</span>', chapter.content, re.DOTALL
-    )
+    verse_spans = findall(r'<span class="verse">(.*?)</span>', chapter.content, DOTALL)
     for verse_span in verse_spans:
         # Extract the verse number from the versemarker
-        verse_number = re.search(r'<sup class="versemarker">(\d+)</sup>', verse_span)
+        verse_number = search(r'<sup class="versemarker">(\d+)</sup>', verse_span)
         if verse_number:
             verse_number_ = verse_number.group(1)
             # Add to the dictionary with verse number as the key and verse text as the value
@@ -1548,19 +1550,15 @@ def split_chapter_into_verses_with_formatting_for_f10(
         # cleaned_html = "".join(str(c) for c in verse_span.contents)
         cleaned_html = str(verse_span)
         # Fix spacing issues introduced by inner spans
-        cleaned_html = re.sub(
+        cleaned_html = sub(
             r"\s+([,;:.!?])", r"\1", cleaned_html
         )  # remove space before punctuation
-        cleaned_html = re.sub(
-            r"\s+'", "'", cleaned_html
-        )  # remove space before apostrophe
-        cleaned_html = re.sub(
-            r"'\s+", "'", cleaned_html
-        )  # remove space after apostrophe
-        cleaned_html = re.sub(
+        cleaned_html = sub(r"\s+'", "'", cleaned_html)  # remove space before apostrophe
+        cleaned_html = sub(r"'\s+", "'", cleaned_html)  # remove space after apostrophe
+        cleaned_html = sub(
             r"\s*-\s*", "-", cleaned_html
         )  # normalize spaces around hyphens
-        cleaned_html = re.sub(r"\s{2,}", " ", cleaned_html)  # collapse double spaces
+        cleaned_html = sub(r"\s{2,}", " ", cleaned_html)  # collapse double spaces
         cleaned_html = cleaned_html.strip()
         # if you want plain text instead, use: cleaned_text = verse_span.get_text(" ", strip=True)
         # store cleaned HTML fragment (still contains <sup> etc.)

From a322e344ca14aace56f1b2b4b26da10db65f24e8 Mon Sep 17 00:00:00 2001
From: linearcombination <4829djaskdfj@gmail.com>
Date: Thu, 12 Mar 2026 09:35:01 -0700
Subject: [PATCH 2/7] Ensure space before verse numbers

---
 backend/templates/html/header_compact_enclosing.html | 4 ++++
 backend/templates/html/header_enclosing.html         | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/backend/templates/html/header_compact_enclosing.html b/backend/templates/html/header_compact_enclosing.html
index ddbf1859..38f2d234 100644
--- a/backend/templates/html/header_compact_enclosing.html
+++ b/backend/templates/html/header_compact_enclosing.html
@@ -446,6 +446,10 @@
         font-weight: normal;
       }
 
+      .verse + .verse::before {
+        content: " ";
+      }
+
       .versemarker {
         font-size: 0.5em;
         vertical-align: top;
diff --git a/backend/templates/html/header_enclosing.html b/backend/templates/html/header_enclosing.html
index f47f2279..049d0a68 100644
--- a/backend/templates/html/header_enclosing.html
+++ b/backend/templates/html/header_enclosing.html
@@ -448,6 +448,10 @@
         font-weight: normal;
       }
 
+      .verse + .verse::before {
+        content: " ";
+      }
+
       .versemarker {
         font-size: 0.5em;
         vertical-align: top;

From d64d467cd8a45388cdc836ef809b2e5048449b5e Mon Sep 17 00:00:00 2001
From: linearcombination <4829djaskdfj@gmail.com>
Date: Thu, 12 Mar 2026 15:55:58 -0700
Subject: [PATCH 3/7] Remove sectionhead5 sections and empty paragraph elements

---
 backend/doc/domain/parsing.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/backend/doc/domain/parsing.py b/backend/doc/domain/parsing.py
index d734fee2..af5dfd8b 100644
--- a/backend/doc/domain/parsing.py
+++ b/backend/doc/domain/parsing.py
@@ -86,8 +86,8 @@
 
 H1, H2, H3, H4, H5 = "h1", "h2", "h3", "h4", "h5"
 
-_SECTIONHEAD5_RE = compile(r'<div\s+class="sectionhead-5">\s*</div>')
-
+SECTIONHEAD5_RE = compile(r'<div\s+class="sectionhead-5">\s*</div>')
+EMPTY_P_RE = compile(r"<p>\s*</p>")
 
 BC_ARTICLE_URL_FMT_STR: str = (
     "https://content.bibletranslationtools.org/WycliffeAssociates/en_bc/src/branch/master/{}"
@@ -455,10 +455,13 @@ def ensure_chapter_marker(
     return f"\\c {chapter_num}\n" + chapter_usfm_text
 
 
-def remove_sectionhead5_elements(
-    content: str, sectionhead5_re: Pattern[str] = _SECTIONHEAD5_RE
+def remove_unwanted_elements(
+    content: str,
+    sectionhead5_re: Pattern[str] = SECTIONHEAD5_RE,
+    empty_paragraph_re: Pattern[str] = EMPTY_P_RE,
 ) -> str:
-    return sectionhead5_re.sub(" ", content)
+    result = sectionhead5_re.sub(" ", content)
+    return empty_paragraph_re.sub("", result)
 
 
 def usfm_book_content(
@@ -522,15 +525,15 @@ def usfm_book_content(
         chapter_html_content = usfm_chapter_html(
             chapter_usfm, input_file, output_file, chapter_num
         )
-        cleaned_chapter_html_content = remove_null_bytes_and_control_characters(
+        cleaned_chapter_html_content_ = remove_null_bytes_and_control_characters(
             chapter_html_content
         )
-        chapter_html_content_sans_s5 = remove_sectionhead5_elements(
-            cleaned_chapter_html_content
+        cleaned_chapter_html_content = remove_unwanted_elements(
+            cleaned_chapter_html_content_
         )
         usfm_chapters[chapter_num] = USFMChapter(
             content=(
-                chapter_html_content_sans_s5 if chapter_html_content_sans_s5 else ""
+                cleaned_chapter_html_content if cleaned_chapter_html_content else ""
             ),
             verses=None,
         )

From ad631eb6dd48424ea8e0bab04e7056306443a9d1 Mon Sep 17 00:00:00 2001
From: linearcombination <4829djaskdfj@gmail.com>
Date: Fri, 13 Mar 2026 04:14:43 -0700
Subject: [PATCH 4/7] Upgrade from htmldocx to html4docx package

---
 backend/doc/domain/document_generator.py      | 2 +-
 backend/passages/domain/document_generator.py | 3 +--
 backend/passages/utils/docx_utils.py          | 2 +-
 backend/requirements.in                       | 2 +-
 backend/requirements.txt                      | 6 +++---
 backend/stet/domain/document_generator.py     | 2 +-
 backend/stet/utils/docx_utils.py              | 2 +-
 pyproject.toml                                | 2 +-
 8 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py
index 4429b0bf..b2933220 100755
--- a/backend/doc/domain/document_generator.py
+++ b/backend/doc/domain/document_generator.py
@@ -75,7 +75,7 @@
 from docx.shared import RGBColor
 from docxcompose.composer import Composer  # type: ignore
 from docxtpl import DocxTemplate  # type: ignore
-from htmldocx import HtmlToDocx  # type: ignore
+from html4docx import HtmlToDocx  # type: ignore
 
 
 logger = settings.logger(__name__)
diff --git a/backend/passages/domain/document_generator.py b/backend/passages/domain/document_generator.py
index a55b046b..4a79e56d 100644
--- a/backend/passages/domain/document_generator.py
+++ b/backend/passages/domain/document_generator.py
@@ -24,7 +24,7 @@
 from docx.oxml import parse_xml
 from docx.shared import Inches, RGBColor
 from docx.table import _Cell, _Row
-from htmldocx import HtmlToDocx  # type: ignore
+from html4docx import HtmlToDocx  # type: ignore
 from passages.domain.model import (
     Passage,
     BibleReferenceWithAvailability,
@@ -34,7 +34,6 @@
 from passages.utils.docx_utils import add_footer, add_header
 from pydantic import Json
 
-
 if TYPE_CHECKING:
     from typing import TypeAlias
 
diff --git a/backend/passages/utils/docx_utils.py b/backend/passages/utils/docx_utils.py
index 74467601..43c8c01f 100644
--- a/backend/passages/utils/docx_utils.py
+++ b/backend/passages/utils/docx_utils.py
@@ -11,7 +11,7 @@
 from docx.shared import Pt, RGBColor
 from docx.table import Table
 from docx.text.paragraph import Paragraph
-from htmldocx import HtmlToDocx  # type: ignore
+from html4docx import HtmlToDocx  # type: ignore
 
 
 from docx.table import _Cell, _Row
diff --git a/backend/requirements.in b/backend/requirements.in
index 2cb262e9..c6af9614 100644
--- a/backend/requirements.in
+++ b/backend/requirements.in
@@ -15,7 +15,7 @@ fastapi[all]
 filelock
 flower
 gunicorn
-htmldocx
+html-for-docx
 jinja2
 mistune
 orjson
diff --git a/backend/requirements.txt b/backend/requirements.txt
index f09ee819..4c7da8c2 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -24,7 +24,7 @@ babel==2.18.0
 beautifulsoup4==4.14.3
     # via
     #   -r backend/requirements.in
-    #   htmldocx
+    #   html-for-docx
 billiard==4.2.4
     # via celery
 brotli==1.2.0
@@ -95,7 +95,7 @@ h11==0.16.0
     # via
     #   httpcore
     #   uvicorn
-htmldocx==0.0.6
+html-for-docx==1.1.4
     # via -r backend/requirements.in
 httpcore==1.0.9
     # via httpx
@@ -176,7 +176,7 @@ python-docx==1.2.0
     # via
     #   docxcompose3
     #   docxtpl
-    #   htmldocx
+    #   html-for-docx
 python-dotenv==1.2.1
     # via
     #   -r backend/requirements.in
diff --git a/backend/stet/domain/document_generator.py b/backend/stet/domain/document_generator.py
index a6d3d393..c70022de 100644
--- a/backend/stet/domain/document_generator.py
+++ b/backend/stet/domain/document_generator.py
@@ -25,7 +25,7 @@
 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
-from htmldocx import HtmlToDocx  # type: ignore
+from html4docx import HtmlToDocx  # type: ignore
 from pydantic import Json
 from stet.domain.model import VerseEntry, WordEntry
 from stet.domain.parser import get_word_entry_dtos
diff --git a/backend/stet/utils/docx_utils.py b/backend/stet/utils/docx_utils.py
index c9cfaff6..0bf56588 100644
--- a/backend/stet/utils/docx_utils.py
+++ b/backend/stet/utils/docx_utils.py
@@ -13,7 +13,7 @@
 from docx.shared import Pt, RGBColor
 from docx.table import Table
 from docx.text.paragraph import Paragraph
-from htmldocx import HtmlToDocx  # type: ignore[import-untyped]
+from html4docx import HtmlToDocx  # type: ignore[import-untyped]
 
 
 from docx.table import _Cell, _Row
diff --git a/pyproject.toml b/pyproject.toml
index 2050121f..f89abfd5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ dependencies = [
   "fastapi[all]",
   "filelock",
   "flower",
-  "htmldocx",
+  "html-for-docx",
   "html2docx",
   "gunicorn",
   "jinja2",

From 7b901bf8e1438df8c458f48f3b5f98c2464ef88c Mon Sep 17 00:00:00 2001
From: linearcombination <4829djaskdfj@gmail.com>
Date: Fri, 13 Mar 2026 04:15:29 -0700
Subject: [PATCH 5/7] Sort imports and format whitespace

---
 backend/doc/domain/document_generator.py  | 4 ++--
 backend/stet/domain/document_generator.py | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py
index b2933220..a66f668f 100755
--- a/backend/doc/domain/document_generator.py
+++ b/backend/doc/domain/document_generator.py
@@ -3,14 +3,13 @@
 and eventually a final document produced.
 """
 
+import re
 import subprocess
 import time
 from datetime import datetime
 from os.path import exists, join
 from typing import Final, Mapping, Optional, Sequence, TypeAlias, cast
 
-# import regex as re # not yet supported in python 3.13 - used for unicode word boundaries for RTL languages
-import re
 from celery import current_task
 from doc.config import settings
 from doc.domain import parsing, resource_lookup, worker
@@ -77,6 +76,7 @@
 from docxtpl import DocxTemplate  # type: ignore
 from html4docx import HtmlToDocx  # type: ignore
 
+# import regex as re # not yet supported in python 3.13 - used for unicode word boundaries for RTL languages
 
 logger = settings.logger(__name__)
 
diff --git a/backend/stet/domain/document_generator.py b/backend/stet/domain/document_generator.py
index c70022de..b45feb79 100644
--- a/backend/stet/domain/document_generator.py
+++ b/backend/stet/domain/document_generator.py
@@ -47,7 +47,6 @@
 )
 from stet.utils.util import extract_chapter_and_beyond
 
-
 logger = settings.logger(__name__)
 
 
@@ -360,9 +359,7 @@ def generate_docx(
             source_paragraph = row_cells[0].paragraphs[0]
             source_paragraph.paragraph_format.line_spacing = 2.0  # Adjust line spacing
             if verse.source_has_preformatted_bolding:
-                add_preformatted_html_to_docx(
-                    verse.source_text, source_paragraph
-                )
+                add_preformatted_html_to_docx(verse.source_text, source_paragraph)
             elif len(word_entry.bolded_phrases) > 0:
                 add_highlighted_html_to_docx_for_words(
                     verse.source_text, source_paragraph, word_entry.bolded_phrases

From 74779174c47ab1fc9fa4cbf0ab7c467980d93818 Mon Sep 17 00:00:00 2001
From: linearcombination <4829djaskdfj@gmail.com>
Date: Fri, 13 Mar 2026 15:20:48 -0700
Subject: [PATCH 6/7] html4docx handles internal links

html4docx handles internal links whereas htmldocx had not, so we can
remove some custom code
---
 backend/doc/domain/document_generator.py |  4 +---
 backend/doc/domain/parsing.py            |  5 ----
 backend/doc/utils/docx_util.py           | 30 ------------------------
 3 files changed, 1 insertion(+), 38 deletions(-)

diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py
index a66f668f..6fdbf1c3 100755
--- a/backend/doc/domain/document_generator.py
+++ b/backend/doc/domain/document_generator.py
@@ -51,7 +51,6 @@
 from doc.utils.docx_util import (
     add_internal_docx_links,
     generate_docx_toc,
-    preprocess_html_for_internal_docx_links,
     style_superscripts,
 )
 from doc.utils.file_utils import (
@@ -767,8 +766,7 @@ def compose_docx_document(
         else:
             add_one_column_section(doc)
         try:
-            processed_html = preprocess_html_for_internal_docx_links(part.content)
-            html_to_docx.add_html_to_document(processed_html, doc)
+            html_to_docx.add_html_to_document(part.content, doc)
         except ValueError as e:
             logger.exception("Error converting HTML to docx: %s", e)
         if part.use_section_visual_separator:
diff --git a/backend/doc/domain/parsing.py b/backend/doc/domain/parsing.py
index af5dfd8b..2423f68c 100644
--- a/backend/doc/domain/parsing.py
+++ b/backend/doc/domain/parsing.py
@@ -62,7 +62,6 @@
 )
 from doc.reviewers_guide.model import RGBook
 from doc.reviewers_guide.parser import get_rg_books
-from doc.utils.docx_util import preprocess_html_for_internal_docx_links
 from doc.utils.text_utils import (
     maybe_correct_book_name,
     chapter_label_numeric_part,
@@ -958,10 +957,6 @@ def tw_name_content_pairs(
             html_word_content = cast(str, mistune.markdown(translation_word_content))
             html_word_content = sub(h2, h4, html_word_content)
             html_word_content = sub(h1, h3, html_word_content)
-            if generate_docx:
-                html_word_content = preprocess_html_for_internal_docx_links(
-                    html_word_content
-                )
             pair = TWNameContentPair(
                 localized_translation_word_,
                 translation_word_filepath,
diff --git a/backend/doc/utils/docx_util.py b/backend/doc/utils/docx_util.py
index 703a2095..7490a4d7 100644
--- a/backend/doc/utils/docx_util.py
+++ b/backend/doc/utils/docx_util.py
@@ -46,36 +46,6 @@ def generate_docx_toc(docx_filepath: str) -> str:
     return str(toc_path)
 
 
-def preprocess_html_for_internal_docx_links(html: str) -> str:
-    """
-    Replace internal HTML anchors and headings with markers that survive HTML→DOCX conversion.
-    Example:
-      <h3 id="intro"> → {{BOOKMARK:intro}}
-      <a href="#intro">Christ</a> → {{LINK_START:intro}}Christ{{LINK_END}}
-    """
-    # Mark bookmarks
-    html = re.sub(
-        r'<h3\s+id="([^"]+)">',
-        r"{{BOOKMARK:\1}}<h3>",
-        html,
-        flags=re.IGNORECASE,
-    )
-    # Replace <a href="#id"> links
-    html = re.sub(
-        r'<a\s+href="#([^"]+)"><span>(.*?)</span></a>',
-        r"{{LINK_START:\1}}\2{{LINK_END}}",
-        html,
-        flags=re.IGNORECASE | re.DOTALL,
-    )
-    html = re.sub(
-        r'<a\s+href="#([^"]+)">(.*?)</a>',
-        r"{{LINK_START:\1}}\2{{LINK_END}}",
-        html,
-        flags=re.IGNORECASE | re.DOTALL,
-    )
-    return html
-
-
 def _make_text_run(text: str) -> Element:
     r = OxmlElement("w:r")
     t = OxmlElement("w:t")

From 7039b7d16fa4557618c99055724ba9d726f61727 Mon Sep 17 00:00:00 2001
From: linearcombination <4829djaskdfj@gmail.com>
Date: Fri, 13 Mar 2026 15:21:48 -0700
Subject: [PATCH 7/7] Ensure spacing before verse numbers in docx where needed

This solves issue #292
---
 backend/doc/utils/docx_util.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/backend/doc/utils/docx_util.py b/backend/doc/utils/docx_util.py
index 7490a4d7..efaa20c0 100644
--- a/backend/doc/utils/docx_util.py
+++ b/backend/doc/utils/docx_util.py
@@ -174,13 +174,20 @@ def style_superscripts(
         2 = +1pt
         4 = +2pt
         6 = +3pt
-
     color:
         RGBColor for superscripts (e.g. light gray)
     """
     for para in doc.paragraphs:
-        for run in para.runs:
+        runs = para.runs
+        for i, run in enumerate(runs):
             if run.font.superscript:
+                # --- Ensure space before superscript ---
+                if run.text and not run.text[0].isspace():
+                    prev_char = None
+                    if i > 0 and runs[i - 1].text:
+                        prev_char = runs[i - 1].text[-1]
+                    if not prev_char or not prev_char.isspace():
+                        run.text = " " + run.text
                 # --- Color ---
                 run.font.color.rgb = color
                 # --- Vertical position ---