From a2e6c1282cfaf1b1dfbffe726d6f47ddafbd408f Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Thu, 12 Mar 2026 09:34:26 -0700 Subject: [PATCH 1/7] Greater specificity of imports from re --- backend/doc/domain/parsing.py | 130 +++++++++++++++++----------------- 1 file changed, 64 insertions(+), 66 deletions(-) diff --git a/backend/doc/domain/parsing.py b/backend/doc/domain/parsing.py index eb5cf122..d734fee2 100644 --- a/backend/doc/domain/parsing.py +++ b/backend/doc/domain/parsing.py @@ -2,7 +2,17 @@ This module provides an API for parsing content. """ -import re +from re import ( + compile, + escape, + findall, + search, + split as re_split, + sub, + DOTALL, + MULTILINE, + Pattern, +) import time from glob import glob from os import DirEntry, scandir, walk @@ -72,15 +82,11 @@ ) from pydantic import HttpUrl - logger = settings.logger(__name__) H1, H2, H3, H4, H5 = "h1", "h2", "h3", "h4", "h5" -_SECTIONHEAD5_RE = re.compile( - r'
\s*
") BC_ARTICLE_URL_FMT_STR: str = ( "https://content.bibletranslationtools.org/WycliffeAssociates/en_bc/src/branch/master/{}" @@ -455,10 +455,13 @@ def ensure_chapter_marker( return f"\\c {chapter_num}\n" + chapter_usfm_text -def remove_sectionhead5_elements( - content: str, sectionhead5_re: Pattern[str] = _SECTIONHEAD5_RE +def remove_unwanted_elements( + content: str, + sectionhead5_re: Pattern[str] = SECTIONHEAD5_RE, + empty_paragraph_re: Pattern[str] = EMPTY_P_RE, ) -> str: - return sectionhead5_re.sub(" ", content) + result = sectionhead5_re.sub(" ", content) + return empty_paragraph_re.sub("", result) def usfm_book_content( @@ -522,15 +525,15 @@ def usfm_book_content( chapter_html_content = usfm_chapter_html( chapter_usfm, input_file, output_file, chapter_num ) - cleaned_chapter_html_content = remove_null_bytes_and_control_characters( + cleaned_chapter_html_content_ = remove_null_bytes_and_control_characters( chapter_html_content ) - chapter_html_content_sans_s5 = remove_sectionhead5_elements( - cleaned_chapter_html_content + cleaned_chapter_html_content = remove_unwanted_elements( + cleaned_chapter_html_content_ ) usfm_chapters[chapter_num] = USFMChapter( content=( - chapter_html_content_sans_s5 if chapter_html_content_sans_s5 else "" + cleaned_chapter_html_content if cleaned_chapter_html_content else "" ), verses=None, ) From ad631eb6dd48424ea8e0bab04e7056306443a9d1 Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Fri, 13 Mar 2026 04:14:43 -0700 Subject: [PATCH 4/7] Upgrade from htmldocx to html4docx package --- backend/doc/domain/document_generator.py | 2 +- backend/passages/domain/document_generator.py | 3 +-- backend/passages/utils/docx_utils.py | 2 +- backend/requirements.in | 2 +- backend/requirements.txt | 6 +++--- backend/stet/domain/document_generator.py | 2 +- backend/stet/utils/docx_utils.py | 2 +- pyproject.toml | 2 +- 8 files changed, 10 insertions(+), 11 deletions(-) diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py index 4429b0bf..b2933220 100755 --- a/backend/doc/domain/document_generator.py +++ b/backend/doc/domain/document_generator.py @@ -75,7 +75,7 @@ from docx.shared import RGBColor from docxcompose.composer import Composer # type: ignore from docxtpl import DocxTemplate # type: ignore -from htmldocx import HtmlToDocx # type: ignore +from html4docx import HtmlToDocx # type: ignore logger = settings.logger(__name__) diff --git a/backend/passages/domain/document_generator.py b/backend/passages/domain/document_generator.py index a55b046b..4a79e56d 100644 --- a/backend/passages/domain/document_generator.py +++ b/backend/passages/domain/document_generator.py @@ -24,7 +24,7 @@ from docx.oxml import parse_xml from docx.shared import Inches, RGBColor from docx.table import _Cell, _Row -from htmldocx import HtmlToDocx # type: ignore +from html4docx import HtmlToDocx # type: ignore from passages.domain.model import ( Passage, BibleReferenceWithAvailability, @@ -34,7 +34,6 @@ from passages.utils.docx_utils import add_footer, add_header from pydantic import Json - if TYPE_CHECKING: from typing import TypeAlias diff --git a/backend/passages/utils/docx_utils.py b/backend/passages/utils/docx_utils.py index 74467601..43c8c01f 100644 --- a/backend/passages/utils/docx_utils.py +++ b/backend/passages/utils/docx_utils.py @@ -11,7 +11,7 @@ from docx.shared import Pt, RGBColor from docx.table import Table from docx.text.paragraph import Paragraph -from htmldocx import HtmlToDocx # type: ignore +from html4docx import HtmlToDocx # type: ignore from docx.table import _Cell, _Row diff --git a/backend/requirements.in b/backend/requirements.in index 2cb262e9..c6af9614 100644 --- a/backend/requirements.in +++ b/backend/requirements.in @@ -15,7 +15,7 @@ fastapi[all] filelock flower gunicorn -htmldocx +html-for-docx jinja2 mistune orjson diff --git a/backend/requirements.txt b/backend/requirements.txt index f09ee819..4c7da8c2 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -24,7 +24,7 @@ babel==2.18.0 beautifulsoup4==4.14.3 # via # -r backend/requirements.in - # htmldocx + # html-for-docx billiard==4.2.4 # via celery brotli==1.2.0 @@ -95,7 +95,7 @@ h11==0.16.0 # via # httpcore # uvicorn -htmldocx==0.0.6 +html-for-docx==1.1.4 # via -r backend/requirements.in httpcore==1.0.9 # via httpx @@ -176,7 +176,7 @@ python-docx==1.2.0 # via # docxcompose3 # docxtpl - # htmldocx + # html-for-docx python-dotenv==1.2.1 # via # -r backend/requirements.in diff --git a/backend/stet/domain/document_generator.py b/backend/stet/domain/document_generator.py index a6d3d393..c70022de 100644 --- a/backend/stet/domain/document_generator.py +++ b/backend/stet/domain/document_generator.py @@ -25,7 +25,7 @@ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.oxml import OxmlElement from docx.oxml.ns import qn -from htmldocx import HtmlToDocx # type: ignore +from html4docx import HtmlToDocx # type: ignore from pydantic import Json from stet.domain.model import VerseEntry, WordEntry from stet.domain.parser import get_word_entry_dtos diff --git a/backend/stet/utils/docx_utils.py b/backend/stet/utils/docx_utils.py index c9cfaff6..0bf56588 100644 --- a/backend/stet/utils/docx_utils.py +++ b/backend/stet/utils/docx_utils.py @@ -13,7 +13,7 @@ from docx.shared import Pt, RGBColor from docx.table import Table from docx.text.paragraph import Paragraph -from htmldocx import HtmlToDocx # type: ignore[import-untyped] +from html4docx import HtmlToDocx # type: ignore[import-untyped] from docx.table import _Cell, _Row diff --git a/pyproject.toml b/pyproject.toml index 2050121f..f89abfd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ dependencies = [ "fastapi[all]", "filelock", "flower", - "htmldocx", + "html-for-docx", "html2docx", "gunicorn", "jinja2", From 7b901bf8e1438df8c458f48f3b5f98c2464ef88c Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Fri, 13 Mar 2026 04:15:29 -0700 Subject: [PATCH 5/7] Sort imports and format whitespace --- backend/doc/domain/document_generator.py | 4 ++-- backend/stet/domain/document_generator.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py index b2933220..a66f668f 100755 --- a/backend/doc/domain/document_generator.py +++ b/backend/doc/domain/document_generator.py @@ -3,14 +3,13 @@ and eventually a final document produced. """ +import re import subprocess import time from datetime import datetime from os.path import exists, join from typing import Final, Mapping, Optional, Sequence, TypeAlias, cast -# import regex as re # not yet supported in python 3.13 - used for unicode word boundaries for RTL languages -import re from celery import current_task from doc.config import settings from doc.domain import parsing, resource_lookup, worker @@ -77,6 +76,7 @@ from docxtpl import DocxTemplate # type: ignore from html4docx import HtmlToDocx # type: ignore +# import regex as re # not yet supported in python 3.13 - used for unicode word boundaries for RTL languages logger = settings.logger(__name__) diff --git a/backend/stet/domain/document_generator.py b/backend/stet/domain/document_generator.py index c70022de..b45feb79 100644 --- a/backend/stet/domain/document_generator.py +++ b/backend/stet/domain/document_generator.py @@ -47,7 +47,6 @@ ) from stet.utils.util import extract_chapter_and_beyond - logger = settings.logger(__name__) @@ -360,9 +359,7 @@ def generate_docx( source_paragraph = row_cells[0].paragraphs[0] source_paragraph.paragraph_format.line_spacing = 2.0 # Adjust line spacing if verse.source_has_preformatted_bolding: - add_preformatted_html_to_docx( - verse.source_text, source_paragraph - ) + add_preformatted_html_to_docx(verse.source_text, source_paragraph) elif len(word_entry.bolded_phrases) > 0: add_highlighted_html_to_docx_for_words( verse.source_text, source_paragraph, word_entry.bolded_phrases From 74779174c47ab1fc9fa4cbf0ab7c467980d93818 Mon Sep 17 00:00:00 2001 From: linearcombination <4829djaskdfj@gmail.com> Date: Fri, 13 Mar 2026 15:20:48 -0700 Subject: [PATCH 6/7] html4docx handles internal links html4docx handles internal links whereas htmldocx had not, so we can remove some custom code --- backend/doc/domain/document_generator.py | 4 +--- backend/doc/domain/parsing.py | 5 ---- backend/doc/utils/docx_util.py | 30 ------------------------ 3 files changed, 1 insertion(+), 38 deletions(-) diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py index a66f668f..6fdbf1c3 100755 --- a/backend/doc/domain/document_generator.py +++ b/backend/doc/domain/document_generator.py @@ -51,7 +51,6 @@ from doc.utils.docx_util import ( add_internal_docx_links, generate_docx_toc, - preprocess_html_for_internal_docx_links, style_superscripts, ) from doc.utils.file_utils import ( @@ -767,8 +766,7 @@ def compose_docx_document( else: add_one_column_section(doc) try: - processed_html = preprocess_html_for_internal_docx_links(part.content) - html_to_docx.add_html_to_document(processed_html, doc) + html_to_docx.add_html_to_document(part.content, doc) except ValueError as e: logger.exception("Error converting HTML to docx: %s", e) if part.use_section_visual_separator: diff --git a/backend/doc/domain/parsing.py b/backend/doc/domain/parsing.py index af5dfd8b..2423f68c 100644 --- a/backend/doc/domain/parsing.py +++ b/backend/doc/domain/parsing.py @@ -62,7 +62,6 @@ ) from doc.reviewers_guide.model import RGBook from doc.reviewers_guide.parser import get_rg_books -from doc.utils.docx_util import preprocess_html_for_internal_docx_links from doc.utils.text_utils import ( maybe_correct_book_name, chapter_label_numeric_part, @@ -958,10 +957,6 @@ def tw_name_content_pairs( html_word_content = cast(str, mistune.markdown(translation_word_content)) html_word_content = sub(h2, h4, html_word_content) html_word_content = sub(h1, h3, html_word_content) - if generate_docx: - html_word_content = preprocess_html_for_internal_docx_links( - html_word_content - ) pair = TWNameContentPair( localized_translation_word_, translation_word_filepath, diff --git a/backend/doc/utils/docx_util.py b/backend/doc/utils/docx_util.py index 703a2095..7490a4d7 100644 --- a/backend/doc/utils/docx_util.py +++ b/backend/doc/utils/docx_util.py @@ -46,36 +46,6 @@ def generate_docx_toc(docx_filepath: str) -> str: return str(toc_path) -def preprocess_html_for_internal_docx_links(html: str) -> str: - """ - Replace internal HTML anchors and headings with markers that survive HTML→DOCX conversion. - Example: -