diff --git a/backend/doc/domain/document_generator.py b/backend/doc/domain/document_generator.py index 4429b0bf..6fdbf1c3 100755 --- a/backend/doc/domain/document_generator.py +++ b/backend/doc/domain/document_generator.py @@ -3,14 +3,13 @@ and eventually a final document produced. """ +import re import subprocess import time from datetime import datetime from os.path import exists, join from typing import Final, Mapping, Optional, Sequence, TypeAlias, cast -# import regex as re # not yet supported in python 3.13 - used for unicode word boundaries for RTL languages -import re from celery import current_task from doc.config import settings from doc.domain import parsing, resource_lookup, worker @@ -52,7 +51,6 @@ from doc.utils.docx_util import ( add_internal_docx_links, generate_docx_toc, - preprocess_html_for_internal_docx_links, style_superscripts, ) from doc.utils.file_utils import ( @@ -75,8 +73,9 @@ from docx.shared import RGBColor from docxcompose.composer import Composer # type: ignore from docxtpl import DocxTemplate # type: ignore -from htmldocx import HtmlToDocx # type: ignore +from html4docx import HtmlToDocx # type: ignore +# import regex as re # not yet supported in python 3.13 - used for unicode word boundaries for RTL languages logger = settings.logger(__name__) @@ -767,8 +766,7 @@ def compose_docx_document( else: add_one_column_section(doc) try: - processed_html = preprocess_html_for_internal_docx_links(part.content) - html_to_docx.add_html_to_document(processed_html, doc) + html_to_docx.add_html_to_document(part.content, doc) except ValueError as e: logger.exception("Error converting HTML to docx: %s", e) if part.use_section_visual_separator: diff --git a/backend/doc/domain/parsing.py b/backend/doc/domain/parsing.py index eb5cf122..2423f68c 100644 --- a/backend/doc/domain/parsing.py +++ b/backend/doc/domain/parsing.py @@ -2,7 +2,17 @@ This module provides an API for parsing content. """ -import re +from re import ( + compile, + escape, + findall, + search, + split as re_split, + sub, + DOTALL, + MULTILINE, + Pattern, +) import time from glob import glob from os import DirEntry, scandir, walk @@ -52,7 +62,6 @@ ) from doc.reviewers_guide.model import RGBook from doc.reviewers_guide.parser import get_rg_books -from doc.utils.docx_util import preprocess_html_for_internal_docx_links from doc.utils.text_utils import ( maybe_correct_book_name, chapter_label_numeric_part, @@ -72,27 +81,23 @@ ) from pydantic import HttpUrl - logger = settings.logger(__name__) H1, H2, H3, H4, H5 = "h1", "h2", "h3", "h4", "h5" -_SECTIONHEAD5_RE = re.compile( - r'
\s*
") BC_ARTICLE_URL_FMT_STR: str = ( "https://content.bibletranslationtools.org/WycliffeAssociates/en_bc/src/branch/master/{}" ) -CHAPTER_LABEL_REGEX = re.compile(r"\\cl\s+[^\n]+") -CHAPTER_LABEL_REGEX2 = re.compile(r"\\cl\s+(.+)") -CHAPTER_REGEX = re.compile(r"\\c\s+\d+") -CHAPTER_CAPTURE_REGEX = re.compile(r"(\\c\s+\d+)") -CHAPTER_CAPTURE_REGEX2 = re.compile(r"\\c\s+(\d+)") +CHAPTER_LABEL_REGEX = compile(r"\\cl\s+[^\n]+") +CHAPTER_LABEL_REGEX2 = compile(r"\\cl\s+(.+)") +CHAPTER_REGEX = compile(r"\\c\s+\d+") +CHAPTER_CAPTURE_REGEX = compile(r"(\\c\s+\d+)") +CHAPTER_CAPTURE_REGEX2 = compile(r"\\c\s+(\d+)") def find_usfm_files( @@ -256,7 +261,7 @@ def split_usfm_by_chapters( book_code: str, usfm_text: str, check_usfm: bool = settings.CHECK_USFM, - chapter_regex: re.Pattern[str] = CHAPTER_REGEX, + chapter_regex: Pattern[str] = CHAPTER_REGEX, resources_with_usfm_defects: Sequence[ tuple[str, str, str] ] = RESOURCES_WITH_USFM_DEFECTS, @@ -267,8 +272,8 @@ def split_usfm_by_chapters( """ chapter_markers = [] chapters = [] - chapter_markers = re.findall(chapter_regex, usfm_text) - chapters = re.split(chapter_regex, usfm_text) + chapter_markers = findall(chapter_regex, usfm_text) + chapters = re_split(chapter_regex, usfm_text) frontmatter = chapters.pop(0).strip() def needs_fixing() -> bool: @@ -303,28 +308,28 @@ def needs_fixing() -> bool: def ensure_chapter_label( chapter_usfm_text: str, chapter_num: int, - chapter_label_regex: re.Pattern[str] = CHAPTER_LABEL_REGEX, - chapter_regex: re.Pattern[str] = CHAPTER_REGEX, + chapter_label_regex: Pattern[str] = CHAPTER_LABEL_REGEX, + chapter_regex: Pattern[str] = CHAPTER_REGEX, ) -> str: r""" Modify USFM source to insert an English chapter label if it does not have one. Ensure that the chapter label includes the chapter number. """ - if not re.search(chapter_label_regex, chapter_usfm_text): - if re.search(chapter_regex, chapter_usfm_text): - chapter_usfm_text = re.sub( + if not search(chapter_label_regex, chapter_usfm_text): + if search(chapter_regex, chapter_usfm_text): + chapter_usfm_text = sub( r"(\\c\s+\d+)", "\n" + r"\1" + "\n" + r"\\cl Chapter " + f"{chapter_num}" + "\n", chapter_usfm_text, ) return chapter_usfm_text # Ensure chapter label contains the chapter number - match = re.search(r"\\cl\s+(.+)", chapter_usfm_text) + match = search(r"\\cl\s+(.+)", chapter_usfm_text) if match: label_text = match.group(1) if str(chapter_num) not in label_text: - updated_label = f"{re.escape(label_text)} {chapter_num}" - chapter_usfm_text = re.sub( + updated_label = f"{escape(label_text)} {chapter_num}" + chapter_usfm_text = sub( r"\\cl\s+(.+)", rf"\\cl {updated_label}", chapter_usfm_text, @@ -338,13 +343,13 @@ def ensure_chapter_label( def ensure_no_chapter_labels( chapter_usfm_text: str, - chapter_label_regex: re.Pattern[str] = CHAPTER_LABEL_REGEX, + chapter_label_regex: Pattern[str] = CHAPTER_LABEL_REGEX, ) -> str: r""" Modify USFM source to remove all chapter labels, \cl. """ - if re.search(chapter_label_regex, chapter_usfm_text): - updated_chapter_usfm_text = re.sub( + if search(chapter_label_regex, chapter_usfm_text): + updated_chapter_usfm_text = sub( chapter_label_regex, "", chapter_usfm_text, @@ -355,10 +360,10 @@ def ensure_no_chapter_labels( def get_chapter_num( chapter_usfm_text: str, - chapter_regex: re.Pattern[str] = CHAPTER_CAPTURE_REGEX2, + chapter_regex: Pattern[str] = CHAPTER_CAPTURE_REGEX2, ) -> int: """Get the chapter number from the USFM chapter source text.""" - if match := re.search(chapter_regex, chapter_usfm_text): + if match := search(chapter_regex, chapter_usfm_text): chapter_num = match.group(1) return int(chapter_num) return -1 # return sentinal @@ -372,7 +377,7 @@ def remove_null_bytes_and_control_characters(html_content: Optional[str]) -> str USFM. We strip those out as well as the possibility of ASCII NULL bytes. """ - return re.sub(r"[\x00-\x1F]+", "", html_content) if html_content else "" + return sub(r"[\x00-\x1F]+", "", html_content) if html_content else "" def extract_usfm_frontmatter(frontmatter: str) -> dict[str, str]: @@ -385,7 +390,7 @@ def extract_usfm_frontmatter(frontmatter: str) -> dict[str, str]: } extracted_data = {} for key, pattern in patterns.items(): - match = re.search(pattern, frontmatter, re.MULTILINE) + match = search(pattern, frontmatter, MULTILINE) if match: extracted_data[key] = match.group(1).strip() return extracted_data @@ -428,17 +433,17 @@ def maybe_localized_book_name(frontmatter: str) -> str: def ensure_chapter_marker( chapter_usfm_text: str, chapter_num: int, - chapter_regex: re.Pattern[str] = CHAPTER_CAPTURE_REGEX, + chapter_regex: Pattern[str] = CHAPTER_CAPTURE_REGEX, ) -> str: r""" Modify USFM source to insert a chapter marker, \c