diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 959145b..f09ce5c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,9 +6,10 @@ repos:
- id: shellcheck
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.10.0
+ rev: v0.15.4
hooks:
- - id: ruff
+ - id: ruff-check
+ args: [--extend-select, S]
- id: ruff-format
- repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/requirements.txt b/requirements.txt
index 996335f..494c1e3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
lightspeed-rag-content @ git+https://github.com/lightspeed-core/rag-content@main
+defusedxml
packaging
lxml
html2text
diff --git a/scripts/rhoso_adoc_docs_to_text.py b/scripts/rhoso_adoc_docs_to_text.py
index b950db8..95d3d2a 100644
--- a/scripts/rhoso_adoc_docs_to_text.py
+++ b/scripts/rhoso_adoc_docs_to_text.py
@@ -22,6 +22,8 @@
from packaging.version import Version
from typing import Generator, Tuple
import xml.etree.ElementTree as ET
+
+import defusedxml.ElementTree as DefusedET
import re
import subprocess
import tempfile
@@ -150,7 +152,7 @@ def red_hat_docs_path(
# This is needed because docinfo.xml is not properly formatted XML file
# because it does not contain a single root tag.
docinfo_content = f.read()
- tree = ET.fromstring(f"{docinfo_content}")
+ tree = DefusedET.fromstring(f"{docinfo_content}")
productnumber = get_xml_element_text(tree, "productnumber")
if Version(productnumber) != Version(docs_version):
@@ -1364,8 +1366,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
# Try to remove the lock file (best effort)
try:
self.lock_path.unlink()
- except Exception:
- pass
+ except Exception as e:
+ LOG.debug(f"Could not remove lock file {self.lock_path}: {e}")
except Exception as e:
LOG.warning(f"Error releasing lock for {self.file_path}: {e}")
@@ -1849,7 +1851,7 @@ def preprocess_xml_table_cells(xml_content: str) -> str:
"""
try:
# Parse the XML
- root = ET.fromstring(xml_content)
+ root = DefusedET.fromstring(xml_content)
# Define the DocBook namespace
ns = {"db": "http://docbook.org/ns/docbook"}
@@ -1910,7 +1912,7 @@ def preprocess_xml_list_titles(xml_content: str) -> str:
"""
try:
# Parse the XML
- root = ET.fromstring(xml_content)
+ root = DefusedET.fromstring(xml_content)
# Define the DocBook namespace
ns = {"db": "http://docbook.org/ns/docbook"}
@@ -2077,7 +2079,7 @@ def convert(self, input_path: Path, output_path: Path) -> dict[Path, list[str]]:
str(xml_temp_path.absolute()),
str(input_for_conversion.absolute()),
]
- subprocess.run(asciidoctor_cmd, check=True, capture_output=True)
+ subprocess.run(asciidoctor_cmd, check=True, capture_output=True) # noqa: S603
# Step 1.5: Preprocess XML to fix issues
with open(xml_temp_path, "r", encoding="utf-8") as f:
@@ -2107,7 +2109,7 @@ def convert(self, input_path: Path, output_path: Path) -> dict[Path, list[str]]:
"-o",
str(output_path.absolute()),
]
- subprocess.run(pandoc_cmd, check=True, capture_output=True)
+ subprocess.run(pandoc_cmd, check=True, capture_output=True) # noqa: S603
# Step 3: Convert any HTML tables to markdown pipe tables
with open(output_path, "r", encoding="utf-8") as f:
@@ -2273,7 +2275,7 @@ def convert(self, input_path: Path, output_path: Path) -> None:
str(xml_temp_path.absolute()),
str(input_for_conversion.absolute()),
]
- result = subprocess.run(
+ result = subprocess.run( # noqa: S603
asciidoctor_cmd, check=True, capture_output=True, text=True
)
if result.stderr:
@@ -2319,7 +2321,7 @@ def convert(self, input_path: Path, output_path: Path) -> None:
"-o",
str(output_path.absolute()),
]
- subprocess.run(
+ subprocess.run( # noqa: S603
pandoc_cmd, check=True, capture_output=True, text=True
)