Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ repos:
- id: shellcheck

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.10.0
rev: v0.15.4
hooks:
- id: ruff
- id: ruff-check
args: [--extend-select, S]
- id: ruff-format

- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
lightspeed-rag-content @ git+https://github.com/lightspeed-core/rag-content@main
defusedxml
packaging
lxml
html2text
20 changes: 11 additions & 9 deletions scripts/rhoso_adoc_docs_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from packaging.version import Version
from typing import Generator, Tuple
import xml.etree.ElementTree as ET

import defusedxml.ElementTree as DefusedET
import re
import subprocess
import tempfile
Expand Down Expand Up @@ -150,7 +152,7 @@ def red_hat_docs_path(
# This is needed because docinfo.xml is not properly formatted XML file
# because it does not contain a single root tag.
docinfo_content = f.read()
tree = ET.fromstring(f"<root>{docinfo_content}</root>")
tree = DefusedET.fromstring(f"<root>{docinfo_content}</root>")

productnumber = get_xml_element_text(tree, "productnumber")
if Version(productnumber) != Version(docs_version):
Expand Down Expand Up @@ -1364,8 +1366,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
# Try to remove the lock file (best effort)
try:
self.lock_path.unlink()
except Exception:
pass
except Exception as e:
LOG.debug(f"Could not remove lock file {self.lock_path}: {e}")
Comment on lines +1369 to +1370
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify broad exception catches in this file and inspect nearby context.
rg -n -C2 'except Exception as e' scripts/rhoso_adoc_docs_to_text.py

Repository: openstack-lightspeed/rag-content

Length of output: 3731


Catch OSError (not Exception) when removing the lock file

self.lock_path.unlink() is only meant to “best-effort” clean up OS-level failures; catching Exception here can hide unrelated bugs.

Proposed fix
-                except Exception as e:
+                except OSError as e:
                     LOG.debug(f"Could not remove lock file {self.lock_path}: {e}")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
except Exception as e:
LOG.debug(f"Could not remove lock file {self.lock_path}: {e}")
except OSError as e:
LOG.debug(f"Could not remove lock file {self.lock_path}: {e}")
🧰 Tools
🪛 Ruff (0.15.14)

[warning] 1369-1369: Do not catch blind exception: Exception

(BLE001)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@scripts/rhoso_adoc_docs_to_text.py` around lines 1369 - 1370, The current
broad except in the block that calls self.lock_path.unlink() (which logs via
LOG.debug(f"Could not remove lock file {self.lock_path}: {e}")) should be
narrowed to only catch OSError to avoid hiding unrelated bugs; replace "except
Exception as e" with "except OSError as e" (keeping the same LOG.debug call and
message) in the method that removes the lock file so only OS-level unlink
failures are handled.

except Exception as e:
LOG.warning(f"Error releasing lock for {self.file_path}: {e}")

Expand Down Expand Up @@ -1849,7 +1851,7 @@ def preprocess_xml_table_cells(xml_content: str) -> str:
"""
try:
# Parse the XML
root = ET.fromstring(xml_content)
root = DefusedET.fromstring(xml_content)

# Define the DocBook namespace
ns = {"db": "http://docbook.org/ns/docbook"}
Expand Down Expand Up @@ -1910,7 +1912,7 @@ def preprocess_xml_list_titles(xml_content: str) -> str:
"""
try:
# Parse the XML
root = ET.fromstring(xml_content)
root = DefusedET.fromstring(xml_content)

# Define the DocBook namespace
ns = {"db": "http://docbook.org/ns/docbook"}
Expand Down Expand Up @@ -2077,7 +2079,7 @@ def convert(self, input_path: Path, output_path: Path) -> dict[Path, list[str]]:
str(xml_temp_path.absolute()),
str(input_for_conversion.absolute()),
]
subprocess.run(asciidoctor_cmd, check=True, capture_output=True)
subprocess.run(asciidoctor_cmd, check=True, capture_output=True) # noqa: S603
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# AST-based verification: list subprocess.run calls missing a `timeout=` kwarg.
python - <<'PY'
import ast
from pathlib import Path

p = Path("scripts/rhoso_adoc_docs_to_text.py")
tree = ast.parse(p.read_text(encoding="utf-8"))

for node in ast.walk(tree):
    if isinstance(node, ast.Call):
        fn = node.func
        is_subprocess_run = (
            isinstance(fn, ast.Attribute)
            and fn.attr == "run"
            and isinstance(fn.value, ast.Name)
            and fn.value.id == "subprocess"
        )
        if is_subprocess_run:
            has_timeout = any(k.arg == "timeout" for k in node.keywords if k.arg)
            if not has_timeout:
                print(f"subprocess.run without timeout at line {node.lineno}")
PY

Repository: openstack-lightspeed/rag-content

Length of output: 252


🏁 Script executed:

#!/bin/bash
set -euo pipefail

python3 - <<'PY'
from pathlib import Path

path = Path("scripts/rhoso_adoc_docs_to_text.py")
lines = path.read_text(encoding="utf-8").splitlines()

targets = [2082, 2112, 2278, 2324]
for t in targets:
    start = max(1, t-8)
    end = min(len(lines), t+8)
    print(f"\n--- {path}:{t} (show {start}-{end}) ---")
    for i in range(start, end+1):
        print(f"{i:5d}: {lines[i-1]}")
PY

Repository: openstack-lightspeed/rag-content

Length of output: 4030


Add explicit timeouts to Asciidoctor/Pandoc conversion subprocesses

subprocess.run(...) calls at lines 2082, 2112, 2278, and 2324 run external conversions without any timeout=, so they can hang indefinitely. Add a timeout to bound failures and keep batch processing resilient.

Proposed fix
+SUBPROCESS_TIMEOUT_SECONDS = 600
...
-                subprocess.run(asciidoctor_cmd, check=True, capture_output=True)  # noqa: S603
+                subprocess.run(
+                    asciidoctor_cmd,
+                    check=True,
+                    capture_output=True,
+                    timeout=SUBPROCESS_TIMEOUT_SECONDS,
+                )  # noqa: S603
...
-                subprocess.run(pandoc_cmd, check=True, capture_output=True)  # noqa: S603
+                subprocess.run(
+                    pandoc_cmd,
+                    check=True,
+                    capture_output=True,
+                    timeout=SUBPROCESS_TIMEOUT_SECONDS,
+                )  # noqa: S603
...
                     result = subprocess.run(  # noqa: S603
-                        asciidoctor_cmd, check=True, capture_output=True, text=True
+                        asciidoctor_cmd,
+                        check=True,
+                        capture_output=True,
+                        text=True,
+                        timeout=SUBPROCESS_TIMEOUT_SECONDS,
                     )
...
                     subprocess.run(  # noqa: S603
-                        pandoc_cmd, check=True, capture_output=True, text=True
+                        pandoc_cmd,
+                        check=True,
+                        capture_output=True,
+                        text=True,
+                        timeout=SUBPROCESS_TIMEOUT_SECONDS,
                     )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@scripts/rhoso_adoc_docs_to_text.py` at line 2082, Add a bounded timeout to
the external conversion subprocesses so they cannot hang indefinitely: update
the subprocess.run(...) calls that invoke asciidoctor_cmd (line with
subprocess.run(asciidoctor_cmd, ...)) and the other conversion runs (the
subprocess.run calls at the locations using pandoc_cmd / similar command
variables) to include a timeout parameter (e.g., timeout=300) and handle
subprocess.TimeoutExpired where these runs occur (wrap in try/except or
propagate with clear logging) so batch processing remains resilient.


# Step 1.5: Preprocess XML to fix issues
with open(xml_temp_path, "r", encoding="utf-8") as f:
Expand Down Expand Up @@ -2107,7 +2109,7 @@ def convert(self, input_path: Path, output_path: Path) -> dict[Path, list[str]]:
"-o",
str(output_path.absolute()),
]
subprocess.run(pandoc_cmd, check=True, capture_output=True)
subprocess.run(pandoc_cmd, check=True, capture_output=True) # noqa: S603

# Step 3: Convert any HTML tables to markdown pipe tables
with open(output_path, "r", encoding="utf-8") as f:
Expand Down Expand Up @@ -2273,7 +2275,7 @@ def convert(self, input_path: Path, output_path: Path) -> None:
str(xml_temp_path.absolute()),
str(input_for_conversion.absolute()),
]
result = subprocess.run(
result = subprocess.run( # noqa: S603
asciidoctor_cmd, check=True, capture_output=True, text=True
)
if result.stderr:
Expand Down Expand Up @@ -2319,7 +2321,7 @@ def convert(self, input_path: Path, output_path: Path) -> None:
"-o",
str(output_path.absolute()),
]
subprocess.run(
subprocess.run( # noqa: S603
pandoc_cmd, check=True, capture_output=True, text=True
)

Expand Down
Loading