From ffc70424e8efd1f7d0ecd429a60a46021cc954d1 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Wed, 10 Jun 2026 20:27:31 -0700
Subject: [PATCH] Fix plugin CI root causes

---
 abx_plugins/plugins/archivedotorg/config.json |  5 ++
 ...on_Snapshot__08_archivedotorg.finite.bg.py | 15 +++-
 .../archivedotorg/tests/test_archivedotorg.py | 87 +++++++++++--------
 .../chrome/tests/chrome_test_helpers.py       | 62 ++++++++-----
 .../chrome/tests/test_chrome_test_helpers.py  | 43 +++++++++
 .../chrome_mhtml/tests/test_chrome_mhtml.py   | 84 +++++++++++++++++-
 .../on_Snapshot__11_favicon.finite.bg.py      | 30 ++++++-
 .../on_Snapshot__72_parse_rss_urls.py         | 53 ++++++++++-
 .../tests/test_parse_rss_urls.py              | 52 +++++++++++
 .../singlefile/tests/test_singlefile.py       | 14 +--
 10 files changed, 372 insertions(+), 73 deletions(-)

diff --git a/abx_plugins/plugins/archivedotorg/config.json b/abx_plugins/plugins/archivedotorg/config.json
index cb26b77f..6214c035 100644
--- a/abx_plugins/plugins/archivedotorg/config.json
+++ b/abx_plugins/plugins/archivedotorg/config.json
@@ -26,6 +26,11 @@
       "minimum": 10,
       "x-fallback": "TIMEOUT",
       "description": "Timeout for archive.org submission in seconds"
+    },
+    "ARCHIVEDOTORG_ENDPOINT": {
+      "type": "string",
+      "default": "https://web.archive.org/save/{url}",
+      "description": "Wayback Machine save endpoint template. Supports {url} or {} placeholders for the submitted URL."
     }
   }
 }
diff --git a/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py b/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py
index 01820088..dd795676 100755
--- a/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py
+++ b/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py
@@ -25,7 +25,7 @@
 from ipaddress import ip_address
 from pathlib import Path
 from urllib.error import HTTPError, URLError
-from urllib.parse import urlparse
+from urllib.parse import quote, urlparse
 from urllib.request import Request, urlopen
 
 from abx_plugins.plugins.base.utils import emit_archive_result_record, load_config
@@ -42,6 +42,7 @@
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 os.chdir(OUTPUT_DIR)
 OUTPUT_FILE = "archive.org.txt"
+URL_PATH_SAFE_CHARS = ":/?#[]@!$&'()*+,;=%"
 
 
 def should_skip_archivedotorg_url(url: str) -> str:
@@ -71,6 +72,15 @@ def should_skip_archivedotorg_url(url: str) -> str:
     return ""
 
 
+def build_archivedotorg_submit_url(endpoint_template: str, url: str) -> str:
+    escaped_url = quote(url, safe=URL_PATH_SAFE_CHARS)
+    if "{url}" in endpoint_template:
+        return endpoint_template.format(url=escaped_url)
+    if "{}" in endpoint_template:
+        return endpoint_template.format(escaped_url)
+    return f"{endpoint_template.rstrip('/')}/{escaped_url}"
+
+
 def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
     """
     Submit URL to archive.org Wayback Machine.
@@ -83,12 +93,13 @@ def log(message: str) -> None:
 
     config = load_config()
     timeout = config.ARCHIVEDOTORG_TIMEOUT
+    endpoint_template = str(config.ARCHIVEDOTORG_ENDPOINT or "").strip()
     library_version = os.environ.get("LIBRARY_VERSION", "0.0.1")
     user_agent = (
         f"ArchiveBox/{library_version} (+https://github.com/ArchiveBox/ArchiveBox/)"
     )
 
-    submit_url = f"https://web.archive.org/save/{url}"
+    submit_url = build_archivedotorg_submit_url(endpoint_template, url)
     log(f"Submitting to Wayback Machine (timeout={timeout}s)")
     log(f"GET {submit_url}")
 
diff --git a/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py b/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py
index c79a19fb..dc124296 100755
--- a/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py
+++ b/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py
@@ -9,6 +9,7 @@
 import tempfile
 from pathlib import Path
 import pytest
+from werkzeug.wrappers import Response
 
 from abx_plugins.plugins.base.test_utils import parse_jsonl_output
 
@@ -24,27 +25,43 @@ def test_hook_script_exists():
     assert ARCHIVEDOTORG_HOOK.exists()
 
 
-def test_submits_to_archivedotorg():
+def _run_archivedotorg_hook(
+    tmpdir: Path,
+    env: dict[str, str],
+) -> subprocess.CompletedProcess:
+    return subprocess.run(
+        [
+            str(ARCHIVEDOTORG_HOOK),
+            "--url",
+            TEST_URL,
+        ],
+        cwd=tmpdir,
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=30,
+    )
+
+
+def test_submits_to_configured_archivedotorg_endpoint(httpserver):
+    archived_path = "/web/20260610123456/https://example.com"
+    httpserver.expect_request("/save/https://example.com").respond_with_data(
+        "saved",
+        status=200,
+        headers={
+            "Content-Location": archived_path,
+            "X-Archive-Orig-Url": TEST_URL,
+        },
+    )
+
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
 
         env = os.environ.copy()
-        # Keep the hook's own network timeout below subprocess timeout so failures
-        # return cleanly as exit=1 instead of being killed by pytest.
-        env["ARCHIVEDOTORG_TIMEOUT"] = "45"
+        env["SNAP_DIR"] = str(tmpdir)
+        env["ARCHIVEDOTORG_ENDPOINT"] = f"{httpserver.url_for('/save')}/{{url}}"
 
-        result = subprocess.run(
-            [
-                str(ARCHIVEDOTORG_HOOK),
-                "--url",
-                TEST_URL,
-            ],
-            cwd=tmpdir,
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=90,
-        )
+        result = _run_archivedotorg_hook(tmpdir, env)
 
         assert result.returncode == 0, result.stderr
 
@@ -58,7 +75,8 @@ def test_submits_to_archivedotorg():
         output_path = tmpdir / "archivedotorg" / "archive.org.txt"
         assert output_path.is_file(), f"Archive.org output missing: {output_path}"
         archived_url = output_path.read_text(encoding="utf-8").strip()
-        assert archived_url.startswith("https://web.archive.org/"), archived_url
+        assert archived_url == f"https://web.archive.org{archived_path}"
+        assert len(httpserver.log) == 1
 
 
 def test_config_save_archivedotorg_false_skips():
@@ -94,25 +112,22 @@ def test_config_save_archivedotorg_false_skips():
         assert result_json["output_str"] == "ARCHIVEDOTORG_ENABLED=False", result_json
 
 
-def test_handles_timeout():
+def test_archivedotorg_http_429_is_deterministic_noresults(httpserver):
+    def rate_limited(_request):
+        return Response("rate limited", status=429, content_type="text/plain")
+
+    httpserver.expect_request("/save/https://example.com").respond_with_handler(
+        rate_limited,
+    )
+
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
 
         env = os.environ.copy()
-        env["ARCHIVEDOTORG_TIMEOUT"] = "10"
+        env["SNAP_DIR"] = str(tmpdir)
+        env["ARCHIVEDOTORG_ENDPOINT"] = f"{httpserver.url_for('/save')}/{{url}}"
 
-        result = subprocess.run(
-            [
-                str(ARCHIVEDOTORG_HOOK),
-                "--url",
-                TEST_URL,
-            ],
-            cwd=tmpdir,
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=30,
-        )
+        result = _run_archivedotorg_hook(tmpdir, env)
 
         assert result.returncode == 0, result.stderr
 
@@ -120,13 +135,13 @@ def test_handles_timeout():
         assert result_json, "Should emit ArchiveResult JSONL"
         assert result_json == {
             "type": "ArchiveResult",
-            "status": "succeeded",
-            "output_str": "archivedotorg/archive.org.txt",
+            "status": "noresults",
+            "output_str": "HTTP 429",
         }, result_json
         output_path = tmpdir / "archivedotorg" / "archive.org.txt"
-        assert output_path.is_file(), f"Archive.org output missing: {output_path}"
-        archived_url = output_path.read_text(encoding="utf-8").strip()
-        assert archived_url.startswith("https://web.archive.org/"), archived_url
+        assert not output_path.exists(), (
+            f"Archive.org output should not exist: {output_path}"
+        )
 
 
 if __name__ == "__main__":
diff --git a/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py b/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py
index e900fd6e..09bc0202 100755
--- a/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py
+++ b/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py
@@ -64,6 +64,7 @@
 import pytest
 from _pytest.fixtures import FixtureLookupError
 from pytest_httpserver import HTTPServer
+from werkzeug import Response
 
 from abx_plugins.plugins.base.test_utils import (
     assert_isolated_snapshot_env,
@@ -176,18 +177,27 @@ def _configure_chrome_httpserver(httpserver) -> dict[str, str]:
     httpserver.expect_request("/linked").respond_with_data(
         "<html><head><title>Linked Page</title></head><body><h1>Linked Page</h1></body></html>",
     )
-    httpserver.expect_request("/slow").respond_with_data(
-        """<!doctype html>
+
+    def slow_response(request):
+        delay_ms = int(request.args.get("delay", "0") or "0")
+        if delay_ms > 0:
+            time.sleep(delay_ms / 1000)
+        return Response(
+            f"""<!doctype html>
 <html>
 <head><meta charset="utf-8"><title>Slow Page</title></head>
 <body>
   <main>
     <h1>Slow Page</h1>
-    <p>delay_ms=0</p>
+    <p>delay_ms={delay_ms}</p>
   </main>
 </body>
 </html>""",
-    )
+            status=200,
+            content_type="text/html; charset=utf-8",
+        )
+
+    httpserver.expect_request("/slow").respond_with_handler(slow_response)
     httpserver.expect_request("/popup-child").respond_with_data(
         """<!doctype html>
 <html>
@@ -401,6 +411,32 @@ def _coerce_upstream_urls(value: Any) -> dict[str, str] | None:
     return urls
 
 
+def _add_https_test_urls(
+    urls: dict[str, str],
+    request,
+    tmp_path_factory,
+) -> dict[str, str]:
+    """Add deterministic HTTPS URLs when an upstream fixture did not provide them."""
+    if urls.get("https_base_url"):
+        return urls
+
+    https_server = _create_https_test_server(tmp_path_factory)
+    https_server.start()
+    request.addfinalizer(https_server.stop)
+    _configure_chrome_httpserver(https_server)
+    urls.update(
+        {
+            key: value
+            for key, value in _build_test_urls(
+                urls["base_url"],
+                https_server.url_for("/"),
+            ).items()
+            if key.startswith("https_")
+        },
+    )
+    return urls
+
+
 def ensure_chromium_and_puppeteer_installed_impl(tmp_path_factory) -> str:
     """Install Chrome and Puppeteer once for test sessions that require Chrome."""
     os.environ["SNAP_DIR"] = str(tmp_path_factory.mktemp("chrome_test_data"))
@@ -457,24 +493,10 @@ def chrome_test_urls(request, httpserver, tmp_path_factory):
             continue
         urls = _coerce_upstream_urls(upstream)
         if urls:
-            return urls
+            return _add_https_test_urls(urls, request, tmp_path_factory)
 
     urls = _configure_chrome_httpserver(httpserver)
-    https_server = _create_https_test_server(tmp_path_factory)
-    https_server.start()
-    request.addfinalizer(https_server.stop)
-    _configure_chrome_httpserver(https_server)
-    urls.update(
-        {
-            key: value
-            for key, value in _build_test_urls(
-                urls["base_url"],
-                https_server.url_for("/"),
-            ).items()
-            if key.startswith("https_")
-        },
-    )
-    return urls
+    return _add_https_test_urls(urls, request, tmp_path_factory)
 
 
 @pytest.fixture
diff --git a/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py b/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py
index 7c2a0d3a..e853865a 100755
--- a/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py
+++ b/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py
@@ -6,10 +6,13 @@
 
 import json
 import os
+import ssl
 import subprocess
 import sys
 import pytest
 import tempfile
+import time
+import urllib.request
 from pathlib import Path
 
 from abx_plugins.plugins.base.test_utils import (
@@ -19,6 +22,8 @@
 )
 from abx_plugins.plugins.chrome.tests.chrome_test_helpers import (
     _call_chrome_utils,
+    _add_https_test_urls,
+    _build_test_urls,
     CHROME_UTILS,
     chrome_session,
     get_test_env,
@@ -55,6 +60,44 @@ def _is_supported_browser_path(path: Path) -> bool:
     return bool(version)
 
 
+def test_chrome_test_urls_slow_route_honors_delay(chrome_test_urls):
+    """The slow fixture URL must stay in-flight long enough for target-close races."""
+    delay_ms = 600
+    slow_url = f"{chrome_test_urls['origin']}/slow?delay={delay_ms}"
+
+    started = time.monotonic()
+    with urllib.request.urlopen(slow_url, timeout=5) as response:
+        body = response.read().decode("utf-8")
+    elapsed = time.monotonic() - started
+
+    assert response.status == 200
+    assert f"delay_ms={delay_ms}" in body
+    assert elapsed >= (delay_ms / 1000) * 0.8
+
+
+def test_https_fallback_is_added_when_upstream_urls_lack_https(
+    request,
+    tmp_path_factory,
+    httpserver,
+):
+    """HTTPS-dependent plugins should receive a real local HTTPS URL."""
+    urls = _build_test_urls(httpserver.url_for("/"))
+
+    resolved = _add_https_test_urls(urls, request, tmp_path_factory)
+
+    assert resolved["https_base_url"].startswith("https://")
+    context = ssl._create_unverified_context()
+    with urllib.request.urlopen(
+        resolved["https_base_url"],
+        timeout=5,
+        context=context,
+    ) as response:
+        body = response.read().decode("utf-8")
+
+    assert response.status == 200
+    assert "Example Domain" in body
+
+
 def test_get_machine_type():
     """Test get_machine_type() returns valid format."""
     machine_type = get_machine_type()
diff --git a/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py b/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py
index 5613471a..a2cd2ece 100644
--- a/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py
+++ b/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py
@@ -1,5 +1,6 @@
 """Integration tests for the chrome_mhtml plugin."""
 
+import json
 import os
 import subprocess
 import tempfile
@@ -22,6 +23,7 @@
 if _MHTML_HOOK is None:
     raise FileNotFoundError(f"Hook not found in {PLUGIN_DIR}")
 MHTML_HOOK = _MHTML_HOOK
+CHROME_UTILS = PLUGIN_DIR.parent / "chrome" / "chrome_utils.js"
 CHROME_STARTUP_TIMEOUT_SECONDS = 45
 MHTML_PARENT_TOKEN = "ABX_MHTML_PARENT_TOKEN_7391"
 MHTML_OOPIF_CHILD_TOKEN = "ABX_MHTML_OOPIF_CHILD_TOKEN_7391"
@@ -31,7 +33,7 @@
 def mhtml_oopif_test_url(httpserver):
     child_url = httpserver.url_for("/child").replace(
         "localhost",
-        "oopif-child.localhost",
+        "oopif-child.test",
         1,
     )
     httpserver.expect_request("/child").respond_with_data(
@@ -48,7 +50,14 @@ def mhtml_oopif_test_url(httpserver):
 <head><meta charset="utf-8"><title>MHTML OOPIF Parent</title></head>
 <body>
   <main><h1>{MHTML_PARENT_TOKEN}</h1></main>
-  <iframe id="cross-site-frame" src="{child_url}"></iframe>
+  <iframe id="cross-site-frame"></iframe>
+  <script>
+    window.addEventListener("load", () => {{
+      setTimeout(() => {{
+        document.getElementById("cross-site-frame").src = "{child_url}";
+      }}, 0);
+    }});
+  </script>
 </body>
 </html>""",
         content_type="text/html; charset=utf-8",
@@ -83,6 +92,63 @@ def test_mhtml_preview_templates_live_with_mhtml_plugins(plugin_name):
     )
 
 
+def wait_for_oopif_child_frame(snapshot_chrome_dir: Path, env: dict[str, str]) -> None:
+    script = r"""
+const chromeUtils = require(process.argv[1]);
+const chromeSessionDir = process.argv[2];
+const childToken = process.argv[3];
+
+(async () => {
+    const puppeteer = chromeUtils.resolvePuppeteerModule();
+    const connection = await chromeUtils.connectToPage({
+        chromeSessionDir,
+        timeoutMs: 30000,
+        requireTargetId: true,
+        puppeteer,
+    });
+    const deadline = Date.now() + 30000;
+    try {
+        while (Date.now() < deadline) {
+            for (const frame of connection.page.frames()) {
+                if (!frame.url().includes('/child')) continue;
+                try {
+                    const text = await frame.evaluate(() => document.body?.innerText || '');
+                    if (text.includes(childToken)) {
+                        process.stdout.write(JSON.stringify({url: frame.url(), text}));
+                        return;
+                    }
+                } catch (error) {}
+            }
+            await new Promise(resolve => setTimeout(resolve, 250));
+        }
+        throw new Error(`Timed out waiting for OOPIF child frame token: ${childToken}`);
+    } finally {
+        connection.browser.disconnect();
+    }
+})().catch(error => {
+    console.error(error.stack || error.message);
+    process.exit(1);
+});
+"""
+    result = subprocess.run(
+        [
+            "node",
+            "-e",
+            script,
+            str(CHROME_UTILS),
+            str(snapshot_chrome_dir),
+            MHTML_OOPIF_CHILD_TOKEN,
+        ],
+        capture_output=True,
+        text=True,
+        timeout=40,
+        env=env,
+    )
+    assert result.returncode == 0, result.stderr
+    payload = json.loads(result.stdout)
+    assert MHTML_OOPIF_CHILD_TOKEN in payload["text"]
+
+
 def test_extracts_mhtml_from_cross_site_iframe(
     require_chrome_runtime,
     mhtml_oopif_test_url,
@@ -91,18 +157,30 @@ def test_extracts_mhtml_from_cross_site_iframe(
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
         test_url = mhtml_oopif_test_url
+        chrome_args_extra = json.dumps(
+            [
+                "--site-per-process",
+                "--host-resolver-rules=MAP oopif-child.test 127.0.0.1",
+                "--proxy-server=direct://",
+                "--proxy-bypass-list=*",
+            ],
+        )
+        env_overrides = {
+            "CHROME_ARGS_EXTRA": chrome_args_extra,
+        }
 
         with chrome_session(
             tmpdir,
             test_url=test_url,
             timeout=CHROME_STARTUP_TIMEOUT_SECONDS,
-            env_overrides={"CHROME_ARGS_EXTRA": "--site-per-process"},
+            env_overrides=env_overrides,
         ) as (
             _process,
             _pid,
             snapshot_chrome_dir,
             env,
         ):
+            wait_for_oopif_child_frame(snapshot_chrome_dir, env)
             output_dir = snapshot_chrome_dir.parent / "chrome_mhtml"
             output_dir.mkdir(exist_ok=True)
 
diff --git a/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py b/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py
index 36d4253f..a95bbe37 100755
--- a/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py
+++ b/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py
@@ -23,6 +23,7 @@
 import os
 import re
 import sys
+import time
 
 from pathlib import Path
 from urllib.error import HTTPError
@@ -46,7 +47,7 @@
 SUCCESS_OUTPUT = f"{PLUGIN_DIR}/{OUTPUT_FILE}"
 
 
-def http_get(url: str, headers: dict[str, str], timeout: int) -> tuple[int, bytes]:
+def http_get(url: str, headers: dict[str, str], timeout: float) -> tuple[int, bytes]:
     req = Request(url, headers=headers)
     try:
         with urlopen(req, timeout=timeout) as response:
@@ -82,6 +83,7 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
 
     config = get_config()
     timeout = config.FAVICON_TIMEOUT
+    deadline = time.monotonic() + timeout
     library_version = os.environ.get("LIBRARY_VERSION", "0.0.1")
     user_agent = (
         f"ArchiveBox/{library_version} (+https://github.com/ArchiveBox/ArchiveBox/)"
@@ -89,6 +91,12 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
     provider_template = (config.FAVICON_PROVIDER or "").strip()
     headers = {"User-Agent": user_agent}
 
+    def remaining_timeout() -> float:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            raise TimeoutError(f"FAVICON_TIMEOUT exceeded after {timeout} seconds")
+        return remaining
+
     # Build list of possible favicon URLs
     parsed = urlparse(url)
     base_url = f"{parsed.scheme}://{parsed.netloc}"
@@ -102,7 +110,11 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
 
     # Try to extract favicon URL from HTML link tags
     try:
-        status_code, body = http_get(url, headers=headers, timeout=timeout)
+        status_code, body = http_get(
+            url,
+            headers=headers,
+            timeout=remaining_timeout(),
+        )
         if 200 <= status_code < 300 and body:
             html = body.decode("utf-8", errors="replace")
             # Look for <link rel="icon" href="...">
@@ -126,9 +138,15 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
     # Try each URL until we find one that works
     for favicon_url in favicon_urls:
         try:
-            status_code, body = http_get(favicon_url, headers=headers, timeout=timeout)
+            status_code, body = http_get(
+                favicon_url,
+                headers=headers,
+                timeout=remaining_timeout(),
+            )
             if 200 <= status_code < 300 and body:
                 return True, save_favicon(body), ""
+        except TimeoutError:
+            return False, None, "No favicon found"
         except Exception:
             continue
 
@@ -136,7 +154,11 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
     provider_url = build_provider_url(provider_template, domain)
     if provider_url:
         try:
-            status_code, body = http_get(provider_url, headers=headers, timeout=timeout)
+            status_code, body = http_get(
+                provider_url,
+                headers=headers,
+                timeout=remaining_timeout(),
+            )
             if 200 <= status_code < 300 and body:
                 return True, save_favicon(body), ""
         except Exception:
diff --git a/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py b/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
index 59d41c37..a41083e0 100755
--- a/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
+++ b/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
@@ -17,6 +17,7 @@
 
 import json
 import os
+import re
 import sys
 from importlib import import_module
 from pathlib import Path
@@ -55,6 +56,8 @@
     "<xi:include",
     "<xinclude:include",
 )
+XML_ROOT_RE = re.compile(r"<([A-Za-z_][\w:.-]*)\b")
+FEED_ROOT_NAMES = {"rss", "feed", "rdf:rdf"}
 
 feedparser: Any | None
 try:
@@ -100,6 +103,43 @@ def reject_xml_file_loading_features(content: str) -> None:
         )
 
 
+def strip_xml_prefix(content: str) -> str:
+    remaining = content.lstrip()
+    if remaining.lower().startswith("<?xml"):
+        end = remaining.find("?>")
+        remaining = remaining[end + 2 :].lstrip() if end != -1 else remaining
+
+    while remaining.startswith("<!--"):
+        end = remaining.find("-->")
+        if end == -1:
+            return remaining
+        remaining = remaining[end + 3 :].lstrip()
+
+    if remaining.lower().startswith("<!doctype"):
+        subset_start = remaining.find("[")
+        first_tag_end = remaining.find(">")
+        if subset_start != -1 and (first_tag_end == -1 or subset_start < first_tag_end):
+            end = remaining.find("]>")
+            remaining = remaining[end + 2 :].lstrip() if end != -1 else remaining
+        elif first_tag_end != -1:
+            remaining = remaining[first_tag_end + 1 :].lstrip()
+
+    while remaining.startswith("<!--"):
+        end = remaining.find("-->")
+        if end == -1:
+            return remaining
+        remaining = remaining[end + 3 :].lstrip()
+
+    return remaining
+
+
+def looks_like_feed_content(content: str) -> bool:
+    root = XML_ROOT_RE.match(strip_xml_prefix(content))
+    if not root:
+        return False
+    return root.group(1).lower() in FEED_ROOT_NAMES
+
+
 def persist_records(records: list[dict]) -> tuple[str, str]:
     """Write extracted URLs when present, otherwise clear stale output after success."""
     if records:
@@ -133,7 +173,6 @@ def main(
     print("parsing 1 files for urls...")
     try:
         content = fetch_content(url)
-        reject_xml_file_loading_features(content)
     except Exception as e:
         if url.startswith(("http://", "https://")):
             # Snapshot URL fetching is only a fallback when no staticfile import
@@ -146,6 +185,18 @@ def main(
         emit_result("failed", f"Failed to fetch {url}: {e}")
         sys.exit(1)
 
+    if not looks_like_feed_content(content):
+        status, output_str = persist_records([])
+        print(output_str)
+        emit_result(status, output_str)
+        sys.exit(0)
+
+    try:
+        reject_xml_file_loading_features(content)
+    except Exception as e:
+        emit_result("failed", f"Failed to parse RSS/Atom feed from {url}: {e}")
+        sys.exit(1)
+
     # Parse the feed
     feed = feedparser.parse(content)
 
diff --git a/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py b/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
index 1a0ec8a9..f0c3313f 100755
--- a/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
+++ b/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
@@ -176,6 +176,58 @@ def test_http_fetch_failure_reports_noresults(self, tmp_path, httpserver):
         assert '"status": "failed"' not in result.stdout
         assert not (tmp_path / "parse_rss_urls" / "urls.jsonl").exists()
 
+    def test_http_html_page_reports_noresults(self, tmp_path, httpserver):
+        """Ordinary HTTP HTML pages are not RSS feeds and should not fail crawls."""
+        httpserver.expect_request("/article").respond_with_data(
+            "<!doctype html><html><head><title>Article</title></head><body>"
+            '<a href="https://example.com/linked">Linked page</a>'
+            "</body></html>",
+            status=200,
+            content_type="text/html; charset=utf-8",
+        )
+
+        result = run_parse_rss_urls(
+            [str(SCRIPT_PATH), "--url", httpserver.url_for("/article")],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0, result.stderr
+        assert '"status": "noresults"' in result.stdout
+        assert '"output_str": "0 URLs parsed"' in result.stdout
+        assert '"type": "Snapshot"' not in result.stdout
+        assert not (tmp_path / "parse_rss_urls" / "urls.jsonl").exists()
+
+    def test_http_rss_feed_with_unsafe_xml_fails(self, tmp_path, httpserver):
+        """Feed-shaped XML errors remain hard failures instead of silent noresults."""
+        httpserver.expect_request("/feed.rss").respond_with_data(
+            """<?xml version="1.0"?>
+<!DOCTYPE rss [
+  <!ENTITY local SYSTEM "file:///etc/passwd">
+]>
+<rss version="2.0">
+  <channel>
+    <item><link>https://example.com/post</link></item>
+  </channel>
+</rss>
+""",
+            status=200,
+            content_type="application/rss+xml",
+        )
+
+        result = run_parse_rss_urls(
+            [str(SCRIPT_PATH), "--url", httpserver.url_for("/feed.rss")],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert '"status": "failed"' in result.stdout
+        assert "XML declarations that can reference external files" in result.stderr
+        assert not (tmp_path / "parse_rss_urls" / "urls.jsonl").exists()
+
     def test_exits_1_when_file_not_found(self, tmp_path):
         """Test that script exits with code 1 when file doesn't exist."""
         result = run_parse_rss_urls(
diff --git a/abx_plugins/plugins/singlefile/tests/test_singlefile.py b/abx_plugins/plugins/singlefile/tests/test_singlefile.py
index 4ec24ff6..163ee404 100755
--- a/abx_plugins/plugins/singlefile/tests/test_singlefile.py
+++ b/abx_plugins/plugins/singlefile/tests/test_singlefile.py
@@ -94,6 +94,7 @@ def ensure_singlefile_extension_installed() -> dict[str, Path]:
 
     _singlefile_install_state = {
         "install_root": install_root,
+        "lib_dir": Path(env_install["LIB_DIR"]),
         "extensions_dir": extensions_dir,
         "cache_file": cache_file,
         "unpacked_path": unpacked_path,
@@ -176,11 +177,10 @@ def test_singlefile_cli_archives_example_com():
             navigate=True,
             timeout=30,
             env_overrides={
-                "CHROME_EXTENSIONS_DIR": str(extensions_dir),
+                "LIB_DIR": env_install["LIB_DIR"],
             },
         ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
             env["SINGLEFILE_ENABLED"] = "true"
-            env["CHROME_EXTENSIONS_DIR"] = str(extensions_dir)
 
             singlefile_output_dir = snapshot_chrome_dir.parent / "singlefile"
             singlefile_output_dir.mkdir(parents=True, exist_ok=True)
@@ -237,7 +237,7 @@ def test_singlefile_with_chrome_session():
             navigate=False,  # Don't navigate, singlefile will do that
             timeout=20,
             env_overrides={
-                "CHROME_EXTENSIONS_DIR": str(install_state["extensions_dir"]),
+                "LIB_DIR": str(install_state["lib_dir"]),
             },
         ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
             snap_dir = Path(env["SNAP_DIR"])
@@ -246,7 +246,6 @@ def test_singlefile_with_chrome_session():
 
             # Use env from chrome_session
             env["SINGLEFILE_ENABLED"] = "true"
-            env["CHROME_EXTENSIONS_DIR"] = str(install_state["extensions_dir"])
 
             # Run singlefile - it should find and use the existing Chrome session
             result = subprocess.run(
@@ -296,6 +295,7 @@ def test_singlefile_with_extension_uses_existing_chrome():
         assert loaded.loaded_abspath is not None, (
             "abxpkg did not resolve SingleFile extension"
         )
+        downloads_dir = tmpdir / "downloads"
 
         # Launch Chrome session with extensions loaded
         with chrome_session(
@@ -306,7 +306,8 @@ def test_singlefile_with_extension_uses_existing_chrome():
             navigate=True,
             timeout=30,
             env_overrides={
-                "CHROME_EXTENSIONS_DIR": str(extensions_dir),
+                "LIB_DIR": env_install["LIB_DIR"],
+                "CHROME_DOWNLOADS_DIR": str(downloads_dir),
             },
         ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
             singlefile_output_dir = snapshot_chrome_dir.parent / "singlefile"
@@ -320,7 +321,6 @@ def test_singlefile_with_extension_uses_existing_chrome():
 
             env["SINGLEFILE_ENABLED"] = "true"
             env["SINGLEFILE_BINARY"] = "/nonexistent/single-file"
-            env["CHROME_EXTENSIONS_DIR"] = str(extensions_dir)
             env["CHROME_HEADLESS"] = "false"
             env.pop("CRAWL_DIR", None)
 
@@ -379,7 +379,7 @@ def test_singlefile_extension_loader_prefers_cached_background_target():
             navigate=True,
             timeout=30,
             env_overrides={
-                "CHROME_EXTENSIONS_DIR": str(install_state["extensions_dir"]),
+                "LIB_DIR": str(install_state["lib_dir"]),
             },
         ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
             metadata = wait_for_extensions_metadata(