diff --git a/abx_plugins/plugins/archivedotorg/config.json b/abx_plugins/plugins/archivedotorg/config.json
index cb26b77f..6214c035 100644
--- a/abx_plugins/plugins/archivedotorg/config.json
+++ b/abx_plugins/plugins/archivedotorg/config.json
@@ -26,6 +26,11 @@
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for archive.org submission in seconds"
+ },
+ "ARCHIVEDOTORG_ENDPOINT": {
+ "type": "string",
+ "default": "https://web.archive.org/save/{url}",
+ "description": "Wayback Machine save endpoint template. Supports {url} or {} placeholders for the submitted URL."
}
}
}
diff --git a/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py b/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py
index 01820088..dd795676 100755
--- a/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py
+++ b/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py
@@ -25,7 +25,7 @@
from ipaddress import ip_address
from pathlib import Path
from urllib.error import HTTPError, URLError
-from urllib.parse import urlparse
+from urllib.parse import quote, urlparse
from urllib.request import Request, urlopen
from abx_plugins.plugins.base.utils import emit_archive_result_record, load_config
@@ -42,6 +42,7 @@
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(OUTPUT_DIR)
OUTPUT_FILE = "archive.org.txt"
+URL_PATH_SAFE_CHARS = ":/?#[]@!$&'()*+,;=%"
def should_skip_archivedotorg_url(url: str) -> str:
@@ -71,6 +72,15 @@ def should_skip_archivedotorg_url(url: str) -> str:
return ""
+def build_archivedotorg_submit_url(endpoint_template: str, url: str) -> str:
+ escaped_url = quote(url, safe=URL_PATH_SAFE_CHARS)
+ if "{url}" in endpoint_template:
+ return endpoint_template.format(url=escaped_url)
+ if "{}" in endpoint_template:
+ return endpoint_template.format(escaped_url)
+ return f"{endpoint_template.rstrip('/')}/{escaped_url}"
+
+
def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
"""
Submit URL to archive.org Wayback Machine.
@@ -83,12 +93,13 @@ def log(message: str) -> None:
config = load_config()
timeout = config.ARCHIVEDOTORG_TIMEOUT
+ endpoint_template = str(config.ARCHIVEDOTORG_ENDPOINT or "").strip()
library_version = os.environ.get("LIBRARY_VERSION", "0.0.1")
user_agent = (
f"ArchiveBox/{library_version} (+https://github.com/ArchiveBox/ArchiveBox/)"
)
- submit_url = f"https://web.archive.org/save/{url}"
+ submit_url = build_archivedotorg_submit_url(endpoint_template, url)
log(f"Submitting to Wayback Machine (timeout={timeout}s)")
log(f"GET {submit_url}")
diff --git a/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py b/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py
index c79a19fb..dc124296 100755
--- a/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py
+++ b/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py
@@ -9,6 +9,7 @@
import tempfile
from pathlib import Path
import pytest
+from werkzeug.wrappers import Response
from abx_plugins.plugins.base.test_utils import parse_jsonl_output
@@ -24,27 +25,43 @@ def test_hook_script_exists():
assert ARCHIVEDOTORG_HOOK.exists()
-def test_submits_to_archivedotorg():
+def _run_archivedotorg_hook(
+ tmpdir: Path,
+ env: dict[str, str],
+) -> subprocess.CompletedProcess:
+ return subprocess.run(
+ [
+ str(ARCHIVEDOTORG_HOOK),
+ "--url",
+ TEST_URL,
+ ],
+ cwd=tmpdir,
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=30,
+ )
+
+
+def test_submits_to_configured_archivedotorg_endpoint(httpserver):
+ archived_path = "/web/20260610123456/https://example.com"
+ httpserver.expect_request("/save/https://example.com").respond_with_data(
+ "saved",
+ status=200,
+ headers={
+ "Content-Location": archived_path,
+ "X-Archive-Orig-Url": TEST_URL,
+ },
+ )
+
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
- # Keep the hook's own network timeout below subprocess timeout so failures
- # return cleanly as exit=1 instead of being killed by pytest.
- env["ARCHIVEDOTORG_TIMEOUT"] = "45"
+ env["SNAP_DIR"] = str(tmpdir)
+ env["ARCHIVEDOTORG_ENDPOINT"] = f"{httpserver.url_for('/save')}/{{url}}"
- result = subprocess.run(
- [
- str(ARCHIVEDOTORG_HOOK),
- "--url",
- TEST_URL,
- ],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=90,
- )
+ result = _run_archivedotorg_hook(tmpdir, env)
assert result.returncode == 0, result.stderr
@@ -58,7 +75,8 @@ def test_submits_to_archivedotorg():
output_path = tmpdir / "archivedotorg" / "archive.org.txt"
assert output_path.is_file(), f"Archive.org output missing: {output_path}"
archived_url = output_path.read_text(encoding="utf-8").strip()
- assert archived_url.startswith("https://web.archive.org/"), archived_url
+ assert archived_url == f"https://web.archive.org{archived_path}"
+ assert len(httpserver.log) == 1
def test_config_save_archivedotorg_false_skips():
@@ -94,25 +112,22 @@ def test_config_save_archivedotorg_false_skips():
assert result_json["output_str"] == "ARCHIVEDOTORG_ENABLED=False", result_json
-def test_handles_timeout():
+def test_archivedotorg_http_429_is_deterministic_noresults(httpserver):
+ def rate_limited(_request):
+ return Response("rate limited", status=429, content_type="text/plain")
+
+ httpserver.expect_request("/save/https://example.com").respond_with_handler(
+ rate_limited,
+ )
+
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
- env["ARCHIVEDOTORG_TIMEOUT"] = "10"
+ env["SNAP_DIR"] = str(tmpdir)
+ env["ARCHIVEDOTORG_ENDPOINT"] = f"{httpserver.url_for('/save')}/{{url}}"
- result = subprocess.run(
- [
- str(ARCHIVEDOTORG_HOOK),
- "--url",
- TEST_URL,
- ],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=30,
- )
+ result = _run_archivedotorg_hook(tmpdir, env)
assert result.returncode == 0, result.stderr
@@ -120,13 +135,13 @@ def test_handles_timeout():
assert result_json, "Should emit ArchiveResult JSONL"
assert result_json == {
"type": "ArchiveResult",
- "status": "succeeded",
- "output_str": "archivedotorg/archive.org.txt",
+ "status": "noresults",
+ "output_str": "HTTP 429",
}, result_json
output_path = tmpdir / "archivedotorg" / "archive.org.txt"
- assert output_path.is_file(), f"Archive.org output missing: {output_path}"
- archived_url = output_path.read_text(encoding="utf-8").strip()
- assert archived_url.startswith("https://web.archive.org/"), archived_url
+ assert not output_path.exists(), (
+ f"Archive.org output should not exist: {output_path}"
+ )
if __name__ == "__main__":
diff --git a/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py b/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py
index e900fd6e..09bc0202 100755
--- a/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py
+++ b/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py
@@ -64,6 +64,7 @@
import pytest
from _pytest.fixtures import FixtureLookupError
from pytest_httpserver import HTTPServer
+from werkzeug import Response
from abx_plugins.plugins.base.test_utils import (
assert_isolated_snapshot_env,
@@ -176,18 +177,27 @@ def _configure_chrome_httpserver(httpserver) -> dict[str, str]:
httpserver.expect_request("/linked").respond_with_data(
"
Linked PageLinked Page
",
)
- httpserver.expect_request("/slow").respond_with_data(
- """
+
+ def slow_response(request):
+ delay_ms = int(request.args.get("delay", "0") or "0")
+ if delay_ms > 0:
+ time.sleep(delay_ms / 1000)
+ return Response(
+ f"""
Slow Page
Slow Page
- delay_ms=0
+ delay_ms={delay_ms}
""",
- )
+ status=200,
+ content_type="text/html; charset=utf-8",
+ )
+
+ httpserver.expect_request("/slow").respond_with_handler(slow_response)
httpserver.expect_request("/popup-child").respond_with_data(
"""
@@ -401,6 +411,32 @@ def _coerce_upstream_urls(value: Any) -> dict[str, str] | None:
return urls
+def _add_https_test_urls(
+ urls: dict[str, str],
+ request,
+ tmp_path_factory,
+) -> dict[str, str]:
+ """Add deterministic HTTPS URLs when an upstream fixture did not provide them."""
+ if urls.get("https_base_url"):
+ return urls
+
+ https_server = _create_https_test_server(tmp_path_factory)
+ https_server.start()
+ request.addfinalizer(https_server.stop)
+ _configure_chrome_httpserver(https_server)
+ urls.update(
+ {
+ key: value
+ for key, value in _build_test_urls(
+ urls["base_url"],
+ https_server.url_for("/"),
+ ).items()
+ if key.startswith("https_")
+ },
+ )
+ return urls
+
+
def ensure_chromium_and_puppeteer_installed_impl(tmp_path_factory) -> str:
"""Install Chrome and Puppeteer once for test sessions that require Chrome."""
os.environ["SNAP_DIR"] = str(tmp_path_factory.mktemp("chrome_test_data"))
@@ -457,24 +493,10 @@ def chrome_test_urls(request, httpserver, tmp_path_factory):
continue
urls = _coerce_upstream_urls(upstream)
if urls:
- return urls
+ return _add_https_test_urls(urls, request, tmp_path_factory)
urls = _configure_chrome_httpserver(httpserver)
- https_server = _create_https_test_server(tmp_path_factory)
- https_server.start()
- request.addfinalizer(https_server.stop)
- _configure_chrome_httpserver(https_server)
- urls.update(
- {
- key: value
- for key, value in _build_test_urls(
- urls["base_url"],
- https_server.url_for("/"),
- ).items()
- if key.startswith("https_")
- },
- )
- return urls
+ return _add_https_test_urls(urls, request, tmp_path_factory)
@pytest.fixture
diff --git a/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py b/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py
index 7c2a0d3a..e853865a 100755
--- a/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py
+++ b/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py
@@ -6,10 +6,13 @@
import json
import os
+import ssl
import subprocess
import sys
import pytest
import tempfile
+import time
+import urllib.request
from pathlib import Path
from abx_plugins.plugins.base.test_utils import (
@@ -19,6 +22,8 @@
)
from abx_plugins.plugins.chrome.tests.chrome_test_helpers import (
_call_chrome_utils,
+ _add_https_test_urls,
+ _build_test_urls,
CHROME_UTILS,
chrome_session,
get_test_env,
@@ -55,6 +60,44 @@ def _is_supported_browser_path(path: Path) -> bool:
return bool(version)
+def test_chrome_test_urls_slow_route_honors_delay(chrome_test_urls):
+ """The slow fixture URL must stay in-flight long enough for target-close races."""
+ delay_ms = 600
+ slow_url = f"{chrome_test_urls['origin']}/slow?delay={delay_ms}"
+
+ started = time.monotonic()
+ with urllib.request.urlopen(slow_url, timeout=5) as response:
+ body = response.read().decode("utf-8")
+ elapsed = time.monotonic() - started
+
+ assert response.status == 200
+ assert f"delay_ms={delay_ms}" in body
+ assert elapsed >= (delay_ms / 1000) * 0.8
+
+
+def test_https_fallback_is_added_when_upstream_urls_lack_https(
+ request,
+ tmp_path_factory,
+ httpserver,
+):
+ """HTTPS-dependent plugins should receive a real local HTTPS URL."""
+ urls = _build_test_urls(httpserver.url_for("/"))
+
+ resolved = _add_https_test_urls(urls, request, tmp_path_factory)
+
+ assert resolved["https_base_url"].startswith("https://")
+ context = ssl._create_unverified_context()
+ with urllib.request.urlopen(
+ resolved["https_base_url"],
+ timeout=5,
+ context=context,
+ ) as response:
+ body = response.read().decode("utf-8")
+
+ assert response.status == 200
+ assert "Example Domain" in body
+
+
def test_get_machine_type():
"""Test get_machine_type() returns valid format."""
machine_type = get_machine_type()
diff --git a/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py b/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py
index 5613471a..a2cd2ece 100644
--- a/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py
+++ b/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py
@@ -1,5 +1,6 @@
"""Integration tests for the chrome_mhtml plugin."""
+import json
import os
import subprocess
import tempfile
@@ -22,6 +23,7 @@
if _MHTML_HOOK is None:
raise FileNotFoundError(f"Hook not found in {PLUGIN_DIR}")
MHTML_HOOK = _MHTML_HOOK
+CHROME_UTILS = PLUGIN_DIR.parent / "chrome" / "chrome_utils.js"
CHROME_STARTUP_TIMEOUT_SECONDS = 45
MHTML_PARENT_TOKEN = "ABX_MHTML_PARENT_TOKEN_7391"
MHTML_OOPIF_CHILD_TOKEN = "ABX_MHTML_OOPIF_CHILD_TOKEN_7391"
@@ -31,7 +33,7 @@
def mhtml_oopif_test_url(httpserver):
child_url = httpserver.url_for("/child").replace(
"localhost",
- "oopif-child.localhost",
+ "oopif-child.test",
1,
)
httpserver.expect_request("/child").respond_with_data(
@@ -48,7 +50,14 @@ def mhtml_oopif_test_url(httpserver):
MHTML OOPIF Parent
{MHTML_PARENT_TOKEN}
-
+
+
""",
content_type="text/html; charset=utf-8",
@@ -83,6 +92,63 @@ def test_mhtml_preview_templates_live_with_mhtml_plugins(plugin_name):
)
+def wait_for_oopif_child_frame(snapshot_chrome_dir: Path, env: dict[str, str]) -> None:
+ script = r"""
+const chromeUtils = require(process.argv[1]);
+const chromeSessionDir = process.argv[2];
+const childToken = process.argv[3];
+
+(async () => {
+ const puppeteer = chromeUtils.resolvePuppeteerModule();
+ const connection = await chromeUtils.connectToPage({
+ chromeSessionDir,
+ timeoutMs: 30000,
+ requireTargetId: true,
+ puppeteer,
+ });
+ const deadline = Date.now() + 30000;
+ try {
+ while (Date.now() < deadline) {
+ for (const frame of connection.page.frames()) {
+ if (!frame.url().includes('/child')) continue;
+ try {
+ const text = await frame.evaluate(() => document.body?.innerText || '');
+ if (text.includes(childToken)) {
+ process.stdout.write(JSON.stringify({url: frame.url(), text}));
+ return;
+ }
+ } catch (error) {}
+ }
+ await new Promise(resolve => setTimeout(resolve, 250));
+ }
+ throw new Error(`Timed out waiting for OOPIF child frame token: ${childToken}`);
+ } finally {
+ connection.browser.disconnect();
+ }
+})().catch(error => {
+ console.error(error.stack || error.message);
+ process.exit(1);
+});
+"""
+ result = subprocess.run(
+ [
+ "node",
+ "-e",
+ script,
+ str(CHROME_UTILS),
+ str(snapshot_chrome_dir),
+ MHTML_OOPIF_CHILD_TOKEN,
+ ],
+ capture_output=True,
+ text=True,
+ timeout=40,
+ env=env,
+ )
+ assert result.returncode == 0, result.stderr
+ payload = json.loads(result.stdout)
+ assert MHTML_OOPIF_CHILD_TOKEN in payload["text"]
+
+
def test_extracts_mhtml_from_cross_site_iframe(
require_chrome_runtime,
mhtml_oopif_test_url,
@@ -91,18 +157,30 @@ def test_extracts_mhtml_from_cross_site_iframe(
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
test_url = mhtml_oopif_test_url
+ chrome_args_extra = json.dumps(
+ [
+ "--site-per-process",
+ "--host-resolver-rules=MAP oopif-child.test 127.0.0.1",
+ "--proxy-server=direct://",
+ "--proxy-bypass-list=*",
+ ],
+ )
+ env_overrides = {
+ "CHROME_ARGS_EXTRA": chrome_args_extra,
+ }
with chrome_session(
tmpdir,
test_url=test_url,
timeout=CHROME_STARTUP_TIMEOUT_SECONDS,
- env_overrides={"CHROME_ARGS_EXTRA": "--site-per-process"},
+ env_overrides=env_overrides,
) as (
_process,
_pid,
snapshot_chrome_dir,
env,
):
+ wait_for_oopif_child_frame(snapshot_chrome_dir, env)
output_dir = snapshot_chrome_dir.parent / "chrome_mhtml"
output_dir.mkdir(exist_ok=True)
diff --git a/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py b/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py
index 36d4253f..a95bbe37 100755
--- a/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py
+++ b/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py
@@ -23,6 +23,7 @@
import os
import re
import sys
+import time
from pathlib import Path
from urllib.error import HTTPError
@@ -46,7 +47,7 @@
SUCCESS_OUTPUT = f"{PLUGIN_DIR}/{OUTPUT_FILE}"
-def http_get(url: str, headers: dict[str, str], timeout: int) -> tuple[int, bytes]:
+def http_get(url: str, headers: dict[str, str], timeout: float) -> tuple[int, bytes]:
req = Request(url, headers=headers)
try:
with urlopen(req, timeout=timeout) as response:
@@ -82,6 +83,7 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
config = get_config()
timeout = config.FAVICON_TIMEOUT
+ deadline = time.monotonic() + timeout
library_version = os.environ.get("LIBRARY_VERSION", "0.0.1")
user_agent = (
f"ArchiveBox/{library_version} (+https://github.com/ArchiveBox/ArchiveBox/)"
@@ -89,6 +91,12 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
provider_template = (config.FAVICON_PROVIDER or "").strip()
headers = {"User-Agent": user_agent}
+ def remaining_timeout() -> float:
+ remaining = deadline - time.monotonic()
+ if remaining <= 0:
+ raise TimeoutError(f"FAVICON_TIMEOUT exceeded after {timeout} seconds")
+ return remaining
+
# Build list of possible favicon URLs
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
@@ -102,7 +110,11 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
# Try to extract favicon URL from HTML link tags
try:
- status_code, body = http_get(url, headers=headers, timeout=timeout)
+ status_code, body = http_get(
+ url,
+ headers=headers,
+ timeout=remaining_timeout(),
+ )
if 200 <= status_code < 300 and body:
html = body.decode("utf-8", errors="replace")
# Look for
@@ -126,9 +138,15 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
# Try each URL until we find one that works
for favicon_url in favicon_urls:
try:
- status_code, body = http_get(favicon_url, headers=headers, timeout=timeout)
+ status_code, body = http_get(
+ favicon_url,
+ headers=headers,
+ timeout=remaining_timeout(),
+ )
if 200 <= status_code < 300 and body:
return True, save_favicon(body), ""
+ except TimeoutError:
+ return False, None, "No favicon found"
except Exception:
continue
@@ -136,7 +154,11 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]:
provider_url = build_provider_url(provider_template, domain)
if provider_url:
try:
- status_code, body = http_get(provider_url, headers=headers, timeout=timeout)
+ status_code, body = http_get(
+ provider_url,
+ headers=headers,
+ timeout=remaining_timeout(),
+ )
if 200 <= status_code < 300 and body:
return True, save_favicon(body), ""
except Exception:
diff --git a/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py b/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
index 59d41c37..a41083e0 100755
--- a/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
+++ b/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
@@ -17,6 +17,7 @@
import json
import os
+import re
import sys
from importlib import import_module
from pathlib import Path
@@ -55,6 +56,8 @@
" None:
)
+def strip_xml_prefix(content: str) -> str:
+ remaining = content.lstrip()
+ if remaining.lower().startswith("")
+ remaining = remaining[end + 2 :].lstrip() if end != -1 else remaining
+
+ while remaining.startswith("")
+ if end == -1:
+ return remaining
+ remaining = remaining[end + 3 :].lstrip()
+
+ if remaining.lower().startswith("")
+ if subset_start != -1 and (first_tag_end == -1 or subset_start < first_tag_end):
+ end = remaining.find("]>")
+ remaining = remaining[end + 2 :].lstrip() if end != -1 else remaining
+ elif first_tag_end != -1:
+ remaining = remaining[first_tag_end + 1 :].lstrip()
+
+ while remaining.startswith("")
+ if end == -1:
+ return remaining
+ remaining = remaining[end + 3 :].lstrip()
+
+ return remaining
+
+
+def looks_like_feed_content(content: str) -> bool:
+ root = XML_ROOT_RE.match(strip_xml_prefix(content))
+ if not root:
+ return False
+ return root.group(1).lower() in FEED_ROOT_NAMES
+
+
def persist_records(records: list[dict]) -> tuple[str, str]:
"""Write extracted URLs when present, otherwise clear stale output after success."""
if records:
@@ -133,7 +173,6 @@ def main(
print("parsing 1 files for urls...")
try:
content = fetch_content(url)
- reject_xml_file_loading_features(content)
except Exception as e:
if url.startswith(("http://", "https://")):
# Snapshot URL fetching is only a fallback when no staticfile import
@@ -146,6 +185,18 @@ def main(
emit_result("failed", f"Failed to fetch {url}: {e}")
sys.exit(1)
+ if not looks_like_feed_content(content):
+ status, output_str = persist_records([])
+ print(output_str)
+ emit_result(status, output_str)
+ sys.exit(0)
+
+ try:
+ reject_xml_file_loading_features(content)
+ except Exception as e:
+ emit_result("failed", f"Failed to parse RSS/Atom feed from {url}: {e}")
+ sys.exit(1)
+
# Parse the feed
feed = feedparser.parse(content)
diff --git a/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py b/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
index 1a0ec8a9..f0c3313f 100755
--- a/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
+++ b/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
@@ -176,6 +176,58 @@ def test_http_fetch_failure_reports_noresults(self, tmp_path, httpserver):
assert '"status": "failed"' not in result.stdout
assert not (tmp_path / "parse_rss_urls" / "urls.jsonl").exists()
+ def test_http_html_page_reports_noresults(self, tmp_path, httpserver):
+ """Ordinary HTTP HTML pages are not RSS feeds and should not fail crawls."""
+ httpserver.expect_request("/article").respond_with_data(
+ "Article"
+ 'Linked page'
+ "",
+ status=200,
+ content_type="text/html; charset=utf-8",
+ )
+
+ result = run_parse_rss_urls(
+ [str(SCRIPT_PATH), "--url", httpserver.url_for("/article")],
+ cwd=tmp_path,
+ capture_output=True,
+ text=True,
+ )
+
+ assert result.returncode == 0, result.stderr
+ assert '"status": "noresults"' in result.stdout
+ assert '"output_str": "0 URLs parsed"' in result.stdout
+ assert '"type": "Snapshot"' not in result.stdout
+ assert not (tmp_path / "parse_rss_urls" / "urls.jsonl").exists()
+
+ def test_http_rss_feed_with_unsafe_xml_fails(self, tmp_path, httpserver):
+ """Feed-shaped XML errors remain hard failures instead of silent noresults."""
+ httpserver.expect_request("/feed.rss").respond_with_data(
+ """
+
+]>
+
+
+ - https://example.com/post
+
+
+""",
+ status=200,
+ content_type="application/rss+xml",
+ )
+
+ result = run_parse_rss_urls(
+ [str(SCRIPT_PATH), "--url", httpserver.url_for("/feed.rss")],
+ cwd=tmp_path,
+ capture_output=True,
+ text=True,
+ )
+
+ assert result.returncode == 1
+ assert '"status": "failed"' in result.stdout
+ assert "XML declarations that can reference external files" in result.stderr
+ assert not (tmp_path / "parse_rss_urls" / "urls.jsonl").exists()
+
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
result = run_parse_rss_urls(
diff --git a/abx_plugins/plugins/singlefile/tests/test_singlefile.py b/abx_plugins/plugins/singlefile/tests/test_singlefile.py
index 4ec24ff6..163ee404 100755
--- a/abx_plugins/plugins/singlefile/tests/test_singlefile.py
+++ b/abx_plugins/plugins/singlefile/tests/test_singlefile.py
@@ -94,6 +94,7 @@ def ensure_singlefile_extension_installed() -> dict[str, Path]:
_singlefile_install_state = {
"install_root": install_root,
+ "lib_dir": Path(env_install["LIB_DIR"]),
"extensions_dir": extensions_dir,
"cache_file": cache_file,
"unpacked_path": unpacked_path,
@@ -176,11 +177,10 @@ def test_singlefile_cli_archives_example_com():
navigate=True,
timeout=30,
env_overrides={
- "CHROME_EXTENSIONS_DIR": str(extensions_dir),
+ "LIB_DIR": env_install["LIB_DIR"],
},
) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
env["SINGLEFILE_ENABLED"] = "true"
- env["CHROME_EXTENSIONS_DIR"] = str(extensions_dir)
singlefile_output_dir = snapshot_chrome_dir.parent / "singlefile"
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
@@ -237,7 +237,7 @@ def test_singlefile_with_chrome_session():
navigate=False, # Don't navigate, singlefile will do that
timeout=20,
env_overrides={
- "CHROME_EXTENSIONS_DIR": str(install_state["extensions_dir"]),
+ "LIB_DIR": str(install_state["lib_dir"]),
},
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
snap_dir = Path(env["SNAP_DIR"])
@@ -246,7 +246,6 @@ def test_singlefile_with_chrome_session():
# Use env from chrome_session
env["SINGLEFILE_ENABLED"] = "true"
- env["CHROME_EXTENSIONS_DIR"] = str(install_state["extensions_dir"])
# Run singlefile - it should find and use the existing Chrome session
result = subprocess.run(
@@ -296,6 +295,7 @@ def test_singlefile_with_extension_uses_existing_chrome():
assert loaded.loaded_abspath is not None, (
"abxpkg did not resolve SingleFile extension"
)
+ downloads_dir = tmpdir / "downloads"
# Launch Chrome session with extensions loaded
with chrome_session(
@@ -306,7 +306,8 @@ def test_singlefile_with_extension_uses_existing_chrome():
navigate=True,
timeout=30,
env_overrides={
- "CHROME_EXTENSIONS_DIR": str(extensions_dir),
+ "LIB_DIR": env_install["LIB_DIR"],
+ "CHROME_DOWNLOADS_DIR": str(downloads_dir),
},
) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
singlefile_output_dir = snapshot_chrome_dir.parent / "singlefile"
@@ -320,7 +321,6 @@ def test_singlefile_with_extension_uses_existing_chrome():
env["SINGLEFILE_ENABLED"] = "true"
env["SINGLEFILE_BINARY"] = "/nonexistent/single-file"
- env["CHROME_EXTENSIONS_DIR"] = str(extensions_dir)
env["CHROME_HEADLESS"] = "false"
env.pop("CRAWL_DIR", None)
@@ -379,7 +379,7 @@ def test_singlefile_extension_loader_prefers_cached_background_target():
navigate=True,
timeout=30,
env_overrides={
- "CHROME_EXTENSIONS_DIR": str(install_state["extensions_dir"]),
+ "LIB_DIR": str(install_state["lib_dir"]),
},
) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
metadata = wait_for_extensions_metadata(