From ffc70424e8efd1f7d0ecd429a60a46021cc954d1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 10 Jun 2026 20:27:31 -0700 Subject: [PATCH] Fix plugin CI root causes --- abx_plugins/plugins/archivedotorg/config.json | 5 ++ ...on_Snapshot__08_archivedotorg.finite.bg.py | 15 +++- .../archivedotorg/tests/test_archivedotorg.py | 87 +++++++++++-------- .../chrome/tests/chrome_test_helpers.py | 62 ++++++++----- .../chrome/tests/test_chrome_test_helpers.py | 43 +++++++++ .../chrome_mhtml/tests/test_chrome_mhtml.py | 84 +++++++++++++++++- .../on_Snapshot__11_favicon.finite.bg.py | 30 ++++++- .../on_Snapshot__72_parse_rss_urls.py | 53 ++++++++++- .../tests/test_parse_rss_urls.py | 52 +++++++++++ .../singlefile/tests/test_singlefile.py | 14 +-- 10 files changed, 372 insertions(+), 73 deletions(-) diff --git a/abx_plugins/plugins/archivedotorg/config.json b/abx_plugins/plugins/archivedotorg/config.json index cb26b77f..6214c035 100644 --- a/abx_plugins/plugins/archivedotorg/config.json +++ b/abx_plugins/plugins/archivedotorg/config.json @@ -26,6 +26,11 @@ "minimum": 10, "x-fallback": "TIMEOUT", "description": "Timeout for archive.org submission in seconds" + }, + "ARCHIVEDOTORG_ENDPOINT": { + "type": "string", + "default": "https://web.archive.org/save/{url}", + "description": "Wayback Machine save endpoint template. Supports {url} or {} placeholders for the submitted URL." } } } diff --git a/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py b/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py index 01820088..dd795676 100755 --- a/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py +++ b/abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py @@ -25,7 +25,7 @@ from ipaddress import ip_address from pathlib import Path from urllib.error import HTTPError, URLError -from urllib.parse import urlparse +from urllib.parse import quote, urlparse from urllib.request import Request, urlopen from abx_plugins.plugins.base.utils import emit_archive_result_record, load_config @@ -42,6 +42,7 @@ OUTPUT_DIR.mkdir(parents=True, exist_ok=True) os.chdir(OUTPUT_DIR) OUTPUT_FILE = "archive.org.txt" +URL_PATH_SAFE_CHARS = ":/?#[]@!$&'()*+,;=%" def should_skip_archivedotorg_url(url: str) -> str: @@ -71,6 +72,15 @@ def should_skip_archivedotorg_url(url: str) -> str: return "" +def build_archivedotorg_submit_url(endpoint_template: str, url: str) -> str: + escaped_url = quote(url, safe=URL_PATH_SAFE_CHARS) + if "{url}" in endpoint_template: + return endpoint_template.format(url=escaped_url) + if "{}" in endpoint_template: + return endpoint_template.format(escaped_url) + return f"{endpoint_template.rstrip('/')}/{escaped_url}" + + def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]: """ Submit URL to archive.org Wayback Machine. @@ -83,12 +93,13 @@ def log(message: str) -> None: config = load_config() timeout = config.ARCHIVEDOTORG_TIMEOUT + endpoint_template = str(config.ARCHIVEDOTORG_ENDPOINT or "").strip() library_version = os.environ.get("LIBRARY_VERSION", "0.0.1") user_agent = ( f"ArchiveBox/{library_version} (+https://github.com/ArchiveBox/ArchiveBox/)" ) - submit_url = f"https://web.archive.org/save/{url}" + submit_url = build_archivedotorg_submit_url(endpoint_template, url) log(f"Submitting to Wayback Machine (timeout={timeout}s)") log(f"GET {submit_url}") diff --git a/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py b/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py index c79a19fb..dc124296 100755 --- a/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py +++ b/abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py @@ -9,6 +9,7 @@ import tempfile from pathlib import Path import pytest +from werkzeug.wrappers import Response from abx_plugins.plugins.base.test_utils import parse_jsonl_output @@ -24,27 +25,43 @@ def test_hook_script_exists(): assert ARCHIVEDOTORG_HOOK.exists() -def test_submits_to_archivedotorg(): +def _run_archivedotorg_hook( + tmpdir: Path, + env: dict[str, str], +) -> subprocess.CompletedProcess: + return subprocess.run( + [ + str(ARCHIVEDOTORG_HOOK), + "--url", + TEST_URL, + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30, + ) + + +def test_submits_to_configured_archivedotorg_endpoint(httpserver): + archived_path = "/web/20260610123456/https://example.com" + httpserver.expect_request("/save/https://example.com").respond_with_data( + "saved", + status=200, + headers={ + "Content-Location": archived_path, + "X-Archive-Orig-Url": TEST_URL, + }, + ) + with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) env = os.environ.copy() - # Keep the hook's own network timeout below subprocess timeout so failures - # return cleanly as exit=1 instead of being killed by pytest. - env["ARCHIVEDOTORG_TIMEOUT"] = "45" + env["SNAP_DIR"] = str(tmpdir) + env["ARCHIVEDOTORG_ENDPOINT"] = f"{httpserver.url_for('/save')}/{{url}}" - result = subprocess.run( - [ - str(ARCHIVEDOTORG_HOOK), - "--url", - TEST_URL, - ], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=90, - ) + result = _run_archivedotorg_hook(tmpdir, env) assert result.returncode == 0, result.stderr @@ -58,7 +75,8 @@ def test_submits_to_archivedotorg(): output_path = tmpdir / "archivedotorg" / "archive.org.txt" assert output_path.is_file(), f"Archive.org output missing: {output_path}" archived_url = output_path.read_text(encoding="utf-8").strip() - assert archived_url.startswith("https://web.archive.org/"), archived_url + assert archived_url == f"https://web.archive.org{archived_path}" + assert len(httpserver.log) == 1 def test_config_save_archivedotorg_false_skips(): @@ -94,25 +112,22 @@ def test_config_save_archivedotorg_false_skips(): assert result_json["output_str"] == "ARCHIVEDOTORG_ENABLED=False", result_json -def test_handles_timeout(): +def test_archivedotorg_http_429_is_deterministic_noresults(httpserver): + def rate_limited(_request): + return Response("rate limited", status=429, content_type="text/plain") + + httpserver.expect_request("/save/https://example.com").respond_with_handler( + rate_limited, + ) + with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) env = os.environ.copy() - env["ARCHIVEDOTORG_TIMEOUT"] = "10" + env["SNAP_DIR"] = str(tmpdir) + env["ARCHIVEDOTORG_ENDPOINT"] = f"{httpserver.url_for('/save')}/{{url}}" - result = subprocess.run( - [ - str(ARCHIVEDOTORG_HOOK), - "--url", - TEST_URL, - ], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30, - ) + result = _run_archivedotorg_hook(tmpdir, env) assert result.returncode == 0, result.stderr @@ -120,13 +135,13 @@ def test_handles_timeout(): assert result_json, "Should emit ArchiveResult JSONL" assert result_json == { "type": "ArchiveResult", - "status": "succeeded", - "output_str": "archivedotorg/archive.org.txt", + "status": "noresults", + "output_str": "HTTP 429", }, result_json output_path = tmpdir / "archivedotorg" / "archive.org.txt" - assert output_path.is_file(), f"Archive.org output missing: {output_path}" - archived_url = output_path.read_text(encoding="utf-8").strip() - assert archived_url.startswith("https://web.archive.org/"), archived_url + assert not output_path.exists(), ( + f"Archive.org output should not exist: {output_path}" + ) if __name__ == "__main__": diff --git a/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py b/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py index e900fd6e..09bc0202 100755 --- a/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py +++ b/abx_plugins/plugins/chrome/tests/chrome_test_helpers.py @@ -64,6 +64,7 @@ import pytest from _pytest.fixtures import FixtureLookupError from pytest_httpserver import HTTPServer +from werkzeug import Response from abx_plugins.plugins.base.test_utils import ( assert_isolated_snapshot_env, @@ -176,18 +177,27 @@ def _configure_chrome_httpserver(httpserver) -> dict[str, str]: httpserver.expect_request("/linked").respond_with_data( "Linked Page

Linked Page

", ) - httpserver.expect_request("/slow").respond_with_data( - """ + + def slow_response(request): + delay_ms = int(request.args.get("delay", "0") or "0") + if delay_ms > 0: + time.sleep(delay_ms / 1000) + return Response( + f""" Slow Page

Slow Page

-

delay_ms=0

+

delay_ms={delay_ms}

""", - ) + status=200, + content_type="text/html; charset=utf-8", + ) + + httpserver.expect_request("/slow").respond_with_handler(slow_response) httpserver.expect_request("/popup-child").respond_with_data( """ @@ -401,6 +411,32 @@ def _coerce_upstream_urls(value: Any) -> dict[str, str] | None: return urls +def _add_https_test_urls( + urls: dict[str, str], + request, + tmp_path_factory, +) -> dict[str, str]: + """Add deterministic HTTPS URLs when an upstream fixture did not provide them.""" + if urls.get("https_base_url"): + return urls + + https_server = _create_https_test_server(tmp_path_factory) + https_server.start() + request.addfinalizer(https_server.stop) + _configure_chrome_httpserver(https_server) + urls.update( + { + key: value + for key, value in _build_test_urls( + urls["base_url"], + https_server.url_for("/"), + ).items() + if key.startswith("https_") + }, + ) + return urls + + def ensure_chromium_and_puppeteer_installed_impl(tmp_path_factory) -> str: """Install Chrome and Puppeteer once for test sessions that require Chrome.""" os.environ["SNAP_DIR"] = str(tmp_path_factory.mktemp("chrome_test_data")) @@ -457,24 +493,10 @@ def chrome_test_urls(request, httpserver, tmp_path_factory): continue urls = _coerce_upstream_urls(upstream) if urls: - return urls + return _add_https_test_urls(urls, request, tmp_path_factory) urls = _configure_chrome_httpserver(httpserver) - https_server = _create_https_test_server(tmp_path_factory) - https_server.start() - request.addfinalizer(https_server.stop) - _configure_chrome_httpserver(https_server) - urls.update( - { - key: value - for key, value in _build_test_urls( - urls["base_url"], - https_server.url_for("/"), - ).items() - if key.startswith("https_") - }, - ) - return urls + return _add_https_test_urls(urls, request, tmp_path_factory) @pytest.fixture diff --git a/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py b/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py index 7c2a0d3a..e853865a 100755 --- a/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py +++ b/abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py @@ -6,10 +6,13 @@ import json import os +import ssl import subprocess import sys import pytest import tempfile +import time +import urllib.request from pathlib import Path from abx_plugins.plugins.base.test_utils import ( @@ -19,6 +22,8 @@ ) from abx_plugins.plugins.chrome.tests.chrome_test_helpers import ( _call_chrome_utils, + _add_https_test_urls, + _build_test_urls, CHROME_UTILS, chrome_session, get_test_env, @@ -55,6 +60,44 @@ def _is_supported_browser_path(path: Path) -> bool: return bool(version) +def test_chrome_test_urls_slow_route_honors_delay(chrome_test_urls): + """The slow fixture URL must stay in-flight long enough for target-close races.""" + delay_ms = 600 + slow_url = f"{chrome_test_urls['origin']}/slow?delay={delay_ms}" + + started = time.monotonic() + with urllib.request.urlopen(slow_url, timeout=5) as response: + body = response.read().decode("utf-8") + elapsed = time.monotonic() - started + + assert response.status == 200 + assert f"delay_ms={delay_ms}" in body + assert elapsed >= (delay_ms / 1000) * 0.8 + + +def test_https_fallback_is_added_when_upstream_urls_lack_https( + request, + tmp_path_factory, + httpserver, +): + """HTTPS-dependent plugins should receive a real local HTTPS URL.""" + urls = _build_test_urls(httpserver.url_for("/")) + + resolved = _add_https_test_urls(urls, request, tmp_path_factory) + + assert resolved["https_base_url"].startswith("https://") + context = ssl._create_unverified_context() + with urllib.request.urlopen( + resolved["https_base_url"], + timeout=5, + context=context, + ) as response: + body = response.read().decode("utf-8") + + assert response.status == 200 + assert "Example Domain" in body + + def test_get_machine_type(): """Test get_machine_type() returns valid format.""" machine_type = get_machine_type() diff --git a/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py b/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py index 5613471a..a2cd2ece 100644 --- a/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py +++ b/abx_plugins/plugins/chrome_mhtml/tests/test_chrome_mhtml.py @@ -1,5 +1,6 @@ """Integration tests for the chrome_mhtml plugin.""" +import json import os import subprocess import tempfile @@ -22,6 +23,7 @@ if _MHTML_HOOK is None: raise FileNotFoundError(f"Hook not found in {PLUGIN_DIR}") MHTML_HOOK = _MHTML_HOOK +CHROME_UTILS = PLUGIN_DIR.parent / "chrome" / "chrome_utils.js" CHROME_STARTUP_TIMEOUT_SECONDS = 45 MHTML_PARENT_TOKEN = "ABX_MHTML_PARENT_TOKEN_7391" MHTML_OOPIF_CHILD_TOKEN = "ABX_MHTML_OOPIF_CHILD_TOKEN_7391" @@ -31,7 +33,7 @@ def mhtml_oopif_test_url(httpserver): child_url = httpserver.url_for("/child").replace( "localhost", - "oopif-child.localhost", + "oopif-child.test", 1, ) httpserver.expect_request("/child").respond_with_data( @@ -48,7 +50,14 @@ def mhtml_oopif_test_url(httpserver): MHTML OOPIF Parent

{MHTML_PARENT_TOKEN}

- + + """, content_type="text/html; charset=utf-8", @@ -83,6 +92,63 @@ def test_mhtml_preview_templates_live_with_mhtml_plugins(plugin_name): ) +def wait_for_oopif_child_frame(snapshot_chrome_dir: Path, env: dict[str, str]) -> None: + script = r""" +const chromeUtils = require(process.argv[1]); +const chromeSessionDir = process.argv[2]; +const childToken = process.argv[3]; + +(async () => { + const puppeteer = chromeUtils.resolvePuppeteerModule(); + const connection = await chromeUtils.connectToPage({ + chromeSessionDir, + timeoutMs: 30000, + requireTargetId: true, + puppeteer, + }); + const deadline = Date.now() + 30000; + try { + while (Date.now() < deadline) { + for (const frame of connection.page.frames()) { + if (!frame.url().includes('/child')) continue; + try { + const text = await frame.evaluate(() => document.body?.innerText || ''); + if (text.includes(childToken)) { + process.stdout.write(JSON.stringify({url: frame.url(), text})); + return; + } + } catch (error) {} + } + await new Promise(resolve => setTimeout(resolve, 250)); + } + throw new Error(`Timed out waiting for OOPIF child frame token: ${childToken}`); + } finally { + connection.browser.disconnect(); + } +})().catch(error => { + console.error(error.stack || error.message); + process.exit(1); +}); +""" + result = subprocess.run( + [ + "node", + "-e", + script, + str(CHROME_UTILS), + str(snapshot_chrome_dir), + MHTML_OOPIF_CHILD_TOKEN, + ], + capture_output=True, + text=True, + timeout=40, + env=env, + ) + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert MHTML_OOPIF_CHILD_TOKEN in payload["text"] + + def test_extracts_mhtml_from_cross_site_iframe( require_chrome_runtime, mhtml_oopif_test_url, @@ -91,18 +157,30 @@ def test_extracts_mhtml_from_cross_site_iframe( with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) test_url = mhtml_oopif_test_url + chrome_args_extra = json.dumps( + [ + "--site-per-process", + "--host-resolver-rules=MAP oopif-child.test 127.0.0.1", + "--proxy-server=direct://", + "--proxy-bypass-list=*", + ], + ) + env_overrides = { + "CHROME_ARGS_EXTRA": chrome_args_extra, + } with chrome_session( tmpdir, test_url=test_url, timeout=CHROME_STARTUP_TIMEOUT_SECONDS, - env_overrides={"CHROME_ARGS_EXTRA": "--site-per-process"}, + env_overrides=env_overrides, ) as ( _process, _pid, snapshot_chrome_dir, env, ): + wait_for_oopif_child_frame(snapshot_chrome_dir, env) output_dir = snapshot_chrome_dir.parent / "chrome_mhtml" output_dir.mkdir(exist_ok=True) diff --git a/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py b/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py index 36d4253f..a95bbe37 100755 --- a/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py +++ b/abx_plugins/plugins/favicon/on_Snapshot__11_favicon.finite.bg.py @@ -23,6 +23,7 @@ import os import re import sys +import time from pathlib import Path from urllib.error import HTTPError @@ -46,7 +47,7 @@ SUCCESS_OUTPUT = f"{PLUGIN_DIR}/{OUTPUT_FILE}" -def http_get(url: str, headers: dict[str, str], timeout: int) -> tuple[int, bytes]: +def http_get(url: str, headers: dict[str, str], timeout: float) -> tuple[int, bytes]: req = Request(url, headers=headers) try: with urlopen(req, timeout=timeout) as response: @@ -82,6 +83,7 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]: config = get_config() timeout = config.FAVICON_TIMEOUT + deadline = time.monotonic() + timeout library_version = os.environ.get("LIBRARY_VERSION", "0.0.1") user_agent = ( f"ArchiveBox/{library_version} (+https://github.com/ArchiveBox/ArchiveBox/)" @@ -89,6 +91,12 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]: provider_template = (config.FAVICON_PROVIDER or "").strip() headers = {"User-Agent": user_agent} + def remaining_timeout() -> float: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError(f"FAVICON_TIMEOUT exceeded after {timeout} seconds") + return remaining + # Build list of possible favicon URLs parsed = urlparse(url) base_url = f"{parsed.scheme}://{parsed.netloc}" @@ -102,7 +110,11 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]: # Try to extract favicon URL from HTML link tags try: - status_code, body = http_get(url, headers=headers, timeout=timeout) + status_code, body = http_get( + url, + headers=headers, + timeout=remaining_timeout(), + ) if 200 <= status_code < 300 and body: html = body.decode("utf-8", errors="replace") # Look for @@ -126,9 +138,15 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]: # Try each URL until we find one that works for favicon_url in favicon_urls: try: - status_code, body = http_get(favicon_url, headers=headers, timeout=timeout) + status_code, body = http_get( + favicon_url, + headers=headers, + timeout=remaining_timeout(), + ) if 200 <= status_code < 300 and body: return True, save_favicon(body), "" + except TimeoutError: + return False, None, "No favicon found" except Exception: continue @@ -136,7 +154,11 @@ def get_favicon(url: str) -> tuple[bool, str | None, str]: provider_url = build_provider_url(provider_template, domain) if provider_url: try: - status_code, body = http_get(provider_url, headers=headers, timeout=timeout) + status_code, body = http_get( + provider_url, + headers=headers, + timeout=remaining_timeout(), + ) if 200 <= status_code < 300 and body: return True, save_favicon(body), "" except Exception: diff --git a/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py b/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py index 59d41c37..a41083e0 100755 --- a/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py +++ b/abx_plugins/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py @@ -17,6 +17,7 @@ import json import os +import re import sys from importlib import import_module from pathlib import Path @@ -55,6 +56,8 @@ " None: ) +def strip_xml_prefix(content: str) -> str: + remaining = content.lstrip() + if remaining.lower().startswith("") + remaining = remaining[end + 2 :].lstrip() if end != -1 else remaining + + while remaining.startswith("") + if end == -1: + return remaining + remaining = remaining[end + 3 :].lstrip() + + if remaining.lower().startswith("") + if subset_start != -1 and (first_tag_end == -1 or subset_start < first_tag_end): + end = remaining.find("]>") + remaining = remaining[end + 2 :].lstrip() if end != -1 else remaining + elif first_tag_end != -1: + remaining = remaining[first_tag_end + 1 :].lstrip() + + while remaining.startswith("") + if end == -1: + return remaining + remaining = remaining[end + 3 :].lstrip() + + return remaining + + +def looks_like_feed_content(content: str) -> bool: + root = XML_ROOT_RE.match(strip_xml_prefix(content)) + if not root: + return False + return root.group(1).lower() in FEED_ROOT_NAMES + + def persist_records(records: list[dict]) -> tuple[str, str]: """Write extracted URLs when present, otherwise clear stale output after success.""" if records: @@ -133,7 +173,6 @@ def main( print("parsing 1 files for urls...") try: content = fetch_content(url) - reject_xml_file_loading_features(content) except Exception as e: if url.startswith(("http://", "https://")): # Snapshot URL fetching is only a fallback when no staticfile import @@ -146,6 +185,18 @@ def main( emit_result("failed", f"Failed to fetch {url}: {e}") sys.exit(1) + if not looks_like_feed_content(content): + status, output_str = persist_records([]) + print(output_str) + emit_result(status, output_str) + sys.exit(0) + + try: + reject_xml_file_loading_features(content) + except Exception as e: + emit_result("failed", f"Failed to parse RSS/Atom feed from {url}: {e}") + sys.exit(1) + # Parse the feed feed = feedparser.parse(content) diff --git a/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py b/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py index 1a0ec8a9..f0c3313f 100755 --- a/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py +++ b/abx_plugins/plugins/parse_rss_urls/tests/test_parse_rss_urls.py @@ -176,6 +176,58 @@ def test_http_fetch_failure_reports_noresults(self, tmp_path, httpserver): assert '"status": "failed"' not in result.stdout assert not (tmp_path / "parse_rss_urls" / "urls.jsonl").exists() + def test_http_html_page_reports_noresults(self, tmp_path, httpserver): + """Ordinary HTTP HTML pages are not RSS feeds and should not fail crawls.""" + httpserver.expect_request("/article").respond_with_data( + "Article" + 'Linked page' + "", + status=200, + content_type="text/html; charset=utf-8", + ) + + result = run_parse_rss_urls( + [str(SCRIPT_PATH), "--url", httpserver.url_for("/article")], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + assert '"status": "noresults"' in result.stdout + assert '"output_str": "0 URLs parsed"' in result.stdout + assert '"type": "Snapshot"' not in result.stdout + assert not (tmp_path / "parse_rss_urls" / "urls.jsonl").exists() + + def test_http_rss_feed_with_unsafe_xml_fails(self, tmp_path, httpserver): + """Feed-shaped XML errors remain hard failures instead of silent noresults.""" + httpserver.expect_request("/feed.rss").respond_with_data( + """ + +]> + + + https://example.com/post + + +""", + status=200, + content_type="application/rss+xml", + ) + + result = run_parse_rss_urls( + [str(SCRIPT_PATH), "--url", httpserver.url_for("/feed.rss")], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 1 + assert '"status": "failed"' in result.stdout + assert "XML declarations that can reference external files" in result.stderr + assert not (tmp_path / "parse_rss_urls" / "urls.jsonl").exists() + def test_exits_1_when_file_not_found(self, tmp_path): """Test that script exits with code 1 when file doesn't exist.""" result = run_parse_rss_urls( diff --git a/abx_plugins/plugins/singlefile/tests/test_singlefile.py b/abx_plugins/plugins/singlefile/tests/test_singlefile.py index 4ec24ff6..163ee404 100755 --- a/abx_plugins/plugins/singlefile/tests/test_singlefile.py +++ b/abx_plugins/plugins/singlefile/tests/test_singlefile.py @@ -94,6 +94,7 @@ def ensure_singlefile_extension_installed() -> dict[str, Path]: _singlefile_install_state = { "install_root": install_root, + "lib_dir": Path(env_install["LIB_DIR"]), "extensions_dir": extensions_dir, "cache_file": cache_file, "unpacked_path": unpacked_path, @@ -176,11 +177,10 @@ def test_singlefile_cli_archives_example_com(): navigate=True, timeout=30, env_overrides={ - "CHROME_EXTENSIONS_DIR": str(extensions_dir), + "LIB_DIR": env_install["LIB_DIR"], }, ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env): env["SINGLEFILE_ENABLED"] = "true" - env["CHROME_EXTENSIONS_DIR"] = str(extensions_dir) singlefile_output_dir = snapshot_chrome_dir.parent / "singlefile" singlefile_output_dir.mkdir(parents=True, exist_ok=True) @@ -237,7 +237,7 @@ def test_singlefile_with_chrome_session(): navigate=False, # Don't navigate, singlefile will do that timeout=20, env_overrides={ - "CHROME_EXTENSIONS_DIR": str(install_state["extensions_dir"]), + "LIB_DIR": str(install_state["lib_dir"]), }, ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): snap_dir = Path(env["SNAP_DIR"]) @@ -246,7 +246,6 @@ def test_singlefile_with_chrome_session(): # Use env from chrome_session env["SINGLEFILE_ENABLED"] = "true" - env["CHROME_EXTENSIONS_DIR"] = str(install_state["extensions_dir"]) # Run singlefile - it should find and use the existing Chrome session result = subprocess.run( @@ -296,6 +295,7 @@ def test_singlefile_with_extension_uses_existing_chrome(): assert loaded.loaded_abspath is not None, ( "abxpkg did not resolve SingleFile extension" ) + downloads_dir = tmpdir / "downloads" # Launch Chrome session with extensions loaded with chrome_session( @@ -306,7 +306,8 @@ def test_singlefile_with_extension_uses_existing_chrome(): navigate=True, timeout=30, env_overrides={ - "CHROME_EXTENSIONS_DIR": str(extensions_dir), + "LIB_DIR": env_install["LIB_DIR"], + "CHROME_DOWNLOADS_DIR": str(downloads_dir), }, ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env): singlefile_output_dir = snapshot_chrome_dir.parent / "singlefile" @@ -320,7 +321,6 @@ def test_singlefile_with_extension_uses_existing_chrome(): env["SINGLEFILE_ENABLED"] = "true" env["SINGLEFILE_BINARY"] = "/nonexistent/single-file" - env["CHROME_EXTENSIONS_DIR"] = str(extensions_dir) env["CHROME_HEADLESS"] = "false" env.pop("CRAWL_DIR", None) @@ -379,7 +379,7 @@ def test_singlefile_extension_loader_prefers_cached_background_target(): navigate=True, timeout=30, env_overrides={ - "CHROME_EXTENSIONS_DIR": str(install_state["extensions_dir"]), + "LIB_DIR": str(install_state["lib_dir"]), }, ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env): metadata = wait_for_extensions_metadata(