Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions abx_plugins/plugins/archivedotorg/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for archive.org submission in seconds"
},
"ARCHIVEDOTORG_ENDPOINT": {
"type": "string",
"default": "https://web.archive.org/save/{url}",
"description": "Wayback Machine save endpoint template. Supports {url} or {} placeholders for the submitted URL."
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from ipaddress import ip_address
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import urlparse
from urllib.parse import quote, urlparse
from urllib.request import Request, urlopen

from abx_plugins.plugins.base.utils import emit_archive_result_record, load_config
Expand All @@ -42,6 +42,7 @@
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(OUTPUT_DIR)
OUTPUT_FILE = "archive.org.txt"
URL_PATH_SAFE_CHARS = ":/?#[]@!$&'()*+,;=%"


def should_skip_archivedotorg_url(url: str) -> str:
Expand Down Expand Up @@ -71,6 +72,15 @@ def should_skip_archivedotorg_url(url: str) -> str:
return ""


def build_archivedotorg_submit_url(endpoint_template: str, url: str) -> str:
escaped_url = quote(url, safe=URL_PATH_SAFE_CHARS)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: URL encoding uses path-safe chars regardless of placeholder position; query-position {url} templates (e.g. ?url={url}) produce malformed outer URLs because inner ?, &, # are left unencoded.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At abx_plugins/plugins/archivedotorg/on_Snapshot__08_archivedotorg.finite.bg.py, line 76:

<comment>URL encoding uses path-safe chars regardless of placeholder position; query-position `{url}` templates (e.g. `?url={url}`) produce malformed outer URLs because inner `?`, `&`, `#` are left unencoded.</comment>

<file context>
@@ -71,6 +72,15 @@ def should_skip_archivedotorg_url(url: str) -> str:
 
 
+def build_archivedotorg_submit_url(endpoint_template: str, url: str) -> str:
+    escaped_url = quote(url, safe=URL_PATH_SAFE_CHARS)
+    if "{url}" in endpoint_template:
+        return endpoint_template.format(url=escaped_url)
</file context>

if "{url}" in endpoint_template:
return endpoint_template.format(url=escaped_url)
if "{}" in endpoint_template:
return endpoint_template.format(escaped_url)
return f"{endpoint_template.rstrip('/')}/{escaped_url}"


def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
"""
Submit URL to archive.org Wayback Machine.
Expand All @@ -83,12 +93,13 @@ def log(message: str) -> None:

config = load_config()
timeout = config.ARCHIVEDOTORG_TIMEOUT
endpoint_template = str(config.ARCHIVEDOTORG_ENDPOINT or "").strip()
library_version = os.environ.get("LIBRARY_VERSION", "0.0.1")
user_agent = (
f"ArchiveBox/{library_version} (+https://github.com/ArchiveBox/ArchiveBox/)"
)

submit_url = f"https://web.archive.org/save/{url}"
submit_url = build_archivedotorg_submit_url(endpoint_template, url)
log(f"Submitting to Wayback Machine (timeout={timeout}s)")
log(f"GET {submit_url}")

Expand Down
87 changes: 51 additions & 36 deletions abx_plugins/plugins/archivedotorg/tests/test_archivedotorg.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import tempfile
from pathlib import Path
import pytest
from werkzeug.wrappers import Response

from abx_plugins.plugins.base.test_utils import parse_jsonl_output

Expand All @@ -24,27 +25,43 @@ def test_hook_script_exists():
assert ARCHIVEDOTORG_HOOK.exists()


def test_submits_to_archivedotorg():
def _run_archivedotorg_hook(
tmpdir: Path,
env: dict[str, str],
) -> subprocess.CompletedProcess:
return subprocess.run(
[
str(ARCHIVEDOTORG_HOOK),
"--url",
TEST_URL,
],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30,
)


def test_submits_to_configured_archivedotorg_endpoint(httpserver):
archived_path = "/web/20260610123456/https://example.com"
httpserver.expect_request("/save/https://example.com").respond_with_data(
"saved",
status=200,
headers={
"Content-Location": archived_path,
"X-Archive-Orig-Url": TEST_URL,
},
)

with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)

env = os.environ.copy()
# Keep the hook's own network timeout below subprocess timeout so failures
# return cleanly as exit=1 instead of being killed by pytest.
env["ARCHIVEDOTORG_TIMEOUT"] = "45"
env["SNAP_DIR"] = str(tmpdir)
env["ARCHIVEDOTORG_ENDPOINT"] = f"{httpserver.url_for('/save')}/{{url}}"

result = subprocess.run(
[
str(ARCHIVEDOTORG_HOOK),
"--url",
TEST_URL,
],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=90,
)
result = _run_archivedotorg_hook(tmpdir, env)

assert result.returncode == 0, result.stderr

Expand All @@ -58,7 +75,8 @@ def test_submits_to_archivedotorg():
output_path = tmpdir / "archivedotorg" / "archive.org.txt"
assert output_path.is_file(), f"Archive.org output missing: {output_path}"
archived_url = output_path.read_text(encoding="utf-8").strip()
assert archived_url.startswith("https://web.archive.org/"), archived_url
assert archived_url == f"https://web.archive.org{archived_path}"
assert len(httpserver.log) == 1


def test_config_save_archivedotorg_false_skips():
Expand Down Expand Up @@ -94,39 +112,36 @@ def test_config_save_archivedotorg_false_skips():
assert result_json["output_str"] == "ARCHIVEDOTORG_ENABLED=False", result_json


def test_handles_timeout():
def test_archivedotorg_http_429_is_deterministic_noresults(httpserver):
def rate_limited(_request):
return Response("rate limited", status=429, content_type="text/plain")

httpserver.expect_request("/save/https://example.com").respond_with_handler(
rate_limited,
)

with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)

env = os.environ.copy()
env["ARCHIVEDOTORG_TIMEOUT"] = "10"
env["SNAP_DIR"] = str(tmpdir)
env["ARCHIVEDOTORG_ENDPOINT"] = f"{httpserver.url_for('/save')}/{{url}}"

result = subprocess.run(
[
str(ARCHIVEDOTORG_HOOK),
"--url",
TEST_URL,
],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30,
)
result = _run_archivedotorg_hook(tmpdir, env)

assert result.returncode == 0, result.stderr

result_json = parse_jsonl_output(result.stdout)
assert result_json, "Should emit ArchiveResult JSONL"
assert result_json == {
"type": "ArchiveResult",
"status": "succeeded",
"output_str": "archivedotorg/archive.org.txt",
"status": "noresults",
"output_str": "HTTP 429",
}, result_json
output_path = tmpdir / "archivedotorg" / "archive.org.txt"
assert output_path.is_file(), f"Archive.org output missing: {output_path}"
archived_url = output_path.read_text(encoding="utf-8").strip()
assert archived_url.startswith("https://web.archive.org/"), archived_url
assert not output_path.exists(), (
f"Archive.org output should not exist: {output_path}"
)


if __name__ == "__main__":
Expand Down
62 changes: 42 additions & 20 deletions abx_plugins/plugins/chrome/tests/chrome_test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
import pytest
from _pytest.fixtures import FixtureLookupError
from pytest_httpserver import HTTPServer
from werkzeug import Response

from abx_plugins.plugins.base.test_utils import (
assert_isolated_snapshot_env,
Expand Down Expand Up @@ -176,18 +177,27 @@ def _configure_chrome_httpserver(httpserver) -> dict[str, str]:
httpserver.expect_request("/linked").respond_with_data(
"<html><head><title>Linked Page</title></head><body><h1>Linked Page</h1></body></html>",
)
httpserver.expect_request("/slow").respond_with_data(
"""<!doctype html>

def slow_response(request):
delay_ms = int(request.args.get("delay", "0") or "0")
if delay_ms > 0:
time.sleep(delay_ms / 1000)
return Response(
f"""<!doctype html>
<html>
<head><meta charset="utf-8"><title>Slow Page</title></head>
<body>
<main>
<h1>Slow Page</h1>
<p>delay_ms=0</p>
<p>delay_ms={delay_ms}</p>
</main>
</body>
</html>""",
)
status=200,
content_type="text/html; charset=utf-8",
)

httpserver.expect_request("/slow").respond_with_handler(slow_response)
httpserver.expect_request("/popup-child").respond_with_data(
"""<!doctype html>
<html>
Expand Down Expand Up @@ -401,6 +411,32 @@ def _coerce_upstream_urls(value: Any) -> dict[str, str] | None:
return urls


def _add_https_test_urls(
urls: dict[str, str],
request,
tmp_path_factory,
) -> dict[str, str]:
"""Add deterministic HTTPS URLs when an upstream fixture did not provide them."""
if urls.get("https_base_url"):
return urls

https_server = _create_https_test_server(tmp_path_factory)
https_server.start()
request.addfinalizer(https_server.stop)
_configure_chrome_httpserver(https_server)
urls.update(
{
key: value
for key, value in _build_test_urls(
urls["base_url"],
https_server.url_for("/"),
).items()
if key.startswith("https_")
},
)
return urls


def ensure_chromium_and_puppeteer_installed_impl(tmp_path_factory) -> str:
"""Install Chrome and Puppeteer once for test sessions that require Chrome."""
os.environ["SNAP_DIR"] = str(tmp_path_factory.mktemp("chrome_test_data"))
Expand Down Expand Up @@ -457,24 +493,10 @@ def chrome_test_urls(request, httpserver, tmp_path_factory):
continue
urls = _coerce_upstream_urls(upstream)
if urls:
return urls
return _add_https_test_urls(urls, request, tmp_path_factory)

urls = _configure_chrome_httpserver(httpserver)
https_server = _create_https_test_server(tmp_path_factory)
https_server.start()
request.addfinalizer(https_server.stop)
_configure_chrome_httpserver(https_server)
urls.update(
{
key: value
for key, value in _build_test_urls(
urls["base_url"],
https_server.url_for("/"),
).items()
if key.startswith("https_")
},
)
return urls
return _add_https_test_urls(urls, request, tmp_path_factory)


@pytest.fixture
Expand Down
43 changes: 43 additions & 0 deletions abx_plugins/plugins/chrome/tests/test_chrome_test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@

import json
import os
import ssl
import subprocess
import sys
import pytest
import tempfile
import time
import urllib.request
from pathlib import Path

from abx_plugins.plugins.base.test_utils import (
Expand All @@ -19,6 +22,8 @@
)
from abx_plugins.plugins.chrome.tests.chrome_test_helpers import (
_call_chrome_utils,
_add_https_test_urls,
_build_test_urls,
CHROME_UTILS,
chrome_session,
get_test_env,
Expand Down Expand Up @@ -55,6 +60,44 @@ def _is_supported_browser_path(path: Path) -> bool:
return bool(version)


def test_chrome_test_urls_slow_route_honors_delay(chrome_test_urls):
"""The slow fixture URL must stay in-flight long enough for target-close races."""
delay_ms = 600
slow_url = f"{chrome_test_urls['origin']}/slow?delay={delay_ms}"

started = time.monotonic()
with urllib.request.urlopen(slow_url, timeout=5) as response:
body = response.read().decode("utf-8")
elapsed = time.monotonic() - started

assert response.status == 200
assert f"delay_ms={delay_ms}" in body
assert elapsed >= (delay_ms / 1000) * 0.8


def test_https_fallback_is_added_when_upstream_urls_lack_https(
request,
tmp_path_factory,
httpserver,
):
"""HTTPS-dependent plugins should receive a real local HTTPS URL."""
urls = _build_test_urls(httpserver.url_for("/"))

resolved = _add_https_test_urls(urls, request, tmp_path_factory)

assert resolved["https_base_url"].startswith("https://")
context = ssl._create_unverified_context()
with urllib.request.urlopen(
resolved["https_base_url"],
timeout=5,
context=context,
) as response:
body = response.read().decode("utf-8")

assert response.status == 200
assert "Example Domain" in body


def test_get_machine_type():
"""Test get_machine_type() returns valid format."""
machine_type = get_machine_type()
Expand Down
Loading
Loading