From 8c8dd8a01ccdeb0e5b764fd15bac0920499c664a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 07:25:56 +0000 Subject: [PATCH 01/48] Add Codex session rerun command --- crates/browser-use-cli/src/main.rs | 58 ++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/crates/browser-use-cli/src/main.rs b/crates/browser-use-cli/src/main.rs index 98c1a389..42efe996 100644 --- a/crates/browser-use-cli/src/main.rs +++ b/crates/browser-use-cli/src/main.rs @@ -181,6 +181,11 @@ enum Command { #[arg(long, default_value = "gpt-5.1-codex")] model: String, }, + RunCodexSession { + task_id: String, + #[arg(long, default_value = "gpt-5.1-codex")] + model: String, + }, RunOpenaiSession { task_id: String, #[arg(long)] @@ -715,6 +720,15 @@ fn main() -> Result<()> { collaboration_mode, &runtime_options, ), + Command::RunCodexSession { task_id, model } => run_codex_session( + &store, + &task_id, + model, + config_profile.as_deref(), + &config_overrides, + collaboration_mode, + &runtime_options, + ), Command::RunOpenaiSession { task_id, model } => run_openai_session( &store, &task_id, @@ -1030,6 +1044,7 @@ fn command_name(command: &Command) -> &'static str { Command::RunOpenrouter { .. } => "run_openrouter", Command::RunDeepseek { .. } => "run_deepseek", Command::RunCodex { .. } => "run_codex", + Command::RunCodexSession { .. } => "run_codex_session", Command::RunOpenaiSession { .. } => "run_openai_session", Command::RunAnthropicSession { .. } => "run_anthropic_session", Command::RunOpenrouterSession { .. } => "run_openrouter_session", @@ -1959,6 +1974,28 @@ fn run_codex( run_new_session_from_config(store, text, config) } +fn run_codex_session( + store: &Store, + task_id: &str, + model: String, + config_profile: Option<&str>, + raw_config_overrides: &[String], + collaboration_mode: CollaborationModeKind, + runtime_options: &CliRuntimeOptions, +) -> Result<()> { + ensure_task_exists(store, task_id)?; + let config = + ProviderRunConfig::new(ProviderBackend::Codex, model).with_options(cli_agent_options( + config_profile, + raw_config_overrides, + collaboration_mode, + runtime_options, + )?); + let session_id = run_existing_session_from_config_and_notify(store, task_id, config, None)?; + println!("{session_id}"); + Ok(()) +} + fn run_openai_session( store: &Store, task_id: &str, @@ -5767,6 +5804,27 @@ command = "test-mcp" Ok(()) } + #[test] + fn run_codex_session_command_accepts_task_id_and_model() -> Result<()> { + let parsed = Args::try_parse_from([ + "browser-use-terminal", + "run-codex-session", + "session-123", + "--model", + "gpt-test", + ])?; + + match &parsed.command { + Command::RunCodexSession { task_id, model } => { + assert_eq!(task_id, "session-123"); + assert_eq!(model, "gpt-test"); + assert_eq!(command_name(&parsed.command), "run_codex_session"); + } + other => panic!("expected run-codex-session command, got {other:?}"), + } + Ok(()) + } + #[test] fn sync_cookies_command_accepts_local_profile_without_global_profile_conflict() -> Result<()> { let parsed = Args::try_parse_from([ From 9459e50eb235ebb2ab056940e977742b13f09678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 15:46:38 +0000 Subject: [PATCH 02/48] Honor managed BrowserProfile launch args --- python/llm_browser_worker/worker.py | 22 ++++++++++++++++++- python/tests/test_worker_package.py | 34 +++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 9e7d23fd..717e01c9 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -333,6 +333,11 @@ def _free_port() -> int: def _managed_chrome_is_visible() -> bool: + mode = _browser_mode() + if mode in {"managed-headed", "headed", "headful"}: + return True + if mode in {"managed-headless", "headless", "headless-chromium"}: + return False return os.environ.get("LLM_BROWSER_MANAGED_CHROME_VISIBLE") == "1" @@ -340,7 +345,8 @@ def _should_start_managed_chrome() -> bool: if os.environ.get("BU_CDP_URL") or os.environ.get("BU_CDP_WS") or os.environ.get("BU_BROWSER_ID"): return False return ( - _browser_mode() in {"headless", "headless-chromium"} + _browser_mode() + in {"managed-headless", "managed-headed", "headless", "headless-chromium", "headed", "headful"} or os.environ.get("LLM_BROWSER_AUTO_CHROME") == "1" ) @@ -356,6 +362,19 @@ def _pick_managed_chrome_path(visible: bool) -> str: return _pick_chromium_path() +def _managed_chrome_extra_args() -> list[str]: + raw = os.environ.get("BU_MANAGED_BROWSER_ARGS") + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return [] + if not isinstance(parsed, list): + return [] + return [arg for arg in parsed if isinstance(arg, str) and arg] + + def _managed_chrome_args(chrome: str, port: int, profile: Path, visible: bool) -> list[str]: args = [ chrome, @@ -369,6 +388,7 @@ def _managed_chrome_args(chrome: str, port: int, profile: Path, visible: bool) - args.extend(["--new-window", "--window-size=1512,900"]) else: args.append("--headless=new") + args.extend(_managed_chrome_extra_args()) args.append("about:blank") return args diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index cdcabca0..34a66744 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -536,6 +536,40 @@ def __str__(self) -> str: ) +def test_managed_browser_profile_env_controls_worker_launch( + tmp_path: Path, monkeypatch +) -> None: + monkeypatch.delenv("BU_CDP_URL", raising=False) + monkeypatch.delenv("BU_CDP_WS", raising=False) + monkeypatch.delenv("BU_BROWSER_ID", raising=False) + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "managed_headed") + monkeypatch.setenv( + "BU_MANAGED_BROWSER_ARGS", + '["--proxy-server=http://proxy.example:8080","--user-agent=BrowserUseTest/1.0",3,""]', + ) + + assert worker._should_start_managed_chrome() is True + assert worker._managed_chrome_is_visible() is True + + headed = worker._managed_chrome_args("/chrome", 9335, tmp_path / "profile", True) + assert "--new-window" in headed + assert "--proxy-server=http://proxy.example:8080" in headed + assert "--user-agent=BrowserUseTest/1.0" in headed + assert headed.index("--proxy-server=http://proxy.example:8080") < headed.index("about:blank") + assert "3" not in headed + assert "" not in headed + + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "managed-headless") + + assert worker._should_start_managed_chrome() is True + assert worker._managed_chrome_is_visible() is False + + headless = worker._managed_chrome_args("/chrome", 9336, tmp_path / "profile", False) + assert "--headless=new" in headless + assert "--new-window" not in headless + assert "--proxy-server=http://proxy.example:8080" in headless + + def test_managed_chrome_args_visible_vs_headless(tmp_path: Path) -> None: visible = worker._managed_chrome_args("/chrome", 9333, tmp_path / "profile", True) headless = worker._managed_chrome_args("/chrome", 9334, tmp_path / "profile", False) From 172c5695a1ad3da098070af4a926a6f3d3041804 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 15:50:10 +0000 Subject: [PATCH 03/48] Honor managed BrowserProfile profile dir --- python/llm_browser_worker/worker.py | 25 +++++++++++++++++++------ python/tests/test_worker_package.py | 26 ++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 717e01c9..075c7059 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -27,6 +27,7 @@ _namespaces: Dict[str, Dict[str, Any]] = {} _managed_chrome: subprocess.Popen[Any] | None = None _managed_chrome_profile: Path | None = None +_managed_chrome_profile_is_temporary = False _explicit_agent_workspace = os.environ.get("BH_AGENT_WORKSPACE") @@ -375,6 +376,15 @@ def _managed_chrome_extra_args() -> list[str]: return [arg for arg in parsed if isinstance(arg, str) and arg] +def _managed_chrome_profile_dir() -> tuple[Path, bool]: + configured = os.environ.get("BU_MANAGED_BROWSER_PROFILE") + if configured and configured.strip(): + profile = Path(configured).expanduser() + profile.mkdir(parents=True, exist_ok=True) + return profile, False + return Path(tempfile.mkdtemp(prefix="but-managed-chrome.")), True + + def _managed_chrome_args(chrome: str, port: int, profile: Path, visible: bool) -> list[str]: args = [ chrome, @@ -403,7 +413,7 @@ def _daemon_has_browser_connection(admin: Any) -> bool: def _cleanup_managed_chrome() -> None: - global _managed_chrome, _managed_chrome_profile + global _managed_chrome, _managed_chrome_profile, _managed_chrome_profile_is_temporary proc = _managed_chrome _managed_chrome = None if proc is not None and proc.poll() is None: @@ -413,13 +423,14 @@ def _cleanup_managed_chrome() -> None: except subprocess.TimeoutExpired: proc.kill() proc.wait(timeout=5) - if _managed_chrome_profile is not None: + if _managed_chrome_profile is not None and _managed_chrome_profile_is_temporary: shutil.rmtree(_managed_chrome_profile, ignore_errors=True) - _managed_chrome_profile = None + _managed_chrome_profile = None + _managed_chrome_profile_is_temporary = False def _ensure_managed_chrome(admin: Any | None = None) -> None: - global _managed_chrome, _managed_chrome_profile + global _managed_chrome, _managed_chrome_profile, _managed_chrome_profile_is_temporary if not _should_start_managed_chrome(): return if admin is not None and _daemon_has_browser_connection(admin): @@ -428,7 +439,7 @@ def _ensure_managed_chrome(admin: Any | None = None) -> None: return port = _free_port() - profile = Path(tempfile.mkdtemp(prefix="but-managed-chrome.")) + profile, profile_is_temporary = _managed_chrome_profile_dir() visible = _managed_chrome_is_visible() chrome = _pick_managed_chrome_path(visible) proc = subprocess.Popen( @@ -450,11 +461,13 @@ def _ensure_managed_chrome(admin: Any | None = None) -> None: time.sleep(0.25) else: proc.terminate() - shutil.rmtree(profile, ignore_errors=True) + if profile_is_temporary: + shutil.rmtree(profile, ignore_errors=True) raise RuntimeError(f"managed Chrome DevTools did not become available: {last_error}") _managed_chrome = proc _managed_chrome_profile = profile + _managed_chrome_profile_is_temporary = profile_is_temporary os.environ["BU_CDP_URL"] = f"http://127.0.0.1:{port}" if not visible: atexit.register(_cleanup_managed_chrome) diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index 34a66744..27a920da 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -570,6 +570,32 @@ def test_managed_browser_profile_env_controls_worker_launch( assert "--proxy-server=http://proxy.example:8080" in headless +def test_managed_browser_profile_env_uses_configured_user_data_dir( + tmp_path: Path, monkeypatch +) -> None: + configured_profile = tmp_path / "configured-profile" + monkeypatch.setenv("BU_MANAGED_BROWSER_PROFILE", str(configured_profile)) + + profile, is_temporary = worker._managed_chrome_profile_dir() + + assert profile == configured_profile + assert is_temporary is False + assert configured_profile.exists() + + args = worker._managed_chrome_args("/chrome", 9337, profile, False) + assert f"--user-data-dir={configured_profile}" in args + + monkeypatch.setattr(worker, "_managed_chrome", None) + monkeypatch.setattr(worker, "_managed_chrome_profile", configured_profile) + monkeypatch.setattr(worker, "_managed_chrome_profile_is_temporary", False) + + worker._cleanup_managed_chrome() + + assert configured_profile.exists() + assert worker._managed_chrome_profile is None + assert worker._managed_chrome_profile_is_temporary is False + + def test_managed_chrome_args_visible_vs_headless(tmp_path: Path) -> None: visible = worker._managed_chrome_args("/chrome", 9333, tmp_path / "profile", True) headless = worker._managed_chrome_args("/chrome", 9334, tmp_path / "profile", False) From 052ff00b52bb4e081a3813cf90b531f01fa3a542 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 15:53:06 +0000 Subject: [PATCH 04/48] Honor managed BrowserProfile viewport --- python/llm_browser_worker/worker.py | 41 ++++++++++++++++++++++++++++- python/tests/test_worker_package.py | 30 +++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 075c7059..5e5ddc86 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -385,7 +385,43 @@ def _managed_chrome_profile_dir() -> tuple[Path, bool]: return Path(tempfile.mkdtemp(prefix="but-managed-chrome.")), True +def _env_bool(name: str) -> bool | None: + raw = os.environ.get(name) + if raw is None: + return None + value = raw.strip().lower() + if value in {"1", "true", "yes", "on"}: + return True + if value in {"0", "false", "no", "off"}: + return False + return None + + +def _managed_chrome_viewport_args() -> list[str]: + if _env_bool("BU_BROWSER_NO_VIEWPORT") is True: + return [] + raw = os.environ.get("BU_BROWSER_VIEWPORT") + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return [] + if not isinstance(parsed, dict): + return [] + width = parsed.get("width") + height = parsed.get("height") + if type(width) is not int or type(height) is not int or width <= 0 or height <= 0: + return [] + args = [f"--window-size={width},{height}"] + device_scale_factor = parsed.get("deviceScaleFactor") + if isinstance(device_scale_factor, (int, float)) and device_scale_factor > 0: + args.append(f"--force-device-scale-factor={device_scale_factor:g}") + return args + + def _managed_chrome_args(chrome: str, port: int, profile: Path, visible: bool) -> list[str]: + viewport_args = _managed_chrome_viewport_args() args = [ chrome, "--remote-debugging-address=127.0.0.1", @@ -395,9 +431,12 @@ def _managed_chrome_args(chrome: str, port: int, profile: Path, visible: bool) - "--no-default-browser-check", ] if visible: - args.extend(["--new-window", "--window-size=1512,900"]) + args.append("--new-window") + if not viewport_args: + args.append("--window-size=1512,900") else: args.append("--headless=new") + args.extend(viewport_args) args.extend(_managed_chrome_extra_args()) args.append("about:blank") return args diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index 27a920da..488af26d 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -596,6 +596,36 @@ def test_managed_browser_profile_env_uses_configured_user_data_dir( assert worker._managed_chrome_profile_is_temporary is False +def test_managed_browser_viewport_env_controls_worker_launch( + tmp_path: Path, monkeypatch +) -> None: + monkeypatch.setenv( + "BU_BROWSER_VIEWPORT", + '{"width":1024,"height":768,"deviceScaleFactor":2,"screenWidth":1440,"screenHeight":900}', + ) + monkeypatch.setenv("BU_BROWSER_NO_VIEWPORT", "false") + + headless = worker._managed_chrome_args("/chrome", 9338, tmp_path / "profile", False) + + assert "--headless=new" in headless + assert "--window-size=1024,768" in headless + assert "--force-device-scale-factor=2" in headless + + headed = worker._managed_chrome_args("/chrome", 9339, tmp_path / "profile", True) + + assert "--new-window" in headed + assert "--window-size=1024,768" in headed + assert "--window-size=1512,900" not in headed + + monkeypatch.setenv("BU_BROWSER_NO_VIEWPORT", "true") + + no_viewport = worker._managed_chrome_args("/chrome", 9340, tmp_path / "profile", True) + + assert "--window-size=1024,768" not in no_viewport + assert "--force-device-scale-factor=2" not in no_viewport + assert "--window-size=1512,900" in no_viewport + + def test_managed_chrome_args_visible_vs_headless(tmp_path: Path) -> None: visible = worker._managed_chrome_args("/chrome", 9333, tmp_path / "profile", True) headless = worker._managed_chrome_args("/chrome", 9334, tmp_path / "profile", False) From a6a9173df3186245ca0c62155694e1bcaff8ee05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 15:59:17 +0000 Subject: [PATCH 05/48] Apply BrowserProfile user agent override --- python/llm_browser_worker/worker.py | 18 +++++++++++++ python/tests/test_worker_package.py | 40 +++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 5e5ddc86..257d8596 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -231,6 +231,22 @@ def _browser_mode() -> str: return os.environ.get("LLM_BROWSER_BROWSER_MODE", "").lower().replace("_", "-").replace(" ", "-") +def _browser_user_agent() -> str | None: + value = os.environ.get("BU_BROWSER_USER_AGENT") + if value is None: + return None + value = value.strip() + return value or None + + +def _apply_browser_user_agent_override(cdp: Any, session_id: Any = None) -> None: + user_agent = _browser_user_agent() + if not user_agent: + return + with contextlib.suppress(Exception): + cdp("Network.setUserAgentOverride", session_id=session_id, userAgent=user_agent) + + def _annotate_error(msg: str) -> str: for pattern, hint in _HINT_PATTERNS: if pattern.search(msg): @@ -631,6 +647,8 @@ def cdp_with_daemon(method: str, session_id: Any = None, **params: Any) -> Any: _ensure_cloud_browser(admin) else: admin.ensure_daemon() + if method != "Network.setUserAgentOverride": + _apply_browser_user_agent_override(original_cdp, session_id=session_id) return original_cdp(method, session_id=session_id, **params) helpers.__llm_browser_original_cdp__ = original_cdp diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index 488af26d..fc54fba4 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -239,6 +239,46 @@ def fake_cdp(method, session_id=None, **kwargs): assert Path(response["images"][0]["path"]).exists() +def test_worker_cdp_applies_browser_user_agent_env(monkeypatch) -> None: + calls = [] + monkeypatch.setenv("BU_BROWSER_USER_AGENT", " BrowserUseRuntime/3.0 ") + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Runtime.evaluate", session_id="target-1", expression="navigator.userAgent") + + assert result == {"method": "Runtime.evaluate"} + assert admin.ensure_calls == 1 + assert calls == [ + ("Network.setUserAgentOverride", "target-1", {"userAgent": "BrowserUseRuntime/3.0"}), + ("Runtime.evaluate", "target-1", {"expression": "navigator.userAgent"}), + ] + + calls.clear() + result = helpers.cdp("Network.setUserAgentOverride", userAgent="Manual/1.0") + + assert result == {"method": "Network.setUserAgentOverride"} + assert admin.ensure_calls == 2 + assert calls == [("Network.setUserAgentOverride", None, {"userAgent": "Manual/1.0"})] + + def test_worker_page_info_fallback_reads_target_url_and_title( tmp_path: Path, monkeypatch ) -> None: From 3316fafd23a89a8e94cdab2a3b04d4368e041b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:02:18 +0000 Subject: [PATCH 06/48] Grant BrowserProfile permissions in worker --- python/llm_browser_worker/worker.py | 33 ++++++++++++++++++++ python/tests/test_worker_package.py | 47 +++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 257d8596..59197686 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -247,6 +247,37 @@ def _apply_browser_user_agent_override(cdp: Any, session_id: Any = None) -> None cdp("Network.setUserAgentOverride", session_id=session_id, userAgent=user_agent) +def _browser_permissions() -> list[str]: + raw = os.environ.get("BU_BROWSER_PERMISSIONS") + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return [] + if not isinstance(parsed, list): + return [] + permissions: list[str] = [] + seen: set[str] = set() + for permission in parsed: + if not isinstance(permission, str): + continue + permission = permission.strip() + if not permission or permission in seen: + continue + permissions.append(permission) + seen.add(permission) + return permissions + + +def _apply_browser_permissions(cdp: Any) -> None: + permissions = _browser_permissions() + if not permissions: + return + with contextlib.suppress(Exception): + cdp("Browser.grantPermissions", permissions=permissions) + + def _annotate_error(msg: str) -> str: for pattern, hint in _HINT_PATTERNS: if pattern.search(msg): @@ -647,6 +678,8 @@ def cdp_with_daemon(method: str, session_id: Any = None, **params: Any) -> Any: _ensure_cloud_browser(admin) else: admin.ensure_daemon() + if method != "Browser.grantPermissions": + _apply_browser_permissions(original_cdp) if method != "Network.setUserAgentOverride": _apply_browser_user_agent_override(original_cdp, session_id=session_id) return original_cdp(method, session_id=session_id, **params) diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index fc54fba4..d0e6dfa8 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -279,6 +279,53 @@ def ensure_daemon(self): assert calls == [("Network.setUserAgentOverride", None, {"userAgent": "Manual/1.0"})] +def test_worker_cdp_grants_browser_permissions_env(monkeypatch) -> None: + calls = [] + monkeypatch.setenv( + "BU_BROWSER_PERMISSIONS", + '["clipboardReadWrite","notifications","clipboardReadWrite",3,""]', + ) + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Page.navigate", session_id="target-1", url="https://example.com") + + assert result == {"method": "Page.navigate"} + assert admin.ensure_calls == 1 + assert calls == [ + ( + "Browser.grantPermissions", + None, + {"permissions": ["clipboardReadWrite", "notifications"]}, + ), + ("Page.navigate", "target-1", {"url": "https://example.com"}), + ] + + calls.clear() + result = helpers.cdp("Browser.grantPermissions", permissions=["geolocation"]) + + assert result == {"method": "Browser.grantPermissions"} + assert admin.ensure_calls == 2 + assert calls == [("Browser.grantPermissions", None, {"permissions": ["geolocation"]})] + + def test_worker_page_info_fallback_reads_target_url_and_title( tmp_path: Path, monkeypatch ) -> None: From eb7c5b0f156e679c763c109a1c611fb82d721286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:05:28 +0000 Subject: [PATCH 07/48] Apply BrowserProfile download behavior --- python/llm_browser_worker/worker.py | 24 +++++++++++ python/tests/test_worker_package.py | 62 +++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 59197686..13c41d36 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -278,6 +278,28 @@ def _apply_browser_permissions(cdp: Any) -> None: cdp("Browser.grantPermissions", permissions=permissions) +def _browser_download_behavior() -> dict[str, Any] | None: + accept_downloads = _env_bool("BU_BROWSER_ACCEPT_DOWNLOADS") + if accept_downloads is False: + return {"behavior": "deny"} + + raw_path = os.environ.get("BU_BROWSER_DOWNLOADS_PATH") + if not raw_path or not raw_path.strip(): + return None + download_path = Path(raw_path.strip()).expanduser().resolve() + with contextlib.suppress(OSError): + download_path.mkdir(parents=True, exist_ok=True) + return {"behavior": "allow", "downloadPath": str(download_path), "eventsEnabled": True} + + +def _apply_browser_download_behavior(cdp: Any) -> None: + behavior = _browser_download_behavior() + if not behavior: + return + with contextlib.suppress(Exception): + cdp("Browser.setDownloadBehavior", **behavior) + + def _annotate_error(msg: str) -> str: for pattern, hint in _HINT_PATTERNS: if pattern.search(msg): @@ -680,6 +702,8 @@ def cdp_with_daemon(method: str, session_id: Any = None, **params: Any) -> Any: admin.ensure_daemon() if method != "Browser.grantPermissions": _apply_browser_permissions(original_cdp) + if method != "Browser.setDownloadBehavior": + _apply_browser_download_behavior(original_cdp) if method != "Network.setUserAgentOverride": _apply_browser_user_agent_override(original_cdp, session_id=session_id) return original_cdp(method, session_id=session_id, **params) diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index d0e6dfa8..f7cdd17d 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -326,6 +326,68 @@ def ensure_daemon(self): assert calls == [("Browser.grantPermissions", None, {"permissions": ["geolocation"]})] +def test_worker_cdp_applies_browser_download_behavior_env(tmp_path: Path, monkeypatch) -> None: + calls = [] + downloads_path = tmp_path / "downloads" + monkeypatch.setenv("BU_BROWSER_ACCEPT_DOWNLOADS", "true") + monkeypatch.setenv("BU_BROWSER_DOWNLOADS_PATH", str(downloads_path)) + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Page.navigate", session_id="target-1", url="https://example.com") + + assert result == {"method": "Page.navigate"} + assert admin.ensure_calls == 1 + assert calls == [ + ( + "Browser.setDownloadBehavior", + None, + { + "behavior": "allow", + "downloadPath": str(downloads_path.resolve()), + "eventsEnabled": True, + }, + ), + ("Page.navigate", "target-1", {"url": "https://example.com"}), + ] + assert downloads_path.exists() + + calls.clear() + monkeypatch.setenv("BU_BROWSER_ACCEPT_DOWNLOADS", "false") + result = helpers.cdp("Runtime.evaluate", expression="1") + + assert result == {"method": "Runtime.evaluate"} + assert admin.ensure_calls == 2 + assert calls == [ + ("Browser.setDownloadBehavior", None, {"behavior": "deny"}), + ("Runtime.evaluate", None, {"expression": "1"}), + ] + + calls.clear() + result = helpers.cdp("Browser.setDownloadBehavior", behavior="allowAndName") + + assert result == {"method": "Browser.setDownloadBehavior"} + assert admin.ensure_calls == 3 + assert calls == [("Browser.setDownloadBehavior", None, {"behavior": "allowAndName"})] + + def test_worker_page_info_fallback_reads_target_url_and_title( tmp_path: Path, monkeypatch ) -> None: From b9fc56950e1c9f54f79c0fde409782bb2256a438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:09:28 +0000 Subject: [PATCH 08/48] Apply BrowserProfile storage state --- python/llm_browser_worker/worker.py | 123 ++++++++++++++++++++++++++++ python/tests/test_worker_package.py | 68 +++++++++++++++ 2 files changed, 191 insertions(+) diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 13c41d36..87741d00 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -300,6 +300,122 @@ def _apply_browser_download_behavior(cdp: Any) -> None: cdp("Browser.setDownloadBehavior", **behavior) +def _browser_storage_state_raw() -> str | None: + raw = os.environ.get("BU_BROWSER_STORAGE_STATE") + if not raw or not raw.strip(): + return None + return raw + + +def _browser_storage_state() -> dict[str, Any] | None: + raw = _browser_storage_state_raw() + if raw is None: + return None + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return None + return parsed if isinstance(parsed, dict) else None + + +def _browser_storage_cookies(storage_state: dict[str, Any]) -> list[dict[str, Any]]: + raw_cookies = storage_state.get("cookies") + if not isinstance(raw_cookies, list): + return [] + cookies: list[dict[str, Any]] = [] + for cookie in raw_cookies: + if not isinstance(cookie, dict): + continue + if not isinstance(cookie.get("name"), str) or not isinstance(cookie.get("value"), str): + continue + cookies.append(cookie) + return cookies + + +def _browser_storage_init_scripts(storage_state: dict[str, Any]) -> list[str]: + origins = storage_state.get("origins") + if not isinstance(origins, list): + return [] + scripts: list[str] = [] + for origin_state in origins: + if not isinstance(origin_state, dict): + continue + origin = origin_state.get("origin") + statements: list[str] = [] + for storage_name in ("localStorage", "sessionStorage"): + items = origin_state.get(storage_name) + if not isinstance(items, list): + continue + for item in items: + if not isinstance(item, dict): + continue + name = item.get("name") + value = item.get("value") + if not isinstance(name, str) or not isinstance(value, str): + continue + statements.append( + f"window.{storage_name}.setItem({json.dumps(name)}, {json.dumps(value)});" + ) + if not statements: + continue + body = "\n".join(statements) + if isinstance(origin, str) and origin: + scripts.append( + "try {\n" + f" if (window.location.origin === {json.dumps(origin)}) {{\n" + f" {body}\n" + " }\n" + "} catch (error) {}" + ) + else: + scripts.append(f"try {{\n {body}\n}} catch (error) {{}}") + return scripts + + +def _apply_browser_storage_state( + cdp: Any, + session_id: Any = None, + applied: set[tuple[Any, ...]] | None = None, +) -> None: + raw = _browser_storage_state_raw() + if raw is None: + return + storage_state = _browser_storage_state() + if not storage_state: + return + + signature = hashlib.sha256(raw.encode("utf-8")).hexdigest() + cookies = _browser_storage_cookies(storage_state) + cookie_key = ("storage_cookies", signature) + if cookies and (applied is None or cookie_key not in applied): + try: + cdp("Storage.setCookies", session_id=session_id, cookies=cookies) + except Exception: + pass + else: + if applied is not None: + applied.add(cookie_key) + + if session_id is None: + return + for index, script in enumerate(_browser_storage_init_scripts(storage_state)): + script_key = ("storage_script", str(session_id), signature, index) + if applied is not None and script_key in applied: + continue + try: + cdp( + "Page.addScriptToEvaluateOnNewDocument", + session_id=session_id, + source=script, + runImmediately=True, + ) + except Exception: + pass + else: + if applied is not None: + applied.add(script_key) + + def _annotate_error(msg: str) -> str: for pattern, hint in _HINT_PATTERNS: if pattern.search(msg): @@ -694,6 +810,7 @@ def _patch_browser_harness_cdp(helpers: Any, admin: Any) -> None: if getattr(helpers, "__llm_browser_cdp_patched__", False): return original_cdp = helpers.cdp + applied_browser_profile_state: set[tuple[Any, ...]] = set() def cdp_with_daemon(method: str, session_id: Any = None, **params: Any) -> Any: if _browser_mode() == "cloud": @@ -704,6 +821,12 @@ def cdp_with_daemon(method: str, session_id: Any = None, **params: Any) -> Any: _apply_browser_permissions(original_cdp) if method != "Browser.setDownloadBehavior": _apply_browser_download_behavior(original_cdp) + if method not in {"Storage.setCookies", "Page.addScriptToEvaluateOnNewDocument"}: + _apply_browser_storage_state( + original_cdp, + session_id=session_id, + applied=applied_browser_profile_state, + ) if method != "Network.setUserAgentOverride": _apply_browser_user_agent_override(original_cdp, session_id=session_id) return original_cdp(method, session_id=session_id, **params) diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index f7cdd17d..a58e4a6f 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from llm_browser_worker import worker @@ -388,6 +389,73 @@ def ensure_daemon(self): assert calls == [("Browser.setDownloadBehavior", None, {"behavior": "allowAndName"})] +def test_worker_cdp_applies_browser_storage_state_env(monkeypatch) -> None: + calls = [] + cookie = {"name": "sid", "value": "secret", "domain": ".example.com", "path": "/"} + monkeypatch.setenv( + "BU_BROWSER_STORAGE_STATE", + json.dumps( + { + "cookies": [cookie, {"name": "bad"}], + "origins": [ + { + "origin": "https://example.com", + "localStorage": [{"name": "theme", "value": "dark"}, {"name": 3}], + "sessionStorage": [{"name": "step", "value": "one"}], + } + ], + } + ), + ) + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Page.navigate", session_id="target-1", url="https://example.com") + + assert result == {"method": "Page.navigate"} + assert admin.ensure_calls == 1 + assert calls[0] == ("Storage.setCookies", "target-1", {"cookies": [cookie]}) + assert calls[1][0] == "Page.addScriptToEvaluateOnNewDocument" + assert calls[1][1] == "target-1" + assert calls[1][2]["runImmediately"] is True + script = calls[1][2]["source"] + assert 'window.location.origin === "https://example.com"' in script + assert 'window.localStorage.setItem("theme", "dark");' in script + assert 'window.sessionStorage.setItem("step", "one");' in script + assert calls[2] == ("Page.navigate", "target-1", {"url": "https://example.com"}) + + calls.clear() + result = helpers.cdp("Runtime.evaluate", session_id="target-1", expression="1") + + assert result == {"method": "Runtime.evaluate"} + assert admin.ensure_calls == 2 + assert calls == [("Runtime.evaluate", "target-1", {"expression": "1"})] + + calls.clear() + result = helpers.cdp("Storage.setCookies", session_id="target-1", cookies=[]) + + assert result == {"method": "Storage.setCookies"} + assert admin.ensure_calls == 3 + assert calls == [("Storage.setCookies", "target-1", {"cookies": []})] + + def test_worker_page_info_fallback_reads_target_url_and_title( tmp_path: Path, monkeypatch ) -> None: From 5410382556ec21ec932484d4b4a2d452547d33fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:13:36 +0000 Subject: [PATCH 09/48] Enforce BrowserProfile domain constraints --- python/llm_browser_worker/worker.py | 88 +++++++++++++++++++++++++++++ python/tests/test_worker_package.py | 57 +++++++++++++++++++ 2 files changed, 145 insertions(+) diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 87741d00..69f2fad2 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -3,7 +3,9 @@ import contextlib import atexit import base64 +import fnmatch import hashlib +import ipaddress import importlib import importlib.util import io @@ -19,6 +21,7 @@ import re import tempfile import time +import urllib.parse import urllib.request from pathlib import Path from typing import Any, Dict @@ -416,6 +419,90 @@ def _apply_browser_storage_state( applied.add(script_key) +def _env_json_string_list(name: str) -> list[str]: + raw = os.environ.get(name) + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return [] + if not isinstance(parsed, list): + return [] + return [value.strip() for value in parsed if isinstance(value, str) and value.strip()] + + +def _is_root_domain(domain: str) -> bool: + if "*" in domain or "://" in domain: + return False + return domain.count(".") == 1 + + +def _is_ip_address(host: str) -> bool: + with contextlib.suppress(ValueError): + ipaddress.ip_address(host) + return True + return False + + +def _domain_pattern_matches(url: str, host: str, scheme: str, pattern: str) -> bool: + full_url_pattern = f"{scheme}://{host}" + pattern = pattern.strip() + if not pattern: + return False + if "*" in pattern: + if pattern.startswith("*."): + domain_part = pattern[2:].lower() + host_lower = host.lower() + return scheme in {"http", "https"} and ( + host_lower == domain_part or host_lower.endswith(f".{domain_part}") + ) + if pattern.endswith("/*"): + return url.startswith(pattern[:-1]) + return fnmatch.fnmatch(full_url_pattern if "://" in pattern else host, pattern) + if "://" in pattern: + return url.lower().startswith(pattern.lower()) + host_lower = host.lower() + pattern_lower = pattern.lower() + if host_lower == pattern_lower: + return True + return _is_root_domain(pattern_lower) and host_lower == f"www.{pattern_lower}" + + +def _browser_profile_url_allowed(url: str) -> bool: + if url in {"about:blank", "chrome://new-tab-page/", "chrome://new-tab-page", "chrome://newtab/"}: + return True + try: + parsed = urllib.parse.urlparse(url) + except Exception: + return False + if parsed.scheme in {"data", "blob"}: + return True + host = parsed.hostname + if not host: + return False + if _env_bool("BU_BROWSER_BLOCK_IP_ADDRESSES") is True and _is_ip_address(host): + return False + + allowed_domains = _env_json_string_list("BU_BROWSER_ALLOWED_DOMAINS") + prohibited_domains = _env_json_string_list("BU_BROWSER_PROHIBITED_DOMAINS") + if allowed_domains: + return any(_domain_pattern_matches(url, host, parsed.scheme, pattern) for pattern in allowed_domains) + if prohibited_domains: + return not any(_domain_pattern_matches(url, host, parsed.scheme, pattern) for pattern in prohibited_domains) + return True + + +def _enforce_browser_domain_constraints(method: str, params: dict[str, Any]) -> None: + if method != "Page.navigate": + return + url = params.get("url") + if not isinstance(url, str) or not url: + return + if not _browser_profile_url_allowed(url): + raise RuntimeError(f"BrowserProfile domain constraints blocked navigation to {url}") + + def _annotate_error(msg: str) -> str: for pattern, hint in _HINT_PATTERNS: if pattern.search(msg): @@ -817,6 +904,7 @@ def cdp_with_daemon(method: str, session_id: Any = None, **params: Any) -> Any: _ensure_cloud_browser(admin) else: admin.ensure_daemon() + _enforce_browser_domain_constraints(method, params) if method != "Browser.grantPermissions": _apply_browser_permissions(original_cdp) if method != "Browser.setDownloadBehavior": diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index a58e4a6f..f3a86fe7 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -456,6 +456,63 @@ def ensure_daemon(self): assert calls == [("Storage.setCookies", "target-1", {"cookies": []})] +def test_worker_cdp_enforces_browser_domain_constraints_env(monkeypatch) -> None: + calls = [] + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + monkeypatch.setenv("BU_BROWSER_ALLOWED_DOMAINS", '["example.com","*.browser-use.com"]') + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + def expect_blocked(url: str) -> None: + try: + helpers.cdp("Page.navigate", session_id="target-1", url=url) + except RuntimeError as exc: + assert "BrowserProfile domain constraints blocked navigation" in str(exc) + assert url in str(exc) + else: + raise AssertionError(f"navigation should be blocked: {url}") + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Page.navigate", session_id="target-1", url="https://www.example.com/path") + + assert result == {"method": "Page.navigate"} + assert calls == [("Page.navigate", "target-1", {"url": "https://www.example.com/path"})] + + expect_blocked("https://iana.org/") + + assert len(calls) == 1 + + monkeypatch.delenv("BU_BROWSER_ALLOWED_DOMAINS", raising=False) + monkeypatch.setenv("BU_BROWSER_PROHIBITED_DOMAINS", '["*.tracking.example"]') + + expect_blocked("https://ads.tracking.example/") + + assert len(calls) == 1 + + monkeypatch.delenv("BU_BROWSER_PROHIBITED_DOMAINS", raising=False) + monkeypatch.setenv("BU_BROWSER_BLOCK_IP_ADDRESSES", "true") + + expect_blocked("http://127.0.0.1/") + + assert len(calls) == 1 + assert admin.ensure_calls == 4 + + def test_worker_page_info_fallback_reads_target_url_and_title( tmp_path: Path, monkeypatch ) -> None: From 22f7becb7c0369af5b80f33a8280ea6e4858e75b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:22:50 +0000 Subject: [PATCH 10/48] Apply BrowserProfile wait timing --- .../src/browser_script_helpers.py | 24 ++++++++++ crates/browser-use-browser/src/lib.rs | 46 +++++++++++++++++++ python/llm_browser_worker/worker.py | 20 ++++++++ python/tests/test_worker_package.py | 22 +++++++++ 4 files changed, 112 insertions(+) diff --git a/crates/browser-use-browser/src/browser_script_helpers.py b/crates/browser-use-browser/src/browser_script_helpers.py index 14d37601..50d3646b 100644 --- a/crates/browser-use-browser/src/browser_script_helpers.py +++ b/crates/browser-use-browser/src/browser_script_helpers.py @@ -354,6 +354,7 @@ def goto_url(url): __last_domain_skills = [{"url": url, **skill} for skill in skills] result = {**result, "domain_skills": __last_domain_skills} wait_for_load(timeout=15) + _wait_for_browser_profile_page_load() return result @@ -469,6 +470,29 @@ def _timeout_seconds(timeout): return min(timeout, 60.0) +def _env_milliseconds_to_seconds(name): + raw = os.environ.get(name) + if raw is None: + return 0.0 + try: + milliseconds = float(str(raw).strip()) + except ValueError: + return 0.0 + if milliseconds <= 0: + return 0.0 + return milliseconds / 1000.0 + + +def _wait_for_browser_profile_page_load(): + minimum_wait = _env_milliseconds_to_seconds("BU_BROWSER_MINIMUM_WAIT_PAGE_LOAD_MS") + if minimum_wait > 0: + _time.sleep(minimum_wait) + + network_idle_wait = _env_milliseconds_to_seconds("BU_BROWSER_NETWORK_IDLE_PAGE_LOAD_MS") + if network_idle_wait > 0: + wait_for_network_idle(timeout=max(network_idle_wait, 1.0), idle_ms=int(network_idle_wait * 1000)) + + def wait_for_load(timeout=15.0): timeout = _timeout_seconds(timeout) deadline = _time.time() + timeout diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 0a7e6a16..c93e36ea 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -7330,6 +7330,52 @@ print("time shadow ok") assert!(output.text.contains("time shadow ok")); } + #[test] + fn browser_script_goto_url_honors_browser_profile_wait_timing_env() { + let temp = tempfile::tempdir().unwrap(); + let _env = EnvRestore::set(&[ + ("BU_BROWSER_MINIMUM_WAIT_PAGE_LOAD_MS", "250"), + ("BU_BROWSER_NETWORK_IDLE_PAGE_LOAD_MS", "750"), + ]); + let output = run_browser_script( + "script-browser-profile-wait-timing", + temp.path(), + temp.path().join("artifacts"), + r#" +sleeps = [] +loads = [] +network_waits = [] +navigations = [] + +def cdp(method, **params): + navigations.append((method, params)) + return {"ok": True} + +def wait_for_load(timeout=15.0): + loads.append(timeout) + return True + +def wait_for_network_idle(timeout=10.0, idle_ms=500): + network_waits.append((timeout, idle_ms)) + return True + +_time.sleep = lambda seconds: sleeps.append(seconds) + +assert goto_url("https://example.com") == {"ok": True} +assert navigations == [("Page.navigate", {"url": "https://example.com"})], navigations +assert loads == [15], loads +assert sleeps == [0.25], sleeps +assert network_waits == [(1.0, 750)], network_waits +print("browser profile wait timing ok") +"#, + 10, + ) + .unwrap(); + + assert!(output.ok, "{:?}\n{}", output.error, output.text); + assert!(output.text.contains("browser profile wait timing ok")); + } + #[test] fn browser_script_js_return_detection_ignores_nested_callbacks() { let temp = tempfile::tempdir().unwrap(); diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 69f2fad2..2054cc58 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -669,6 +669,25 @@ def _env_bool(name: str) -> bool | None: return None +def _env_milliseconds_to_seconds(name: str) -> float: + raw = os.environ.get(name) + if raw is None: + return 0.0 + try: + milliseconds = float(raw.strip()) + except ValueError: + return 0.0 + if milliseconds <= 0: + return 0.0 + return milliseconds / 1000.0 + + +def _apply_browser_wait_between_actions() -> None: + wait_seconds = _env_milliseconds_to_seconds("BU_BROWSER_WAIT_BETWEEN_ACTIONS_MS") + if wait_seconds > 0: + time.sleep(wait_seconds) + + def _managed_chrome_viewport_args() -> list[str]: if _env_bool("BU_BROWSER_NO_VIEWPORT") is True: return [] @@ -2011,6 +2030,7 @@ def _run(request: Dict[str, Any]) -> Dict[str, Any]: assert ns is not None with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stdout): exec(compile(code, "", "exec"), ns) + _apply_browser_wait_between_actions() _auto_emit_browser_state(ns, request_id) _emit_browser_identity_events(ns, request_id) return { diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index f3a86fe7..c45d1725 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -31,6 +31,28 @@ def test_worker_run_executes_in_persistent_session_namespace(tmp_path: Path) -> assert second["data"] == 2 +def test_worker_run_applies_browser_wait_between_actions_env( + tmp_path: Path, monkeypatch +) -> None: + sleeps = [] + monkeypatch.setenv("BU_BROWSER_WAIT_BETWEEN_ACTIONS_MS", "125") + monkeypatch.setattr(worker.time, "sleep", lambda seconds: sleeps.append(seconds)) + + response = worker._run( + { + "id": "wait-between", + "session_id": "task-wait-between", + "cwd": str(tmp_path), + "artifact_dir": str(tmp_path / "artifacts"), + "code": "result = 'ok'", + } + ) + + assert response["ok"] is True + assert response["data"] == "ok" + assert sleeps == [0.125] + + def test_worker_records_artifacts_and_images(tmp_path: Path) -> None: source = tmp_path / "source.png" source.write_bytes(b"png") From 5e7b7002734c2f71532fe8d2c80a3ba4977e2632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:37:56 +0000 Subject: [PATCH 11/48] Apply BrowserProfile runtime env in Rust core --- crates/browser-use-browser/src/lib.rs | 549 +++++++++++++++++++++++++- 1 file changed, 542 insertions(+), 7 deletions(-) diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index c93e36ea..b29b8542 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -8,7 +8,7 @@ use std::collections::{HashMap, HashSet, VecDeque}; use std::fs::{self, File}; use std::io::{BufRead, BufReader, BufWriter, Read, Seek, SeekFrom, Write}; -use std::net::{TcpListener, TcpStream}; +use std::net::{IpAddr, TcpListener, TcpStream}; use std::path::{Path, PathBuf}; use std::process::{Child, ChildStderr, ChildStdout, Command, Stdio}; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; @@ -189,9 +189,15 @@ struct BrowserSession { last_target_id: Option, last_session_id: Option, last_emitted_browser_payload: Option, + browser_profile_runtime: BrowserProfileRuntimeState, logs: VecDeque, } +#[derive(Default)] +struct BrowserProfileRuntimeState { + applied_setup_keys: HashSet, +} + impl Default for BrowserSession { fn default() -> Self { Self { @@ -213,6 +219,7 @@ impl Default for BrowserSession { last_target_id: None, last_session_id: None, last_emitted_browser_payload: None, + browser_profile_runtime: BrowserProfileRuntimeState::default(), logs: VecDeque::new(), } } @@ -1425,7 +1432,9 @@ fn dispatch_connect(session: &mut BrowserSession, argv: &[String]) -> Result ManagedProfile::Temp, Some(path) => ManagedProfile::Path(PathBuf::from(path)), }; - let extra_args = option_values(argv, "--arg"); + let profile = managed_browser_profile_from_env(profile); + let mut extra_args = option_values(argv, "--arg"); + extra_args.extend(managed_browser_extra_args_from_env()); session.connect_managed(headless, profile, extra_args) } Some("remote-cdp") => { @@ -1442,6 +1451,354 @@ fn dispatch_connect(session: &mut BrowserSession, argv: &[String]) -> Result, + params: Value, +} + +fn env_trimmed(name: &str) -> Option { + std::env::var(name) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) +} + +fn env_bool(name: &str) -> Option { + match env_trimmed(name)?.to_ascii_lowercase().as_str() { + "1" | "true" | "yes" | "on" => Some(true), + "0" | "false" | "no" | "off" => Some(false), + _ => None, + } +} + +fn env_json_string_list(name: &str) -> Vec { + let Some(raw) = env_trimmed(name) else { + return Vec::new(); + }; + let Ok(value) = serde_json::from_str::(&raw) else { + return Vec::new(); + }; + let Some(items) = value.as_array() else { + return Vec::new(); + }; + let mut seen = HashSet::new(); + let mut out = Vec::new(); + for item in items { + let Some(value) = item.as_str().map(str::trim).filter(|value| !value.is_empty()) else { + continue; + }; + if seen.insert(value.to_string()) { + out.push(value.to_string()); + } + } + out +} + +fn expand_browser_profile_path(value: &str) -> PathBuf { + if let Some(rest) = value.strip_prefix("~/") { + if let Some(home) = home_dir() { + return home.join(rest); + } + } + PathBuf::from(value) +} + +fn managed_browser_profile_from_env(fallback: ManagedProfile) -> ManagedProfile { + env_trimmed("BU_MANAGED_BROWSER_PROFILE") + .map(|path| ManagedProfile::Path(expand_browser_profile_path(&path))) + .unwrap_or(fallback) +} + +fn managed_browser_extra_args_from_env() -> Vec { + let Some(raw) = env_trimmed("BU_MANAGED_BROWSER_ARGS") else { + return Vec::new(); + }; + let Ok(value) = serde_json::from_str::(&raw) else { + return Vec::new(); + }; + value + .as_array() + .into_iter() + .flatten() + .filter_map(Value::as_str) + .filter(|arg| !arg.is_empty()) + .map(ToOwned::to_owned) + .collect() +} + +fn browser_viewport_launch_args() -> Vec { + if env_bool("BU_BROWSER_NO_VIEWPORT") == Some(true) { + return Vec::new(); + } + let Some(raw) = env_trimmed("BU_BROWSER_VIEWPORT") else { + return Vec::new(); + }; + let Ok(value) = serde_json::from_str::(&raw) else { + return Vec::new(); + }; + let Some(width) = value.get("width").and_then(Value::as_i64) else { + return Vec::new(); + }; + let Some(height) = value.get("height").and_then(Value::as_i64) else { + return Vec::new(); + }; + if width <= 0 || height <= 0 { + return Vec::new(); + } + let mut args = vec![format!("--window-size={width},{height}")]; + if let Some(scale) = value + .get("deviceScaleFactor") + .and_then(Value::as_f64) + .filter(|scale| *scale > 0.0) + { + args.push(format!("--force-device-scale-factor={scale}")); + } + args +} + +fn browser_download_behavior() -> Option<(String, Value)> { + if env_bool("BU_BROWSER_ACCEPT_DOWNLOADS") == Some(false) { + return Some(( + "downloads:false".to_string(), + json!({ "behavior": "deny" }), + )); + } + let raw_path = env_trimmed("BU_BROWSER_DOWNLOADS_PATH")?; + let path = expand_browser_profile_path(&raw_path); + let _ = fs::create_dir_all(&path); + Some(( + format!("downloads:true:{}", path.display()), + json!({ + "behavior": "allow", + "downloadPath": path.display().to_string(), + "eventsEnabled": true, + }), + )) +} + +fn browser_storage_state_raw() -> Option { + env_trimmed("BU_BROWSER_STORAGE_STATE") +} + +fn browser_storage_state() -> Option { + serde_json::from_str::(&browser_storage_state_raw()?).ok() +} + +fn browser_storage_cookies(storage_state: &Value) -> Vec { + storage_state + .get("cookies") + .and_then(Value::as_array) + .into_iter() + .flatten() + .filter_map(cookie_to_cdp_param) + .collect() +} + +fn browser_storage_init_scripts(storage_state: &Value) -> Vec { + let Some(origins) = storage_state.get("origins").and_then(Value::as_array) else { + return Vec::new(); + }; + let mut scripts = Vec::new(); + for origin_state in origins { + let Some(origin_state) = origin_state.as_object() else { + continue; + }; + let origin = origin_state.get("origin").and_then(Value::as_str); + let mut statements = Vec::new(); + for storage_name in ["localStorage", "sessionStorage"] { + let Some(items) = origin_state.get(storage_name).and_then(Value::as_array) else { + continue; + }; + for item in items { + let Some(name) = item.get("name").and_then(Value::as_str) else { + continue; + }; + let Some(value) = item.get("value").and_then(Value::as_str) else { + continue; + }; + statements.push(format!( + "window.{storage_name}.setItem({}, {});", + serde_json::to_string(name).unwrap_or_else(|_| "\"\"".to_string()), + serde_json::to_string(value).unwrap_or_else(|_| "\"\"".to_string()) + )); + } + } + if statements.is_empty() { + continue; + } + let body = statements.join("\n "); + if let Some(origin) = origin.filter(|origin| !origin.trim().is_empty()) { + scripts.push(format!( + "try {{\n if (window.location.origin === {}) {{\n {body}\n }}\n}} catch (error) {{}}", + serde_json::to_string(origin).unwrap_or_else(|_| "\"\"".to_string()) + )); + } else { + scripts.push(format!("try {{\n {body}\n}} catch (error) {{}}")); + } + } + scripts +} + +fn browser_profile_setup_calls(session_id: Option<&str>) -> Vec { + let mut calls = Vec::new(); + + let permissions = env_json_string_list("BU_BROWSER_PERMISSIONS"); + if !permissions.is_empty() { + calls.push(BrowserProfileSetupCall { + key: format!("permissions:{}", permissions.join("\0")), + method: "Browser.grantPermissions", + session_id: None, + params: json!({ "permissions": permissions }), + }); + } + + if let Some((key, params)) = browser_download_behavior() { + calls.push(BrowserProfileSetupCall { + key, + method: "Browser.setDownloadBehavior", + session_id: None, + params, + }); + } + + if let Some(storage_state) = browser_storage_state() { + if let Some(raw) = browser_storage_state_raw() { + let cookies = browser_storage_cookies(&storage_state); + if !cookies.is_empty() { + calls.push(BrowserProfileSetupCall { + key: format!("storage-cookies:{raw}"), + method: "Storage.setCookies", + session_id: None, + params: json!({ "cookies": cookies }), + }); + } + if let Some(session_id) = session_id { + for (index, source) in browser_storage_init_scripts(&storage_state) + .into_iter() + .enumerate() + { + calls.push(BrowserProfileSetupCall { + key: format!("storage-script:{session_id}:{index}:{raw}"), + method: "Page.addScriptToEvaluateOnNewDocument", + session_id: Some(session_id.to_string()), + params: json!({ "source": source, "runImmediately": true }), + }); + } + } + } + } + + if let (Some(session_id), Some(user_agent)) = (session_id, env_trimmed("BU_BROWSER_USER_AGENT")) + { + calls.push(BrowserProfileSetupCall { + key: format!("user-agent:{session_id}:{user_agent}"), + method: "Network.setUserAgentOverride", + session_id: Some(session_id.to_string()), + params: json!({ "userAgent": user_agent }), + }); + } + + calls +} + +fn is_root_domain_pattern(pattern: &str) -> bool { + !pattern.contains('*') && !pattern.contains("://") && pattern.matches('.').count() == 1 +} + +fn wildcard_match(pattern: &str, value: &str) -> bool { + if !pattern.contains('*') { + return pattern == value; + } + let mut remainder = value; + let mut first = true; + for part in pattern.split('*') { + if part.is_empty() { + continue; + } + if first && !pattern.starts_with('*') { + let Some(stripped) = remainder.strip_prefix(part) else { + return false; + }; + remainder = stripped; + } else if let Some(index) = remainder.find(part) { + remainder = &remainder[index + part.len()..]; + } else { + return false; + } + first = false; + } + pattern.ends_with('*') || remainder.is_empty() +} + +fn browser_domain_pattern_matches(url: &str, host: &str, scheme: &str, pattern: &str) -> bool { + let pattern = pattern.trim(); + if pattern.is_empty() { + return false; + } + let host_lower = host.to_ascii_lowercase(); + let pattern_lower = pattern.to_ascii_lowercase(); + if let Some(domain) = pattern_lower.strip_prefix("*.") { + return matches!(scheme, "http" | "https") + && (host_lower == domain || host_lower.ends_with(&format!(".{domain}"))); + } + if pattern_lower.ends_with("/*") { + return url + .to_ascii_lowercase() + .starts_with(pattern_lower.trim_end_matches('*')); + } + if pattern_lower.contains('*') { + let value = if pattern_lower.contains("://") { + format!("{scheme}://{host_lower}") + } else { + host_lower.clone() + }; + return wildcard_match(&pattern_lower, &value); + } + if pattern_lower.contains("://") { + return url.to_ascii_lowercase().starts_with(&pattern_lower); + } + host_lower == pattern_lower + || (is_root_domain_pattern(&pattern_lower) && host_lower == format!("www.{pattern_lower}")) +} + +fn browser_profile_url_allowed(raw_url: &str) -> bool { + if matches!( + raw_url, + "about:blank" | "chrome://new-tab-page/" | "chrome://new-tab-page" | "chrome://newtab/" + ) { + return true; + } + let Ok(url) = reqwest::Url::parse(raw_url) else { + return false; + }; + if matches!(url.scheme(), "data" | "blob") { + return true; + } + let Some(host) = url.host_str() else { + return false; + }; + if env_bool("BU_BROWSER_BLOCK_IP_ADDRESSES") == Some(true) && host.parse::().is_ok() { + return false; + } + + let allowed_domains = env_json_string_list("BU_BROWSER_ALLOWED_DOMAINS"); + let prohibited_domains = env_json_string_list("BU_BROWSER_PROHIBITED_DOMAINS"); + if !allowed_domains.is_empty() { + return allowed_domains + .iter() + .any(|pattern| browser_domain_pattern_matches(raw_url, host, url.scheme(), pattern)); + } + if !prohibited_domains.is_empty() { + return !prohibited_domains + .iter() + .any(|pattern| browser_domain_pattern_matches(raw_url, host, url.scheme(), pattern)); + } + true +} + fn dispatch_local( _session: &mut BrowserSession, argv: &[String], @@ -2402,6 +2759,7 @@ impl BrowserSession { "browser is not connected. Run `browser status --json` or `browser connect ...`." ); } + self.prepare_browser_profile_runtime(method, session_id, ¶ms)?; browser_session_prepare_cdp_visuals(self, method, session_id, ¶ms); let Some(connection) = self.connection.as_mut() else { bail!( @@ -2519,6 +2877,47 @@ impl BrowserSession { } } + fn prepare_browser_profile_runtime( + &mut self, + method: &str, + session_id: Option<&str>, + params: &Value, + ) -> Result<()> { + if method == "Page.navigate" { + if let Some(url) = params.get("url").and_then(Value::as_str) { + if !browser_profile_url_allowed(url) { + bail!("BrowserProfile domain constraints blocked navigation to {url}"); + } + } + } + + let setup_calls = browser_profile_setup_calls(session_id); + if setup_calls.is_empty() { + return Ok(()); + } + let Some(connection) = self.connection.as_mut() else { + return Ok(()); + }; + for call in setup_calls { + if self + .browser_profile_runtime + .applied_setup_keys + .contains(&call.key) + { + continue; + } + if connection + .call(&call.method, call.session_id.as_deref(), call.params) + .is_ok() + { + self.browser_profile_runtime + .applied_setup_keys + .insert(call.key); + } + } + Ok(()) + } + fn attach_first_page(&mut self) -> Result<()> { let targets = self.targets()?; let target_id = targets @@ -3554,15 +3953,19 @@ fn launch_managed_browser(launch: ManagedLaunch) -> Result<(ManagedBrowser, Stri "--no-first-run".to_string(), "--no-default-browser-check".to_string(), ]; + let viewport_args = browser_viewport_launch_args(); if launch.headless { args.push("--headless=new".to_string()); - args.push("--window-size=1280,720".to_string()); + if viewport_args.is_empty() && env_bool("BU_BROWSER_NO_VIEWPORT") != Some(true) { + args.push("--window-size=1280,720".to_string()); + } } else { - args.extend([ - "--new-window".to_string(), - "--window-size=1512,900".to_string(), - ]); + args.push("--new-window".to_string()); + if viewport_args.is_empty() { + args.push("--window-size=1512,900".to_string()); + } } + args.extend(viewport_args); args.extend(launch.extra_args.clone()); args.push("about:blank".to_string()); let mut child = Command::new(&launch.executable) @@ -7376,6 +7779,138 @@ print("browser profile wait timing ok") assert!(output.text.contains("browser profile wait timing ok")); } + #[test] + fn browser_profile_runtime_setup_calls_read_env() { + let temp = tempfile::tempdir().unwrap(); + let downloads = temp.path().join("downloads"); + let downloads_text = downloads.display().to_string(); + let storage_state = json!({ + "cookies": [{ + "name": "sid", + "value": "secret", + "domain": ".example.com", + "path": "/" + }], + "origins": [{ + "origin": "https://example.com", + "localStorage": [{"name": "theme", "value": "dark"}], + "sessionStorage": [{"name": "step", "value": "one"}] + }] + }) + .to_string(); + let _env = EnvRestore::set(&[ + ( + "BU_BROWSER_PERMISSIONS", + r#"["clipboardReadWrite","notifications","clipboardReadWrite",3]"#, + ), + ("BU_BROWSER_ACCEPT_DOWNLOADS", "true"), + ("BU_BROWSER_DOWNLOADS_PATH", &downloads_text), + ("BU_BROWSER_STORAGE_STATE", &storage_state), + ("BU_BROWSER_USER_AGENT", "BrowserUseRuntime/6.0"), + ]); + + let calls = browser_profile_setup_calls(Some("session-1")); + let methods = calls.iter().map(|call| call.method).collect::>(); + + assert_eq!( + methods, + vec![ + "Browser.grantPermissions", + "Browser.setDownloadBehavior", + "Storage.setCookies", + "Page.addScriptToEvaluateOnNewDocument", + "Network.setUserAgentOverride", + ] + ); + assert_eq!( + calls[0].params["permissions"], + json!(["clipboardReadWrite", "notifications"]) + ); + assert_eq!(calls[1].params["behavior"], "allow"); + assert_eq!(calls[1].params["downloadPath"], downloads_text); + assert!(downloads.exists()); + assert_eq!(calls[2].params["cookies"][0]["name"], "sid"); + assert!(calls[3] + .params + .get("source") + .and_then(Value::as_str) + .is_some_and(|source| source.contains("window.localStorage.setItem(\"theme\", \"dark\");") + && source.contains("window.sessionStorage.setItem(\"step\", \"one\");"))); + assert_eq!(calls[4].session_id.as_deref(), Some("session-1")); + assert_eq!(calls[4].params["userAgent"], "BrowserUseRuntime/6.0"); + } + + #[test] + fn managed_browser_launch_reads_browser_profile_env() { + let temp = tempfile::tempdir().unwrap(); + let profile = temp.path().join("profile"); + let profile_text = profile.display().to_string(); + let _env = EnvRestore::set(&[ + ("BU_MANAGED_BROWSER_PROFILE", &profile_text), + ( + "BU_MANAGED_BROWSER_ARGS", + r#"["--proxy-server=http://proxy.example:8080","--user-agent=BrowserUseManaged/1.0",3,""]"#, + ), + ( + "BU_BROWSER_VIEWPORT", + r#"{"width":960,"height":720,"deviceScaleFactor":2}"#, + ), + ("BU_BROWSER_NO_VIEWPORT", "false"), + ]); + + let ManagedProfile::Path(resolved_profile) = + managed_browser_profile_from_env(ManagedProfile::Temp) + else { + panic!("expected managed profile path from env"); + }; + assert_eq!(resolved_profile, profile); + assert_eq!( + managed_browser_extra_args_from_env(), + vec![ + "--proxy-server=http://proxy.example:8080".to_string(), + "--user-agent=BrowserUseManaged/1.0".to_string(), + ] + ); + assert_eq!( + browser_viewport_launch_args(), + vec![ + "--window-size=960,720".to_string(), + "--force-device-scale-factor=2".to_string(), + ] + ); + } + + #[test] + fn browser_profile_runtime_domain_constraints_read_env() { + { + let _env = EnvRestore::set(&[ + ( + "BU_BROWSER_ALLOWED_DOMAINS", + r#"["example.com","*.browser-use.com"]"#, + ), + ("BU_BROWSER_PROHIBITED_DOMAINS", r#"["*.tracking.example"]"#), + ("BU_BROWSER_BLOCK_IP_ADDRESSES", "true"), + ]); + + assert!(browser_profile_url_allowed("https://www.example.com/path")); + assert!(browser_profile_url_allowed("https://docs.browser-use.com/")); + assert!(browser_profile_url_allowed("about:blank")); + assert!(!browser_profile_url_allowed("https://iana.org/")); + assert!(!browser_profile_url_allowed("http://127.0.0.1/")); + } + + { + let _env = EnvRestore::set(&[ + ("BU_BROWSER_ALLOWED_DOMAINS", "[]"), + ("BU_BROWSER_PROHIBITED_DOMAINS", r#"["*.tracking.example"]"#), + ("BU_BROWSER_BLOCK_IP_ADDRESSES", "false"), + ]); + + assert!(!browser_profile_url_allowed("https://ads.tracking.example/")); + assert!(browser_profile_url_allowed("https://example.com/")); + } + } + #[test] fn browser_script_js_return_detection_ignores_nested_callbacks() { let temp = tempfile::tempdir().unwrap(); From d0d53506cb0ab800306b9ec0758d59075b588d53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:48:37 +0000 Subject: [PATCH 12/48] Make BrowserProfile constraints passive without env --- crates/browser-use-browser/src/lib.rs | 53 +++++++++++++++++++++------ 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index b29b8542..15c53b58 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -1487,7 +1487,11 @@ fn env_json_string_list(name: &str) -> Vec { let mut seen = HashSet::new(); let mut out = Vec::new(); for item in items { - let Some(value) = item.as_str().map(str::trim).filter(|value| !value.is_empty()) else { + let Some(value) = item + .as_str() + .map(str::trim) + .filter(|value| !value.is_empty()) + else { continue; }; if seen.insert(value.to_string()) { @@ -1561,10 +1565,7 @@ fn browser_viewport_launch_args() -> Vec { fn browser_download_behavior() -> Option<(String, Value)> { if env_bool("BU_BROWSER_ACCEPT_DOWNLOADS") == Some(false) { - return Some(( - "downloads:false".to_string(), - json!({ "behavior": "deny" }), - )); + return Some(("downloads:false".to_string(), json!({ "behavior": "deny" }))); } let raw_path = env_trimmed("BU_BROWSER_DOWNLOADS_PATH")?; let path = expand_browser_profile_path(&raw_path); @@ -1771,21 +1772,24 @@ fn browser_profile_url_allowed(raw_url: &str) -> bool { ) { return true; } + let block_ip_addresses = env_bool("BU_BROWSER_BLOCK_IP_ADDRESSES") == Some(true); + let allowed_domains = env_json_string_list("BU_BROWSER_ALLOWED_DOMAINS"); + let prohibited_domains = env_json_string_list("BU_BROWSER_PROHIBITED_DOMAINS"); + let constraints_active = + block_ip_addresses || !allowed_domains.is_empty() || !prohibited_domains.is_empty(); let Ok(url) = reqwest::Url::parse(raw_url) else { - return false; + return !constraints_active; }; if matches!(url.scheme(), "data" | "blob") { return true; } let Some(host) = url.host_str() else { - return false; + return !constraints_active; }; - if env_bool("BU_BROWSER_BLOCK_IP_ADDRESSES") == Some(true) && host.parse::().is_ok() { + if block_ip_addresses && host.parse::().is_ok() { return false; } - let allowed_domains = env_json_string_list("BU_BROWSER_ALLOWED_DOMAINS"); - let prohibited_domains = env_json_string_list("BU_BROWSER_PROHIBITED_DOMAINS"); if !allowed_domains.is_empty() { return allowed_domains .iter() @@ -7834,7 +7838,8 @@ print("browser profile wait timing ok") .params .get("source") .and_then(Value::as_str) - .is_some_and(|source| source.contains("window.localStorage.setItem(\"theme\", \"dark\");") + .is_some_and(|source| source + .contains("window.localStorage.setItem(\"theme\", \"dark\");") && source.contains("window.sessionStorage.setItem(\"step\", \"one\");"))); assert_eq!(calls[4].session_id.as_deref(), Some("session-1")); assert_eq!(calls[4].params["userAgent"], "BrowserUseRuntime/6.0"); @@ -7906,11 +7911,35 @@ print("browser profile wait timing ok") ("BU_BROWSER_BLOCK_IP_ADDRESSES", "false"), ]); - assert!(!browser_profile_url_allowed("https://ads.tracking.example/")); + assert!(!browser_profile_url_allowed( + "https://ads.tracking.example/" + )); assert!(browser_profile_url_allowed("https://example.com/")); } } + #[test] + fn browser_profile_domain_constraints_are_passive_without_env() { + let _env = EnvRestore::unset(&[ + "BU_BROWSER_ALLOWED_DOMAINS", + "BU_BROWSER_PROHIBITED_DOMAINS", + "BU_BROWSER_BLOCK_IP_ADDRESSES", + ]); + + assert!(browser_profile_url_allowed("")); + assert!(browser_profile_url_allowed("/relative-path")); + + drop(_env); + let _env = EnvRestore::set(&[ + ("BU_BROWSER_ALLOWED_DOMAINS", r#"["example.com"]"#), + ("BU_BROWSER_PROHIBITED_DOMAINS", "[]"), + ("BU_BROWSER_BLOCK_IP_ADDRESSES", "false"), + ]); + + assert!(!browser_profile_url_allowed("")); + assert!(!browser_profile_url_allowed("/relative-path")); + } + #[test] fn browser_script_js_return_detection_ignores_nested_callbacks() { let temp = tempfile::tempdir().unwrap(); From 255145b56a583d507ea834ccaab00064ef7d30cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:09:45 +0000 Subject: [PATCH 13/48] Flatten Anthropic error tool results --- .../src/protocols/anthropic_messages.rs | 102 +++++++++++++++++- 1 file changed, 99 insertions(+), 3 deletions(-) diff --git a/crates/browser-use-llm/src/protocols/anthropic_messages.rs b/crates/browser-use-llm/src/protocols/anthropic_messages.rs index a3ba582a..c560db46 100644 --- a/crates/browser-use-llm/src/protocols/anthropic_messages.rs +++ b/crates/browser-use-llm/src/protocols/anthropic_messages.rs @@ -159,15 +159,24 @@ fn build_content_block(part: &ContentPart) -> Result { content, is_error, } => { - let blocks: Result, LlmError> = - content.iter().map(build_content_block).collect(); + let blocks = if *is_error { + vec![json!({ + "type": "text", + "text": flatten_error_tool_result_content(content), + })] + } else { + content + .iter() + .map(build_content_block) + .collect::, LlmError>>()? + }; let mut block = Map::new(); block.insert("type".to_string(), Value::String("tool_result".to_string())); block.insert( "tool_use_id".to_string(), Value::String(tool_call_id.clone()), ); - block.insert("content".to_string(), Value::Array(blocks?)); + block.insert("content".to_string(), Value::Array(blocks)); if *is_error { block.insert("is_error".to_string(), Value::Bool(true)); } @@ -192,6 +201,49 @@ fn build_content_block(part: &ContentPart) -> Result { } } +fn flatten_error_tool_result_content(content: &[ContentPart]) -> String { + let mut chunks = Vec::new(); + collect_error_tool_result_text(content, &mut chunks); + if chunks.is_empty() { + return "Tool call failed.".to_string(); + } + chunks.join("\n") +} + +fn collect_error_tool_result_text(content: &[ContentPart], chunks: &mut Vec) { + for part in content { + match part { + ContentPart::Text { text } | ContentPart::Reasoning { text, .. } => { + if !text.trim().is_empty() { + chunks.push(text.clone()); + } + } + ContentPart::Media { + mime_type, + data, + url, + .. + } => { + let pointer = url + .as_deref() + .map(|url| format!(" at {url}")) + .unwrap_or_else(|| { + data.as_ref() + .map(|_| " inline".to_string()) + .unwrap_or_default() + }); + chunks.push(format!("[{mime_type} media{pointer}]")); + } + ContentPart::ToolCall { name, .. } => { + chunks.push(format!("[nested tool call: {name}]")); + } + ContentPart::ToolResult { content, .. } => { + collect_error_tool_result_text(content, chunks); + } + } + } +} + /// Extract a thinking signature stored under `provider_metadata` if present. fn reasoning_signature_from_metadata(part: &ContentPart) -> Option { if let ContentPart::Reasoning { @@ -704,6 +756,50 @@ mod tests { ); } + #[test] + fn build_body_flattens_error_tool_result_content_to_text_blocks() { + let mut req = LlmRequest::new("m", "anthropic"); + req.messages.push(Message::new( + MessageRole::Tool, + vec![ContentPart::ToolResult { + tool_call_id: "toolu_error".into(), + content: vec![ + ContentPart::text("browser script failed"), + ContentPart::Media { + mime_type: "image/png".into(), + data: Some("base64-image".into()), + url: None, + detail: None, + }, + ], + is_error: true, + }], + )); + + let body = AnthropicMessagesProtocol::new().build_body(&req).unwrap(); + assert_eq!( + body["messages"], + json!([ + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "toolu_error", + "is_error": true, + "content": [ + { + "type": "text", + "text": "browser script failed\n[image/png media inline]" + } + ] + } + ] + } + ]) + ); + } + #[test] fn build_body_maps_reasoning_signature_to_thinking_block() { let mut req = LlmRequest::new("m", "anthropic"); From 5b72bda50acfcea05bd807996c92ae508e4194d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 02:13:47 +0000 Subject: [PATCH 14/48] Honor max turns in terminal sessions --- .../browser-use-agent/src/entrypoint/mod.rs | 19 ++++++- .../browser-use-agent/src/turn/loop_driver.rs | 56 +++++++++++++++---- .../browser-use-agent/src/turn/loop_tests.rs | 36 +++++++++++- 3 files changed, 96 insertions(+), 15 deletions(-) diff --git a/crates/browser-use-agent/src/entrypoint/mod.rs b/crates/browser-use-agent/src/entrypoint/mod.rs index a90b4a68..2ca83c34 100644 --- a/crates/browser-use-agent/src/entrypoint/mod.rs +++ b/crates/browser-use-agent/src/entrypoint/mod.rs @@ -2034,6 +2034,7 @@ async fn drive_run( developer_instructions: Option, previous_model_compaction: Option, cancel: CancellationToken, + max_turns: Option, ) -> Result, AgentError> { let state = StoreTurnState::new(Arc::clone(&store), session_id.clone(), recorded); // Enable REAL token accounting + model-based compaction when a sampler is @@ -2078,9 +2079,18 @@ async fn drive_run( let turn_loop = TurnLoop::new(state, driver, observer); let cancel_monitor = spawn_store_cancel_monitor(Arc::clone(&store), session_id.clone(), cancel.clone()); - let result = turn_loop - .run(ctx, turn_has_fresh_input, cancel.clone()) - .await; + let result = match max_turns { + Some(max_turns) => { + turn_loop + .run_with_max_turns(ctx, turn_has_fresh_input, cancel.clone(), max_turns) + .await + } + None => { + turn_loop + .run(ctx, turn_has_fresh_input, cancel.clone()) + .await + } + }; cancel_monitor.abort(); if result.is_ok() { ensure_fallback_capture_recording(&store, session_id.as_str()); @@ -2340,6 +2350,7 @@ async fn run_session_once_with_config_with_cancel( config.options.developer_instructions.clone(), previous_model_compaction, cancel.clone(), + Some(config.options.max_turns), ) .await?; } @@ -2364,6 +2375,7 @@ async fn run_session_once_with_config_with_cancel( None, None, cancel.clone(), + Some(config.options.max_turns), ) .await?; } @@ -3911,6 +3923,7 @@ mod tests { None, None, CancellationToken::new(), + None, ), ) .await diff --git a/crates/browser-use-agent/src/turn/loop_driver.rs b/crates/browser-use-agent/src/turn/loop_driver.rs index a4966368..8a8ba1ac 100644 --- a/crates/browser-use-agent/src/turn/loop_driver.rs +++ b/crates/browser-use-agent/src/turn/loop_driver.rs @@ -118,6 +118,33 @@ impl TurnLoop { ctx: TurnCtx, turn_has_fresh_input: bool, cancel: CancellationToken, + ) -> Result, crate::AgentError> { + self.run_inner(ctx, turn_has_fresh_input, cancel, None) + .await + } + + /// Run the driver with an optional sampling-round limit. + /// + /// Browser Use's Python API exposes this as `Agent.run(max_steps=...)`. + /// The default [`run`](Self::run) remains unbounded for Codex parity, while + /// terminal/browser-use bridge callers can opt into the cap. + pub async fn run_with_max_turns( + &self, + ctx: TurnCtx, + turn_has_fresh_input: bool, + cancel: CancellationToken, + max_turns: usize, + ) -> Result, crate::AgentError> { + self.run_inner(ctx, turn_has_fresh_input, cancel, Some(max_turns.max(1))) + .await + } + + async fn run_inner( + &self, + ctx: TurnCtx, + turn_has_fresh_input: bool, + cancel: CancellationToken, + max_turns: Option, ) -> Result, crate::AgentError> { let turn_id = ctx.session_id.clone(); self.observer.on_lifecycle(TurnLifecycleEvent::TurnStarted { @@ -128,6 +155,7 @@ impl TurnLoop { // drained; with no fresh input we may drain immediately. let mut can_drain = decision::initial_can_drain(turn_has_fresh_input); let mut last_agent_message: Option = None; + let mut turns_run = 0usize; // Unbounded (`turn.rs:214`): NO max-turns counter. The only exits are // Complete, cancellation, or a hard error. @@ -166,6 +194,7 @@ impl TurnLoop { } Err(other) => return Err(other), }; + turns_run += 1; // Carry the latest assistant text forward (codex keeps the last // non-empty agent message as the turn result; `turn.rs:340`). @@ -183,6 +212,23 @@ impl TurnLoop { // ---- 4. act on the step (codex `turn.rs:250-355`) ---- match step { + LoopStep::Complete => { + // Terminal: no follow-up needed and no compaction. Record the + // final agent message and break (`turn.rs:340-355`). + self.observer + .on_lifecycle(TurnLifecycleEvent::TurnComplete { + turn_id, + last_agent_message: last_agent_message.clone(), + }); + return Ok(last_agent_message); + } + _ if max_turns.is_some_and(|limit| turns_run >= limit) => { + self.observer.on_lifecycle(TurnLifecycleEvent::TurnAborted { + turn_id, + reason: TurnAbortReason::Interrupted, + }); + return Ok(last_agent_message); + } LoopStep::CompactThenContinue { can_drain_next } => { // Compact, then continue. The compaction BODY is a stub hook // (real model-based compaction WP pending); the CONTROL FLOW @@ -197,16 +243,6 @@ impl TurnLoop { // gate is always open (`turn.rs:250-255`). can_drain = true; } - LoopStep::Complete => { - // Terminal: no follow-up needed and no compaction. Record the - // final agent message and break (`turn.rs:340-355`). - self.observer - .on_lifecycle(TurnLifecycleEvent::TurnComplete { - turn_id, - last_agent_message: last_agent_message.clone(), - }); - return Ok(last_agent_message); - } } } } diff --git a/crates/browser-use-agent/src/turn/loop_tests.rs b/crates/browser-use-agent/src/turn/loop_tests.rs index 5c698db2..de02413e 100644 --- a/crates/browser-use-agent/src/turn/loop_tests.rs +++ b/crates/browser-use-agent/src/turn/loop_tests.rs @@ -28,7 +28,7 @@ use tokio_util::sync::CancellationToken; use crate::decision::{SamplingOutcome, TokenStatus}; use crate::events::TurnCtx; -use crate::task::TurnLifecycleEvent; +use crate::task::{TurnAbortReason, TurnLifecycleEvent}; use crate::turn::{SamplingDriver, TurnLoop, TurnObserver, TurnState}; use crate::AgentError; @@ -712,7 +712,39 @@ async fn loop_is_unbounded_fifty_iterations_complete() { assert_eq!(observer.kinds(), vec!["started", "complete"]); } -// ---- (7) a hard (non-abort) sampling error propagates out of the loop ------ +// ---- (7) bounded run stops after max_turns ------------------------------- + +#[tokio::test] +async fn bounded_loop_aborts_after_max_turns() { + let sampler = ScriptedSamplingDriver::new(vec![ + SamplingScript::Ok(follow_up("step 0")), + SamplingScript::Ok(follow_up("step 1")), + SamplingScript::Ok(complete("should not run")), + ]); + let requests = sampler.requests_handle(); + let state = InMemoryTurnState::new(Vec::new(), token_status(false)); + let observer = RecordingObserver::new(); + + let turn = TurnLoop::new(state, sampler, observer.clone()); + let out = turn + .run_with_max_turns(ctx(), false, CancellationToken::new(), 2) + .await + .expect("bounded loop should stop gracefully"); + + assert_eq!(requests.load(Ordering::SeqCst), 2); + assert_eq!(out.as_deref(), Some("step 1")); + assert_eq!(observer.kinds(), vec!["started", "aborted"]); + let events = observer.events.lock().unwrap(); + assert!(matches!( + events.last(), + Some(TurnLifecycleEvent::TurnAborted { + reason: TurnAbortReason::Interrupted, + .. + }) + )); +} + +// ---- (8) a hard (non-abort) sampling error propagates out of the loop ------ #[tokio::test] async fn hard_sampling_error_propagates_and_does_not_complete() { From 2f9eab807237c2a79a0deffc5d1a605f846de55a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 03:25:52 +0000 Subject: [PATCH 15/48] Support Remote CDP browser mode --- .../src/tools/handlers/browser.rs | 145 +++++++++++++----- .../src/tools/handlers/browser_tests.rs | 47 ++++++ crates/browser-use-cli/src/main.rs | 31 +++- 3 files changed, 181 insertions(+), 42 deletions(-) diff --git a/crates/browser-use-agent/src/tools/handlers/browser.rs b/crates/browser-use-agent/src/tools/handlers/browser.rs index f0f6c8a1..c6794f7b 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser.rs @@ -392,6 +392,7 @@ impl RealBackend { .filter(|mode| !mode.is_empty()) .map(|mode| match mode { "cloud" | "browser-use-cloud" | "remote-cloud" => "cloud", + "remote-cdp" | "cdp" => "remote-cdp", "headless" | "headless-chromium" | "managed-headless" => "managed-headless", other => other, }) @@ -411,6 +412,8 @@ impl RealBackend { | ["remote", "start", ..] | ["browser", "remote", "stop", ..] | ["remote", "stop", ..] + | ["browser", "connect", "remote-cdp", ..] + | ["connect", "remote-cdp", ..] ) } @@ -467,6 +470,20 @@ impl RealBackend { } "browser connect managed --headless" } + "remote-cdp" => { + if connected && current_mode == Some("remote-cdp") { + return Ok(events); + } + let desired_command = remote_cdp_connect_command()?; + let mut started = browser_use_browser::run_browser_command( + session_id, + cwd, + artifact_dir, + &desired_command, + )?; + events.append(&mut started.events); + return Ok(events); + } _ => return Ok(events), }; let mut started = browser_use_browser::run_browser_command( @@ -615,10 +632,12 @@ fn dispatch_browser_preference( selected_browser_mode: Option<&str>, ) -> anyhow::Result { match args.get(1).map(String::as_str) { - None | Some("--json") | Some("show") => browser_preference_json(store), + None | Some("--json") | Some("show") => { + browser_preference_json(store, selected_browser_mode) + } Some("use") => { let mode = args.get(2).map(String::as_str).ok_or_else(|| { - anyhow!("browser preference use requires ") + anyhow!("browser preference use requires ") })?; let normalized = normalize_browser_preference_mode(mode)?; enforce_selected_browser_mode(selected_browser_mode, normalized)?; @@ -626,7 +645,7 @@ fn dispatch_browser_preference( store.set_setting("browser", browser_display_name(normalized))?; Ok(json!({ "status": "ok", - "preference": browser_preference_json(store)?, + "preference": browser_preference_json(store, selected_browser_mode)?, "next_step": "browser connect", })) } @@ -644,7 +663,7 @@ fn dispatch_browser_profile_preference( selected_browser_mode: Option<&str>, ) -> anyhow::Result { match args.get(1).map(String::as_str) { - Some("current") => browser_preference_json(store), + Some("current") => browser_preference_json(store, selected_browser_mode), Some("use") => { enforce_selected_browser_mode(selected_browser_mode, "local")?; let profile_id = args @@ -751,7 +770,7 @@ fn dispatch_browser_profile_preference( })) } Some(other) => bail!("unknown browser profile command: {other}"), - None => browser_preference_json(store), + None => browser_preference_json(store, selected_browser_mode), } } @@ -772,10 +791,7 @@ fn resolve_browser_command_for_selected_mode( .transpose()? .flatten() }; - Ok(browser_connect_command_for_mode( - effective_mode, - profile_id.as_deref(), - )) + browser_connect_command_for_mode(effective_mode, profile_id.as_deref()) } else { enforce_browser_command_matches_selected_mode(&args, selected_browser_mode)?; Ok(cmd.to_string()) @@ -812,9 +828,41 @@ fn preferred_browser_mode(store: Option<&Store>) -> anyhow::Result<&'static str> normalize_browser_preference_mode(&mode) } -fn browser_connect_command_for_mode(mode: &str, profile_id: Option<&str>) -> String { +fn remote_cdp_connect_command() -> anyhow::Result { + if let Some(ws) = env_trimmed("BU_CDP_WS") { + return Ok(remote_cdp_connect_command_for_endpoint(&ws)); + } + if let Some(url) = env_trimmed("BU_CDP_URL") { + return Ok(remote_cdp_connect_command_for_endpoint(&url)); + } + bail!("browser mode is locked to Remote CDP, but BU_CDP_URL or BU_CDP_WS is not set") +} + +fn remote_cdp_connect_command_for_endpoint(endpoint: &str) -> String { + let flag = if endpoint.starts_with("ws://") || endpoint.starts_with("wss://") { + "--ws" + } else { + "--url" + }; + format!( + "browser connect remote-cdp {flag} {}", + shell_quote_browser_arg(endpoint) + ) +} + +fn env_trimmed(name: &str) -> Option { + std::env::var(name) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) +} + +fn browser_connect_command_for_mode( + mode: &str, + profile_id: Option<&str>, +) -> anyhow::Result { match normalize_browser_preference_mode(mode).unwrap_or("local") { - "cloud" => profile_id.filter(|value| !value.is_empty()).map_or_else( + "cloud" => Ok(profile_id.filter(|value| !value.is_empty()).map_or_else( || "browser remote start".to_string(), |profile_id| { format!( @@ -822,10 +870,11 @@ fn browser_connect_command_for_mode(mode: &str, profile_id: Option<&str>) -> Str shell_quote_browser_arg(profile_id) ) }, - ), - "managed-headless" => "browser connect managed --headless".to_string(), - "managed-headed" => "browser connect managed --headed".to_string(), - _ => "browser connect local".to_string(), + )), + "managed-headless" => Ok("browser connect managed --headless".to_string()), + "managed-headed" => Ok("browser connect managed --headed".to_string()), + "remote-cdp" => remote_cdp_connect_command(), + _ => Ok("browser connect local".to_string()), } } @@ -879,10 +928,7 @@ fn enforce_browser_command_matches_selected_mode( }; enforce_selected_browser_mode(Some(selected_mode), requested_mode) } - Some("remote-cdp") => bail!( - "browser mode is locked to {} for this run; remote CDP endpoints are not selectable from this terminal browser mode", - browser_display_name(selected_mode), - ), + Some("remote-cdp") => enforce_selected_browser_mode(Some(selected_mode), "remote-cdp"), Some(other) => bail!("unknown browser connect mode: {other}"), }, "local" => enforce_selected_browser_mode(Some(selected_mode), "local"), @@ -906,18 +952,34 @@ fn has_browser_arg(args: &[String], flag: &str) -> bool { args.iter().any(|arg| arg == flag) } -fn browser_preference_json(store: &Store) -> anyhow::Result { - let mode = store - .get_setting(BROWSER_PREF_MODE)? - .or_else(|| { - store - .get_setting("browser") - .ok() - .flatten() - .and_then(|value| display_browser_to_mode(&value).map(ToOwned::to_owned)) - }) - .unwrap_or_else(|| "local".to_string()); - let profile_id = store.get_setting(BROWSER_PREF_PROFILE)?; +fn browser_preference_json( + store: &Store, + selected_browser_mode: Option<&str>, +) -> anyhow::Result { + let selected_mode = selected_browser_mode + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(normalize_browser_preference_mode) + .transpose()?; + let mode = selected_mode.map(ToOwned::to_owned).unwrap_or_else(|| { + store + .get_setting(BROWSER_PREF_MODE) + .ok() + .flatten() + .or_else(|| { + store + .get_setting("browser") + .ok() + .flatten() + .and_then(|value| display_browser_to_mode(&value).map(ToOwned::to_owned)) + }) + .unwrap_or_else(|| "local".to_string()) + }); + let profile_id = if selected_mode.is_some() { + None + } else { + store.get_setting(BROWSER_PREF_PROFILE)? + }; let domain_profiles = store .list_settings()? .into_iter() @@ -936,15 +998,20 @@ fn browser_preference_json(store: &Store) -> anyhow::Result { "display": browser_display_name(normalize_browser_preference_mode(&mode)?), "profile_id": profile_id, "domain_profiles": domain_profiles, - "connect_command": match normalize_browser_preference_mode(&mode)? { - "cloud" => "browser remote start", - "managed-headless" => "browser connect managed --headless", - "managed-headed" => "browser connect managed --headed", - _ => "browser connect local", - }, + "connect_command": browser_connect_command_display_for_mode(&mode, profile_id.as_deref())?, })) } +fn browser_connect_command_display_for_mode( + mode: &str, + profile_id: Option<&str>, +) -> anyhow::Result { + match normalize_browser_preference_mode(mode)? { + "remote-cdp" => Ok("browser connect remote-cdp --url ".to_string()), + _ => browser_connect_command_for_mode(mode, profile_id), + } +} + fn remembered_domain_profile(store: &Store, domain: &str) -> anyhow::Result> { store .get_setting(&browser_domain_profile_key(domain))? @@ -1045,6 +1112,7 @@ fn browser_profile_connect_next_step(mode: &str, profile_id: Option<&str>) -> St ), "managed-headless" => "browser connect managed --headless".to_string(), "managed-headed" => "browser connect managed --headed".to_string(), + "remote-cdp" => "browser connect remote-cdp --url ".to_string(), _ => profile_id.map_or_else( || "browser connect local".to_string(), |profile_id| { @@ -1062,6 +1130,7 @@ fn normalize_browser_preference_mode(mode: &str) -> anyhow::Result<&'static str> match normalized.as_str() { "local" | "local-chrome" => Ok("local"), "cloud" | "browser-use-cloud" => Ok("cloud"), + "remote-cdp" | "cdp" => Ok("remote-cdp"), "headless" | "headless-chromium" | "managed-headless" => Ok("managed-headless"), "managed" | "managed-headed" | "headed" => Ok("managed-headed"), other => bail!("unknown browser preference mode: {other}"), @@ -1071,6 +1140,7 @@ fn normalize_browser_preference_mode(mode: &str) -> anyhow::Result<&'static str> fn browser_display_name(mode: &str) -> &'static str { match mode { "cloud" => "Browser Use cloud", + "remote-cdp" => "Remote CDP", "managed-headless" => "Headless Chromium", "managed-headed" => "Managed Chromium", _ => "Local Chrome", @@ -1080,6 +1150,7 @@ fn browser_display_name(mode: &str) -> &'static str { fn display_browser_to_mode(display: &str) -> Option<&'static str> { match display { "Browser Use cloud" => Some("cloud"), + "Remote CDP" => Some("remote-cdp"), "Headless Chromium" => Some("managed-headless"), "Managed Chromium" => Some("managed-headed"), "Local Chrome" => Some("local"), diff --git a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs index c63c3a36..ef781e77 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs @@ -12,6 +12,7 @@ //! parallel_safe = false; (4) backend error -> ToolError; (5) an //! orchestrator-driven run with the fake backend. +use std::ffi::OsString; use std::path::PathBuf; use std::sync::{Arc, Mutex}; @@ -58,6 +59,29 @@ struct FakeBackend { fail: bool, } +struct EnvVarGuard { + key: &'static str, + previous: Option, +} + +impl EnvVarGuard { + fn set(key: &'static str, value: &str) -> Self { + let previous = std::env::var_os(key); + std::env::set_var(key, value); + Self { key, previous } + } +} + +impl Drop for EnvVarGuard { + fn drop(&mut self) { + if let Some(previous) = &self.previous { + std::env::set_var(self.key, previous); + } else { + std::env::remove_var(self.key); + } + } +} + impl FakeBackend { fn last(&self) -> LastCall { self.last.lock().unwrap().clone() @@ -318,6 +342,29 @@ async fn bare_browser_connect_resolves_to_selected_cloud_mode() { ); } +#[tokio::test] +async fn bare_browser_connect_resolves_to_selected_remote_cdp_mode() { + let _guard = EnvVarGuard::set( + "BU_CDP_URL", + "ws://127.0.0.1:9222/devtools/browser/session-id", + ); + let backend = Arc::new(FakeBackend::default()); + let tool = + tool_with(Arc::clone(&backend)).with_selected_browser_mode(Some("remote-cdp".to_string())); + + let req = BrowserRequest::command("sess-1", "browser connect"); + let out = run_direct(&tool, &req).await.unwrap(); + + assert_eq!(out.exit_code, 0); + assert_eq!( + backend.last(), + LastCall::Command( + "browser connect remote-cdp --ws ws://127.0.0.1:9222/devtools/browser/session-id" + .to_string() + ) + ); +} + #[tokio::test] async fn selected_browser_mode_rejects_wrong_connection_family() { let backend = Arc::new(FakeBackend::default()); diff --git a/crates/browser-use-cli/src/main.rs b/crates/browser-use-cli/src/main.rs index 42efe996..82b90265 100644 --- a/crates/browser-use-cli/src/main.rs +++ b/crates/browser-use-cli/src/main.rs @@ -2583,15 +2583,12 @@ fn python(store: &Store, task_id: &str, code: String) -> Result<()> { fn browser_script(store: &Store, task_id: &str, code: String) -> Result<()> { let task = ensure_task_exists(store, task_id)?; let tool_call_id = format!("browser_script-cli-{task_id}"); - if let Some(cdp_url) = std::env::var("BU_CDP_URL") - .ok() - .filter(|url| !url.trim().is_empty()) - { + if let Some(connect_command) = remote_cdp_connect_command_from_env() { let connect = browser_use_browser::run_browser_command( task_id, &task.cwd, &task.artifact_root, - &format!("browser connect remote-cdp --url {}", cdp_url.trim()), + &connect_command, )?; if connect.content.get("status").and_then(Value::as_str) != Some("connected") { bail!("browser connect remote-cdp failed: {}", connect.content); @@ -2640,6 +2637,30 @@ fn browser_script(store: &Store, task_id: &str, code: String) -> Result<()> { ) } +fn remote_cdp_connect_command_from_env() -> Option { + std::env::var("BU_CDP_WS") + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .or_else(|| { + std::env::var("BU_CDP_URL") + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + }) + .map(|endpoint| { + let flag = if endpoint.starts_with("ws://") || endpoint.starts_with("wss://") { + "--ws" + } else { + "--url" + }; + format!( + "browser connect remote-cdp {flag} {}", + shell_quote_arg(&endpoint) + ) + }) +} + #[derive(Clone, Debug)] struct SyncCookiesArgs { profile: Option, From bbcf2730863dcdc9c031030261e1d239f446c958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 03:38:44 +0000 Subject: [PATCH 16/48] Harden browser script startup and image payloads --- .../src/tools/handlers/browser.rs | 18 ++++ .../src/tools/handlers/browser_tests.rs | 40 +++++++++ .../src/browser_script_helpers.py | 89 ++++++++++++++++--- crates/browser-use-browser/src/lib.rs | 44 ++++++++- 4 files changed, 177 insertions(+), 14 deletions(-) diff --git a/crates/browser-use-agent/src/tools/handlers/browser.rs b/crates/browser-use-agent/src/tools/handlers/browser.rs index 207f9212..695ae588 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser.rs @@ -77,6 +77,7 @@ pub const BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX: &str = "\n__browser_script_conte const BROWSER_PREF_MODE: &str = "browser.preference.mode"; const BROWSER_PREF_PROFILE: &str = "browser.preference.profile"; const BROWSER_DOMAIN_PROFILE_PREFIX: &str = "browser.domain_profile."; +const BROWSER_SCRIPT_MAX_IMAGE_DIMENSION: u32 = 8_000; /// What the model wants the browser to do. #[derive(Debug, Clone, PartialEq, Eq)] @@ -1357,6 +1358,14 @@ fn browser_script_image_part(image: &Value) -> Result, Strin if !mime_type.starts_with("image/") { return Ok(None); } + if let Some((width, height)) = png_dimensions(&bytes) { + if width > BROWSER_SCRIPT_MAX_IMAGE_DIMENSION || height > BROWSER_SCRIPT_MAX_IMAGE_DIMENSION + { + return Err(format!( + "Warning: image artifact was not attached because its dimensions {width}x{height} exceed provider limit; artifact remains at {path}" + )); + } + } Ok(Some(ContentPart::Media { mime_type: mime_type.to_string(), data: Some(general_purpose::STANDARD.encode(bytes)), @@ -1365,6 +1374,15 @@ fn browser_script_image_part(image: &Value) -> Result, Strin })) } +fn png_dimensions(bytes: &[u8]) -> Option<(u32, u32)> { + if bytes.len() < 24 || &bytes[..8] != b"\x89PNG\r\n\x1a\n" { + return None; + } + let width = u32::from_be_bytes(bytes.get(16..20)?.try_into().ok()?); + let height = u32::from_be_bytes(bytes.get(20..24)?.try_into().ok()?); + Some((width, height)) +} + fn browser_script_tool_message_content(response: &BrowserScriptOutput) -> String { if response.status.as_deref() == Some("running") { return browser_script_running_message(response); diff --git a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs index 361f0784..7e463a57 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs @@ -619,6 +619,46 @@ async fn script_unreadable_images_warn_in_stdout() { ); } +#[tokio::test] +async fn script_oversized_png_images_warn_in_stdout_without_media_payload() { + let temp = tempfile::tempdir().expect("tempdir"); + let image_path = temp.path().join("wide.png"); + let mut png = vec![0_u8; 24]; + png[0..8].copy_from_slice(b"\x89PNG\r\n\x1a\n"); + png[12..16].copy_from_slice(b"IHDR"); + png[16..20].copy_from_slice(&8001_u32.to_be_bytes()); + png[20..24].copy_from_slice(&600_u32.to_be_bytes()); + std::fs::write(&image_path, png).expect("write png"); + + let backend = Arc::new(FakeBackend::default()); + backend.script_images.lock().unwrap().push(json!({ + "path": image_path, + "mime_type": "image/png", + "detail": "auto", + "label": "wide", + })); + let tool = tool_with(Arc::clone(&backend)); + + let req = BrowserRequest::execute("sess-1", "capture_screenshot()", false); + let out = run_direct(&tool, &req).await.unwrap(); + assert!( + out.stdout + .contains("dimensions 8001x600 exceed provider limit"), + "stdout: {}", + out.stdout + ); + assert!( + out.stdout.contains("artifact remains at"), + "stdout: {}", + out.stdout + ); + assert!( + !out.stdout.contains(BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX), + "oversized-only images should not emit a media marker: {}", + out.stdout + ); +} + #[tokio::test] async fn default_artifact_dir_comes_from_tool_ctx_artifact_root() { let backend = Arc::new(FakeBackend::default()); diff --git a/crates/browser-use-browser/src/browser_script_helpers.py b/crates/browser-use-browser/src/browser_script_helpers.py index 81bb2199..8d444c73 100644 --- a/crates/browser-use-browser/src/browser_script_helpers.py +++ b/crates/browser-use-browser/src/browser_script_helpers.py @@ -387,6 +387,10 @@ def goto_url(url): def page_info(): """Return url, title, viewport, scroll position, page size, and target info.""" + try: + ensure_real_tab() + except Exception: + pass dialog = _send_meta("pending_dialog").get("dialog") if dialog: return {"dialog": dialog} @@ -547,10 +551,21 @@ def _wait_for_browser_profile_page_load(): def wait_for_load(timeout=15.0): timeout = _timeout_seconds(timeout) deadline = _time.time() + timeout + interactive_since = None while _time.time() < deadline: try: - if js("document.readyState") == "complete": + state = js("document.readyState") + if state == "complete": return True + if state == "interactive": + has_body = js("!!document.body && !!location.href && !location.href.startsWith('about:')") + if has_body: + if interactive_since is None: + interactive_since = _time.time() + if _time.time() - interactive_since >= 1.0: + return True + else: + interactive_since = None except Exception: pass _time.sleep(0.3) @@ -614,9 +629,53 @@ def _write_b64_artifact(label, data_b64, suffix=".png", mime_type="image/png"): return str(path) +def _positive_int_env(names, default=None): + for name in names: + raw = os.environ.get(name) + if raw is None: + continue + try: + value = int(str(raw).strip()) + except ValueError: + continue + if value > 0: + return value + if value == 0: + return None + return default + + +def _screenshot_max_dim(max_dim): + if max_dim is not None: + try: + value = int(max_dim) + except (TypeError, ValueError): + return None + return value if value > 0 else None + return _positive_int_env(("BU_BROWSER_SCREENSHOT_MAX_DIM", "BROWSER_USE_SCREENSHOT_MAX_DIM"), 7600) + + +def _downscale_image_artifact(path, max_dim): + if not max_dim: + return None + try: + from PIL import Image + + img = Image.open(path) + original_size = img.size + if max(original_size) > max_dim: + img.thumbnail((max_dim, max_dim)) + img.save(path) + return {"width": img.size[0], "height": img.size[1], "downscaled": True, "original_size": original_size} + return {"width": original_size[0], "height": original_size[1], "downscaled": False} + except Exception: + return None + + def capture_screenshot(label="screenshot", full=False, attach=True, max_dim=None, **kwargs): """Save a PNG of the current viewport and return its local artifact path.""" try: + ensure_real_tab() target_id = (current_tab() or {}).get("targetId") if target_id: cdp("Target.activateTarget", session_id=None, targetId=target_id) @@ -631,20 +690,26 @@ def capture_screenshot(label="screenshot", full=False, attach=True, max_dim=None if full: params["captureBeyondViewport"] = True params.update(kwargs) - result = cdp("Page.captureScreenshot", **params) + last_error = None + for attempt in range(3): + try: + result = cdp("Page.captureScreenshot", **params) + break + except Exception as exc: + last_error = exc + if attempt == 2: + raise + _time.sleep(0.35 * (attempt + 1)) + else: + raise last_error if not attach: return result path = _write_b64_artifact(label, result["data"], ".png", "image/png") - if max_dim: - try: - from PIL import Image - - img = Image.open(path) - if max(img.size) > max_dim: - img.thumbnail((max_dim, max_dim)) - img.save(path) - except Exception: - pass + image_info = _downscale_image_artifact(path, _screenshot_max_dim(max_dim)) + if image_info and __images: + __images[-1].update(image_info) + if image_info and __artifacts: + __artifacts[-1].update({key: image_info[key] for key in ("width", "height") if key in image_info}) return path diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 873d984c..bfdc9de7 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -28,7 +28,7 @@ use tungstenite::{connect, Message, WebSocket}; const BU_API: &str = "https://api.browser-use.com/api/v3"; const LOG_LIMIT: usize = 250; const SCRIPT_MAX_OUTPUT_CHARS: usize = 120_000; -const BROWSER_SCRIPT_INITIAL_WAIT_MS: u64 = 750; +const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 7_000; const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 1_000; const BROWSER_SCRIPT_HELPERS: &str = include_str!("browser_script_helpers.py"); @@ -411,7 +411,7 @@ pub fn start_browser_script( timeout_seconds: u64, ) -> Result { let mut run = spawn_browser_script(session_id, cwd, artifact_dir, code, timeout_seconds)?; - let initial_deadline = Instant::now() + Duration::from_millis(BROWSER_SCRIPT_INITIAL_WAIT_MS); + let initial_deadline = Instant::now() + Duration::from_millis(browser_script_initial_wait_ms()); loop { if run.child.try_wait()?.is_some() { return finish_browser_script_run(run, false); @@ -458,6 +458,22 @@ pub fn start_browser_script( } } +fn browser_script_initial_wait_ms() -> u64 { + [ + "BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", + "BROWSER_SCRIPT_INITIAL_WAIT_MS", + ] + .iter() + .find_map(|name| { + std::env::var(name) + .ok() + .and_then(|value| value.trim().parse::().ok()) + .filter(|value| *value > 0) + }) + .map(|value| value.clamp(250, 30_000)) + .unwrap_or(BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS) +} + pub fn observe_browser_script( session_id: &str, run_id: &str, @@ -8601,10 +8617,34 @@ print("http_get parity ok") assert!(output.run_id.is_some()); } + #[test] + fn browser_script_initial_wait_defaults_to_seven_seconds_and_clamps_env() { + { + let _env = EnvRestore::unset(&[ + "BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", + "BROWSER_SCRIPT_INITIAL_WAIT_MS", + ]); + assert_eq!(browser_script_initial_wait_ms(), 7_000); + } + { + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "1500")]); + assert_eq!(browser_script_initial_wait_ms(), 1_500); + } + { + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "50")]); + assert_eq!(browser_script_initial_wait_ms(), 250); + } + { + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "45000")]); + assert_eq!(browser_script_initial_wait_ms(), 30_000); + } + } + #[test] fn browser_script_start_observe_finishes_slow_scripts() { let temp = tempfile::tempdir().unwrap(); let session_id = "script-start-observe"; + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "500")]); let started = start_browser_script( session_id, temp.path(), From e6fe176287e6f1e36d7516c5c3cfb0401eb381c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 04:08:27 +0000 Subject: [PATCH 17/48] Gate eval finalization artifacts --- .../browser-use-agent/src/entrypoint/mod.rs | 90 +++++++++++++++++++ crates/browser-use-browser/src/lib.rs | 58 +++++++++++- 2 files changed, 147 insertions(+), 1 deletion(-) diff --git a/crates/browser-use-agent/src/entrypoint/mod.rs b/crates/browser-use-agent/src/entrypoint/mod.rs index 2ca83c34..7c8aa454 100644 --- a/crates/browser-use-agent/src/entrypoint/mod.rs +++ b/crates/browser-use-agent/src/entrypoint/mod.rs @@ -994,7 +994,23 @@ fn drain_agent_mailbox_as_pending_input(store: &SharedStore, session_id: &str) - .collect() } +const DISABLE_FALLBACK_CAPTURE_GIF_ENV: &str = "BU_DISABLE_FALLBACK_CAPTURE_GIF"; + +fn fallback_capture_recording_enabled() -> bool { + std::env::var(DISABLE_FALLBACK_CAPTURE_GIF_ENV) + .map(|value| { + !matches!( + value.trim().to_ascii_lowercase().as_str(), + "1" | "true" | "yes" | "on" + ) + }) + .unwrap_or(true) +} + fn ensure_fallback_capture_recording(store: &SharedStore, session_id: &str) { + if !fallback_capture_recording_enabled() { + return; + } let Ok(store) = store.lock() else { return; }; @@ -2476,8 +2492,66 @@ mod tests { use crate::config_overrides::ProviderRunConfig; use browser_use_store::Store; use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::{Mutex as StdMutex, MutexGuard as StdMutexGuard, OnceLock as StdOnceLock}; use tempfile::TempDir; + static ENTRYPOINT_ENV_LOCK: StdOnceLock> = StdOnceLock::new(); + + struct EnvRestore { + _guard: StdMutexGuard<'static, ()>, + values: Vec<(&'static str, Option)>, + } + + impl EnvRestore { + fn set(vars: &[(&'static str, &str)]) -> Self { + let guard = ENTRYPOINT_ENV_LOCK + .get_or_init(|| StdMutex::new(())) + .lock() + .expect("env lock poisoned"); + let values = vars + .iter() + .map(|(key, _)| (*key, std::env::var(key).ok())) + .collect::>(); + for (key, value) in vars { + std::env::set_var(key, value); + } + Self { + _guard: guard, + values, + } + } + + fn unset(keys: &[&'static str]) -> Self { + let guard = ENTRYPOINT_ENV_LOCK + .get_or_init(|| StdMutex::new(())) + .lock() + .expect("env lock poisoned"); + let values = keys + .iter() + .map(|key| (*key, std::env::var(key).ok())) + .collect::>(); + for key in keys { + std::env::remove_var(key); + } + Self { + _guard: guard, + values, + } + } + } + + impl Drop for EnvRestore { + fn drop(&mut self) { + for (key, value) in self.values.drain(..) { + if let Some(value) = value { + std::env::set_var(key, value); + } else { + std::env::remove_var(key); + } + } + } + } + /// A tempdir-backed `SharedStore` with a fresh session row (the `events` table /// has a FK on `sessions(id)`, so the session must exist before we append). /// Returns the `TempDir` so the caller keeps the on-disk sqlite db alive. @@ -2503,6 +2577,22 @@ mod tests { ProviderRunConfig::new(ProviderBackend::Fake, "fake-model").with_fake_result("hi from fake") } + #[test] + fn fallback_capture_recording_can_be_disabled_for_eval_runs() { + { + let _env = EnvRestore::unset(&[DISABLE_FALLBACK_CAPTURE_GIF_ENV]); + assert!(fallback_capture_recording_enabled()); + } + { + let _env = EnvRestore::set(&[(DISABLE_FALLBACK_CAPTURE_GIF_ENV, "1")]); + assert!(!fallback_capture_recording_enabled()); + } + { + let _env = EnvRestore::set(&[(DISABLE_FALLBACK_CAPTURE_GIF_ENV, "false")]); + assert!(fallback_capture_recording_enabled()); + } + } + /// Seed a real user turn into the durable log before driving. /// /// Appends straight through the store lock (the sync `Store::append_event`) diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index bfdc9de7..73723ee5 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -587,6 +587,16 @@ fn spawn_browser_script( code, )?; let mut command = browser_script_python_command(); + if browser_script_session_outputs_enabled() { + let outputs_dir = artifact_dir.as_ref().join("outputs"); + fs::create_dir_all(&outputs_dir).with_context(|| { + format!( + "create browser_script outputs dir {}", + outputs_dir.display() + ) + })?; + command.env("BH_OUTPUTS_DIR", outputs_dir); + } let mut child = command .arg("-c") .arg(prelude) @@ -987,6 +997,10 @@ fn nonempty_os_var(name: &str) -> Option { std::env::var_os(name).filter(|value| !value.is_empty()) } +fn browser_script_session_outputs_enabled() -> bool { + env_bool("BU_BROWSER_SCRIPT_SESSION_OUTPUTS").unwrap_or(false) +} + fn venv_python_path(venv: &Path) -> PathBuf { #[cfg(windows)] { @@ -5636,7 +5650,7 @@ ARTIFACT_DIR.mkdir(parents=True, exist_ok=True) STREAM_PATH.parent.mkdir(parents=True, exist_ok=True) FRAMES_DIR.mkdir(parents=True, exist_ok=True) FRAMES_MANIFEST = FRAMES_DIR / "frames.ndjson" -OUTPUTS_DIR = CWD +OUTPUTS_DIR = pathlib.Path(os.environ.get("BH_OUTPUTS_DIR") or {cwd:?}).expanduser().resolve() OUTPUTS_DIR.mkdir(parents=True, exist_ok=True) __USER_CODE = base64.b64decode({encoded_code:?}).decode() @@ -7679,6 +7693,48 @@ print(session_metadata()["outputs_dir"]) } } + #[test] + fn browser_script_session_outputs_dir_isolates_parallel_cwd_files() { + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_SESSION_OUTPUTS", "1")]); + let temp = tempfile::tempdir().unwrap(); + let artifacts = temp.path().join("artifacts"); + let output = run_browser_script( + "script-session-outputs", + temp.path(), + &artifacts, + r#" +shared = pathlib.Path.cwd() / 'parallel-task-leak.txt' +shared.write_text('from another parallel task', encoding='utf-8') +answer = pathlib.Path(outputs_dir()) / 'answer.json' +answer.write_text(json.dumps({'ok': True}), encoding='utf-8') +print(session_metadata()["outputs_dir"]) +"#, + 10, + ) + .unwrap(); + assert!(output.ok, "{:?}", output.error); + let artifact_paths = output + .artifacts + .iter() + .filter_map(|artifact| artifact["path"].as_str()) + .collect::>(); + assert!( + artifact_paths + .iter() + .any(|path| path.ends_with("/outputs/answer.json")), + "expected outputs artifact, got {artifact_paths:?}" + ); + assert!( + artifact_paths + .iter() + .all(|path| !path.ends_with("parallel-task-leak.txt")), + "cwd file leaked into artifacts: {artifact_paths:?}" + ); + assert!(output + .text + .contains(artifacts.join("outputs").to_str().unwrap())); + } + #[test] fn browser_script_summary_comment_maps_output_to_display_summary() { let temp = tempfile::tempdir().unwrap(); From 562b23ca8042e0b7ef33c78de6fa9d2242a2f877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 05:38:06 +0000 Subject: [PATCH 18/48] Speed up eval browser step defaults --- .../browser-use-agent/src/entrypoint/mod.rs | 42 ++++++++++++++----- crates/browser-use-browser/src/lib.rs | 6 +-- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/crates/browser-use-agent/src/entrypoint/mod.rs b/crates/browser-use-agent/src/entrypoint/mod.rs index 7c8aa454..212589b1 100644 --- a/crates/browser-use-agent/src/entrypoint/mod.rs +++ b/crates/browser-use-agent/src/entrypoint/mod.rs @@ -995,16 +995,26 @@ fn drain_agent_mailbox_as_pending_input(store: &SharedStore, session_id: &str) - } const DISABLE_FALLBACK_CAPTURE_GIF_ENV: &str = "BU_DISABLE_FALLBACK_CAPTURE_GIF"; +const ENABLE_FALLBACK_CAPTURE_GIF_ENV: &str = "BU_ENABLE_FALLBACK_CAPTURE_GIF"; + +fn env_bool(name: &str) -> Option { + std::env::var(name) + .ok() + .and_then(|value| match value.trim().to_ascii_lowercase().as_str() { + "1" | "true" | "yes" | "on" => Some(true), + "0" | "false" | "no" | "off" => Some(false), + _ => None, + }) +} fn fallback_capture_recording_enabled() -> bool { - std::env::var(DISABLE_FALLBACK_CAPTURE_GIF_ENV) - .map(|value| { - !matches!( - value.trim().to_ascii_lowercase().as_str(), - "1" | "true" | "yes" | "on" - ) - }) - .unwrap_or(true) + if matches!(env_bool(DISABLE_FALLBACK_CAPTURE_GIF_ENV), Some(true)) { + return false; + } + if let Some(enabled) = env_bool(ENABLE_FALLBACK_CAPTURE_GIF_ENV) { + return enabled; + } + matches!(env_bool(DISABLE_FALLBACK_CAPTURE_GIF_ENV), Some(false)) } fn ensure_fallback_capture_recording(store: &SharedStore, session_id: &str) { @@ -2578,13 +2588,23 @@ mod tests { } #[test] - fn fallback_capture_recording_can_be_disabled_for_eval_runs() { + fn fallback_capture_recording_is_opt_in_for_eval_speed() { + { + let _env = EnvRestore::unset(&[ + DISABLE_FALLBACK_CAPTURE_GIF_ENV, + ENABLE_FALLBACK_CAPTURE_GIF_ENV, + ]); + assert!(!fallback_capture_recording_enabled()); + } { - let _env = EnvRestore::unset(&[DISABLE_FALLBACK_CAPTURE_GIF_ENV]); + let _env = EnvRestore::set(&[(ENABLE_FALLBACK_CAPTURE_GIF_ENV, "1")]); assert!(fallback_capture_recording_enabled()); } { - let _env = EnvRestore::set(&[(DISABLE_FALLBACK_CAPTURE_GIF_ENV, "1")]); + let _env = EnvRestore::set(&[ + (ENABLE_FALLBACK_CAPTURE_GIF_ENV, "1"), + (DISABLE_FALLBACK_CAPTURE_GIF_ENV, "1"), + ]); assert!(!fallback_capture_recording_enabled()); } { diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 73723ee5..6b511ea7 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -28,7 +28,7 @@ use tungstenite::{connect, Message, WebSocket}; const BU_API: &str = "https://api.browser-use.com/api/v3"; const LOG_LIMIT: usize = 250; const SCRIPT_MAX_OUTPUT_CHARS: usize = 120_000; -const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 7_000; +const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 15_000; const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 1_000; const BROWSER_SCRIPT_HELPERS: &str = include_str!("browser_script_helpers.py"); @@ -8674,13 +8674,13 @@ print("http_get parity ok") } #[test] - fn browser_script_initial_wait_defaults_to_seven_seconds_and_clamps_env() { + fn browser_script_initial_wait_defaults_to_fifteen_seconds_and_clamps_env() { { let _env = EnvRestore::unset(&[ "BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "BROWSER_SCRIPT_INITIAL_WAIT_MS", ]); - assert_eq!(browser_script_initial_wait_ms(), 7_000); + assert_eq!(browser_script_initial_wait_ms(), 15_000); } { let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "1500")]); From 3a1c78e6921a4616fec77cb1ecbabfc9a25783b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 05:55:27 +0000 Subject: [PATCH 19/48] Disable continuous browser capture by default --- .../src/browser_script_helpers.py | 2 +- crates/browser-use-browser/src/lib.rs | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/crates/browser-use-browser/src/browser_script_helpers.py b/crates/browser-use-browser/src/browser_script_helpers.py index bdba0308..083c97ea 100644 --- a/crates/browser-use-browser/src/browser_script_helpers.py +++ b/crates/browser-use-browser/src/browser_script_helpers.py @@ -691,7 +691,7 @@ def capture_screenshot(label="screenshot", full=False, attach=True, max_dim=None def note(caption): """Mark the current moment as important for the recording, with a short human-readable caption (e.g. note("Delta $209 - cheapest fare details")). - Cheap: it just timestamps a caption; the 2fps session capture already has the + Cheap: it just timestamps a caption; when enabled, session capture already has the frame. Call it at each meaningful step so the end-of-run highlight GIF can be captioned. Returns the recorded note.""" record = {"ts_ms": int(_time.time() * 1000), "caption": str(caption)} diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 8248527c..090eea09 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -5800,7 +5800,7 @@ __USER_CODE = base64.b64decode({encoded_code:?}).decode() # are written as JPEGs plus a sidecar manifest, kept OUT of STREAM_PATH so the # event drain never sees partial/interleaved lines. try: - CAPTURE_FPS = float(os.environ.get("LLM_BROWSER_CAPTURE_FPS", "2") or "2") + CAPTURE_FPS = float(os.environ.get("LLM_BROWSER_CAPTURE_FPS", "0") or "0") except (TypeError, ValueError): CAPTURE_FPS = 2.0 try: @@ -6870,7 +6870,7 @@ fn session_capture_fps() -> f64 { std::env::var("LLM_BROWSER_CAPTURE_FPS") .ok() .and_then(|v| v.trim().parse::().ok()) - .unwrap_or(2.0) + .unwrap_or(0.0) } fn session_capture_quality() -> i64 { std::env::var("LLM_BROWSER_CAPTURE_QUALITY") @@ -8833,6 +8833,18 @@ print("http_get parity ok") } } + #[test] + fn session_capture_is_opt_in_for_eval_speed() { + { + let _env = EnvRestore::unset(&["LLM_BROWSER_CAPTURE_FPS"]); + assert_eq!(session_capture_fps(), 0.0); + } + { + let _env = EnvRestore::set(&[("LLM_BROWSER_CAPTURE_FPS", "2")]); + assert_eq!(session_capture_fps(), 2.0); + } + } + #[test] fn browser_script_start_observe_finishes_slow_scripts() { let temp = tempfile::tempdir().unwrap(); From ed549a3c611a31f62cb1d3973c5b6035cc967005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 05:55:32 +0000 Subject: [PATCH 20/48] Reduce remote CDP setup churn --- crates/browser-use-agent/src/prompts/mod.rs | 7 ++++ crates/browser-use-agent/src/prompts/tests.rs | 9 +++++ .../src/tools/handlers/browser.rs | 39 ++++++++++++++++++- .../src/tools/handlers/browser_tests.rs | 31 +++++++++++++++ 4 files changed, 84 insertions(+), 2 deletions(-) diff --git a/crates/browser-use-agent/src/prompts/mod.rs b/crates/browser-use-agent/src/prompts/mod.rs index e8250f48..5c479cfe 100644 --- a/crates/browser-use-agent/src/prompts/mod.rs +++ b/crates/browser-use-agent/src/prompts/mod.rs @@ -208,6 +208,13 @@ pub fn browser_mode_instruction(mode: &str) -> String { "Remote start means start and connect; use `browser remote live-url` to retrieve the watch URL." ) .to_string(), + "remote-cdp" | "cdp" => concat!( + "Selected browser mode: Remote CDP. The evaluation harness already provides the browser endpoint. ", + "Do not call `browser connect managed`, `browser connect local`, or `browser remote start`. ", + "Start page work directly with `browser_script` using `goto_url(...)`, then call explicit waits such as `wait_for_load(...)` only when the next page state matters. ", + "Use `browser status --json` only if you need to inspect the current connection." + ) + .to_string(), other => format!( "Selected browser mode: {other}. Use `browser status --json` first, then choose an explicit browser connect command." ), diff --git a/crates/browser-use-agent/src/prompts/tests.rs b/crates/browser-use-agent/src/prompts/tests.rs index 40222ac6..3c4f648b 100644 --- a/crates/browser-use-agent/src/prompts/tests.rs +++ b/crates/browser-use-agent/src/prompts/tests.rs @@ -89,6 +89,15 @@ fn browser_mode_instruction_matches_main_local_connection_guidance() { assert!(prompt.contains("browser local setup")); } +#[test] +fn browser_mode_instruction_guides_remote_cdp_to_direct_page_work() { + let prompt = browser_mode_instruction("remote-cdp"); + assert!(prompt.contains("Selected browser mode: Remote CDP")); + assert!(prompt.contains("already provides the browser endpoint")); + assert!(prompt.contains("Start page work directly with `browser_script`")); + assert!(prompt.contains("Do not call `browser connect managed`")); +} + /// Plan mode was removed. The compatibility enum value now renders the Default /// asset so stale configs do not re-enable planning behavior. #[test] diff --git a/crates/browser-use-agent/src/tools/handlers/browser.rs b/crates/browser-use-agent/src/tools/handlers/browser.rs index 695ae588..c6568fd9 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser.rs @@ -414,13 +414,18 @@ impl RealBackend { } fn should_ensure_before_command(&self, command: &str) -> bool { - if self.normalized_browser_mode().is_none() { + let Some(mode) = self.normalized_browser_mode() else { return false; - } + }; let Ok(words) = browser_command_words(command) else { return false; }; let words = words.iter().map(String::as_str).collect::>(); + if mode == "remote-cdp" + && matches!(words.as_slice(), ["browser", "status", ..] | ["status", ..]) + { + return true; + } if browser_command_is_passive(words.as_slice()) { return false; } @@ -842,11 +847,41 @@ fn resolve_browser_command_for_selected_mode( }; browser_connect_command_for_mode(effective_mode, profile_id.as_deref()) } else { + if let Some(command) = + remote_cdp_compatibility_connect_command(&args, selected_browser_mode)? + { + return Ok(command); + } enforce_browser_command_matches_selected_mode(&args, selected_browser_mode)?; Ok(cmd.to_string()) } } +fn remote_cdp_compatibility_connect_command( + args: &[String], + selected_browser_mode: Option<&str>, +) -> anyhow::Result> { + let Some(selected_mode) = selected_browser_mode + .map(str::trim) + .filter(|value| !value.is_empty()) + else { + return Ok(None); + }; + if normalize_browser_preference_mode(selected_mode)? != "remote-cdp" { + return Ok(None); + } + let requests_different_browser_setup = match args { + [command, mode, ..] if command == "connect" => mode != "remote-cdp", + [command, ..] if command == "local" => true, + [command, action, ..] if command == "remote" && action == "start" => true, + _ => false, + }; + if requests_different_browser_setup { + return Ok(Some(remote_cdp_connect_command()?)); + } + Ok(None) +} + fn local_connect_profile_preflight( has_stored_profile: bool, backend: &dyn BrowserBackend, diff --git a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs index 7e463a57..7e871b14 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs @@ -365,6 +365,37 @@ async fn bare_browser_connect_resolves_to_selected_remote_cdp_mode() { ); } +#[tokio::test] +async fn selected_remote_cdp_rewrites_wrong_browser_family_commands() { + let _guard = EnvVarGuard::set( + "BU_CDP_URL", + "ws://127.0.0.1:9222/devtools/browser/session-id", + ); + let backend = Arc::new(FakeBackend::default()); + let tool = + tool_with(Arc::clone(&backend)).with_selected_browser_mode(Some("remote-cdp".to_string())); + + for command in [ + "browser connect managed --headed", + "browser connect managed --headless", + "browser connect local", + "browser remote start", + ] { + let req = BrowserRequest::command("sess-1", command); + let out = run_direct(&tool, &req).await.unwrap(); + + assert_eq!(out.exit_code, 0, "{command}"); + assert_eq!( + backend.last(), + LastCall::Command( + "browser connect remote-cdp --ws ws://127.0.0.1:9222/devtools/browser/session-id" + .to_string() + ), + "{command}" + ); + } +} + #[tokio::test] async fn selected_browser_mode_rejects_wrong_connection_family() { let backend = Arc::new(FakeBackend::default()); From f87907dd0c06ca49bd340ff2523171a262b2bf41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 06:15:03 +0000 Subject: [PATCH 21/48] Bound multi-item collection loops --- crates/browser-use-agent/src/prompts/tests.rs | 8 ++++++++ prompts/browser-agent-system.md | 2 ++ 2 files changed, 10 insertions(+) diff --git a/crates/browser-use-agent/src/prompts/tests.rs b/crates/browser-use-agent/src/prompts/tests.rs index 3c4f648b..160a2859 100644 --- a/crates/browser-use-agent/src/prompts/tests.rs +++ b/crates/browser-use-agent/src/prompts/tests.rs @@ -98,6 +98,14 @@ fn browser_mode_instruction_guides_remote_cdp_to_direct_page_work() { assert!(prompt.contains("Do not call `browser connect managed`")); } +#[test] +fn system_prompt_bounds_multi_item_collection_loops() { + assert!(BASE_SYSTEM_PROMPT.contains("Multi-item collection rule")); + assert!(BASE_SYSTEM_PROMPT.contains("maintain a checklist")); + assert!(BASE_SYSTEM_PROMPT.contains("Do not keep varying one search term")); + assert!(BASE_SYSTEM_PROMPT.contains("audit the checklist")); +} + /// Plan mode was removed. The compatibility enum value now renders the Default /// asset so stale configs do not re-enable planning behavior. #[test] diff --git a/prompts/browser-agent-system.md b/prompts/browser-agent-system.md index 6699965f..b3af3e65 100644 --- a/prompts/browser-agent-system.md +++ b/prompts/browser-agent-system.md @@ -51,6 +51,8 @@ Python namespace rule: `browser_script` variables do not persist across calls. S Durable helper rule: if you discover a reusable selector, site quirk, private API, or interaction helper, put the smallest useful helper in `.browser-use/agent-workspace/agent_helpers.py` and use it on later calls. The file is auto-loaded when it changes; call `load_agent_helpers()` if you need to force reload. Keep helpers task-focused, CDP-friendly, and free of secrets. Do not build manager layers, retry frameworks, page-object frameworks, or wrapper abstractions unless the task itself absolutely requires it. +Multi-item collection rule: when the task asks for many products, countries, people, records, plans, prices, links, or fields, maintain a checklist of every required row and field. Spend work across the checklist, not indefinitely on one difficult item. For each item/source, use a small number of targeted attempts, then either record the best verified value, mark it unavailable/unknown with the source and reason, or move to a better source. Do not keep varying one search term while other required rows are untouched. Before `done`, audit the checklist: every requested row/field must be filled, explicitly unavailable/unknown, or clearly reported as partial with the remaining gaps. + Use the browser to discover and verify. Once the browser reveals stable data endpoints, static links, downloadable assets, XHR/fetch patterns, or predictable pagination URLs, switch to `requests`, `http_get`, `fetch` inside `js`, or `ThreadPoolExecutor` for bulk extraction. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Use one global deadline plus per-item micro timeouts, and check the global deadline before every navigation, wait, and sleep. Any loop over multiple pages/items must emit short progress every item or every 2 seconds, whichever comes first. For list/profile extraction, filter candidates before navigating when possible, and poll for record readiness rather than nullable answer fields; if a loaded record has a missing optional field, record it as missing and continue. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. Use helper agents only when the user explicitly asks for sub-agents, delegation, or parallel agent work. Requests for depth, thoroughness, research, investigation, or detailed codebase analysis do not by themselves authorize spawning a helper. When delegation is authorized, give each helper a narrow, self-contained task that materially advances the work, keep urgent blocking work local, avoid duplicate helper work, and continue useful non-overlapping local work while the helper runs. Use the `explorer` role for authorized read-only repository questions and `worker` for authorized implementation work with a bounded write scope. From d4a35481734f763be5e79e2a5079562374cbeb1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 06:23:13 +0000 Subject: [PATCH 22/48] Commit single-site collection to one domain --- crates/browser-use-agent/src/prompts/tests.rs | 8 ++++++++ prompts/browser-agent-system.md | 2 ++ 2 files changed, 10 insertions(+) diff --git a/crates/browser-use-agent/src/prompts/tests.rs b/crates/browser-use-agent/src/prompts/tests.rs index 160a2859..2dd8769f 100644 --- a/crates/browser-use-agent/src/prompts/tests.rs +++ b/crates/browser-use-agent/src/prompts/tests.rs @@ -106,6 +106,14 @@ fn system_prompt_bounds_multi_item_collection_loops() { assert!(BASE_SYSTEM_PROMPT.contains("audit the checklist")); } +#[test] +fn system_prompt_commits_single_site_collection_to_one_domain() { + assert!(BASE_SYSTEM_PROMPT.contains("Single-site collection rule")); + assert!(BASE_SYSTEM_PROMPT.contains("choose one viable domain early")); + assert!(BASE_SYSTEM_PROMPT.contains("Do not stitch rows from multiple domains")); + assert!(BASE_SYSTEM_PROMPT.contains("mark it unavailable for that domain")); +} + /// Plan mode was removed. The compatibility enum value now renders the Default /// asset so stale configs do not re-enable planning behavior. #[test] diff --git a/prompts/browser-agent-system.md b/prompts/browser-agent-system.md index b3af3e65..e06235dc 100644 --- a/prompts/browser-agent-system.md +++ b/prompts/browser-agent-system.md @@ -53,6 +53,8 @@ Durable helper rule: if you discover a reusable selector, site quirk, private AP Multi-item collection rule: when the task asks for many products, countries, people, records, plans, prices, links, or fields, maintain a checklist of every required row and field. Spend work across the checklist, not indefinitely on one difficult item. For each item/source, use a small number of targeted attempts, then either record the best verified value, mark it unavailable/unknown with the source and reason, or move to a better source. Do not keep varying one search term while other required rows are untouched. Before `done`, audit the checklist: every requested row/field must be filled, explicitly unavailable/unknown, or clearly reported as partial with the remaining gaps. +Single-site collection rule: when the task asks for data from one website, one vendor, one domain, or "a single website", choose one viable domain early and complete the checklist on that domain. Candidate scouting should be brief: verify the domain has the right category, currency, locale, or authority, then commit. Do not stitch rows from multiple domains, and do not keep vendor-hopping after a viable domain exists. Switch domains only when the current domain clearly cannot satisfy the requested category/currency/authority after a bounded check. If an item is missing on the committed domain, mark it unavailable for that domain and move to the next checklist row. + Use the browser to discover and verify. Once the browser reveals stable data endpoints, static links, downloadable assets, XHR/fetch patterns, or predictable pagination URLs, switch to `requests`, `http_get`, `fetch` inside `js`, or `ThreadPoolExecutor` for bulk extraction. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Use one global deadline plus per-item micro timeouts, and check the global deadline before every navigation, wait, and sleep. Any loop over multiple pages/items must emit short progress every item or every 2 seconds, whichever comes first. For list/profile extraction, filter candidates before navigating when possible, and poll for record readiness rather than nullable answer fields; if a loaded record has a missing optional field, record it as missing and continue. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. Use helper agents only when the user explicitly asks for sub-agents, delegation, or parallel agent work. Requests for depth, thoroughness, research, investigation, or detailed codebase analysis do not by themselves authorize spawning a helper. When delegation is authorized, give each helper a narrow, self-contained task that materially advances the work, keep urgent blocking work local, avoid duplicate helper work, and continue useful non-overlapping local work while the helper runs. Use the `explorer` role for authorized read-only repository questions and `worker` for authorized implementation work with a bounded write scope. From 33dce80c4b070e8625c6ba01a7cff5d1ad241351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 06:30:38 +0000 Subject: [PATCH 23/48] Clarify viable single-site collection --- crates/browser-use-agent/src/prompts/tests.rs | 1 + prompts/browser-agent-system.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/browser-use-agent/src/prompts/tests.rs b/crates/browser-use-agent/src/prompts/tests.rs index 2dd8769f..efe67c71 100644 --- a/crates/browser-use-agent/src/prompts/tests.rs +++ b/crates/browser-use-agent/src/prompts/tests.rs @@ -110,6 +110,7 @@ fn system_prompt_bounds_multi_item_collection_loops() { fn system_prompt_commits_single_site_collection_to_one_domain() { assert!(BASE_SYSTEM_PROMPT.contains("Single-site collection rule")); assert!(BASE_SYSTEM_PROMPT.contains("choose one viable domain early")); + assert!(BASE_SYSTEM_PROMPT.contains("do not keep searching for a perfect domain")); assert!(BASE_SYSTEM_PROMPT.contains("Do not stitch rows from multiple domains")); assert!(BASE_SYSTEM_PROMPT.contains("mark it unavailable for that domain")); } diff --git a/prompts/browser-agent-system.md b/prompts/browser-agent-system.md index e06235dc..434d673e 100644 --- a/prompts/browser-agent-system.md +++ b/prompts/browser-agent-system.md @@ -53,7 +53,7 @@ Durable helper rule: if you discover a reusable selector, site quirk, private AP Multi-item collection rule: when the task asks for many products, countries, people, records, plans, prices, links, or fields, maintain a checklist of every required row and field. Spend work across the checklist, not indefinitely on one difficult item. For each item/source, use a small number of targeted attempts, then either record the best verified value, mark it unavailable/unknown with the source and reason, or move to a better source. Do not keep varying one search term while other required rows are untouched. Before `done`, audit the checklist: every requested row/field must be filled, explicitly unavailable/unknown, or clearly reported as partial with the remaining gaps. -Single-site collection rule: when the task asks for data from one website, one vendor, one domain, or "a single website", choose one viable domain early and complete the checklist on that domain. Candidate scouting should be brief: verify the domain has the right category, currency, locale, or authority, then commit. Do not stitch rows from multiple domains, and do not keep vendor-hopping after a viable domain exists. Switch domains only when the current domain clearly cannot satisfy the requested category/currency/authority after a bounded check. If an item is missing on the committed domain, mark it unavailable for that domain and move to the next checklist row. +Single-site collection rule: when the task asks for data from one website, one vendor, one domain, or "a single website", choose one viable domain early and complete the checklist on that domain. Candidate scouting should be brief: verify the domain has the right category, currency, locale, or authority, then commit. If the task permits unavailable/missing rows, a domain is viable as soon as it has the requested category/source type and at least one requested row or a searchable catalog in the requested currency/locale; do not keep searching for a perfect domain that has every row. Do not stitch rows from multiple domains, and do not keep vendor-hopping after a viable domain exists. Switch domains only when the current domain clearly cannot satisfy the requested category/currency/authority after a bounded check. If an item is missing on the committed domain, mark it unavailable for that domain and move to the next checklist row. Use the browser to discover and verify. Once the browser reveals stable data endpoints, static links, downloadable assets, XHR/fetch patterns, or predictable pagination URLs, switch to `requests`, `http_get`, `fetch` inside `js`, or `ThreadPoolExecutor` for bulk extraction. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Use one global deadline plus per-item micro timeouts, and check the global deadline before every navigation, wait, and sleep. Any loop over multiple pages/items must emit short progress every item or every 2 seconds, whichever comes first. For list/profile extraction, filter candidates before navigating when possible, and poll for record readiness rather than nullable answer fields; if a loaded record has a missing optional field, record it as missing and continue. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. From 3ead5472fb7c2a75645c8375c61ff02c1165456d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 07:05:13 +0000 Subject: [PATCH 24/48] Nudge bounded agents to finish on final turn --- crates/browser-use-agent/src/turn/loop_driver.rs | 9 +++++++++ crates/browser-use-agent/src/turn/loop_tests.rs | 13 +++++++++++++ 2 files changed, 22 insertions(+) diff --git a/crates/browser-use-agent/src/turn/loop_driver.rs b/crates/browser-use-agent/src/turn/loop_driver.rs index 8a8ba1ac..db8d3011 100644 --- a/crates/browser-use-agent/src/turn/loop_driver.rs +++ b/crates/browser-use-agent/src/turn/loop_driver.rs @@ -79,8 +79,11 @@ use super::{CompactionMode, SamplingDriver, TurnObserver, TurnState}; use crate::decision::{self, LoopStep}; use crate::events::TurnCtx; use crate::task::{TurnAbortReason, TurnLifecycleEvent}; +use browser_use_llm::schema::{ContentPart, Message, MessageRole}; use tokio_util::sync::CancellationToken; +const FINAL_MAX_TURNS_NUDGE: &str = "This is the final allowed step for this run. Stop exploring and call the done tool with the best complete answer you can provide now. Include unknown or unavailable items explicitly instead of continuing to search."; + /// The async, unbounded turn-loop driver. Generic over the three frozen turn /// traits so production wires real impls (`ContextManager`+`Session`, /// `ModelSamplingDriver`, a `StoreSink`-backed observer) while tests inject @@ -175,6 +178,12 @@ impl TurnLoop { // `ContextManager` history; the loop simply threads it through. let mut request = self.state.clone_history_for_prompt().await; request.extend(input); + if max_turns.is_some_and(|limit| turns_run + 1 == limit) { + request.push(Message::new( + MessageRole::Developer, + vec![ContentPart::text(FINAL_MAX_TURNS_NUDGE)], + )); + } // ---- 2. run one sampling round-trip ---- let outcome = match self diff --git a/crates/browser-use-agent/src/turn/loop_tests.rs b/crates/browser-use-agent/src/turn/loop_tests.rs index de02413e..61205a3e 100644 --- a/crates/browser-use-agent/src/turn/loop_tests.rs +++ b/crates/browser-use-agent/src/turn/loop_tests.rs @@ -722,6 +722,7 @@ async fn bounded_loop_aborts_after_max_turns() { SamplingScript::Ok(complete("should not run")), ]); let requests = sampler.requests_handle(); + let inputs = sampler.inputs_handle(); let state = InMemoryTurnState::new(Vec::new(), token_status(false)); let observer = RecordingObserver::new(); @@ -733,6 +734,18 @@ async fn bounded_loop_aborts_after_max_turns() { assert_eq!(requests.load(Ordering::SeqCst), 2); assert_eq!(out.as_deref(), Some("step 1")); + let recorded_inputs = inputs.lock().unwrap(); + let Some(Message { + role: MessageRole::Developer, + content, + }) = recorded_inputs[1].last() + else { + panic!("last bounded request should include final-step developer nudge"); + }; + assert!( + matches!(content.first(), Some(ContentPart::Text { text }) if text.contains("final allowed step")), + "final nudge should tell the agent to finish" + ); assert_eq!(observer.kinds(), vec!["started", "aborted"]); let events = observer.events.lock().unwrap(); assert!(matches!( From b48c04b6b16c830c26bf1b94365d642729d03dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 07:28:34 +0000 Subject: [PATCH 25/48] Shorten browser script observe threshold --- crates/browser-use-browser/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 090eea09..3e632e9b 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -28,7 +28,7 @@ use tungstenite::{connect, Message, WebSocket}; const BU_API: &str = "https://api.browser-use.com/api/v3"; const LOG_LIMIT: usize = 250; const SCRIPT_MAX_OUTPUT_CHARS: usize = 120_000; -const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 15_000; +const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 7_000; const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 1_000; const BROWSER_SCRIPT_HELPERS: &str = include_str!("browser_script_helpers.py"); @@ -8811,13 +8811,13 @@ print("http_get parity ok") } #[test] - fn browser_script_initial_wait_defaults_to_fifteen_seconds_and_clamps_env() { + fn browser_script_initial_wait_defaults_to_seven_seconds_and_clamps_env() { { let _env = EnvRestore::unset(&[ "BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "BROWSER_SCRIPT_INITIAL_WAIT_MS", ]); - assert_eq!(browser_script_initial_wait_ms(), 15_000); + assert_eq!(browser_script_initial_wait_ms(), 7_000); } { let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "1500")]); From 0cb95d7ed57c38864dea828ef81819b98509a236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 08:04:33 +0000 Subject: [PATCH 26/48] Align done tool with Browser Use result fields --- .../src/tools/handlers/done.rs | 69 ++++++++++++++++--- .../src/tools/handlers/done_tests.rs | 40 +++++++++-- .../browser-use-agent/src/tools/registry.rs | 19 +++-- .../src/tools/registry_tests.rs | 12 +++- crates/browser-use-agent/src/turn/sampling.rs | 33 ++++++--- .../src/turn/sampling_tests.rs | 54 +++++++++++++++ 6 files changed, 194 insertions(+), 33 deletions(-) diff --git a/crates/browser-use-agent/src/tools/handlers/done.rs b/crates/browser-use-agent/src/tools/handlers/done.rs index 267dfaa0..6b38c8cf 100644 --- a/crates/browser-use-agent/src/tools/handlers/done.rs +++ b/crates/browser-use-agent/src/tools/handlers/done.rs @@ -31,9 +31,10 @@ //! //! * **Tool name** — `done` (the completion tool key). Mirrors the codex/legacy //! completion/`done` tool the agent calls to declare it has finished. -//! * **Args** — `{ "text"?: string }`: an optional free-text final summary -//! message. Codex's completion carries the final assistant text; we model the -//! summary as the single optional `text` field (omittable on the wire). +//! * **Args** — `{ "result"?: string, "text"?: string, "result_file"?: string }`: +//! an optional user-facing final answer, a legacy `text` alias, and an optional +//! result file pointer. Codex's completion carries the final assistant text; +//! Browser Use prompts call this `result`, so both names are accepted. //! * **no approval / benign** — like `update_plan`, this is a pure state echo: it //! needs no approval and touches no sandbox. We leave //! [`exec_approval_requirement`](Approvable::exec_approval_requirement) at its @@ -64,14 +65,21 @@ pub const DONE_STDOUT_PREFIX: &str = "done:"; /// Typed request for the `done` tool. /// -/// `text` is the optional final summary message the model carries when it -/// declares the task finished. `#[serde(default)]` so it may be omitted on the -/// wire; skipped on serialize when `None` to keep the echoed JSON tidy. +/// `result` is the canonical final answer. `text` remains accepted for legacy +/// callers, and `result_file` can point at a persisted artifact when the answer +/// is intentionally file-backed. All fields are optional so the model may still +/// declare done with no message. #[derive(Clone, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct DoneRequest { - /// The final summary message (optional). + /// Canonical user-facing final answer. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub result: Option, + /// Legacy final summary alias. #[serde(default, skip_serializing_if = "Option::is_none")] pub text: Option, + /// Optional relative or absolute result artifact path. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub result_file: Option, } impl DoneRequest { @@ -79,12 +87,49 @@ impl DoneRequest { pub fn with_text(text: impl Into) -> Self { Self { text: Some(text.into()), + ..Self::default() } } - /// The final summary message, trimmed; empty when no (or blank) text. - pub fn summary(&self) -> &str { - self.text.as_deref().map(str::trim).unwrap_or("") + /// Convenience constructor with the canonical final answer field. + pub fn with_result(result: impl Into) -> Self { + Self { + result: Some(result.into()), + ..Self::default() + } + } + + /// The user-facing final answer, trimmed. + /// + /// `result` wins over legacy `text`. If both are blank and only a + /// `result_file` was supplied, expose a compact file-pointer summary so the + /// host has a visible completion result. + pub fn summary(&self) -> String { + if let Some(result) = self + .result + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + { + return result.to_string(); + } + if let Some(text) = self + .text + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + { + return text.to_string(); + } + if let Some(result_file) = self + .result_file + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + { + return format!("Result file: {result_file}"); + } + String::new() } } @@ -106,7 +151,9 @@ impl DoneTool { /// so the key is rarely consulted; it exists to satisfy [`Approvable`] uniformly. #[derive(serde::Serialize, Clone, Debug, Eq, PartialEq, Hash)] pub struct DoneApprovalKey { + result: Option, text: Option, + result_file: Option, } impl Approvable for DoneTool { @@ -114,7 +161,9 @@ impl Approvable for DoneTool { fn approval_keys(&self, req: &DoneRequest) -> Vec { vec![DoneApprovalKey { + result: req.result.clone(), text: req.text.clone(), + result_file: req.result_file.clone(), }] } diff --git a/crates/browser-use-agent/src/tools/handlers/done_tests.rs b/crates/browser-use-agent/src/tools/handlers/done_tests.rs index 430e0267..198e493d 100644 --- a/crates/browser-use-agent/src/tools/handlers/done_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/done_tests.rs @@ -87,28 +87,56 @@ async fn done_without_text_yields_empty_summary() { assert_eq!(out.stdout, DONE_STDOUT_PREFIX); } -// ---- (3) the wire args deserialize from the model's `{ "text": ... }` ---- +// ---- (3) the wire args deserialize from Browser Use-style and legacy payloads ---- #[test] fn done_wire_args_round_trip() { - // Full form. - let req: DoneRequest = serde_json::from_value(serde_json::json!({ "text": "done now" })) + // Browser Use-style final result form. + let req: DoneRequest = serde_json::from_value(serde_json::json!({ "result": "done now" })) .expect("done deserialize"); - assert_eq!(req.text.as_deref(), Some("done now")); + assert_eq!(req.result.as_deref(), Some("done now")); assert_eq!(req.summary(), "done now"); - // Minimal: `text` omitted -> None (the model may declare done with no message). + // Legacy `text` remains accepted. + let legacy: DoneRequest = serde_json::from_value(serde_json::json!({ "text": "legacy done" })) + .expect("legacy done deserialize"); + assert_eq!(legacy.text.as_deref(), Some("legacy done")); + assert_eq!(legacy.summary(), "legacy done"); + + // `result` wins if both canonical and legacy fields are present. + let both: DoneRequest = + serde_json::from_value(serde_json::json!({ "result": "canonical", "text": "legacy" })) + .expect("combined done deserialize"); + assert_eq!(both.summary(), "canonical"); + + // File-only completion still produces a visible host summary. + let file_only: DoneRequest = + serde_json::from_value(serde_json::json!({ "result_file": "outputs/answer.json" })) + .expect("file done deserialize"); + assert_eq!(file_only.summary(), "Result file: outputs/answer.json"); + + // Minimal: fields omitted -> None (the model may declare done with no message). let empty: DoneRequest = serde_json::from_value(serde_json::json!({})).expect("empty done deserialize"); + assert_eq!(empty.result, None); assert_eq!(empty.text, None); + assert_eq!(empty.result_file, None); assert_eq!(empty.summary(), ""); - // `text` is skipped on serialize when None. + // Empty fields are skipped on serialize. let json = serde_json::to_value(&DoneRequest::default()).unwrap(); + assert!( + json.get("result").is_none(), + "None result is skipped on serialize" + ); assert!( json.get("text").is_none(), "None text is skipped on serialize" ); + assert!( + json.get("result_file").is_none(), + "None result_file is skipped on serialize" + ); } // ---- (4) drive one call through the orchestrator over the seam ---- diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index b0693114..6cd18757 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -1103,22 +1103,29 @@ to the single frame that proves the task succeeded." } /// `done`: the completion tool the model calls to declare the task finished, - /// carrying its final summary. Parity: codex/legacy completion (`done`) tool - /// (`{ "text"?: string }`). The handler's - /// [`DoneRequest`](crate::tools::handlers::done::DoneRequest) accepts an - /// optional `text` summary. + /// carrying its final answer. The handler accepts Browser Use-style + /// `{ "result"?: string, "result_file"?: string }` and the legacy + /// `{ "text"?: string }` alias. pub fn done() -> ToolDefinition { ToolDefinition { name: "done".to_string(), description: - "Signal that the task is finished, with an optional final summary message." + "Signal that the task is finished, carrying the complete user-facing final answer." .to_string(), input_schema: json!({ "type": "object", "properties": { + "result": { + "type": "string", + "description": "The complete final answer to show the user or evaluator. Include all requested data here when the task asks for inline JSON, CSV, markdown, a table, links, or a schema-shaped response." + }, "text": { "type": "string", - "description": "The final summary message describing what was accomplished." + "description": "Legacy alias for result. Prefer result for new calls." + }, + "result_file": { + "type": "string", + "description": "Optional path to a saved final-result artifact when a file pointer satisfies the task or supplements the inline result." } }, "additionalProperties": false diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 03060850..27ced53c 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -1073,7 +1073,7 @@ async fn done_dispatches_through_the_registry() { let out = reg .dispatch( "done", - &serde_json::json!({ "text": "task finished" }), + &serde_json::json!({ "result": "task finished" }), &ctx("done"), &env(), AskForApproval::Never, @@ -1091,4 +1091,14 @@ async fn done_dispatches_through_the_registry() { ); // done is serial (terminal). assert_eq!(reg.parallel_safe("done"), Some(false)); + + let done_def = reg + .model_visible_definitions() + .into_iter() + .find(|definition| definition.name == "done") + .expect("done definition"); + let properties = &done_def.input_schema["properties"]; + assert!(properties.get("result").is_some()); + assert!(properties.get("text").is_some()); + assert!(properties.get("result_file").is_some()); } diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index f62ff78a..527ffac7 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -690,18 +690,31 @@ fn calls_done_tool(tool_calls: &[ContentPart]) -> bool { /// The final summary carried by the model's `done` call, if any. /// -/// Reads the `text` field from the first `done` tool call's JSON arguments -/// (matching the `done` handler's `DoneRequest { text }`). Returns `None` when -/// there is no `done` call or it carried no (non-empty) summary, so the caller -/// only overrides the turn result when there is a real message to surface. +/// Reads the `result` field from the first `done` tool call's JSON arguments, +/// falling back to the legacy `text` alias and then to a compact `result_file` +/// pointer. Returns `None` when there is no `done` call or it carried no +/// non-empty completion payload, so the caller only overrides the turn result +/// when there is a real message to surface. fn done_summary(tool_calls: &[ContentPart]) -> Option { tool_calls.iter().find_map(|p| match p { - ContentPart::ToolCall { name, input, .. } if name == DONE_TOOL_NAME => input - .get("text") - .and_then(|t| t.as_str()) - .map(str::trim) - .filter(|s| !s.is_empty()) - .map(str::to_string), + ContentPart::ToolCall { name, input, .. } if name == DONE_TOOL_NAME => { + for field in ["result", "text"] { + if let Some(value) = input + .get(field) + .and_then(|value| value.as_str()) + .map(str::trim) + .filter(|value| !value.is_empty()) + { + return Some(value.to_string()); + } + } + input + .get("result_file") + .and_then(|value| value.as_str()) + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(|path| format!("Result file: {path}")) + } _ => None, }) } diff --git a/crates/browser-use-agent/src/turn/sampling_tests.rs b/crates/browser-use-agent/src/turn/sampling_tests.rs index eaee3693..3f5fa685 100644 --- a/crates/browser-use-agent/src/turn/sampling_tests.rs +++ b/crates/browser-use-agent/src/turn/sampling_tests.rs @@ -219,6 +219,15 @@ fn tool_call(name: &str) -> Result { }) } +fn tool_call_with_input(name: &str, input: serde_json::Value) -> Result { + Ok(LlmEvent::ToolCall { + id: "call-1".to_string(), + name: name.to_string(), + namespace: None, + input, + }) +} + fn finish(reason: FinishReason) -> Result { Ok(LlmEvent::Finish { usage: Usage { @@ -771,6 +780,51 @@ async fn fused_driver_advertises_dispatcher_tool_specs_on_request() { ); } +#[tokio::test] +async fn fused_done_result_becomes_final_message_without_follow_up() { + use crate::turn::dispatch::ToolDispatcher; + use crate::turn::sampling::FusionRecorder; + + let specs = vec![tool_def("done")]; + let dispatcher = Arc::new(ToolDispatcher::with_runner_and_specs( + NoopRunner, /* model_supports */ true, specs, + )); + let (transport, _opens) = ScriptedTransport::new(vec![OpenScript::Stream(vec![ + tool_call_with_input( + "done", + serde_json::json!({ + "result": "full table answer", + "text": "legacy summary" + }), + ), + finish(FinishReason::ToolUse), + ])]); + let sink: Arc = Arc::new(RecordingSink::default()); + let recorder: Arc = Arc::new(NoopRecorder); + let d = ModelSamplingDriver::new(transport, sink, ctx(), 5) + .without_jitter() + .with_fusion(dispatcher, recorder); + + let out = d + .run_sampling_request(user_input(), CancellationToken::new()) + .await + .expect("sampling should succeed"); + + assert!( + !out.model_needs_follow_up, + "done must terminate the fused turn instead of requesting another sample" + ); + assert_eq!( + out.last_agent_message.as_deref(), + Some("full table answer"), + "canonical done.result must be surfaced over the legacy text alias" + ); + assert!( + out.defers_mailbox_delivery_to_next_turn, + "terminal done output is the final-answer boundary" + ); +} + #[tokio::test] async fn text_only_driver_sends_no_tool_specs() { // The text-only driver (no dispatcher) must send NO tools — codex sends the From 235480125b3ecd39f268e8a67c2a0403bf4f6b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 10:05:05 +0000 Subject: [PATCH 27/48] Cap browser script stdout for model context --- .../src/tools/handlers/browser.rs | 20 ++++++++- .../src/tools/handlers/browser_tests.rs | 45 +++++++++++++++++-- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/crates/browser-use-agent/src/tools/handlers/browser.rs b/crates/browser-use-agent/src/tools/handlers/browser.rs index c6568fd9..eb3ca1f4 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser.rs @@ -73,6 +73,7 @@ pub const DEFAULT_OBSERVE_TIMEOUT_MS: u64 = 1_000; /// [`ContentPart`]s so provider protocols can send images to vision-capable /// models while preserving a plain text fallback for logs/tests. pub const BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX: &str = "\n__browser_script_content__:"; +pub const MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES: usize = 16 * 1024; const BROWSER_PREF_MODE: &str = "browser.preference.mode"; const BROWSER_PREF_PROFILE: &str = "browser.preference.profile"; @@ -1337,13 +1338,30 @@ fn map_script_output(out: BrowserScriptOutput) -> ExecOutput { fn browser_script_stdout(response: &BrowserScriptOutput) -> String { let text = browser_script_tool_message_content(response); let (image_parts, warnings) = browser_script_image_parts(response); - let text = append_browser_script_image_warnings(text, &warnings); + let text = + cap_inline_browser_script_stdout(append_browser_script_image_warnings(text, &warnings)); let Some(payload) = browser_script_content_payload(&text, image_parts) else { return text; }; format!("{text}{BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX}{payload}") } +fn cap_inline_browser_script_stdout(text: String) -> String { + if text.len() <= MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES { + return text; + } + let mut end = MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES; + while end > 0 && !text.is_char_boundary(end) { + end -= 1; + } + let elided = text.len() - end; + let mut out = text[..end].to_string(); + out.push_str(&format!( + "\n... [browser_script stdout truncated, {elided} more bytes; full output persisted]" + )); + out +} + fn browser_script_content_payload(text: &str, image_parts: Vec) -> Option { if image_parts.is_empty() { return None; diff --git a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs index 7e871b14..d3f458ff 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs @@ -24,6 +24,7 @@ use serde_json::json; use super::browser::{ browser_command_is_passive, desired_browser_connect_command, BrowserAction, BrowserBackend, BrowserRequest, BrowserTool, BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX, + MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES, }; use crate::session::SharedStore; use crate::tools::approval::AskForApproval; @@ -55,6 +56,7 @@ struct FakeBackend { last_session: Mutex>, last_paths: Mutex>, last_timeout_secs: Mutex>, + script_text: Mutex>, script_images: Mutex>, fail: bool, } @@ -99,6 +101,10 @@ impl FakeBackend { *self.last_timeout_secs.lock().unwrap() } + fn set_script_text(&self, text: impl Into) { + *self.script_text.lock().unwrap() = Some(text.into()); + } + fn record_paths(&self, cwd: &std::path::Path, artifact_dir: &std::path::Path) { *self.last_paths.lock().unwrap() = Some((cwd.to_path_buf(), artifact_dir.to_path_buf())); } @@ -114,20 +120,26 @@ impl FakeBackend { } } - fn ok_script(status: Option<&str>, ok: bool) -> BrowserScriptOutput { + fn ok_script_with_text(status: Option<&str>, ok: bool, text: String) -> BrowserScriptOutput { BrowserScriptOutput { ok, status: status.map(|s| s.to_string()), run_id: Some("run-1".to_string()), - text: "script-output".to_string(), + text, ..Default::default() } } fn ok_script_with_images(&self, status: Option<&str>, ok: bool) -> BrowserScriptOutput { + let text = self + .script_text + .lock() + .unwrap() + .clone() + .unwrap_or_else(|| "script-output".to_string()); BrowserScriptOutput { images: self.script_images(), - ..Self::ok_script(status, ok) + ..Self::ok_script_with_text(status, ok, text) } } } @@ -620,6 +632,33 @@ async fn script_images_are_appended_as_structured_stdout_payload() { assert!(media.2.is_none()); } +#[tokio::test] +async fn script_oversized_stdout_is_truncated_for_model_output() { + let backend = Arc::new(FakeBackend::default()); + backend.set_script_text("x".repeat(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES + 5_000)); + let tool = tool_with(Arc::clone(&backend)); + + let req = BrowserRequest::execute("sess-1", "document.body.innerText", false); + let out = run_direct(&tool, &req).await.unwrap(); + + assert_eq!(out.exit_code, 0); + assert!( + out.stdout.len() < MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES + 1_000, + "stdout should be capped, got {} bytes", + out.stdout.len() + ); + assert!( + out.stdout.contains("[browser_script stdout truncated"), + "stdout: {}", + out.stdout + ); + assert!( + !out.stdout + .contains(&"x".repeat(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES + 100)), + "uncapped browser_script output leaked into model stdout" + ); +} + #[tokio::test] async fn script_unreadable_images_warn_in_stdout() { let temp = tempfile::tempdir().expect("tempdir"); From 25bad16ef6c268b9b2af464151134b10aea604fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 10:34:30 +0000 Subject: [PATCH 28/48] Lower browser script model output cap --- crates/browser-use-agent/src/tools/handlers/browser.rs | 7 ++++++- .../browser-use-agent/src/tools/handlers/browser_tests.rs | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/crates/browser-use-agent/src/tools/handlers/browser.rs b/crates/browser-use-agent/src/tools/handlers/browser.rs index eb3ca1f4..64eb06b2 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser.rs @@ -73,7 +73,12 @@ pub const DEFAULT_OBSERVE_TIMEOUT_MS: u64 = 1_000; /// [`ContentPart`]s so provider protocols can send images to vision-capable /// models while preserving a plain text fallback for logs/tests. pub const BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX: &str = "\n__browser_script_content__:"; -pub const MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES: usize = 16 * 1024; +/// Maximum bytes of browser-script text returned to the next model turn. +/// +/// Full browser-script output is persisted through durable events/artifacts; the +/// inline model view is deliberately smaller because long eval tasks repeatedly +/// carry every prior tool result in later prompts. +pub const MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES: usize = 4 * 1024; const BROWSER_PREF_MODE: &str = "browser.preference.mode"; const BROWSER_PREF_PROFILE: &str = "browser.preference.profile"; diff --git a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs index d3f458ff..44611fde 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs @@ -659,6 +659,11 @@ async fn script_oversized_stdout_is_truncated_for_model_output() { ); } +#[test] +fn browser_script_stdout_cap_defaults_to_four_kib_for_eval_cost() { + assert_eq!(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES, 4 * 1024); +} + #[tokio::test] async fn script_unreadable_images_warn_in_stdout() { let temp = tempfile::tempdir().expect("tempdir"); From 297930aba37979a6603dfcd0438398ed106f6709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 10:52:51 +0000 Subject: [PATCH 29/48] Preserve browser script summaries under stdout cap --- .../src/tools/handlers/browser.rs | 14 +++--- .../src/tools/handlers/browser_tests.rs | 50 +++++++++++++++++++ 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/crates/browser-use-agent/src/tools/handlers/browser.rs b/crates/browser-use-agent/src/tools/handlers/browser.rs index 64eb06b2..5cf5832f 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser.rs @@ -1362,7 +1362,7 @@ fn cap_inline_browser_script_stdout(text: String) -> String { let elided = text.len() - end; let mut out = text[..end].to_string(); out.push_str(&format!( - "\n... [browser_script stdout truncated, {elided} more bytes; full output persisted]" + "\n... [browser_script stdout truncated, {elided} more bytes; full output persisted. Use a narrower browser_script extraction, the emitted summaries, or a saved artifact instead of re-reading broad page text.]" )); out } @@ -1519,12 +1519,6 @@ fn browser_script_failure_message(response: &BrowserScriptOutput) -> String { fn browser_script_structured_message_parts(response: &BrowserScriptOutput) -> Vec { let mut parts = Vec::new(); - if !response.outputs.is_empty() { - parts.push(format!( - "outputs: {}", - Value::Array(response.outputs.clone()) - )); - } if !response.summary.is_empty() { parts.push(format!( "summary: {}", @@ -1534,6 +1528,12 @@ fn browser_script_structured_message_parts(response: &BrowserScriptOutput) -> Ve if !response.data.is_null() && response.data != serde_json::json!({}) { parts.push(format!("data: {}", response.data)); } + if !response.outputs.is_empty() { + parts.push(format!( + "outputs: {}", + Value::Array(response.outputs.clone()) + )); + } parts } diff --git a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs index 44611fde..a3e27282 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs @@ -57,6 +57,8 @@ struct FakeBackend { last_paths: Mutex>, last_timeout_secs: Mutex>, script_text: Mutex>, + script_outputs: Mutex>, + script_summary: Mutex>, script_images: Mutex>, fail: bool, } @@ -138,6 +140,8 @@ impl FakeBackend { .clone() .unwrap_or_else(|| "script-output".to_string()); BrowserScriptOutput { + outputs: self.script_outputs.lock().unwrap().clone(), + summary: self.script_summary.lock().unwrap().clone(), images: self.script_images(), ..Self::ok_script_with_text(status, ok, text) } @@ -659,6 +663,52 @@ async fn script_oversized_stdout_is_truncated_for_model_output() { ); } +#[tokio::test] +async fn script_truncated_structured_output_preserves_summary_first() { + let backend = Arc::new(FakeBackend::default()); + backend.script_summary.lock().unwrap().push(json!({ + "kind": "extracted", + "message": "Read 40 candidate rows", + "output_label": "candidate_rows" + })); + backend.script_outputs.lock().unwrap().push(json!({ + "label": "candidate_rows", + "value": "x".repeat(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES + 8_000) + })); + let tool = tool_with(Arc::clone(&backend)); + + let req = BrowserRequest::execute("sess-1", "emit_output(rows, label='candidate_rows')", false); + let out = run_direct(&tool, &req).await.unwrap(); + + assert_eq!(out.exit_code, 0); + assert!( + out.stdout.contains("summary:"), + "summary should remain visible before large raw output: {}", + out.stdout + ); + assert!( + out.stdout.contains("Read 40 candidate rows"), + "stdout: {}", + out.stdout + ); + assert!( + out.stdout.contains("[browser_script stdout truncated"), + "stdout: {}", + out.stdout + ); + assert!( + out.stdout + .contains("Use a narrower browser_script extraction"), + "stdout: {}", + out.stdout + ); + assert!( + out.stdout.find("summary:") < out.stdout.find("outputs:"), + "summary should precede raw outputs: {}", + out.stdout + ); +} + #[test] fn browser_script_stdout_cap_defaults_to_four_kib_for_eval_cost() { assert_eq!(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES, 4 * 1024); From 21a180aa14be723163ff4a84d761848d49a20d3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 11:14:00 +0000 Subject: [PATCH 30/48] Add batch browser fetch helpers --- crates/browser-use-agent/src/prompts/tests.rs | 9 +- .../src/browser_script_helpers.py | 231 ++++++++++++++++++ crates/browser-use-browser/src/lib.rs | 70 ++++++ prompts/browser-agent-system.md | 4 +- prompts/browser-script-tool-description.md | 5 +- 5 files changed, 315 insertions(+), 4 deletions(-) diff --git a/crates/browser-use-agent/src/prompts/tests.rs b/crates/browser-use-agent/src/prompts/tests.rs index efe67c71..c015e432 100644 --- a/crates/browser-use-agent/src/prompts/tests.rs +++ b/crates/browser-use-agent/src/prompts/tests.rs @@ -164,13 +164,20 @@ fn browser_tool_descriptions_preserve_interaction_skills() { script.contains("js(function_source, *args)"), "browser_script description lost js argument helper guidance" ); + assert!( + script.contains("http_get_many(urls, **kwargs)") + && script.contains("browser_fetch_many(requests, **kwargs)"), + "browser_script description lost batch/direct fetch helper guidance" + ); // The base system prompt enumerates the page-interaction helpers, including // the screenshot/image helpers used for visual inspection. assert!( BASE_SYSTEM_PROMPT.contains("capture_screenshot") && BASE_SYSTEM_PROMPT.contains("emit_image") - && BASE_SYSTEM_PROMPT.contains("js(function_source, *args)"), + && BASE_SYSTEM_PROMPT.contains("js(function_source, *args)") + && BASE_SYSTEM_PROMPT.contains("http_get_many") + && BASE_SYSTEM_PROMPT.contains("browser_fetch_many"), "base system prompt lost its screenshot/image interaction helpers" ); } diff --git a/crates/browser-use-browser/src/browser_script_helpers.py b/crates/browser-use-browser/src/browser_script_helpers.py index 083c97ea..b223ed16 100644 --- a/crates/browser-use-browser/src/browser_script_helpers.py +++ b/crates/browser-use-browser/src/browser_script_helpers.py @@ -15,6 +15,7 @@ import time as _time import urllib.error import urllib.request +from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import urlparse @@ -1016,3 +1017,233 @@ def http_get(url, headers=None, timeout=20.0, binary=None): raise RuntimeError( f"http_get failed for {url}: {exc}. Try a shorter timeout, browser js(fetch(...)), or a configured proxy if the site blocks direct HTTP." ) from exc + + +def http_get_many(urls, headers=None, timeout=20.0, binary=None, max_workers=8, return_errors=True): + """Fetch many independent URLs with http_get while preserving input order. + + By default one failed URL becomes {"ok": False, "url": ..., "error": ...} + instead of failing the whole batch. Set return_errors=False when every URL is + required and the caller should abort on the first failure. + """ + items = list(urls) + if not items: + return [] + workers = max(1, min(int(max_workers or 1), len(items))) + results = [None] * len(items) + + def fetch_one(index, item): + if isinstance(item, dict): + request_url = item["url"] + request_headers = dict(headers or {}) + request_headers.update(item.get("headers") or {}) + request_timeout = item.get("timeout", timeout) + request_binary = item.get("binary", binary) + else: + request_url = str(item) + request_headers = headers + request_timeout = timeout + request_binary = binary + return index, request_url, http_get( + request_url, + headers=request_headers, + timeout=request_timeout, + binary=request_binary, + ) + + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = [pool.submit(fetch_one, index, item) for index, item in enumerate(items)] + for future in as_completed(futures): + try: + index, _url, response = future.result() + results[index] = response + except Exception as exc: + index = futures.index(future) + item = items[index] + request_url = item.get("url") if isinstance(item, dict) else str(item) + if not return_errors: + raise + results[index] = {"ok": False, "url": request_url, "error": str(exc)} + return results + + +def _normalize_browser_fetch_request( + url, + method="GET", + headers=None, + body=None, + json_body=None, + timeout=20.0, + binary=None, +): + request_headers = dict(headers or {}) + request_body = body + if json_body is not None: + request_body = json.dumps(json_body) + if not any(k.lower() == "content-type" for k in request_headers): + request_headers["Content-Type"] = "application/json" + if isinstance(request_body, (dict, list)): + request_body = json.dumps(request_body) + if not any(k.lower() == "content-type" for k in request_headers): + request_headers["Content-Type"] = "application/json" + if isinstance(request_body, bytes): + request_body = request_body.decode("latin1") + return { + "url": str(url), + "method": str(method or "GET").upper(), + "headers": request_headers, + "body": request_body, + "timeout_ms": int(float(timeout) * 1000), + "binary": bool(binary), + } + + +def _browser_fetch_response(result, return_error=False): + if not isinstance(result, dict): + if return_error: + return {"ok": False, "url": None, "error": f"invalid browser_fetch result: {result!r}"} + raise RuntimeError(f"invalid browser_fetch result: {result!r}") + if not result.get("ok"): + if return_error: + return { + "ok": False, + "url": result.get("url"), + "error": result.get("error", "browser_fetch failed"), + } + raise RuntimeError(f"browser_fetch failed for {result.get('url')}: {result.get('error')}") + headers = result.get("headers") or {} + status = result.get("status") + url = result.get("url") + if result.get("binary"): + body = base64.b64decode(result.get("body_b64") or "") + return _HttpGetBytes(body, status, headers, url) + return _HttpGetText(result.get("body") or "", status, headers, url) + + +def browser_fetch( + url, + method="GET", + headers=None, + body=None, + json_body=None, + timeout=20.0, + binary=None, +): + """Fetch from the current page context with browser cookies/session state.""" + request = _normalize_browser_fetch_request( + url, + method=method, + headers=headers, + body=body, + json_body=json_body, + timeout=timeout, + binary=binary, + ) + return browser_fetch_many([request], timeout=timeout, return_errors=False)[0] + + +def browser_fetch_many(requests, timeout=20.0, max_concurrency=6, return_errors=True): + """Fetch many URLs from the current page context, preserving order. + + Each item may be a URL string or a dict with url/method/headers/body/json_body/ + timeout/binary. This is useful after the page reveals stable endpoints but + direct http_get lacks cookies, auth headers, or browser-only access. + """ + normalized = [] + for item in list(requests): + if isinstance(item, dict): + normalized.append( + _normalize_browser_fetch_request( + item["url"], + method=item.get("method", "GET"), + headers=item.get("headers"), + body=item.get("body"), + json_body=item.get("json_body"), + timeout=item.get("timeout", timeout), + binary=item.get("binary"), + ) + ) + else: + normalized.append(_normalize_browser_fetch_request(item, timeout=timeout)) + if not normalized: + return [] + + expression = f""" +(async () => {{ + const requests = {json.dumps(normalized)}; + const maxConcurrency = Math.max(1, Math.min({int(max_concurrency or 1)}, requests.length)); + function arrayBufferToBase64(buffer) {{ + const bytes = new Uint8Array(buffer); + let binary = ""; + const chunkSize = 0x8000; + for (let i = 0; i < bytes.length; i += chunkSize) {{ + const chunk = bytes.subarray(i, i + chunkSize); + binary += String.fromCharCode.apply(null, chunk); + }} + return btoa(binary); + }} + async function fetchOne(request) {{ + const controller = new AbortController(); + const timeoutMs = Math.max(1, Number(request.timeout_ms || 20000)); + const timer = setTimeout(() => controller.abort(), timeoutMs); + try {{ + const options = {{ + method: request.method || "GET", + headers: request.headers || {{}}, + credentials: "include", + signal: controller.signal + }}; + if (request.body !== null && request.body !== undefined) {{ + options.body = request.body; + }} + const response = await fetch(request.url, options); + const headers = {{}}; + response.headers.forEach((value, key) => {{ headers[key] = value; }}); + if (request.binary) {{ + const buffer = await response.arrayBuffer(); + return {{ + ok: true, + response_ok: response.ok, + status: response.status, + statusText: response.statusText, + url: response.url, + headers, + binary: true, + body_b64: arrayBufferToBase64(buffer) + }}; + }} + const body = await response.text(); + return {{ + ok: true, + response_ok: response.ok, + status: response.status, + statusText: response.statusText, + url: response.url, + headers, + binary: false, + body + }}; + }} catch (error) {{ + return {{ + ok: false, + url: request.url, + error: String(error && (error.message || error)) + }}; + }} finally {{ + clearTimeout(timer); + }} + }} + const results = new Array(requests.length); + let next = 0; + async function worker() {{ + while (next < requests.length) {{ + const index = next++; + results[index] = await fetchOne(requests[index]); + }} + }} + await Promise.all(Array.from({{length: maxConcurrency}}, worker)); + return results; +}})() +""" + raw_results = _runtime_evaluate(expression, await_promise=True, return_by_value=True) + return [_browser_fetch_response(result, return_error=return_errors) for result in raw_results] diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 3e632e9b..63a17a33 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -8772,6 +8772,76 @@ print("http_get parity ok") assert!(output.text.contains("http_get parity ok")); } + #[test] + fn browser_script_http_get_many_preserves_order_and_errors() { + let temp = tempfile::tempdir().unwrap(); + let output = run_browser_script( + "script-http-get-many", + temp.path(), + temp.path().join("artifacts"), + r#" +import http.server +import socketserver +import threading + +class Handler(http.server.BaseHTTPRequestHandler): + def log_message(self, fmt, *args): + pass + + def do_GET(self): + if self.path in ("/one", "/two"): + assert self.headers.get("X-Shared") == "yes", dict(self.headers) + if self.path == "/one": + assert self.headers.get("X-Item") == "one", dict(self.headers) + body = self.path.strip("/").encode() + self.send_response(200) + self.send_header("Content-Type", "text/plain; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + return + self.send_response(404) + self.end_headers() + +server = socketserver.ThreadingTCPServer(("127.0.0.1", 0), Handler) +thread = threading.Thread(target=server.serve_forever, daemon=True) +thread.start() +base = f"http://127.0.0.1:{server.server_address[1]}" +try: + results = http_get_many( + [base + "/two", {"url": base + "/one", "headers": {"X-Item": "one"}}, base + "/missing"], + headers={"X-Shared": "yes"}, + max_workers=3, + ) + assert len(results) == 3, results + assert results[0] == "two", results + assert results[0].status_code == 200 + assert results[1] == "one", results + assert results[1].url.endswith("/one") + assert results[2]["ok"] is False, results[2] + assert results[2]["url"].endswith("/missing"), results[2] + try: + http_get_many([base + "/missing"], return_errors=False) + except RuntimeError: + pass + else: + raise AssertionError("return_errors=False should raise") +finally: + server.shutdown() + server.server_close() + +assert callable(browser_fetch) +assert callable(browser_fetch_many) +print("http_get_many parity ok") +"#, + 10, + ) + .unwrap(); + + assert!(output.ok, "{:?}\n{}", output.error, output.text); + assert!(output.text.contains("http_get_many parity ok")); + } + #[test] fn browser_script_timeout_returns_tool_failure() { let temp = tempfile::tempdir().unwrap(); diff --git a/prompts/browser-agent-system.md b/prompts/browser-agent-system.md index 434d673e..06a96f65 100644 --- a/prompts/browser-agent-system.md +++ b/prompts/browser-agent-system.md @@ -4,7 +4,7 @@ Raw CDP is the center of page interaction. Treat `cdp("Domain.method", ...)` ins The `browser` tool behaves like a CLI for browser runtime management. Use it for `browser status --json`, `browser connect local`, `browser local setup`, `browser connect managed`, `browser remote start`, `browser doctor`, explicit recovery, profile summaries, runtime logs, and ownership checks. It does not interact with pages. -The `browser_script` tool runs fresh Python in a browser-connected environment. Browser/CDP state persists in Rust; Python variables do not persist across calls. Important helpers include `cdp`, `new_tab`, `goto_url`, `page_info`, `js`, `capture_screenshot`, `screenshot`, `screenshot_clip`, `emit_image`, `click_at_xy`, `fill_input`, `type_text`, `press_key`, `scroll`, `wait_for_load`, `wait_for_element`, `wait_for_network_idle`, `current_tab`, `list_tabs`, `switch_tab`, `ensure_real_tab`, `upload_file`, `drain_events`, `http_get`, `copy_artifact`, `artifact_root`, `outputs_dir`, `session_metadata`, `audit_artifact`, `agent_workspace`, `load_agent_helpers`, `domain_skills_for_url`, and `last_domain_skills`. Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. +The `browser_script` tool runs fresh Python in a browser-connected environment. Browser/CDP state persists in Rust; Python variables do not persist across calls. Important helpers include `cdp`, `new_tab`, `goto_url`, `page_info`, `js`, `capture_screenshot`, `screenshot`, `screenshot_clip`, `emit_image`, `click_at_xy`, `fill_input`, `type_text`, `press_key`, `scroll`, `wait_for_load`, `wait_for_element`, `wait_for_network_idle`, `current_tab`, `list_tabs`, `switch_tab`, `ensure_real_tab`, `upload_file`, `drain_events`, `http_get`, `http_get_many`, `browser_fetch`, `browser_fetch_many`, `copy_artifact`, `artifact_root`, `outputs_dir`, `session_metadata`, `audit_artifact`, `agent_workspace`, `load_agent_helpers`, `domain_skills_for_url`, and `last_domain_skills`. Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. `browser_script` has a start/listen lifecycle. A fast call returns final output immediately. A longer call returns `status: running` plus `run_id`; observe it with `action="observe"` until final status. If observe returns no new output for its wait window, back off instead of polling constantly. Images/artifacts emitted by the running script are returned by observe as soon as they exist. Use `action="cancel"` with the `run_id` only when the running script is no longer useful. @@ -55,6 +55,6 @@ Multi-item collection rule: when the task asks for many products, countries, peo Single-site collection rule: when the task asks for data from one website, one vendor, one domain, or "a single website", choose one viable domain early and complete the checklist on that domain. Candidate scouting should be brief: verify the domain has the right category, currency, locale, or authority, then commit. If the task permits unavailable/missing rows, a domain is viable as soon as it has the requested category/source type and at least one requested row or a searchable catalog in the requested currency/locale; do not keep searching for a perfect domain that has every row. Do not stitch rows from multiple domains, and do not keep vendor-hopping after a viable domain exists. Switch domains only when the current domain clearly cannot satisfy the requested category/currency/authority after a bounded check. If an item is missing on the committed domain, mark it unavailable for that domain and move to the next checklist row. -Use the browser to discover and verify. Once the browser reveals stable data endpoints, static links, downloadable assets, XHR/fetch patterns, or predictable pagination URLs, switch to `requests`, `http_get`, `fetch` inside `js`, or `ThreadPoolExecutor` for bulk extraction. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Use one global deadline plus per-item micro timeouts, and check the global deadline before every navigation, wait, and sleep. Any loop over multiple pages/items must emit short progress every item or every 2 seconds, whichever comes first. For list/profile extraction, filter candidates before navigating when possible, and poll for record readiness rather than nullable answer fields; if a loaded record has a missing optional field, record it as missing and continue. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. +Use the browser to discover and verify. Once the browser reveals stable data endpoints, static links, downloadable assets, XHR/fetch patterns, or predictable pagination URLs, switch to `http_get_many` for independent public URLs or `browser_fetch_many` when browser cookies/session state are needed. Use single `http_get`/`browser_fetch` calls for one-off checks. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Use one global deadline plus per-item micro timeouts, and check the global deadline before every navigation, wait, and sleep. Any loop over multiple pages/items must emit short progress every item or every 2 seconds, whichever comes first. For list/profile extraction, filter candidates before navigating when possible, and poll for record readiness rather than nullable answer fields; if a loaded record has a missing optional field, record it as missing and continue. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. Use helper agents only when the user explicitly asks for sub-agents, delegation, or parallel agent work. Requests for depth, thoroughness, research, investigation, or detailed codebase analysis do not by themselves authorize spawning a helper. When delegation is authorized, give each helper a narrow, self-contained task that materially advances the work, keep urgent blocking work local, avoid duplicate helper work, and continue useful non-overlapping local work while the helper runs. Use the `explorer` role for authorized read-only repository questions and `worker` for authorized implementation work with a bounded write scope. diff --git a/prompts/browser-script-tool-description.md b/prompts/browser-script-tool-description.md index d7f623bc..31aa8052 100644 --- a/prompts/browser-script-tool-description.md +++ b/prompts/browser-script-tool-description.md @@ -51,6 +51,9 @@ ensure_real_tab() upload_file(...) drain_events() http_get(url, **kwargs) +http_get_many(urls, **kwargs) +browser_fetch(url, **kwargs) +browser_fetch_many(requests, **kwargs) copy_artifact(path, kind="file") emit_output(value, label=None) @@ -109,7 +112,7 @@ emit_output(rows, label="employee_rows") - Use `js(...)` for DOM inspection and raw `cdp(...)` for lower-level browser actions. - Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. - For real user forms, act like a browser user: screenshot, click the visible field/control, type with `type_text(...)`, `press_key(...)`, or `fill_input(...)`, then screenshot or otherwise verify. Use coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Do not assign `element.value`, `element.checked`, `selectedIndex`, React private state, or MutationObserver restore loops on live forms. Do not synthesize `input`, `change`, `click`, or keyboard events in page JavaScript to make a form look filled. Those anti-patterns can desynchronize framework state from the visible DOM. -- Use `http_get(...)` for static pages and APIs after the browser reveals stable endpoints. It returns the response body as a string by default, or bytes with `binary=True`; the returned body also exposes `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. If direct HTTP hits bot or login protection, retry with site-specific headers/cookies, `js(fetch(...))` in the browser, or the configured Browser Use fetch proxy. +- Use `http_get(...)` for one static page/API URL after the browser reveals a stable endpoint, and `http_get_many(...)` for several independent public URLs. Use `browser_fetch(...)` or `browser_fetch_many(...)` when the page's cookies, auth headers, or browser session are needed. Returned bodies are strings by default, bytes with `binary=True`, and expose `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. Batch helpers preserve input order and return per-URL error records by default so one bad link does not waste the whole extraction chunk. If direct HTTP hits bot or login protection, retry with `browser_fetch(...)`, site-specific headers/cookies, or the configured Browser Use fetch proxy. - Extract only fields needed for the task. Do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless you are debugging and the smaller field-level extraction failed. - Save complete generated result files under `outputs_dir()` or relative paths in the current working directory. Files written there are collected as artifacts automatically; `copy_artifact(...)` is for files created elsewhere. - For large structured results, write the full JSON/CSV/text to a file. If the task asks for an exact inline final format, return that content with `done(result=...)` and optionally include `result_file=path`; otherwise finish with `done(result_file=path)`. From 8af682f79dc9c0cd3aa30b10f12b745dd1d2de9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 11:31:10 +0000 Subject: [PATCH 31/48] Add batch fetch recipe guidance --- crates/browser-use-agent/src/prompts/tests.rs | 6 ++++ prompts/browser-script-tool-description.md | 31 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/crates/browser-use-agent/src/prompts/tests.rs b/crates/browser-use-agent/src/prompts/tests.rs index c015e432..ef0d4d63 100644 --- a/crates/browser-use-agent/src/prompts/tests.rs +++ b/crates/browser-use-agent/src/prompts/tests.rs @@ -169,6 +169,12 @@ fn browser_tool_descriptions_preserve_interaction_skills() { && script.contains("browser_fetch_many(requests, **kwargs)"), "browser_script description lost batch/direct fetch helper guidance" ); + assert!( + script.contains("Batch recipe after discovering stable links or endpoints") + && script.contains("responses = http_get_many(urls, timeout=12, max_workers=8)") + && script.contains("Fetched ${$.ok_count}/${$.total} independent URLs"), + "browser_script description lost its concrete batch-fetch adoption recipe" + ); // The base system prompt enumerates the page-interaction helpers, including // the screenshot/image helpers used for visual inspection. diff --git a/prompts/browser-script-tool-description.md b/prompts/browser-script-tool-description.md index 31aa8052..96519975 100644 --- a/prompts/browser-script-tool-description.md +++ b/prompts/browser-script-tool-description.md @@ -113,6 +113,37 @@ emit_output(rows, label="employee_rows") - Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. - For real user forms, act like a browser user: screenshot, click the visible field/control, type with `type_text(...)`, `press_key(...)`, or `fill_input(...)`, then screenshot or otherwise verify. Use coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Do not assign `element.value`, `element.checked`, `selectedIndex`, React private state, or MutationObserver restore loops on live forms. Do not synthesize `input`, `change`, `click`, or keyboard events in page JavaScript to make a form look filled. Those anti-patterns can desynchronize framework state from the visible DOM. - Use `http_get(...)` for one static page/API URL after the browser reveals a stable endpoint, and `http_get_many(...)` for several independent public URLs. Use `browser_fetch(...)` or `browser_fetch_many(...)` when the page's cookies, auth headers, or browser session are needed. Returned bodies are strings by default, bytes with `binary=True`, and expose `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. Batch helpers preserve input order and return per-URL error records by default so one bad link does not waste the whole extraction chunk. If direct HTTP hits bot or login protection, retry with `browser_fetch(...)`, site-specific headers/cookies, or the configured Browser Use fetch proxy. +- Batch recipe after discovering stable links or endpoints: + +```python +# browser_summary: +# { +# "fetch_progress": { +# "kind": "extracted", +# "message": "Fetched ${$.ok_count}/${$.total} independent URLs" +# }, +# "records": { +# "kind": "extracted", +# "message": "Extracted ${$.length} records from fetched pages" +# } +# } + +urls = [...] +responses = http_get_many(urls, timeout=12, max_workers=8) +ok = [r for r in responses if not isinstance(r, dict) and getattr(r, "status_code", 0) < 400] +emit_output({"total": len(responses), "ok_count": len(ok)}, label="fetch_progress") + +records = [] +for url, response in zip(urls, responses): + if isinstance(response, dict) and response.get("error"): + records.append({"url": url, "status": "error", "error": response["error"]}) + continue + text = response.text + records.append({"url": url, "status": response.status_code, "title": text[:200]}) + +emit_output(records, label="records") +``` + - Extract only fields needed for the task. Do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless you are debugging and the smaller field-level extraction failed. - Save complete generated result files under `outputs_dir()` or relative paths in the current working directory. Files written there are collected as artifacts automatically; `copy_artifact(...)` is for files created elsewhere. - For large structured results, write the full JSON/CSV/text to a file. If the task asks for an exact inline final format, return that content with `done(result=...)` and optionally include `result_file=path`; otherwise finish with `done(result_file=path)`. From 34535915445298e4684f3486fcf811c8f8908d6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 12:21:56 +0000 Subject: [PATCH 32/48] Guide text-heavy extraction away from screenshots --- crates/browser-use-agent/src/prompts/tests.rs | 15 +++++++++++++++ prompts/browser-agent-system.md | 4 ++-- prompts/browser-script-tool-description.md | 2 +- prompts/dataset-case-user.md | 2 +- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/crates/browser-use-agent/src/prompts/tests.rs b/crates/browser-use-agent/src/prompts/tests.rs index ef0d4d63..45566eef 100644 --- a/crates/browser-use-agent/src/prompts/tests.rs +++ b/crates/browser-use-agent/src/prompts/tests.rs @@ -115,6 +115,21 @@ fn system_prompt_commits_single_site_collection_to_one_domain() { assert!(BASE_SYSTEM_PROMPT.contains("mark it unavailable for that domain")); } +#[test] +fn prompts_avoid_screenshots_for_text_heavy_extraction() { + assert!(BASE_SYSTEM_PROMPT.contains( + "For text-heavy research, document reading, search, pricing, tables, and list extraction" + )); + assert!(BASE_SYSTEM_PROMPT.contains("screenshots add latency")); + assert!(BASE_SYSTEM_PROMPT.contains("If you have three or more independent URLs")); + + let script = browser_script_tool_description(); + assert!(script.contains( + "For text-heavy research, document reading, search, pricing, tables, and list extraction" + )); + assert!(script.contains("screenshots add latency")); +} + /// Plan mode was removed. The compatibility enum value now renders the Default /// asset so stale configs do not re-enable planning behavior. #[test] diff --git a/prompts/browser-agent-system.md b/prompts/browser-agent-system.md index 06a96f65..7041246c 100644 --- a/prompts/browser-agent-system.md +++ b/prompts/browser-agent-system.md @@ -29,12 +29,12 @@ Browser-harness workflow: - First navigation should usually be `new_tab(url)`, not `goto_url(url)`, because `goto_url` mutates the active tab. `new_tab(url)` and `goto_url(url)` have zero implicit wait: they send the CDP navigation command and then return without waiting for readyState, network idle, selectors, paint, or sleeps. If you chain more work in the same script after navigation, explicitly wait or poll before reading/clicking. If navigation is the last action before yielding to the model, the LLM call itself may provide enough elapsed time; the next call must still inspect state before assuming the page loaded. - When a task is site-specific and a matching domain skill exists, read it before inventing selectors, private API routes, or flows. Use `domain_skills_for_url(url, include_content=True)` before or immediately after navigation; `goto_url(url)` also records matching skill metadata in the tool result. -- Use screenshots as labeled temporal checkpoints. Screenshots are often the fastest way to understand the page, spot blockers, read visible state, and verify what changed. Capture visual state before and after meaningful browser actions: initial load, clicks, scrolls, route changes, menus, dialogs, downloads, uploads, form submissions, and final verification. +- Use screenshots as labeled temporal checkpoints when visual state matters. For text-heavy research, document reading, search, pricing, tables, and list extraction, default to `page_info()`, `js(...)`, targeted DOM text, `http_get_many`, or `browser_fetch_many`; screenshots add latency and usually do not help. Capture visual state before and after meaningful browser actions only when layout, coordinates, blockers, menus, dialogs, downloads, uploads, form submissions, or final visual verification matter. - Prefer coordinate clicks for visible targets. Use `screenshot` or `capture_screenshot`, inspect the pixels, `click_at_xy(x, y)`, then screenshot again to verify. Chrome hit-testing handles iframes, shadow DOM, and cross-origin content better than selector abstractions. - For forms, behave like a browser user. Inspect visually with screenshots first; use read-only JS only when pixels are insufficient to identify labels or stable selectors. Click into visible text fields before typing. Use `type_text(...)`, `press_key(...)`, or `fill_input(...)` for text, and real coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Never bulk-fill a live form by setting DOM values, setting checked state, dispatching synthetic form events, or running a restore loop; this can desynchronize framework state from the visible DOM. - Prefer capturing the action timeline inside one `browser_script` tool call when possible: `screenshot("before_click")`, perform the action, wait for the state change, then `screenshot("after_click")`. - Do not call `screenshot` repeatedly on an unchanged viewport. Once you have a screenshot, either take an action, inspect with CDP/JS, navigate, scroll, call `screenshot_clip(...)` for a different region, wait for an async transition, or finish. Every screenshot should have a purpose: observe current state, verify an action, inspect a changed region, or preserve final evidence. -- Use raw `cdp(...)`, `page_info()`, `wait_for_element(...)`, `wait_for_network_idle(...)`, and `js(...)` when coordinates are the wrong tool or you need structured data. +- Use raw `cdp(...)`, `page_info()`, `wait_for_element(...)`, `wait_for_network_idle(...)`, and `js(...)` when coordinates are the wrong tool or you need structured data. If you have three or more independent URLs, files, documents, or API endpoints to inspect, batch them in one `browser_script` call with `http_get_many` or `browser_fetch_many` instead of visiting them one at a time. - `js(...)` returns Python values. After `text = js("document.body.innerText")`, use Python slicing like `text[:1000]`; only use JavaScript methods such as `.slice(...)` inside the JavaScript expression itself. - After actions that trigger loads, SPA transitions, XHR/fetch, menus, dialogs, downloads, uploads, or other visible state changes, be patient by making several cheap observations, not one long blind wait. Prefer short waits, then inspect again with `page_info()` or a screenshot. A wait returning false is not a task failure; inspect the current page and continue from the best available state or decide whether it is stuck. - If redirected to an auth wall or credential prompt, stop and ask the user. Do not infer or type credentials from screenshots. diff --git a/prompts/browser-script-tool-description.md b/prompts/browser-script-tool-description.md index 96519975..ac6e0eac 100644 --- a/prompts/browser-script-tool-description.md +++ b/prompts/browser-script-tool-description.md @@ -76,7 +76,7 @@ Usage guidance: - Do not combine `Input.dispatchKeyEvent` carrying printable `text` with a manual `char` event for the same character; that double-inserts text in Chrome. - If the task is site-specific, call `domain_skills_for_url(url, include_content=True)` before inventing selectors, private API routes, or flows. `goto_url(url)` also returns matching `domain_skills` metadata when a skill root is available. - Be patient with loading pages by making several cheap observations, not one long blind wait. Prefer short waits such as `wait_for_load(1)`, `wait_for_element(selector, timeout=2)`, or `wait_for_network_idle(2)`, then inspect again. If a wait returns false, that is not a task failure; inspect the current page and continue from the best available state or decide whether it is stuck. -- Use screenshots as labeled temporal checkpoints: initial load, before/after meaningful clicks, scrolls, route changes, dialogs, uploads, downloads, and final verification. +- Use screenshots as labeled temporal checkpoints when visual state matters: before/after meaningful clicks, scrolls, route changes, dialogs, uploads, downloads, and visual final verification. For text-heavy research, document reading, search, pricing, tables, and list extraction, prefer `page_info()`, `js(...)`, targeted DOM text, `http_get_many`, or `browser_fetch_many`; screenshots add latency and usually do not help. - The common screenshot call is `screenshot(label)`, for example `screenshot("before_submit")`. - Screenshot/image artifacts are sent as `input_image` content to the next model turn. The user does not see those pixels inline in the terminal; describe what you see or provide the saved artifact path when the user asks for the screenshot. - If a script emits screenshots/images and then fails, the next model turn still receives the images alongside the failure diagnosis. Use those pixels to decide the next smaller retry. diff --git a/prompts/dataset-case-user.md b/prompts/dataset-case-user.md index c46f6bcd..a49e3880 100644 --- a/prompts/dataset-case-user.md +++ b/prompts/dataset-case-user.md @@ -6,7 +6,7 @@ Task ID: {{task_id}} Task: {{task}} -Use `browser` for browser connection/status/recovery and `browser_script` for browser interaction. Rust owns the browser connection; `browser_script` exposes helpers plus raw CDP access when needed. Prefer robust CDP/DOM observations over guessing. Attach screenshots after meaningful visual transitions or whenever visible state matters. +Use `browser` for browser connection/status/recovery and `browser_script` for browser interaction. Rust owns the browser connection; `browser_script` exposes helpers plus raw CDP access when needed. Prefer robust CDP/DOM observations over guessing. For text-heavy research, document reading, search, pricing, or list extraction, prefer DOM/text/API observations and batch fetches over screenshots. Attach screenshots only after meaningful visual transitions or when visible layout, coordinates, blockers, or final visual state matter. Filesystem contract: if the task asks you to save files, write them in the current working directory using relative paths. For large JSON/CSV/list results, save the full result to `result.json` or `result.csv` so it is available as an artifact. If the requested final answer is not an exact inline format, return a compact final answer with the output path, record count, schema/columns, and one sample row instead of pasting a giant blob. From ce498f7380ac9e4b72251d75a51b777809539299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 12:34:56 +0000 Subject: [PATCH 33/48] Add dataset eval timebox guidance --- crates/browser-use-agent/src/prompts/tests.rs | 10 ++++++++++ prompts/dataset-case-user.md | 2 ++ 2 files changed, 12 insertions(+) diff --git a/crates/browser-use-agent/src/prompts/tests.rs b/crates/browser-use-agent/src/prompts/tests.rs index 45566eef..d236b0a1 100644 --- a/crates/browser-use-agent/src/prompts/tests.rs +++ b/crates/browser-use-agent/src/prompts/tests.rs @@ -130,6 +130,16 @@ fn prompts_avoid_screenshots_for_text_heavy_extraction() { assert!(script.contains("screenshots add latency")); } +#[test] +fn dataset_prompt_enforces_timeboxed_finalization() { + let prompt = include_str!("../../../../prompts/dataset-case-user.md"); + + assert!(prompt.contains("Timebox contract")); + assert!(prompt.contains("soft deadline")); + assert!(prompt.contains("hard deadline")); + assert!(prompt.contains("Never keep running until the external runner timeout")); +} + /// Plan mode was removed. The compatibility enum value now renders the Default /// asset so stale configs do not re-enable planning behavior. #[test] diff --git a/prompts/dataset-case-user.md b/prompts/dataset-case-user.md index a49e3880..d668d259 100644 --- a/prompts/dataset-case-user.md +++ b/prompts/dataset-case-user.md @@ -18,6 +18,8 @@ Remote browser contract: browser automation may run on a different machine from Long extraction contract: if the task needs many pages, rows, files, or detail records, work in bounded chunks. Discover the endpoint or pagination pattern first, then fetch in batches with explicit timeouts, checkpoint partial results in the current working directory, and print compact progress counts. A timed-out all-in-one crawl with no saved artifact is not progress; resume from checkpoints when a chunk fails. +Timebox contract: dataset runs have a short wall-clock budget. For long research, document, or extraction tasks, set a soft deadline before starting broad collection, about 7 minutes from now, and a hard deadline about 8.5 minutes from now. Check the deadline before each new page, document, query, or file. After the soft deadline, stop broad research and fill remaining fields from the strongest verified evidence or mark them unknown/unavailable. Before the hard deadline, call `done(...)` with the completed or partial result. Never keep running until the external runner timeout with no saved result. + Completion contract: the final answer must contain the requested answer or a clear pointer to the artifact that contains it. For artifact-heavy results, include the artifact path, record count, schema/columns, and one sample row. A bare acknowledgement such as `Done.` is not useful unless the task explicitly asked for no visible answer. Before finalizing extraction results, briefly check that the returned items are the same kind of thing the task asked for and that hard filters were not softened to satisfy quantity. If an item is only adjacent, similar, or uncertain, exclude it or mark it uncertain rather than silently treating it as a match. From 89bb71ebe1652d6c5b1a22b44ef02c83305d0ad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:36:46 +0000 Subject: [PATCH 34/48] Record Rust LLM request messages for traces --- crates/browser-use-agent/src/turn/sampling.rs | 112 +++++++++++++++++- .../src/turn/sampling_tests.rs | 65 ++++++++++ 2 files changed, 175 insertions(+), 2 deletions(-) diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index 527ffac7..f41a124f 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -439,7 +439,7 @@ impl ModelSamplingDriver { } } - fn emit_turn_request(&self, attempt: u32, composition: &Value) { + fn emit_turn_request(&self, attempt: u32, composition: &Value, llm_input: &Value) { self.sink.emit(PendingEvent::new( self.ctx.session_id.clone(), names::MODEL_TURN_REQUEST, @@ -449,6 +449,7 @@ impl ModelSamplingDriver { "turn_idx": self.ctx.turn_idx, "attempt": attempt, "composition": composition, + "llm_input": llm_input, }), )); } @@ -877,9 +878,10 @@ impl SamplingDriver // exist here (they are never persisted as message events). Uses the same // byte->token estimator the agent uses elsewhere, so it stays consistent. let composition = request_composition(&req); + let llm_input = request_observability_input(&req); let mut attempt: u32 = 0; loop { - self.emit_turn_request(attempt, &composition); + self.emit_turn_request(attempt, &composition, &llm_input); // ---- open the stream (codex: `client.stream(&prompt).await`) ---- let mut stream = match self.transport.open_stream(&req) { Ok(s) => s, @@ -1078,3 +1080,109 @@ fn request_composition(req: &LlmRequest) -> Value { "tools": tools, }) } + +const OBSERVABILITY_MAX_MESSAGES: usize = 80; +const OBSERVABILITY_MAX_TEXT_CHARS: usize = 80_000; +const OBSERVABILITY_MAX_STRING_CHARS: usize = 4_000; + +fn request_observability_input(req: &LlmRequest) -> Value { + let mut remaining_text_chars = OBSERVABILITY_MAX_TEXT_CHARS; + let message_count = req.messages.len(); + let omitted_earlier_messages = message_count.saturating_sub(OBSERVABILITY_MAX_MESSAGES); + let messages: Vec = req + .messages + .iter() + .skip(omitted_earlier_messages) + .map(|message| observability_json_value(message, &mut remaining_text_chars)) + .collect(); + let system: Vec = req + .system + .iter() + .map(|part| observability_json_value(part, &mut remaining_text_chars)) + .collect(); + + serde_json::json!({ + "system": system, + "messages": messages, + "message_count": message_count, + "omitted_earlier_messages": omitted_earlier_messages, + "truncated": remaining_text_chars == 0, + }) +} + +fn observability_json_value( + value: &T, + remaining_text_chars: &mut usize, +) -> Value { + serde_json::to_value(value) + .map(|value| sanitize_observability_value(value, remaining_text_chars)) + .unwrap_or(Value::Null) +} + +fn sanitize_observability_value(value: Value, remaining_text_chars: &mut usize) -> Value { + match value { + Value::Object(map) => { + let mut out = serde_json::Map::with_capacity(map.len()); + for (key, value) in map { + if is_observability_secret_key(&key) { + out.insert(key, Value::String("[redacted]".to_string())); + } else if key == "data" { + out.insert(key, Value::String("[redacted inline data]".to_string())); + } else { + out.insert( + key, + sanitize_observability_value(value, remaining_text_chars), + ); + } + } + Value::Object(out) + } + Value::Array(values) => Value::Array( + values + .into_iter() + .map(|value| sanitize_observability_value(value, remaining_text_chars)) + .collect(), + ), + Value::String(text) => { + Value::String(truncate_observability_string(&text, remaining_text_chars)) + } + other => other, + } +} + +fn is_observability_secret_key(key: &str) -> bool { + let key = key.to_ascii_lowercase(); + key.contains("api_key") + || key.contains("apikey") + || key.contains("authorization") + || key.contains("auth_token") + || key.contains("password") + || key.contains("secret") + || key.contains("token") + || key.contains("cookie") +} + +fn truncate_observability_string(text: &str, remaining_text_chars: &mut usize) -> String { + if text.is_empty() { + return String::new(); + } + if *remaining_text_chars == 0 { + return "[truncated: request observability text budget exhausted]".to_string(); + } + + let limit = OBSERVABILITY_MAX_STRING_CHARS.min(*remaining_text_chars); + let mut out = String::new(); + let mut chars = text.chars(); + for _ in 0..limit { + let Some(ch) = chars.next() else { + *remaining_text_chars = (*remaining_text_chars).saturating_sub(out.chars().count()); + return out; + }; + out.push(ch); + } + *remaining_text_chars = (*remaining_text_chars).saturating_sub(out.chars().count()); + if chars.next().is_some() { + out.push_str("...[truncated]"); + } + out +} diff --git a/crates/browser-use-agent/src/turn/sampling_tests.rs b/crates/browser-use-agent/src/turn/sampling_tests.rs index 3f5fa685..5ab9d4c4 100644 --- a/crates/browser-use-agent/src/turn/sampling_tests.rs +++ b/crates/browser-use-agent/src/turn/sampling_tests.rs @@ -562,6 +562,71 @@ async fn driver_passes_populated_per_call_request_to_open_stream() { ); } +#[tokio::test] +async fn turn_request_event_carries_sanitized_llm_input_messages() { + let (transport, _opens) = + ScriptedTransport::new(vec![OpenScript::Stream(vec![finish(FinishReason::Stop)])]); + let sink = Arc::new(RecordingSink::default()); + let d = driver(transport, sink.clone(), 5); + + let input = vec![ + Message::new( + MessageRole::User, + vec![ + ContentPart::text("Find the account page."), + ContentPart::Media { + mime_type: "image/png".to_string(), + data: Some("iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB".to_string()), + url: None, + detail: Some("low".to_string()), + }, + ], + ), + Message::new( + MessageRole::Assistant, + vec![ContentPart::ToolCall { + id: "call-1".to_string(), + name: "browser_script".to_string(), + input: serde_json::json!({ + "code": "goto_url('https://example.com')", + "api_key": "secret-value", + }), + provider_metadata: None, + }], + ), + ]; + + let _ = d + .run_sampling_request(input, CancellationToken::new()) + .await + .expect("sampling should succeed"); + + let events = sink.drain(); + let request = events + .iter() + .find(|event| event.event_type == names::MODEL_TURN_REQUEST) + .expect("turn request event emitted"); + let llm_input = &request.payload["llm_input"]; + assert_eq!(llm_input["message_count"], serde_json::json!(2)); + assert_eq!(llm_input["omitted_earlier_messages"], serde_json::json!(0)); + assert_eq!( + llm_input["messages"][0]["content"][0]["text"], + serde_json::json!("Find the account page.") + ); + assert_eq!( + llm_input["messages"][0]["content"][1]["data"], + serde_json::json!("[redacted inline data]") + ); + assert_eq!( + llm_input["messages"][1]["content"][0]["input"]["api_key"], + serde_json::json!("[redacted]") + ); + assert!(!llm_input["system"][0]["text"] + .as_str() + .unwrap_or_default() + .is_empty()); +} + #[tokio::test] async fn driver_prepends_selected_browser_mode_instruction_to_messages() { let (transport, seen) = From 040d36c75e79a62f45df200c268650f56e7fc009 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:49:06 +0000 Subject: [PATCH 35/48] Enable Anthropic prompt cache breakpoints --- crates/browser-use-agent/src/turn/sampling.rs | 7 ++-- .../src/turn/sampling_tests.rs | 7 +++- .../src/protocols/anthropic_messages.rs | 41 ++++++++++++++++++- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index f41a124f..5ad749b6 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -59,7 +59,7 @@ use std::time::Instant; use browser_use_llm::route::{ModelClient, Route}; use browser_use_llm::schema::{ - ContentPart, FinishReason, LlmError, LlmErrorReason, LlmEvent, LlmRequest, Message, + CacheHint, ContentPart, FinishReason, LlmError, LlmErrorReason, LlmEvent, LlmRequest, Message, MessageRole, SystemPart, TextPhase, Usage, }; use futures_util::{Stream, StreamExt}; @@ -1037,8 +1037,9 @@ impl SamplingDriver /// unit-reachable while the fused driver still advertises the catalog. fn build_request(ctx: &TurnCtx, input: Vec) -> LlmRequest { let mut req = LlmRequest::new(ctx.model.clone(), ctx.provider.clone()); - req.system - .push(SystemPart::new(ctx.base_instructions.clone())); + let mut base_system = SystemPart::new(ctx.base_instructions.clone()); + base_system.cache = Some(CacheHint::Ephemeral); + req.system.push(base_system); req.messages = input; if let Some(instruction) = ctx.browser_mode_instruction.as_deref() { req.messages.insert( diff --git a/crates/browser-use-agent/src/turn/sampling_tests.rs b/crates/browser-use-agent/src/turn/sampling_tests.rs index 5ab9d4c4..b6eaeb1e 100644 --- a/crates/browser-use-agent/src/turn/sampling_tests.rs +++ b/crates/browser-use-agent/src/turn/sampling_tests.rs @@ -12,7 +12,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use browser_use_llm::schema::{ - ContentPart, FinishReason, LlmError, LlmErrorReason, LlmEvent, LlmRequest, Message, + CacheHint, ContentPart, FinishReason, LlmError, LlmErrorReason, LlmEvent, LlmRequest, Message, MessageRole, TextPhase, Usage, }; use browser_use_protocol::EventRecord; @@ -560,6 +560,11 @@ async fn driver_passes_populated_per_call_request_to_open_stream() { ctx().provider.into(), "request carries the turn's provider" ); + assert_eq!( + req.system.first().and_then(|part| part.cache), + Some(CacheHint::Ephemeral), + "stable base system prompt should be cacheable for providers that support prompt caching" + ); } #[tokio::test] diff --git a/crates/browser-use-llm/src/protocols/anthropic_messages.rs b/crates/browser-use-llm/src/protocols/anthropic_messages.rs index c560db46..6dbe4109 100644 --- a/crates/browser-use-llm/src/protocols/anthropic_messages.rs +++ b/crates/browser-use-llm/src/protocols/anthropic_messages.rs @@ -67,7 +67,13 @@ impl Protocol for AnthropicMessagesProtocol { // Tool definitions. if !req.tools.is_empty() { - let tools: Vec = req.tools.iter().map(build_tool).collect(); + let mut tools: Vec = req.tools.iter().map(build_tool).collect(); + if let Some(Value::Object(last_tool)) = tools.last_mut() { + last_tool.insert( + "cache_control".to_string(), + cache_control(CacheHint::Ephemeral), + ); + } body.insert("tools".to_string(), Value::Array(tools)); } @@ -677,7 +683,8 @@ mod tests { "type": "object", "properties": { "city": { "type": "string" } }, "required": ["city"], - } + }, + "cache_control": { "type": "ephemeral" } } ]) ); @@ -686,6 +693,36 @@ mod tests { assert_eq!(body["tool_choice"], json!({ "type": "auto" })); } + #[test] + fn build_body_marks_cache_control_breakpoints() { + let mut req = LlmRequest::new("claude-sonnet-4-6", "anthropic"); + let mut system = SystemPart::new("Stable system prompt."); + system.cache = Some(CacheHint::Ephemeral); + req.system.push(system); + for name in ["first_tool", "last_tool"] { + req.tools.push(ToolDefinition { + name: name.into(), + description: String::new(), + input_schema: json!({ "type": "object" }), + output_schema: None, + namespace: None, + namespace_description: None, + }); + } + + let body = AnthropicMessagesProtocol::new().build_body(&req).unwrap(); + + assert_eq!( + body["system"][0]["cache_control"], + json!({ "type": "ephemeral" }) + ); + assert!(body["tools"][0].get("cache_control").is_none()); + assert_eq!( + body["tools"][1]["cache_control"], + json!({ "type": "ephemeral" }) + ); + } + #[test] fn build_body_respects_max_tokens_and_omits_empty_sections() { let mut req = LlmRequest::new("m", "anthropic"); From 116e730d967a415987eb24df0da0c42723a07ea1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:56:05 +0000 Subject: [PATCH 36/48] Mirror Python Anthropic message cache breakpoint --- .../browser-use-agent/src/turn/loop_tests.rs | 1 + crates/browser-use-agent/src/turn/sampling.rs | 11 +++++++ .../src/turn/sampling_tests.rs | 11 +++++-- .../src/protocols/anthropic_messages.rs | 33 +++++++++++++++++-- crates/browser-use-llm/src/schema/messages.rs | 14 +++++++- 5 files changed, 64 insertions(+), 6 deletions(-) diff --git a/crates/browser-use-agent/src/turn/loop_tests.rs b/crates/browser-use-agent/src/turn/loop_tests.rs index 61205a3e..de3657e2 100644 --- a/crates/browser-use-agent/src/turn/loop_tests.rs +++ b/crates/browser-use-agent/src/turn/loop_tests.rs @@ -738,6 +738,7 @@ async fn bounded_loop_aborts_after_max_turns() { let Some(Message { role: MessageRole::Developer, content, + .. }) = recorded_inputs[1].last() else { panic!("last bounded request should include final-step developer nudge"); diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index 5ad749b6..8ebceb60 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -1041,6 +1041,7 @@ fn build_request(ctx: &TurnCtx, input: Vec) -> LlmRequest { base_system.cache = Some(CacheHint::Ephemeral); req.system.push(base_system); req.messages = input; + mark_last_message_cacheable(&mut req.messages); if let Some(instruction) = ctx.browser_mode_instruction.as_deref() { req.messages.insert( 0, @@ -1053,6 +1054,16 @@ fn build_request(ctx: &TurnCtx, input: Vec) -> LlmRequest { req } +fn mark_last_message_cacheable(messages: &mut [Message]) { + if let Some(message) = messages + .iter_mut() + .rev() + .find(|message| !matches!(message.role, MessageRole::System | MessageRole::Developer)) + { + message.cache = Some(CacheHint::Ephemeral); + } +} + /// Token attribution for the per-turn request, computed from the REAL assembled /// [`LlmRequest`]. The system prompt and tool schemas are not message events, so /// this is the only place the `/context` view can learn their size. Counts use diff --git a/crates/browser-use-agent/src/turn/sampling_tests.rs b/crates/browser-use-agent/src/turn/sampling_tests.rs index b6eaeb1e..302b754e 100644 --- a/crates/browser-use-agent/src/turn/sampling_tests.rs +++ b/crates/browser-use-agent/src/turn/sampling_tests.rs @@ -544,9 +544,11 @@ async fn driver_passes_populated_per_call_request_to_open_stream() { ); // And it must be EXACTLY the input the driver was asked to sample, with the // turn's model/provider identity from `ctx()`. + let mut expected_messages = input.clone(); + expected_messages.last_mut().unwrap().cache = Some(CacheHint::Ephemeral); assert_eq!( - req.messages, input, - "open_stream must receive the driver's per-call input messages verbatim" + req.messages, expected_messages, + "open_stream must receive the driver's per-call input messages with the current-state cache hint" ); // `req.model`/`req.provider` are the `ModelId`/`ProviderId` newtypes; compare // against the same `.into()` conversion `LlmRequest::new` applies to `ctx()`. @@ -565,6 +567,11 @@ async fn driver_passes_populated_per_call_request_to_open_stream() { Some(CacheHint::Ephemeral), "stable base system prompt should be cacheable for providers that support prompt caching" ); + assert_eq!( + req.messages.last().and_then(|message| message.cache), + Some(CacheHint::Ephemeral), + "latest browser-state message should be cacheable like the Python Anthropic serializer" + ); } #[tokio::test] diff --git a/crates/browser-use-llm/src/protocols/anthropic_messages.rs b/crates/browser-use-llm/src/protocols/anthropic_messages.rs index 6dbe4109..f93d24ea 100644 --- a/crates/browser-use-llm/src/protocols/anthropic_messages.rs +++ b/crates/browser-use-llm/src/protocols/anthropic_messages.rs @@ -121,15 +121,36 @@ fn build_message(message: &Message) -> Result { } }; - let content: Result, LlmError> = - message.content.iter().map(build_content_block).collect(); + let mut content: Vec = message + .content + .iter() + .map(build_content_block) + .collect::, LlmError>>()?; + apply_cache_control_to_last_content_block(&mut content, message.cache); Ok(json!({ "role": role, - "content": content?, + "content": content, })) } +fn apply_cache_control_to_last_content_block(content: &mut [Value], cache: Option) { + let Some(cache) = cache else { + return; + }; + + for block in content.iter_mut().rev() { + let Some(obj) = block.as_object_mut() else { + continue; + }; + if obj.get("type").and_then(Value::as_str) == Some("image") { + continue; + } + obj.insert("cache_control".to_string(), cache_control(cache)); + break; + } +} + /// Translate a canonical [`ContentPart`] into an Anthropic content block. fn build_content_block(part: &ContentPart) -> Result { match part { @@ -699,6 +720,8 @@ mod tests { let mut system = SystemPart::new("Stable system prompt."); system.cache = Some(CacheHint::Ephemeral); req.system.push(system); + req.messages + .push(Message::user_text("Current browser state.").with_cache(CacheHint::Ephemeral)); for name in ["first_tool", "last_tool"] { req.tools.push(ToolDefinition { name: name.into(), @@ -716,6 +739,10 @@ mod tests { body["system"][0]["cache_control"], json!({ "type": "ephemeral" }) ); + assert_eq!( + body["messages"][0]["content"][0]["cache_control"], + json!({ "type": "ephemeral" }) + ); assert!(body["tools"][0].get("cache_control").is_none()); assert_eq!( body["tools"][1]["cache_control"], diff --git a/crates/browser-use-llm/src/schema/messages.rs b/crates/browser-use-llm/src/schema/messages.rs index d38fe7dc..bb01f0a2 100644 --- a/crates/browser-use-llm/src/schema/messages.rs +++ b/crates/browser-use-llm/src/schema/messages.rs @@ -86,17 +86,29 @@ pub enum CacheHint { #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Message { pub role: MessageRole, + /// Optional prompt-cache hint; only honored by protocols that support + /// inline cache markers (Anthropic / Bedrock). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub cache: Option, #[serde(default)] pub content: Vec, } impl Message { pub fn new(role: MessageRole, content: Vec) -> Self { - Self { role, content } + Self { + role, + cache: None, + content, + } } pub fn user_text(s: impl Into) -> Self { Self::new(MessageRole::User, vec![ContentPart::text(s)]) } + pub fn with_cache(mut self, cache: CacheHint) -> Self { + self.cache = Some(cache); + self + } } /// A tool the model may call. The handler is never on the wire — only schema. From af70af042c62e5d2dd07304cee07edb331ca9bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:11:08 +0000 Subject: [PATCH 37/48] Normalize Anthropic cache usage accounting --- .../src/context/tests_accounting.rs | 2 + crates/browser-use-agent/src/events/map.rs | 24 +++++- .../browser-use-agent/src/events/map_tests.rs | 7 +- crates/browser-use-agent/src/goals/tests.rs | 1 + .../src/protocols/anthropic_messages.rs | 75 ++++++++++++++++++- .../src/protocols/openai_chat.rs | 3 + .../src/protocols/openai_responses.rs | 5 ++ crates/browser-use-llm/src/route/client.rs | 1 + crates/browser-use-llm/src/schema/event.rs | 20 +++-- crates/browser-use-llm/src/schema/mod.rs | 2 + crates/browser-use-providers/src/lib.rs | 56 +++++++++----- 11 files changed, 163 insertions(+), 33 deletions(-) diff --git a/crates/browser-use-agent/src/context/tests_accounting.rs b/crates/browser-use-agent/src/context/tests_accounting.rs index 7daea781..d0c7e04d 100644 --- a/crates/browser-use-agent/src/context/tests_accounting.rs +++ b/crates/browser-use-agent/src/context/tests_accounting.rs @@ -79,6 +79,7 @@ fn from_llm_usage_uses_server_total_when_present() { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 0, output_tokens: 30, reasoning_output_tokens: 10, total_tokens: 123, @@ -96,6 +97,7 @@ fn from_llm_usage_total_fallback_excludes_cached() { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 0, output_tokens: 30, reasoning_output_tokens: 10, total_tokens: 0, diff --git a/crates/browser-use-agent/src/events/map.rs b/crates/browser-use-agent/src/events/map.rs index 60e84d3e..7243f4c2 100644 --- a/crates/browser-use-agent/src/events/map.rs +++ b/crates/browser-use-agent/src/events/map.rs @@ -95,6 +95,7 @@ pub fn usage_to_model_usage(u: &Usage) -> ModelUsage { ModelUsage { input_tokens: Some(u.input_tokens as i64), input_cached_tokens: Some(u.cached_input_tokens as i64), + input_cache_creation_tokens: positive_i64(u.cache_creation_input_tokens), output_tokens: Some(u.output_tokens as i64), reasoning_output_tokens: Some(u.reasoning_output_tokens as i64), total_tokens: Some(total as i64), @@ -102,6 +103,10 @@ pub fn usage_to_model_usage(u: &Usage) -> ModelUsage { } } +fn positive_i64(value: u64) -> Option { + (value > 0).then_some(value as i64) +} + /// Codex-shaped token-usage object (mirrors core `model_usage_to_codex_token_usage`): /// `{ input_tokens, cached_input_tokens, output_tokens, reasoning_output_tokens, /// total_tokens }`, where a missing `total_tokens` falls back to the sum of the @@ -109,6 +114,7 @@ pub fn usage_to_model_usage(u: &Usage) -> ModelUsage { fn codex_token_usage(usage: &ModelUsage) -> Value { let input_tokens = usage.input_tokens.unwrap_or(0); let cached_input_tokens = usage.input_cached_tokens.unwrap_or(0); + let cache_creation_input_tokens = usage.input_cache_creation_tokens.unwrap_or(0); let output_tokens = usage.output_tokens.unwrap_or(0); let reasoning_output_tokens = usage.reasoning_output_tokens.unwrap_or(0); let total_tokens = usage.total_tokens.unwrap_or_else(|| { @@ -116,20 +122,26 @@ fn codex_token_usage(usage: &ModelUsage) -> Value { .saturating_add(output_tokens) .saturating_add(reasoning_output_tokens) }); - json!({ + let mut value = json!({ "input_tokens": input_tokens, "cached_input_tokens": cached_input_tokens, "output_tokens": output_tokens, "reasoning_output_tokens": reasoning_output_tokens, "total_tokens": total_tokens, - }) + }); + if cache_creation_input_tokens > 0 { + value["input_cache_creation_tokens"] = json!(cache_creation_input_tokens); + } + value } /// Field-wise sum of two codex token-usage objects (mirrors core /// `add_codex_token_usage`). Missing keys are treated as `0`. fn add_codex_token_usage(previous: &Value, addition: &Value) -> Value { let get = |value: &Value, key: &str| value.get(key).and_then(Value::as_i64).unwrap_or(0); - json!({ + let cache_creation_input_tokens = + get(previous, "input_cache_creation_tokens") + get(addition, "input_cache_creation_tokens"); + let mut value = json!({ "input_tokens": get(previous, "input_tokens") + get(addition, "input_tokens"), "cached_input_tokens": get(previous, "cached_input_tokens") + get(addition, "cached_input_tokens"), @@ -137,7 +149,11 @@ fn add_codex_token_usage(previous: &Value, addition: &Value) -> Value { "reasoning_output_tokens": get(previous, "reasoning_output_tokens") + get(addition, "reasoning_output_tokens"), "total_tokens": get(previous, "total_tokens") + get(addition, "total_tokens"), - }) + }); + if cache_creation_input_tokens > 0 { + value["input_cache_creation_tokens"] = json!(cache_creation_input_tokens); + } + value } /// Build the `token_count` payload (core parity: diff --git a/crates/browser-use-agent/src/events/map_tests.rs b/crates/browser-use-agent/src/events/map_tests.rs index a530bd8a..667a685d 100644 --- a/crates/browser-use-agent/src/events/map_tests.rs +++ b/crates/browser-use-agent/src/events/map_tests.rs @@ -108,6 +108,7 @@ fn finish_maps_to_token_count_from_usage() { let usage = Usage { input_tokens: 100, cached_input_tokens: 10, + cache_creation_input_tokens: 0, output_tokens: 20, reasoning_output_tokens: 5, total_tokens: 125, @@ -206,6 +207,7 @@ fn usage_total_zero_falls_back_to_computed_total() { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 0, output_tokens: 20, reasoning_output_tokens: 5, total_tokens: 0, // provider didn't report an inclusive total @@ -230,6 +232,7 @@ fn usage_total_nonzero_is_preserved() { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 12, output_tokens: 20, reasoning_output_tokens: 5, total_tokens: 250, // explicit total wins over computed_total @@ -240,8 +243,8 @@ fn usage_total_nonzero_is_preserved() { assert_eq!(mu.input_cached_tokens, Some(40)); assert_eq!(mu.output_tokens, Some(20)); assert_eq!(mu.reasoning_output_tokens, Some(5)); - // Cost / cache-creation fields are unknown at this layer. - assert_eq!(mu.input_cache_creation_tokens, None); + assert_eq!(mu.input_cache_creation_tokens, Some(12)); + // Cost fields are unknown at this layer. assert_eq!(mu.cost_usd, None); assert_eq!(mu.cost_source, None); } diff --git a/crates/browser-use-agent/src/goals/tests.rs b/crates/browser-use-agent/src/goals/tests.rs index b0647326..a61f70ea 100644 --- a/crates/browser-use-agent/src/goals/tests.rs +++ b/crates/browser-use-agent/src/goals/tests.rs @@ -56,6 +56,7 @@ fn usage(input: u64, cached: u64, output: u64) -> Usage { Usage { input_tokens: input, cached_input_tokens: cached, + cache_creation_input_tokens: 0, output_tokens: output, reasoning_output_tokens: 0, total_tokens: 0, diff --git a/crates/browser-use-llm/src/protocols/anthropic_messages.rs b/crates/browser-use-llm/src/protocols/anthropic_messages.rs index f93d24ea..8148cbc9 100644 --- a/crates/browser-use-llm/src/protocols/anthropic_messages.rs +++ b/crates/browser-use-llm/src/protocols/anthropic_messages.rs @@ -524,14 +524,33 @@ impl AnthropicMessagesStream { /// Merge any usage fields present in `usage` into the running total. fn apply_usage(&mut self, usage: &Value) { if let Some(v) = usage.get("input_tokens").and_then(Value::as_u64) { - self.usage.input_tokens = v; + self.set_uncached_input_tokens(v); } if let Some(v) = usage.get("output_tokens").and_then(Value::as_u64) { self.usage.output_tokens = v; } if let Some(v) = usage.get("cache_read_input_tokens").and_then(Value::as_u64) { - self.usage.cached_input_tokens = v; + self.set_cached_input_tokens(v); } + if let Some(v) = usage + .get("cache_creation_input_tokens") + .and_then(Value::as_u64) + { + self.usage.cache_creation_input_tokens = v; + } + } + + fn set_uncached_input_tokens(&mut self, raw_input_tokens: u64) { + self.usage.input_tokens = raw_input_tokens.saturating_add(self.usage.cached_input_tokens); + } + + fn set_cached_input_tokens(&mut self, cached_input_tokens: u64) { + let raw_input_tokens = self + .usage + .input_tokens + .saturating_sub(self.usage.cached_input_tokens); + self.usage.cached_input_tokens = cached_input_tokens; + self.usage.input_tokens = raw_input_tokens.saturating_add(cached_input_tokens); } /// Flush open blocks and emit `StepFinish` + `Finish` (idempotent). @@ -1099,6 +1118,58 @@ mod tests { assert_eq!(events, expected); } + #[test] + fn decoder_normalizes_anthropic_cached_usage_to_inclusive_input() { + let frames = vec![ + frame( + "message_start", + json!({ + "type": "message_start", + "message": { + "id": "msg_cache", + "role": "assistant", + "content": [], + "usage": { + "input_tokens": 12, + "cache_creation_input_tokens": 44, + "output_tokens": 0 + } + } + }), + ), + frame( + "message_delta", + json!({ + "type": "message_delta", + "delta": { "stop_reason": "end_turn", "stop_sequence": null }, + "usage": { + "output_tokens": 3088, + "cache_read_input_tokens": 183250 + } + }), + ), + frame("message_stop", json!({ "type": "message_stop" })), + ]; + + let events = drive(&frames); + let usage = Usage { + input_tokens: 183262, + cached_input_tokens: 183250, + cache_creation_input_tokens: 44, + output_tokens: 3088, + ..Default::default() + }; + + assert!(events.contains(&LlmEvent::StepFinish { + usage, + finish_reason: Some(FinishReason::Stop), + })); + assert!(events.contains(&LlmEvent::Finish { + usage, + finish_reason: Some(FinishReason::Stop), + })); + } + #[test] fn decoder_handles_thinking_block_and_signature() { let frames = vec![ diff --git a/crates/browser-use-llm/src/protocols/openai_chat.rs b/crates/browser-use-llm/src/protocols/openai_chat.rs index efe88804..308f370b 100644 --- a/crates/browser-use-llm/src/protocols/openai_chat.rs +++ b/crates/browser-use-llm/src/protocols/openai_chat.rs @@ -701,6 +701,7 @@ fn parse_usage(usage: &Value) -> Usage { Usage { input_tokens: u("prompt_tokens"), cached_input_tokens: cached, + cache_creation_input_tokens: 0, output_tokens: u("completion_tokens"), reasoning_output_tokens: reasoning, total_tokens: u("total_tokens"), @@ -974,6 +975,7 @@ mod tests { let usage = Usage { input_tokens: 10, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 5, reasoning_output_tokens: 0, total_tokens: 15, @@ -1048,6 +1050,7 @@ mod tests { let usage = Usage { input_tokens: 3, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 1, reasoning_output_tokens: 0, total_tokens: 4, // computed: 3 + 1 diff --git a/crates/browser-use-llm/src/protocols/openai_responses.rs b/crates/browser-use-llm/src/protocols/openai_responses.rs index 76a0cf33..b71deb6a 100644 --- a/crates/browser-use-llm/src/protocols/openai_responses.rs +++ b/crates/browser-use-llm/src/protocols/openai_responses.rs @@ -712,6 +712,7 @@ fn parse_usage(usage: Option<&Value>) -> Option { Some(Usage { input_tokens: input, cached_input_tokens: cached, + cache_creation_input_tokens: 0, output_tokens: output, reasoning_output_tokens: reasoning, total_tokens: total, @@ -971,6 +972,7 @@ mod tests { usage: Usage { input_tokens: 11, cached_input_tokens: 4, + cache_creation_input_tokens: 0, output_tokens: 7, reasoning_output_tokens: 2, total_tokens: 18, @@ -981,6 +983,7 @@ mod tests { usage: Usage { input_tokens: 11, cached_input_tokens: 4, + cache_creation_input_tokens: 0, output_tokens: 7, reasoning_output_tokens: 2, total_tokens: 18, @@ -1082,6 +1085,7 @@ mod tests { usage: Usage { input_tokens: 1, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 1, reasoning_output_tokens: 0, total_tokens: 2, @@ -1092,6 +1096,7 @@ mod tests { usage: Usage { input_tokens: 1, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 1, reasoning_output_tokens: 0, total_tokens: 2, diff --git a/crates/browser-use-llm/src/route/client.rs b/crates/browser-use-llm/src/route/client.rs index 121bdabc..511ff2b7 100644 --- a/crates/browser-use-llm/src/route/client.rs +++ b/crates/browser-use-llm/src/route/client.rs @@ -848,6 +848,7 @@ mod tests { let usage = Usage { input_tokens: 11, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 7, reasoning_output_tokens: 0, total_tokens: 18, diff --git a/crates/browser-use-llm/src/schema/event.rs b/crates/browser-use-llm/src/schema/event.rs index 037579f1..f4aeb228 100644 --- a/crates/browser-use-llm/src/schema/event.rs +++ b/crates/browser-use-llm/src/schema/event.rs @@ -10,15 +10,20 @@ use serde_json::Value; use super::ids::FinishReason; use super::messages::ContentPart; -/// Token usage with an explicitly **non-overlapping** breakdown, so consumers -/// never have to subtract. `total_tokens` is the inclusive total reported (or -/// computed) for the turn. +/// Token usage normalized for Browser Use cost accounting. #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct Usage { + /// Regular input plus cache-read input. Anthropic cache-write input is kept + /// separate in `cache_creation_input_tokens` so it can be billed at the + /// cache-write rate without also being charged as base input. #[serde(default)] pub input_tokens: u64, + /// Cache-read input tokens. These are included in `input_tokens`. #[serde(default)] pub cached_input_tokens: u64, + /// Cache-write input tokens. These are not included in `input_tokens`. + #[serde(default)] + pub cache_creation_input_tokens: u64, #[serde(default)] pub output_tokens: u64, #[serde(default)] @@ -29,10 +34,13 @@ pub struct Usage { impl Usage { /// Sum of the breakdown fields (use when a provider does not report an - /// inclusive total). `cached_input_tokens` is a subset of `input_tokens` - /// and is therefore not added again. + /// inclusive total). `cached_input_tokens` is included in `input_tokens`, + /// while cache-creation tokens are a separate Anthropic billing bucket. pub fn computed_total(&self) -> u64 { - self.input_tokens + self.output_tokens + self.reasoning_output_tokens + self.input_tokens + + self.cache_creation_input_tokens + + self.output_tokens + + self.reasoning_output_tokens } } diff --git a/crates/browser-use-llm/src/schema/mod.rs b/crates/browser-use-llm/src/schema/mod.rs index 50b0159d..e883f65c 100644 --- a/crates/browser-use-llm/src/schema/mod.rs +++ b/crates/browser-use-llm/src/schema/mod.rs @@ -107,6 +107,7 @@ mod tests { usage: Usage { input_tokens: 10, cached_input_tokens: 4, + cache_creation_input_tokens: 0, output_tokens: 6, reasoning_output_tokens: 2, total_tokens: 18, @@ -125,6 +126,7 @@ mod tests { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 0, output_tokens: 20, reasoning_output_tokens: 5, total_tokens: 0, diff --git a/crates/browser-use-providers/src/lib.rs b/crates/browser-use-providers/src/lib.rs index e0ae165d..622cc967 100644 --- a/crates/browser-use-providers/src/lib.rs +++ b/crates/browser-use-providers/src/lib.rs @@ -7051,10 +7051,33 @@ fn parse_usage(usage: Option<&Value>, model: &str) -> Option { .or_else(|| usage.get("total_cost")) .or_else(|| usage.get("cost_usd")) .and_then(value_f64); - let input_tokens = usage + let raw_input_tokens = usage .get("input_tokens") .or_else(|| usage.get("prompt_tokens")) .and_then(Value::as_i64); + let cached_input_tokens = usage + .get("input_tokens_details") + .and_then(|details| details.get("cached_tokens")) + .or_else(|| { + usage + .get("prompt_tokens_details") + .and_then(|details| details.get("cached_tokens")) + }) + .or_else(|| usage.get("cache_read_input_tokens")) + .and_then(Value::as_i64); + let cache_creation_tokens = usage + .get("cache_creation_input_tokens") + .or_else(|| usage.get("prompt_cache_creation_tokens")) + .and_then(Value::as_i64); + let input_tokens = raw_input_tokens.map(|tokens| { + if usage.get("cache_read_input_tokens").is_some() + || usage.get("cache_creation_input_tokens").is_some() + { + tokens + cached_input_tokens.unwrap_or(0) + } else { + tokens + } + }); let output_tokens = usage .get("output_tokens") .or_else(|| usage.get("completion_tokens")) @@ -7068,26 +7091,21 @@ fn parse_usage(usage: Option<&Value>, model: &str) -> Option { .and_then(|details| details.get("reasoning_tokens")) }) .and_then(Value::as_i64); - let total_tokens = usage - .get("total_tokens") - .and_then(Value::as_i64) - .or_else(|| Some(input_tokens? + output_tokens?)); + let has_anthropic_cache_fields = usage.get("cache_read_input_tokens").is_some() + || usage.get("cache_creation_input_tokens").is_some(); + let computed_total_tokens = input_tokens? + cache_creation_tokens.unwrap_or(0) + output_tokens?; + let total_tokens = if has_anthropic_cache_fields { + Some(computed_total_tokens) + } else { + usage + .get("total_tokens") + .and_then(Value::as_i64) + .or(Some(computed_total_tokens)) + }; let usage = ModelUsage { input_tokens, - input_cached_tokens: usage - .get("input_tokens_details") - .and_then(|details| details.get("cached_tokens")) - .or_else(|| { - usage - .get("prompt_tokens_details") - .and_then(|details| details.get("cached_tokens")) - }) - .or_else(|| usage.get("cache_read_input_tokens")) - .and_then(Value::as_i64), - input_cache_creation_tokens: usage - .get("cache_creation_input_tokens") - .or_else(|| usage.get("prompt_cache_creation_tokens")) - .and_then(Value::as_i64), + input_cached_tokens: cached_input_tokens, + input_cache_creation_tokens: cache_creation_tokens, output_tokens, reasoning_output_tokens, total_tokens, From a823fc07c3df39d3b604fcd8d127afe5e75087b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:54:49 +0000 Subject: [PATCH 38/48] Emit full LLM observability input --- crates/browser-use-agent/src/turn/sampling.rs | 69 +++---------------- .../src/turn/sampling_tests.rs | 52 +++++++++++++- 2 files changed, 59 insertions(+), 62 deletions(-) diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index 8ebceb60..a2495404 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -1093,58 +1093,35 @@ fn request_composition(req: &LlmRequest) -> Value { }) } -const OBSERVABILITY_MAX_MESSAGES: usize = 80; -const OBSERVABILITY_MAX_TEXT_CHARS: usize = 80_000; -const OBSERVABILITY_MAX_STRING_CHARS: usize = 4_000; - fn request_observability_input(req: &LlmRequest) -> Value { - let mut remaining_text_chars = OBSERVABILITY_MAX_TEXT_CHARS; let message_count = req.messages.len(); - let omitted_earlier_messages = message_count.saturating_sub(OBSERVABILITY_MAX_MESSAGES); - let messages: Vec = req - .messages - .iter() - .skip(omitted_earlier_messages) - .map(|message| observability_json_value(message, &mut remaining_text_chars)) - .collect(); - let system: Vec = req - .system - .iter() - .map(|part| observability_json_value(part, &mut remaining_text_chars)) - .collect(); + let messages: Vec = req.messages.iter().map(observability_json_value).collect(); + let system: Vec = req.system.iter().map(observability_json_value).collect(); serde_json::json!({ "system": system, "messages": messages, "message_count": message_count, - "omitted_earlier_messages": omitted_earlier_messages, - "truncated": remaining_text_chars == 0, + "omitted_earlier_messages": 0, + "truncated": false, }) } -fn observability_json_value( - value: &T, - remaining_text_chars: &mut usize, -) -> Value { +fn observability_json_value(value: &T) -> Value { serde_json::to_value(value) - .map(|value| sanitize_observability_value(value, remaining_text_chars)) + .map(sanitize_observability_value) .unwrap_or(Value::Null) } -fn sanitize_observability_value(value: Value, remaining_text_chars: &mut usize) -> Value { +fn sanitize_observability_value(value: Value) -> Value { match value { Value::Object(map) => { let mut out = serde_json::Map::with_capacity(map.len()); for (key, value) in map { if is_observability_secret_key(&key) { out.insert(key, Value::String("[redacted]".to_string())); - } else if key == "data" { - out.insert(key, Value::String("[redacted inline data]".to_string())); } else { - out.insert( - key, - sanitize_observability_value(value, remaining_text_chars), - ); + out.insert(key, sanitize_observability_value(value)); } } Value::Object(out) @@ -1152,12 +1129,9 @@ fn sanitize_observability_value(value: Value, remaining_text_chars: &mut usize) Value::Array(values) => Value::Array( values .into_iter() - .map(|value| sanitize_observability_value(value, remaining_text_chars)) + .map(sanitize_observability_value) .collect(), ), - Value::String(text) => { - Value::String(truncate_observability_string(&text, remaining_text_chars)) - } other => other, } } @@ -1173,28 +1147,3 @@ fn is_observability_secret_key(key: &str) -> bool { || key.contains("token") || key.contains("cookie") } - -fn truncate_observability_string(text: &str, remaining_text_chars: &mut usize) -> String { - if text.is_empty() { - return String::new(); - } - if *remaining_text_chars == 0 { - return "[truncated: request observability text budget exhausted]".to_string(); - } - - let limit = OBSERVABILITY_MAX_STRING_CHARS.min(*remaining_text_chars); - let mut out = String::new(); - let mut chars = text.chars(); - for _ in 0..limit { - let Some(ch) = chars.next() else { - *remaining_text_chars = (*remaining_text_chars).saturating_sub(out.chars().count()); - return out; - }; - out.push(ch); - } - *remaining_text_chars = (*remaining_text_chars).saturating_sub(out.chars().count()); - if chars.next().is_some() { - out.push_str("...[truncated]"); - } - out -} diff --git a/crates/browser-use-agent/src/turn/sampling_tests.rs b/crates/browser-use-agent/src/turn/sampling_tests.rs index 302b754e..c9e88bd3 100644 --- a/crates/browser-use-agent/src/turn/sampling_tests.rs +++ b/crates/browser-use-agent/src/turn/sampling_tests.rs @@ -575,7 +575,7 @@ async fn driver_passes_populated_per_call_request_to_open_stream() { } #[tokio::test] -async fn turn_request_event_carries_sanitized_llm_input_messages() { +async fn turn_request_event_carries_full_llm_input_messages() { let (transport, _opens) = ScriptedTransport::new(vec![OpenScript::Stream(vec![finish(FinishReason::Stop)])]); let sink = Arc::new(RecordingSink::default()); @@ -627,18 +627,66 @@ async fn turn_request_event_carries_sanitized_llm_input_messages() { ); assert_eq!( llm_input["messages"][0]["content"][1]["data"], - serde_json::json!("[redacted inline data]") + serde_json::json!("iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB") ); assert_eq!( llm_input["messages"][1]["content"][0]["input"]["api_key"], serde_json::json!("[redacted]") ); + assert_eq!(llm_input["truncated"], serde_json::json!(false)); assert!(!llm_input["system"][0]["text"] .as_str() .unwrap_or_default() .is_empty()); } +#[tokio::test] +async fn turn_request_event_carries_all_observability_messages_without_text_budget() { + let (transport, _opens) = + ScriptedTransport::new(vec![OpenScript::Stream(vec![finish(FinishReason::Stop)])]); + let sink = Arc::new(RecordingSink::default()); + let d = driver(transport, sink.clone(), 5); + + let long_text = "observe-this-text".repeat(6_000); + let mut input: Vec = (0..85) + .map(|index| { + Message::new( + MessageRole::User, + vec![ContentPart::text(format!("msg-{index}"))], + ) + }) + .collect(); + input.push(Message::new( + MessageRole::User, + vec![ContentPart::text(long_text.clone())], + )); + + let _ = d + .run_sampling_request(input, CancellationToken::new()) + .await + .expect("sampling should succeed"); + + let events = sink.drain(); + let request = events + .iter() + .find(|event| event.event_type == names::MODEL_TURN_REQUEST) + .expect("turn request event emitted"); + let llm_input = &request.payload["llm_input"]; + let messages = llm_input["messages"].as_array().expect("messages array"); + assert_eq!(llm_input["message_count"], serde_json::json!(86)); + assert_eq!(llm_input["omitted_earlier_messages"], serde_json::json!(0)); + assert_eq!(messages.len(), 86); + assert_eq!( + messages[85]["content"][0]["text"], + serde_json::json!(long_text) + ); + assert_eq!(llm_input["truncated"], serde_json::json!(false)); + + let serialized = serde_json::to_string(llm_input).expect("llm_input serializes"); + assert!(!serialized.contains("request observability text budget exhausted")); + assert!(!serialized.contains("...[truncated]")); +} + #[tokio::test] async fn driver_prepends_selected_browser_mode_instruction_to_messages() { let (transport, seen) = From 5b5b6ac4b99efedd2465d82a5196c04f17699683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:43:56 +0000 Subject: [PATCH 39/48] Improve Rust agent replay and cache breakpoints --- .../src/session/reconstruct.rs | 51 +++++++++++++----- .../src/session/reconstruct_tests.rs | 52 +++++++++++++++++++ crates/browser-use-agent/src/turn/sampling.rs | 47 ++++++++++++++--- .../src/turn/sampling_tests.rs | 38 ++++++++++++++ 4 files changed, 167 insertions(+), 21 deletions(-) diff --git a/crates/browser-use-agent/src/session/reconstruct.rs b/crates/browser-use-agent/src/session/reconstruct.rs index be92a74b..6292fb48 100644 --- a/crates/browser-use-agent/src/session/reconstruct.rs +++ b/crates/browser-use-agent/src/session/reconstruct.rs @@ -1154,15 +1154,22 @@ fn response_input_item_output_content(item: &Value) -> Value { fn value_to_tool_output_text(value: &Value) -> String { match value { Value::String(text) => text.clone(), - Value::Array(parts) => parts - .iter() - .filter_map(|part| { - part.get("text") - .and_then(Value::as_str) - .map(ToOwned::to_owned) - }) - .collect::>() - .join(""), + Value::Array(parts) => { + let text = parts + .iter() + .filter_map(|part| { + part.get("text") + .and_then(Value::as_str) + .map(ToOwned::to_owned) + }) + .collect::>() + .join(""); + if text.trim().is_empty() { + value.to_string() + } else { + text + } + } Value::Null => String::new(), other => other.to_string(), } @@ -1466,15 +1473,33 @@ fn tool_output_event_content(payload: &Value) -> Value { fn tool_output_event_text(payload: &Value) -> String { if let Some(text) = payload.get("text").and_then(Value::as_str) { - return text.to_string(); + if !text.trim().is_empty() { + return text.to_string(); + } } if let Some(output) = payload.get("output") { - return value_to_tool_output_text(output); + let text = value_to_tool_output_text(output); + if !text.trim().is_empty() { + return text; + } } if let Some(content) = payload.get("content") { - return value_to_tool_output_text(content); + let text = value_to_tool_output_text(content); + if !text.trim().is_empty() { + return text; + } + } + let mut parts = Vec::new(); + for key in ["summary", "data", "outputs"] { + let Some(value) = payload.get(key) else { + continue; + }; + if value.is_null() || value == &serde_json::json!({}) || value == &serde_json::json!([]) { + continue; + } + parts.push(format!("{key}: {}", value_to_tool_output_text(value))); } - String::new() + parts.join("\n") } fn synthetic_tool_result_text(name: &str) -> String { diff --git a/crates/browser-use-agent/src/session/reconstruct_tests.rs b/crates/browser-use-agent/src/session/reconstruct_tests.rs index 7b528066..bfa53b15 100644 --- a/crates/browser-use-agent/src/session/reconstruct_tests.rs +++ b/crates/browser-use-agent/src/session/reconstruct_tests.rs @@ -114,6 +114,58 @@ fn turn_with_tool_call_and_output() { ); } +#[test] +fn tool_output_event_uses_structured_browser_script_fallback_text() { + let events = vec![ + event(1, "session.input", json!({ "text": "open page" })), + event( + 2, + "model.tool_call", + json!({ + "id": "call_browser", + "name": "browser_script", + "arguments": { "code": "emit_output(page_info(), label='page_info')" } + }), + ), + event( + 3, + "tool.output", + json!({ + "tool_call_id": "call_browser", + "name": "browser_script", + "text": "", + "summary": [{ + "kind": "page", + "output_label": "page_info", + "title": "Example Domain", + "url": "https://example.com" + }], + "outputs": [{ + "label": "page_info", + "value": { + "title": "Example Domain", + "url": "https://example.com" + } + }] + }), + ), + event(4, "session.done", json!({})), + ]; + + let messages = provider_messages_from_events(&events); + assert_eq!(messages.len(), 3, "messages: {messages:#?}"); + let tool = &messages[2]; + assert_eq!(tool.get("role").and_then(Value::as_str), Some("tool")); + let content = tool + .get("content") + .and_then(Value::as_str) + .expect("structured fallback content"); + assert!(content.contains("summary:")); + assert!(content.contains("outputs:")); + assert!(content.contains("Example Domain")); + assert!(!content.trim().is_empty()); +} + #[test] fn tool_output_event_preserves_image_content() { let events = vec![ diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index a2495404..8b96e288 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -1041,7 +1041,7 @@ fn build_request(ctx: &TurnCtx, input: Vec) -> LlmRequest { base_system.cache = Some(CacheHint::Ephemeral); req.system.push(base_system); req.messages = input; - mark_last_message_cacheable(&mut req.messages); + mark_message_cache_breakpoints(&mut req.messages); if let Some(instruction) = ctx.browser_mode_instruction.as_deref() { req.messages.insert( 0, @@ -1054,13 +1054,44 @@ fn build_request(ctx: &TurnCtx, input: Vec) -> LlmRequest { req } -fn mark_last_message_cacheable(messages: &mut [Message]) { - if let Some(message) = messages - .iter_mut() - .rev() - .find(|message| !matches!(message.role, MessageRole::System | MessageRole::Developer)) - { - message.cache = Some(CacheHint::Ephemeral); +fn mark_message_cache_breakpoints(messages: &mut [Message]) { + const LOOKBACK_TARGET_BLOCKS: usize = 16; + const MAX_MESSAGE_BREAKPOINTS: usize = 2; + + for message in messages.iter_mut() { + message.cache = None; + } + + let eligible: Vec<(usize, usize)> = messages + .iter() + .enumerate() + .filter(|(_, message)| { + !matches!(message.role, MessageRole::System | MessageRole::Developer) + }) + .map(|(index, message)| (index, message.content.len().max(1))) + .collect(); + let Some((last_index, _)) = eligible.last().copied() else { + return; + }; + + let mut selected = vec![last_index]; + let mut blocks_since_last = 0usize; + for (index, block_count) in eligible.into_iter().rev().skip(1) { + blocks_since_last = blocks_since_last.saturating_add(block_count); + if blocks_since_last >= LOOKBACK_TARGET_BLOCKS { + selected.push(index); + break; + } + } + selected.sort_unstable(); + selected.dedup(); + if selected.len() > MAX_MESSAGE_BREAKPOINTS { + selected.drain(0..selected.len() - MAX_MESSAGE_BREAKPOINTS); + } + for index in selected { + if let Some(message) = messages.get_mut(index) { + message.cache = Some(CacheHint::Ephemeral); + } } } diff --git a/crates/browser-use-agent/src/turn/sampling_tests.rs b/crates/browser-use-agent/src/turn/sampling_tests.rs index c9e88bd3..cebcb803 100644 --- a/crates/browser-use-agent/src/turn/sampling_tests.rs +++ b/crates/browser-use-agent/src/turn/sampling_tests.rs @@ -574,6 +574,44 @@ async fn driver_passes_populated_per_call_request_to_open_stream() { ); } +#[tokio::test] +async fn open_stream_marks_an_earlier_cache_breakpoint_for_long_histories() { + let (transport, seen) = + RecordingTransport::new(vec![text_delta("ok"), finish(FinishReason::Stop)]); + let sink: Arc = Arc::new(RecordingSink::default()); + let d = ModelSamplingDriver::new(transport, sink, ctx(), 5).without_jitter(); + + let input: Vec = (0..25) + .map(|index| { + Message::new( + MessageRole::User, + vec![ContentPart::text(format!("browser state {index}"))], + ) + }) + .collect(); + let _ = d + .run_sampling_request(input, CancellationToken::new()) + .await + .expect("sampling should succeed"); + + let captured = seen.lock().unwrap(); + let req = &captured[0]; + let cache_indices: Vec = req + .messages + .iter() + .enumerate() + .filter_map(|(index, message)| { + (message.cache == Some(CacheHint::Ephemeral)).then_some(index) + }) + .collect(); + + assert_eq!( + cache_indices, + vec![8, 24], + "long browser histories should keep the latest message cacheable and add one earlier breakpoint inside Anthropic's lookback window" + ); +} + #[tokio::test] async fn turn_request_event_carries_full_llm_input_messages() { let (transport, _opens) = From 9102fb2ec4db1d5d3643d443b72cf3952fa6e591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:57:20 +0000 Subject: [PATCH 40/48] Emit Rust LLM tool definitions for tracing --- crates/browser-use-agent/src/turn/sampling.rs | 36 ++++++++-- .../src/turn/sampling_tests.rs | 67 +++++++++++++++++-- 2 files changed, 91 insertions(+), 12 deletions(-) diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index 8b96e288..add236c0 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -54,6 +54,7 @@ use std::collections::HashMap; use std::future::Future; use std::pin::Pin; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use std::time::Instant; @@ -334,6 +335,7 @@ pub struct ModelSamplingDriver< transport: T, sink: Arc, ctx: TurnCtx, + next_turn_idx: AtomicUsize, /// Retry budget (codex `provider.stream_max_retries()`). max_retries: u32, /// Whether to apply I/O-layer jitter to the post-decision backoff sleep. @@ -369,6 +371,7 @@ impl ModelSamplingDriver { Self { transport, sink, + next_turn_idx: AtomicUsize::new(ctx.turn_idx), ctx, max_retries, jitter: true, @@ -397,6 +400,7 @@ impl ModelSamplingDriver { ModelSamplingDriver { transport: self.transport, sink: self.sink, + next_turn_idx: self.next_turn_idx, ctx: self.ctx, max_retries: self.max_retries, jitter: self.jitter, @@ -433,20 +437,33 @@ impl ModelSamplingDriver { } /// Map an [`LlmEvent`] to UI events and emit them through the sink. - fn emit_event(&self, ev: &LlmEvent) { - for pending in events::map_llm_event(&self.ctx, ev) { + fn ctx_for_turn(&self, turn_idx: usize) -> TurnCtx { + let mut ctx = self.ctx.clone(); + ctx.turn_idx = turn_idx; + ctx + } + + fn emit_event(&self, ev: &LlmEvent, turn_idx: usize) { + let ctx = self.ctx_for_turn(turn_idx); + for pending in events::map_llm_event(&ctx, ev) { self.sink.emit(pending); } } - fn emit_turn_request(&self, attempt: u32, composition: &Value, llm_input: &Value) { + fn emit_turn_request( + &self, + turn_idx: usize, + attempt: u32, + composition: &Value, + llm_input: &Value, + ) { self.sink.emit(PendingEvent::new( self.ctx.session_id.clone(), names::MODEL_TURN_REQUEST, serde_json::json!({ "model": &self.ctx.model, "provider": &self.ctx.provider, - "turn_idx": self.ctx.turn_idx, + "turn_idx": turn_idx, "attempt": attempt, "composition": composition, "llm_input": llm_input, @@ -544,9 +561,10 @@ impl ModelSamplingDriver { acc: &mut TurnAccumulator, ev: LlmEvent, started_at: Instant, + turn_idx: usize, ) -> StreamProgress { // Emit UI events first (map is pure; emit is the only side effect). - self.emit_event(&ev); + self.emit_event(&ev, turn_idx); match ev { LlmEvent::TextDelta { id, delta } => { let has_content = !delta.trim().is_empty(); @@ -864,6 +882,7 @@ impl SamplingDriver // the populated conversation, not an empty body. let input = self.input_with_goal_context(input); let mut req = build_request(&self.ctx, input); + let turn_idx = self.next_turn_idx.fetch_add(1, Ordering::Relaxed); // Advertise the tool catalog. When a dispatcher is attached (the fused // path), it carries the registry's model-visible definitions; we copy them // verbatim (order-stable) into `req.tools` so the model can actually emit @@ -881,7 +900,7 @@ impl SamplingDriver let llm_input = request_observability_input(&req); let mut attempt: u32 = 0; loop { - self.emit_turn_request(attempt, &composition, &llm_input); + self.emit_turn_request(turn_idx, attempt, &composition, &llm_input); // ---- open the stream (codex: `client.stream(&prompt).await`) ---- let mut stream = match self.transport.open_stream(&req) { Ok(s) => s, @@ -913,7 +932,7 @@ impl SamplingDriver match maybe_event { Some(Ok(ev)) => { let check_mailbox_preemption = checks_mailbox_preemption_after_event(&ev); - match self.consume_event(&mut acc, ev, started_at) { + match self.consume_event(&mut acc, ev, started_at, turn_idx) { StreamProgress::Continue => { if check_mailbox_preemption && self.has_mailbox_preemption().await { preempted_for_mailbox = true; @@ -1128,10 +1147,13 @@ fn request_observability_input(req: &LlmRequest) -> Value { let message_count = req.messages.len(); let messages: Vec = req.messages.iter().map(observability_json_value).collect(); let system: Vec = req.system.iter().map(observability_json_value).collect(); + let tools: Vec = req.tools.iter().map(observability_json_value).collect(); serde_json::json!({ "system": system, "messages": messages, + "tools": tools, + "tools_count": tools.len(), "message_count": message_count, "omitted_earlier_messages": 0, "truncated": false, diff --git a/crates/browser-use-agent/src/turn/sampling_tests.rs b/crates/browser-use-agent/src/turn/sampling_tests.rs index cebcb803..0d3e52c4 100644 --- a/crates/browser-use-agent/src/turn/sampling_tests.rs +++ b/crates/browser-use-agent/src/turn/sampling_tests.rs @@ -321,6 +321,40 @@ async fn finish_accounts_usage_only_when_goal_is_active() { ); } +#[tokio::test] +async fn repeated_sampling_requests_emit_monotonic_turn_indices() { + let (transport, _opens) = ScriptedTransport::new(vec![ + OpenScript::Stream(vec![finish(FinishReason::Stop)]), + OpenScript::Stream(vec![finish(FinishReason::Stop)]), + ]); + let sink = Arc::new(RecordingSink::default()); + let d = driver(transport, sink.clone(), 5); + + let _ = d + .run_sampling_request(user_input(), CancellationToken::new()) + .await + .expect("first sampling request should succeed"); + let _ = d + .run_sampling_request(user_input(), CancellationToken::new()) + .await + .expect("second sampling request should succeed"); + + let events = sink.drain(); + let turn_request_indices: Vec = events + .iter() + .filter(|event| event.event_type == names::MODEL_TURN_REQUEST) + .map(|event| event.payload["turn_idx"].as_i64().expect("turn_idx")) + .collect(); + let token_count_indices: Vec = events + .iter() + .filter(|event| event.event_type == names::TOKEN_COUNT) + .map(|event| event.payload["turn_idx"].as_i64().expect("turn_idx")) + .collect(); + + assert_eq!(turn_request_indices, vec![0, 1]); + assert_eq!(token_count_indices, vec![0, 1]); +} + #[tokio::test] async fn active_goal_context_is_injected_with_codex_envelope() { let (transport, seen) = @@ -873,7 +907,7 @@ impl crate::turn::sampling::FusionRecorder for NoopRecorder { fn tool_def(name: &str) -> browser_use_llm::schema::ToolDefinition { browser_use_llm::schema::ToolDefinition { name: name.to_string(), - description: String::new(), + description: format!("{name} model-visible tool description"), input_schema: serde_json::json!({"type": "object"}), output_schema: None, namespace: None, @@ -911,9 +945,10 @@ async fn fused_driver_advertises_dispatcher_tool_specs_on_request() { // about the request the driver built. let (transport, seen) = RecordingTransport::new(vec![text_delta("ok"), finish(FinishReason::Stop)]); - let sink: Arc = Arc::new(RecordingSink::default()); + let sink = Arc::new(RecordingSink::default()); + let sink_for_driver: Arc = sink.clone(); let recorder: Arc = Arc::new(NoopRecorder); - let d = ModelSamplingDriver::new(transport, sink, ctx(), 5) + let d = ModelSamplingDriver::new(transport, sink_for_driver, ctx(), 5) .without_jitter() .with_fusion(dispatcher, recorder); @@ -935,12 +970,34 @@ async fn fused_driver_advertises_dispatcher_tool_specs_on_request() { !req.tools.is_empty(), "fused driver must advertise the dispatcher's tool specs — req.tools is EMPTY" ); - let names: Vec<&str> = req.tools.iter().map(|t| t.name.as_str()).collect(); + let tool_names: Vec<&str> = req.tools.iter().map(|t| t.name.as_str()).collect(); assert_eq!( - names, + tool_names, vec!["browser", "python", "shell"], "req.tools must carry the registered tool names, in the registry's order" ); + + let events = sink.drain(); + let request = events + .iter() + .find(|event| event.event_type == names::MODEL_TURN_REQUEST) + .expect("turn request event emitted"); + let llm_tools = request.payload["llm_input"]["tools"] + .as_array() + .expect("llm_input tools array"); + assert_eq!( + request.payload["llm_input"]["tools_count"], + serde_json::json!(3) + ); + assert_eq!(llm_tools[0]["name"], serde_json::json!("browser")); + assert_eq!( + llm_tools[0]["description"], + serde_json::json!("browser model-visible tool description") + ); + assert_eq!( + llm_tools[0]["input_schema"], + serde_json::json!({"type": "object"}) + ); } #[tokio::test] From 53e1364b7d35695cffa2a3b78168ad2a21847ac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 18:30:55 +0000 Subject: [PATCH 41/48] Cache Anthropic tool definitions in provider path --- crates/browser-use-providers/src/lib.rs | 53 ++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/crates/browser-use-providers/src/lib.rs b/crates/browser-use-providers/src/lib.rs index 622cc967..6830647d 100644 --- a/crates/browser-use-providers/src/lib.rs +++ b/crates/browser-use-providers/src/lib.rs @@ -5306,7 +5306,7 @@ fn chat_tool_description(tool: &ToolSpec) -> String { } fn tool_specs_to_anthropic_tools(tools: &[ToolSpec], is_oauth: bool) -> Vec { - tools + let mut anthropic_tools: Vec = tools .iter() .map(|tool| { json!({ @@ -5315,7 +5315,11 @@ fn tool_specs_to_anthropic_tools(tools: &[ToolSpec], is_oauth: bool) -> Vec String { @@ -9445,6 +9449,51 @@ mod tests { Ok(()) } + #[test] + fn anthropic_messages_request_marks_last_tool_cacheable() -> Result<()> { + let provider = AnthropicMessagesProvider::new("anthropic-key", "claude-test"); + let body = provider.messages_request_body( + &ProviderTurn { + instructions: Some("Stable system prompt".to_string()), + messages: vec![json!({"role": "user", "content": "finish"})], + tools: vec![ + ToolSpec { + name: "browser".to_string(), + namespace: None, + namespace_description: None, + description: "Inspect a page".to_string(), + input_schema: json!({"type": "object"}), + output_schema: None, + freeform: None, + }, + ToolSpec { + name: "done".to_string(), + namespace: None, + namespace_description: None, + description: "Finish the task".to_string(), + input_schema: json!({"type": "object"}), + output_schema: None, + freeform: None, + }, + ], + ..ProviderTurn::default() + }, + false, + true, + )?; + + assert!(body["tools"][0].get("cache_control").is_none()); + assert_eq!( + body["tools"][1]["cache_control"], + json!({"type": "ephemeral"}) + ); + assert_eq!( + body["system"][0]["cache_control"], + json!({"type": "ephemeral"}) + ); + Ok(()) + } + #[test] fn openai_compatible_chat_retries_5xx_inside_provider_like_codex_request_layer() -> Result<()> { let success = json!({ From adeb4cd1099dd5bedc2ffa58470882d7751f2dd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:06:01 +0000 Subject: [PATCH 42/48] Enable Anthropic automatic conversation caching --- crates/browser-use-providers/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/browser-use-providers/src/lib.rs b/crates/browser-use-providers/src/lib.rs index 6830647d..77b476c1 100644 --- a/crates/browser-use-providers/src/lib.rs +++ b/crates/browser-use-providers/src/lib.rs @@ -2450,6 +2450,7 @@ impl AnthropicMessagesProvider { let mut body = json!({ "model": self.model, "max_tokens": 16000, + "cache_control": { "type": "ephemeral" }, "system": anthropic_system_blocks_with_developer_context(&instructions, &turn.messages, is_oauth), "messages": messages_to_anthropic_messages(&turn.messages, is_oauth)?, }); @@ -9483,6 +9484,7 @@ mod tests { )?; assert!(body["tools"][0].get("cache_control").is_none()); + assert_eq!(body["cache_control"], json!({"type": "ephemeral"})); assert_eq!( body["tools"][1]["cache_control"], json!({"type": "ephemeral"}) From 98d912b416e20c2babfe68dfc972b2bc0e1f3185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:10:21 +0000 Subject: [PATCH 43/48] Apply runtime config overrides to CLI runs --- .../browser-use-agent/src/config_overrides.rs | 57 +++++++++++++++---- crates/browser-use-cli/src/main.rs | 12 ++-- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/crates/browser-use-agent/src/config_overrides.rs b/crates/browser-use-agent/src/config_overrides.rs index ef4746d8..30498c45 100644 --- a/crates/browser-use-agent/src/config_overrides.rs +++ b/crates/browser-use-agent/src/config_overrides.rs @@ -733,41 +733,57 @@ pub fn apply_child_request_runtime_config( config: &mut ProviderRunConfig, request: &ChildAgentRunRequest, ) -> Result<()> { - let overrides = &request.config_overrides; + apply_runtime_config_overrides(&mut config.options, &request.config_overrides) +} + +/// Apply config keys that mutate in-memory runtime options. +/// +/// The raw override list is still retained for downstream consumers that read +/// less common config keys directly, but options that are consulted before those +/// consumers run must be materialized here. +pub fn apply_runtime_config_overrides( + options: &mut AgentRunOptions, + overrides: &ConfigOverrides, +) -> Result<()> { + if let Some(value) = config_override_u64(overrides, "max_turns") { + options.max_turns = usize::try_from(value) + .context("max_turns does not fit in usize")? + .max(1); + } if let Some(value) = config_override_str(overrides, "browser_mode") { - config.options.browser_mode = Some(value); + options.browser_mode = Some(value); } if let Some(value) = config_override_str(overrides, "base_instructions") { - config.options.base_instructions = Some(value); + options.base_instructions = Some(value); } if let Some(value) = config_override_str(overrides, "developer_instructions") { - config.options.developer_instructions = Some(value); + options.developer_instructions = Some(value); } if let Some(value) = config_override_str(overrides, "compact_prompt") { - config.options.compact_prompt = Some(value); + options.compact_prompt = Some(value); } if let Some(value) = config_override_u64(overrides, "python_tool_timeout_seconds") { - config.options.python_tool_timeout_seconds = value; + options.python_tool_timeout_seconds = value; } if let Some(value) = config_override_bool(overrides, "model_compaction_enabled") { - config.options.model_compaction_enabled = value; + options.model_compaction_enabled = value; } if let Some(value) = config_override_i64(overrides, "model_auto_compact_token_limit") { - config.options.model_auto_compact_token_limit = Some(value); + options.model_auto_compact_token_limit = Some(value); } if let Some(value) = config_override_str(overrides, "model_auto_compact_token_limit_scope") { - config.options.model_auto_compact_token_limit_scope = + options.model_auto_compact_token_limit_scope = parse_auto_compact_token_limit_scope(&value)?; } if let Some(value) = config_override_str(overrides, "approval_policy") .or_else(|| config_override_str(overrides, "ask_for_approval")) { - config.options.approval_policy = parse_approval_policy(&value)?; + options.approval_policy = parse_approval_policy(&value)?; } if let Some(value) = config_override_bool(overrides, "use_guardian") .or_else(|| config_override_bool(overrides, "guardian")) { - config.options.use_guardian = value; + options.use_guardian = value; } Ok(()) } @@ -1816,6 +1832,25 @@ command = "profile-server" assert!(options.agent_roles.is_empty()); } + #[test] + fn runtime_config_overrides_materialize_max_turns_and_browser_mode() { + let overrides = parse_config_overrides(&ov(&[ + "max_turns=100", + "browser_mode=\"remote-cdp\"", + "python_tool_timeout_seconds=45", + "model_compaction_enabled=false", + ])) + .unwrap(); + let mut options = AgentRunOptions::default(); + + apply_runtime_config_overrides(&mut options, &overrides).unwrap(); + + assert_eq!(options.max_turns, 100); + assert_eq!(options.browser_mode.as_deref(), Some("remote-cdp")); + assert_eq!(options.python_tool_timeout_seconds, 45); + assert!(!options.model_compaction_enabled); + } + #[test] fn provider_run_config_new_uses_explicit_source_and_default_options() { let config = ProviderRunConfig::new(ProviderBackend::Anthropic, "claude-x"); diff --git a/crates/browser-use-cli/src/main.rs b/crates/browser-use-cli/src/main.rs index 3100df78..ec0371b8 100644 --- a/crates/browser-use-cli/src/main.rs +++ b/crates/browser-use-cli/src/main.rs @@ -13,11 +13,12 @@ use browser_use_agent::config_model::{ model_catalog_for_cwd_with_options, }; use browser_use_agent::config_overrides::{ - apply_child_request_runtime_config, load_mcp_servers_for_profile, parse_config_overrides, - resolve_agent_roles_for_profile, resolve_approval_policy_for_profile, - resolve_collab_for_profile, resolve_guardian_for_profile, resolve_multi_agent_v2_for_profile, - AgentRunOptions, ChildAgentRunCompletion, ChildAgentRunRequest, ChildAgentRunner, - ConfigOverrides, ProviderBackend, ProviderRunConfig, RunConfigValueSource, + apply_child_request_runtime_config, apply_runtime_config_overrides, + load_mcp_servers_for_profile, parse_config_overrides, resolve_agent_roles_for_profile, + resolve_approval_policy_for_profile, resolve_collab_for_profile, resolve_guardian_for_profile, + resolve_multi_agent_v2_for_profile, AgentRunOptions, ChildAgentRunCompletion, + ChildAgentRunRequest, ChildAgentRunner, ConfigOverrides, ProviderBackend, ProviderRunConfig, + RunConfigValueSource, }; use browser_use_agent::context::{ append_user_shell_command_context_event, typed_user_input_payload_from_items_for_cwd, @@ -1763,6 +1764,7 @@ fn cli_agent_options( options = options.with_mcp_servers(mcp_servers); } if !config_overrides.is_empty() { + apply_runtime_config_overrides(&mut options, &config_overrides)?; options = options.with_config_overrides(config_overrides); } Ok(options) From 8fff5fa2ba327e927d9b734238fcd46186261030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:21:56 +0000 Subject: [PATCH 44/48] Avoid duplicate durable prompt replay --- .../browser-use-agent/src/entrypoint/mod.rs | 160 ++++++++++++++++-- 1 file changed, 143 insertions(+), 17 deletions(-) diff --git a/crates/browser-use-agent/src/entrypoint/mod.rs b/crates/browser-use-agent/src/entrypoint/mod.rs index 212589b1..169bc3b5 100644 --- a/crates/browser-use-agent/src/entrypoint/mod.rs +++ b/crates/browser-use-agent/src/entrypoint/mod.rs @@ -251,6 +251,10 @@ struct StoreTurnState { /// `Some`, it REPLACES the durable-log prompt; later recorded turns are /// appended after it. `None` until the first compaction. compacted: Mutex>>, + /// Unit/offline seams rely on the in-memory recorder because they do not emit + /// durable model/tool events. Production emits those events synchronously, so + /// replaying both durable history and this recorder tail duplicates turns. + include_recorded_tail_in_prompt: bool, } impl StoreTurnState { @@ -278,9 +282,21 @@ impl StoreTurnState { previous_model_compaction: None, compaction_sampler: None, compacted: Mutex::new(None), + include_recorded_tail_in_prompt: true, } } + /// Use only the durable event log when rebuilding prompts. + /// + /// The live facade persists model/tool events as they happen, before the next + /// sampling iteration. The in-memory fusion recorder is therefore redundant + /// for production prompt replay and would duplicate the same assistant/tool + /// turns that the event log already reconstructs. + fn with_durable_prompt_replay(mut self) -> Self { + self.include_recorded_tail_in_prompt = false; + self + } + /// Enable REAL token accounting + model-based compaction against a context /// window, driven by `sampler` for the no-tools summary pass. fn with_compaction( @@ -332,10 +348,16 @@ impl StoreTurnState { history_from_store(&self.store, &session_id) } }; - msgs.extend(self.recorded.lock().unwrap().iter().cloned()); + self.append_recorded_tail_if_enabled(&mut msgs); msgs } + fn append_recorded_tail_if_enabled(&self, msgs: &mut Vec) { + if self.include_recorded_tail_in_prompt { + msgs.extend(self.recorded.lock().unwrap().iter().cloned()); + } + } + fn current_prompt_items_for_compaction(&self, mode: CompactionMode) -> Vec { let events = events_from_store(&self.store, self.session_id.as_str()); match mode { @@ -353,13 +375,15 @@ impl StoreTurnState { events: &[browser_use_protocol::EventRecord], ) -> Vec { let mut items = provider_messages_from_events(events); - items.extend( - self.recorded - .lock() - .unwrap() - .iter() - .map(message_to_provider_item), - ); + if self.include_recorded_tail_in_prompt { + items.extend( + self.recorded + .lock() + .unwrap() + .iter() + .map(message_to_provider_item), + ); + } items } @@ -1578,10 +1602,10 @@ fn enrich_token_count_payload( impl TurnState for StoreTurnState { async fn clone_history_for_prompt(&self) -> Vec { // Once compacted, the prompt base is the compacted override (codex's - // replaced history); otherwise it is the lowered durable log. The recorded - // buffer (this run's assistant turns + the fused driver's dispatched tool - // outputs) is appended either way, so tool outputs always re-enter the next - // prompt (the fusion seam is preserved across compaction). + // replaced history); otherwise it is the lowered durable log. Offline + // tests append the recorder tail so tool outputs re-enter the next prompt + // without a durable sink. Production disables that tail because the same + // model/tool events have already been persisted and replay from the log. if self.compacted.lock().unwrap().is_some() { return self.assemble_prompt_blocking(); } @@ -1591,10 +1615,7 @@ impl TurnState for StoreTurnState { let mut msgs = tokio::task::spawn_blocking(move || history_from_store(&store, &session_id)) .await .unwrap_or_default(); - // The recorded buffer carries this run's assistant turns AND the fused - // driver's dispatched tool outputs (both append through the same `Arc`), so - // the next prompt sees everything produced so far. - msgs.extend(self.recorded.lock().unwrap().iter().cloned()); + self.append_recorded_tail_if_enabled(&mut msgs); msgs } @@ -2062,7 +2083,8 @@ async fn drive_run( cancel: CancellationToken, max_turns: Option, ) -> Result, AgentError> { - let state = StoreTurnState::new(Arc::clone(&store), session_id.clone(), recorded); + let state = StoreTurnState::new(Arc::clone(&store), session_id.clone(), recorded) + .with_durable_prompt_replay(); // Enable REAL token accounting + model-based compaction when a sampler is // available (the real backend path). The Fake/no-credential path passes `None` // and keeps the inert (never-compacts) behavior. @@ -2706,6 +2728,32 @@ mod tests { ) } + fn count_tool_call_ids(messages: &[Message], call_id: &str) -> usize { + messages + .iter() + .flat_map(|message| message.content.iter()) + .filter(|part| { + matches!( + part, + ContentPart::ToolCall { id, .. } if id == call_id + ) + }) + .count() + } + + fn count_tool_result_ids(messages: &[Message], call_id: &str) -> usize { + messages + .iter() + .flat_map(|message| message.content.iter()) + .filter(|part| { + matches!( + part, + ContentPart::ToolResult { tool_call_id, .. } if tool_call_id == call_id + ) + }) + .count() + } + fn seed_workspace_context(store: &SharedStore, session_id: &str, content: &str) { let store = store.lock().expect("store mutex poisoned"); store @@ -3022,6 +3070,84 @@ mod tests { assert!(!state.token_status().await.token_limit_reached); } + #[tokio::test] + async fn durable_prompt_replay_ignores_duplicate_fusion_tail() { + let (_dir, store, session_id) = store_with_session(); + seed_user_input(&store, &session_id, "use the browser").await; + { + let store = store.lock().expect("store mutex poisoned"); + store + .append_event( + &session_id, + "model.tool_call", + serde_json::json!({ + "id": "call_browser", + "name": "browser_script", + "arguments": { "code": "return document.title" }, + }), + ) + .expect("seed durable tool call"); + store + .append_event( + &session_id, + "tool.output", + serde_json::json!({ + "tool_call_id": "call_browser", + "name": "browser_script", + "text": "Example Domain", + }), + ) + .expect("seed durable tool output"); + } + + let recorded = Arc::new(Mutex::new(vec![ + Message::new( + MessageRole::Assistant, + vec![ContentPart::ToolCall { + id: "call_browser".to_string(), + name: "browser_script".to_string(), + input: serde_json::json!({ "code": "return document.title" }), + provider_metadata: None, + }], + ), + Message::new( + MessageRole::Tool, + vec![ContentPart::ToolResult { + tool_call_id: "call_browser".to_string(), + content: vec![ContentPart::text("Example Domain")], + is_error: false, + }], + ), + ])); + + let default_state = StoreTurnState::new( + Arc::clone(&store), + SessionId(session_id.clone()), + Arc::clone(&recorded), + ); + let default_prompt = default_state.clone_history_for_prompt().await; + assert_eq!( + count_tool_call_ids(&default_prompt, "call_browser"), + 2, + "test fixture should reproduce the old durable+recorder duplication" + ); + + let durable_state = + StoreTurnState::new(Arc::clone(&store), SessionId(session_id), recorded) + .with_durable_prompt_replay(); + let prompt = durable_state.clone_history_for_prompt().await; + assert_eq!( + count_tool_call_ids(&prompt, "call_browser"), + 1, + "production prompt replay must not duplicate durable tool calls" + ); + assert_eq!( + count_tool_result_ids(&prompt, "call_browser"), + 1, + "production prompt replay must not duplicate durable tool outputs" + ); + } + #[tokio::test] async fn pending_active_followup_drains_into_history_once() { let (_dir, store, session_id) = store_with_session(); From fa0728aa09e922d7c2502c137e25229c21386af0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:36:37 +0000 Subject: [PATCH 45/48] Nudge long bounded runs to finalize --- .../browser-use-agent/src/turn/loop_driver.rs | 16 +++++++- .../browser-use-agent/src/turn/loop_tests.rs | 37 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/crates/browser-use-agent/src/turn/loop_driver.rs b/crates/browser-use-agent/src/turn/loop_driver.rs index db8d3011..7c6dae6d 100644 --- a/crates/browser-use-agent/src/turn/loop_driver.rs +++ b/crates/browser-use-agent/src/turn/loop_driver.rs @@ -83,6 +83,7 @@ use browser_use_llm::schema::{ContentPart, Message, MessageRole}; use tokio_util::sync::CancellationToken; const FINAL_MAX_TURNS_NUDGE: &str = "This is the final allowed step for this run. Stop exploring and call the done tool with the best complete answer you can provide now. Include unknown or unavailable items explicitly instead of continuing to search."; +const PROGRESS_MAX_TURNS_NUDGE: &str = "Progress checkpoint: If you have enough evidence, a saved artifact, or a complete-enough answer, stop further exploration and call the done tool now. Continue only for clearly missing required information that is likely to change the final answer."; /// The async, unbounded turn-loop driver. Generic over the three frozen turn /// traits so production wires real impls (`ContextManager`+`Session`, @@ -178,11 +179,17 @@ impl TurnLoop { // `ContextManager` history; the loop simply threads it through. let mut request = self.state.clone_history_for_prompt().await; request.extend(input); - if max_turns.is_some_and(|limit| turns_run + 1 == limit) { + let next_turn = turns_run + 1; + if max_turns.is_some_and(|limit| next_turn == limit) { request.push(Message::new( MessageRole::Developer, vec![ContentPart::text(FINAL_MAX_TURNS_NUDGE)], )); + } else if max_turns.is_some_and(|limit| should_emit_progress_nudge(limit, next_turn)) { + request.push(Message::new( + MessageRole::Developer, + vec![ContentPart::text(PROGRESS_MAX_TURNS_NUDGE)], + )); } // ---- 2. run one sampling round-trip ---- @@ -256,3 +263,10 @@ impl TurnLoop { } } } + +fn should_emit_progress_nudge(max_turns: usize, next_turn: usize) -> bool { + if max_turns < 40 || next_turn >= max_turns { + return false; + } + next_turn >= max_turns / 2 && next_turn % 10 == 0 +} diff --git a/crates/browser-use-agent/src/turn/loop_tests.rs b/crates/browser-use-agent/src/turn/loop_tests.rs index de3657e2..895e091f 100644 --- a/crates/browser-use-agent/src/turn/loop_tests.rs +++ b/crates/browser-use-agent/src/turn/loop_tests.rs @@ -758,6 +758,43 @@ async fn bounded_loop_aborts_after_max_turns() { )); } +#[tokio::test] +async fn bounded_loop_adds_progress_nudge_for_long_runs() { + let mut scripts: Vec = (0..50) + .map(|i| SamplingScript::Ok(follow_up(&format!("step {i}")))) + .collect(); + scripts.push(SamplingScript::Ok(complete("done after checkpoint"))); + + let sampler = ScriptedSamplingDriver::new(scripts); + let requests = sampler.requests_handle(); + let inputs = sampler.inputs_handle(); + let state = InMemoryTurnState::new(Vec::new(), token_status(false)); + let observer = RecordingObserver::new(); + + let turn = TurnLoop::new(state, sampler, observer.clone()); + let out = turn + .run_with_max_turns(ctx(), false, CancellationToken::new(), 100) + .await + .expect("bounded long run should complete"); + + assert_eq!(requests.load(Ordering::SeqCst), 51); + assert_eq!(out.as_deref(), Some("done after checkpoint")); + let recorded_inputs = inputs.lock().unwrap(); + let Some(Message { + role: MessageRole::Developer, + content, + .. + }) = recorded_inputs[49].last() + else { + panic!("turn 50 should include the progress developer nudge"); + }; + assert!( + matches!(content.first(), Some(ContentPart::Text { text }) if text.contains("Progress checkpoint")), + "progress nudge should tell the agent to finalize once enough evidence exists" + ); + assert_eq!(observer.kinds(), vec!["started", "complete"]); +} + // ---- (8) a hard (non-abort) sampling error propagates out of the loop ------ #[tokio::test] From e109049a8a10637d0b3c2bc49407b015183a3664 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:04:44 +0000 Subject: [PATCH 46/48] Return structured errors from single browser_fetch --- .../src/browser_script_helpers.py | 11 +++++- crates/browser-use-browser/src/lib.rs | 37 +++++++++++++++++++ prompts/browser-script-tool-description.md | 2 +- 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/crates/browser-use-browser/src/browser_script_helpers.py b/crates/browser-use-browser/src/browser_script_helpers.py index b223ed16..b6bc7135 100644 --- a/crates/browser-use-browser/src/browser_script_helpers.py +++ b/crates/browser-use-browser/src/browser_script_helpers.py @@ -1128,8 +1128,15 @@ def browser_fetch( json_body=None, timeout=20.0, binary=None, + return_error=True, ): - """Fetch from the current page context with browser cookies/session state.""" + """Fetch from the current page context with browser cookies/session state. + + By default a failed page-context fetch returns + {"ok": False, "url": ..., "error": ...} instead of failing the entire + browser_script call. Pass return_error=False when the caller wants a hard + exception for required URLs. + """ request = _normalize_browser_fetch_request( url, method=method, @@ -1139,7 +1146,7 @@ def browser_fetch( timeout=timeout, binary=binary, ) - return browser_fetch_many([request], timeout=timeout, return_errors=False)[0] + return browser_fetch_many([request], timeout=timeout, return_errors=return_error)[0] def browser_fetch_many(requests, timeout=20.0, max_concurrency=6, return_errors=True): diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 63a17a33..15bafc51 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -8842,6 +8842,43 @@ print("http_get_many parity ok") assert!(output.text.contains("http_get_many parity ok")); } + #[test] + fn browser_script_browser_fetch_single_returns_structured_errors_by_default() { + let temp = tempfile::tempdir().unwrap(); + let output = run_browser_script( + "script-browser-fetch-single-error", + temp.path(), + temp.path().join("artifacts"), + r#" +def fake_runtime_evaluate(expression, await_promise=False, return_by_value=False): + return [{"ok": False, "url": "https://example.test/api", "error": "Failed to fetch"}] + +globals()["_runtime_evaluate"] = fake_runtime_evaluate + +result = browser_fetch("https://example.test/api") +assert result["ok"] is False, result +assert result["url"] == "https://example.test/api", result +assert "Failed to fetch" in result["error"], result + +try: + browser_fetch("https://example.test/api", return_error=False) +except RuntimeError as exc: + assert "browser_fetch failed" in str(exc), exc +else: + raise AssertionError("return_error=False should raise") + +print("browser_fetch single structured error ok") +"#, + 10, + ) + .unwrap(); + + assert!(output.ok, "{:?}\n{}", output.error, output.text); + assert!(output + .text + .contains("browser_fetch single structured error ok")); + } + #[test] fn browser_script_timeout_returns_tool_failure() { let temp = tempfile::tempdir().unwrap(); diff --git a/prompts/browser-script-tool-description.md b/prompts/browser-script-tool-description.md index ac6e0eac..86c13298 100644 --- a/prompts/browser-script-tool-description.md +++ b/prompts/browser-script-tool-description.md @@ -112,7 +112,7 @@ emit_output(rows, label="employee_rows") - Use `js(...)` for DOM inspection and raw `cdp(...)` for lower-level browser actions. - Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. - For real user forms, act like a browser user: screenshot, click the visible field/control, type with `type_text(...)`, `press_key(...)`, or `fill_input(...)`, then screenshot or otherwise verify. Use coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Do not assign `element.value`, `element.checked`, `selectedIndex`, React private state, or MutationObserver restore loops on live forms. Do not synthesize `input`, `change`, `click`, or keyboard events in page JavaScript to make a form look filled. Those anti-patterns can desynchronize framework state from the visible DOM. -- Use `http_get(...)` for one static page/API URL after the browser reveals a stable endpoint, and `http_get_many(...)` for several independent public URLs. Use `browser_fetch(...)` or `browser_fetch_many(...)` when the page's cookies, auth headers, or browser session are needed. Returned bodies are strings by default, bytes with `binary=True`, and expose `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. Batch helpers preserve input order and return per-URL error records by default so one bad link does not waste the whole extraction chunk. If direct HTTP hits bot or login protection, retry with `browser_fetch(...)`, site-specific headers/cookies, or the configured Browser Use fetch proxy. +- Use `http_get(...)` for one static page/API URL after the browser reveals a stable endpoint, and `http_get_many(...)` for several independent public URLs. Use `browser_fetch(...)` or `browser_fetch_many(...)` when the page's cookies, auth headers, or browser session are needed. Returned bodies are strings by default, bytes with `binary=True`, and expose `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. `browser_fetch(...)` and the batch helpers return error records by default so one bad endpoint does not waste the whole extraction chunk; pass `return_error=False` or `return_errors=False` only when a hard failure is intended. If direct HTTP hits bot or login protection, retry with `browser_fetch(...)`, site-specific headers/cookies, or the configured Browser Use fetch proxy. - Batch recipe after discovering stable links or endpoints: ```python From 45c3659a516c0d3ee0d61caeb601eb6da2587c7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Fri, 5 Jun 2026 00:01:25 +0000 Subject: [PATCH 47/48] Handle busy browser recovery without tool failure --- crates/browser-use-browser/src/lib.rs | 148 ++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 0de4e068..46e3643c 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -565,6 +565,97 @@ fn active_browser_script_next_step(active_scripts: &Value) -> Option { .map(ToOwned::to_owned) } +fn is_browser_recovery_command(argv: &[String]) -> bool { + argv.first().map(String::as_str) == Some("recover") +} + +fn busy_recovery_status_json( + session_id: &str, + argv: &[String], + mut status: Value, + script_registry: &BrowserScriptRunRegistry, +) -> Value { + let requested_command = format!("browser {}", argv.join(" ")); + let live_active_scripts = + active_browser_script_runs_json_with_registry(session_id, script_registry); + let active_scripts = if live_active_scripts + .as_array() + .is_some_and(|scripts| !scripts.is_empty()) + { + live_active_scripts + } else { + status + .get("active_scripts") + .cloned() + .unwrap_or(live_active_scripts) + }; + let next_step = busy_recovery_next_step(&active_scripts, &requested_command); + + if let Some(object) = status.as_object_mut() { + object.insert("status".to_string(), Value::String("busy".to_string())); + object.insert("busy".to_string(), Value::Bool(true)); + object.insert("recovery_deferred".to_string(), Value::Bool(true)); + object.insert( + "reason".to_string(), + Value::String( + "Browser recovery was requested while an active browser_script owned the browser session." + .to_string(), + ), + ); + object.insert( + "requested_command".to_string(), + Value::String(requested_command.clone()), + ); + object.insert("active_scripts".to_string(), active_scripts); + object.insert("next_step".to_string(), Value::String(next_step.clone())); + object.insert( + "model_instruction".to_string(), + Value::String(format!( + "The browser session is busy, not failed. Follow next_step, then retry {requested_command}." + )), + ); + return status; + } + + json!({ + "status": "busy", + "busy": true, + "recovery_deferred": true, + "reason": "Browser recovery was requested while an active browser_script owned the browser session.", + "requested_command": requested_command, + "active_scripts": active_scripts, + "next_step": next_step, + "model_instruction": format!( + "The browser session is busy, not failed. Follow next_step, then retry {requested_command}." + ), + }) +} + +fn busy_recovery_next_step(active_scripts: &Value, requested_command: &str) -> String { + let Some(script) = active_scripts + .as_array() + .and_then(|scripts| scripts.first()) + else { + return format!( + "Wait for the in-flight browser_script to return, run browser status --json, then retry {requested_command}." + ); + }; + let Some(run_id) = script.get("run_id").and_then(Value::as_str) else { + return format!( + "Wait for the in-flight browser_script to return, run browser status --json, then retry {requested_command}." + ); + }; + let status = script + .get("status") + .and_then(Value::as_str) + .unwrap_or("running"); + if matches!(status, "finished" | "timed_out") { + format!("browser_script action=observe run_id={run_id}; then retry {requested_command}.") + } else { + format!("browser_script action=observe run_id={run_id}; if it is still stuck without progress, browser_script action=cancel run_id={run_id}; then retry {requested_command}.") + } +} + pub fn run_browser_command( session_id: &str, cwd: impl AsRef, @@ -665,6 +756,12 @@ pub fn run_browser_command_with_options_and_registries( content, }); } + if is_browser_recovery_command(&argv) { + return Ok(BrowserCommandOutput { + events: Vec::new(), + content: busy_recovery_status_json(session_id, &argv, content, script_registry), + }); + } bail!( "browser session is busy with an active browser_script; observe or cancel that script before running browser {}", argv.join(" ") @@ -9209,6 +9306,57 @@ mod tests { assert_eq!(registry.active_session_count(), 1); } + #[test] + fn browser_recovery_while_checked_out_returns_busy_guidance() { + let temp = tempfile::tempdir().unwrap(); + let registry = BrowserSessionRegistry::new(); + let script_registry = BrowserScriptRunRegistry::new(); + let session_id = "checked-out-recover"; + registry + .checked_out_statuses + .lock() + .expect("browser checked-out session registry poisoned") + .insert( + session_id.to_string(), + json!({ + "mode": "remote-cloud", + "connection": "connected", + "active_scripts": [{ + "run_id": "script-1", + "status": "running", + "next_step": "browser_script action=observe run_id=script-1" + }], + "page": { + "target_id": "target-1", + "session_id": "session-1" + } + }), + ); + + let output = run_browser_command_with_options_and_registries( + session_id, + temp.path(), + temp.path().join("artifacts"), + "browser recover reconnect-websocket", + BrowserCommandOptions::default(), + &script_registry, + ®istry, + ) + .expect("busy recovery should return structured guidance"); + + assert_eq!(output.content["status"], "busy"); + assert_eq!(output.content["busy"], true); + assert_eq!(output.content["recovery_deferred"], true); + assert_eq!( + output.content["requested_command"], + "browser recover reconnect-websocket" + ); + assert_eq!(output.content["active_scripts"][0]["run_id"], "script-1"); + let next_step = output.content["next_step"].as_str().unwrap(); + assert!(next_step.contains("browser_script action=observe run_id=script-1")); + assert!(next_step.contains("retry browser recover reconnect-websocket")); + } + #[test] fn browser_help_is_cli_like() { let help = browser_help(); From 98c530cd8acb5b586f77e7bacab036d4f0f30bf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Fri, 5 Jun 2026 01:54:45 +0000 Subject: [PATCH 48/48] Retry transient browser bridge calls --- .../src/browser_script_helpers.py | 42 ++++++++++++- crates/browser-use-browser/src/lib.rs | 59 +++++++++++++++++++ 2 files changed, 99 insertions(+), 2 deletions(-) diff --git a/crates/browser-use-browser/src/browser_script_helpers.py b/crates/browser-use-browser/src/browser_script_helpers.py index 16b24a11..4e95ad0d 100644 --- a/crates/browser-use-browser/src/browser_script_helpers.py +++ b/crates/browser-use-browser/src/browser_script_helpers.py @@ -12,6 +12,7 @@ import os import pathlib import sys +import threading import time as _time import urllib.error import urllib.request @@ -24,13 +25,50 @@ __last_domain_skills = [] +_bridge_call_lock = threading.RLock() +_TRANSIENT_BRIDGE_ERRORS = ( + "browser is not connected or is busy", + "browser session is busy", + "browser bridge closed before response", + "cdp runtime.evaluate timed out", + "runtime.evaluate timed out", + "temporarily unavailable", +) + + +def _is_transient_bridge_error(exc): + message = str(exc).lower() + return any(part in message for part in _TRANSIENT_BRIDGE_ERRORS) + + +def _bridge_with_retry(payload, *, attempts=4): + delay = 0.25 + last_exc = None + for attempt in range(attempts): + try: + with _bridge_call_lock: + return _bridge(payload) + except (OSError, TimeoutError, RuntimeError) as exc: + last_exc = exc + if attempt + 1 >= attempts or not _is_transient_bridge_error(exc): + raise + print( + f"browser_script bridge retry {attempt + 2}/{attempts} after transient error: {exc}", + file=sys.stderr, + flush=True, + ) + _time.sleep(delay) + delay = min(delay * 2, 2.0) + raise last_exc + + def _send_meta(meta, **params): - return _bridge({"kind": "meta", "meta": meta, **params}) + return _bridge_with_retry({"kind": "meta", "meta": meta, **params}) def cdp(method, session_id=None, **params): """Raw CDP. Example: cdp("Page.navigate", url="https://example.com").""" - return _bridge({"kind": "cdp", "method": method, "session_id": session_id, "params": params}) + return _bridge_with_retry({"kind": "cdp", "method": method, "session_id": session_id, "params": params}) def cdp_batch(calls): diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index 46e3643c..8d8e12ac 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -10635,6 +10635,65 @@ print("browser_fetch single structured error ok") .contains("browser_fetch single structured error ok")); } + #[test] + fn browser_script_bridge_retries_transient_busy_errors() { + let temp = tempfile::tempdir().unwrap(); + let output = run_browser_script( + "script-bridge-retry-busy", + temp.path(), + temp.path().join("artifacts"), + r#" +attempts = {"n": 0} + +class FakeSock: + def __init__(self, payload): + self.payload = bytearray(payload) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def sendall(self, data): + pass + + def recv(self, n): + if not self.payload: + return b"" + chunk = self.payload[:n] + del self.payload[:n] + return bytes(chunk) + +original_create_connection = socket.create_connection + +def fake_create_connection(*args, **kwargs): + attempts["n"] += 1 + if attempts["n"] < 3: + return FakeSock(b'{"ok":false,"error":"browser is not connected or is busy; run `browser status --json`"}\n') + return FakeSock(b'{"ok":true,"result":{"targetInfos":[]}}\n') + +socket.create_connection = fake_create_connection +try: + result = cdp("Target.getTargets") +finally: + socket.create_connection = original_create_connection + +assert result == {"targetInfos": []}, result +assert attempts["n"] == 3, attempts +print("bridge retry ok") +"#, + 10, + ) + .unwrap(); + + assert!(output.ok, "{:?}\n{}", output.error, output.text); + assert!(output.text.contains("bridge retry ok")); + assert!(output + .text + .contains("browser_script bridge retry 2/4 after transient error")); + } + #[test] fn browser_script_timeout_returns_tool_failure() { let temp = tempfile::tempdir().unwrap();