diff --git a/crates/browser-use-agent/src/entrypoint/provider.rs b/crates/browser-use-agent/src/entrypoint/provider.rs index 3281a54c..7dce60b0 100644 --- a/crates/browser-use-agent/src/entrypoint/provider.rs +++ b/crates/browser-use-agent/src/entrypoint/provider.rs @@ -1139,9 +1139,9 @@ fn resolve_provider_with_python( /// The registry registers the backend-free handlers — `shell`, `apply_patch`, /// `view_image`, `update_plan`, `done`, `tool_search` (catalog populated from the registered tools' defs), /// `web_search` (ENABLED; the Responses builder encodes it as the hosted -/// `web_search_preview` tool), `search` (a locally-executed DuckDuckGo search, -/// distinct from the hosted `web_search`) — plus the two product-surface tools -/// that drive real subsystems: +/// `web_search_preview` tool), `search` (a client-executed call to the +/// browser-use search API, distinct from the hosted `web_search`) — plus the +/// two product-surface tools that drive real subsystems: /// * `browser` ([`BrowserTool::new`]): standalone — the production /// [`RealBackend`](crate::tools::handlers::browser::RealBackend) wraps the /// `browser-use-browser` crate and manages CDP sessions internally (keyed by @@ -1237,7 +1237,7 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( use crate::tools::handlers::done::{DoneRequest, DoneTool}; use crate::tools::handlers::mcp::McpToolCallRequest; use crate::tools::handlers::python::{PythonRequest, PythonTool}; - use crate::tools::handlers::search::{SearchRequest, SearchTool}; + use crate::tools::handlers::search::{SearchRequest, SearchTool, SEARCH_PARALLEL_SAFE}; use crate::tools::handlers::shell::{ ExecCommandRequest, ExecCommandTool, ShellRequest, ShellTool, WriteStdinRequest, WriteStdinTool, @@ -1322,10 +1322,16 @@ fn build_tool_dispatcher_with_cwd_and_goal_store( true, WebSearchTool::new(WebSearchConfig::enabled()), ); - // `search`: locally-executed DuckDuckGo (Lite) web search — the client runs - // the HTTP request and parses the results itself (distinct from the hosted - // `web_search` above). Read-only, so parallel_safe = true. - reg.register::<_, SearchRequest>("search", definitions::search(), true, SearchTool::new()); + // `search`: web search via the browser-use search API — the client makes + // the API call (auth: `BROWSER_USE_API_KEY`) and formats the results itself + // (distinct from the hosted `web_search` above). Serial: a conservative + // scheduling default for a billed API call. + reg.register::<_, SearchRequest>( + "search", + definitions::search(), + SEARCH_PARALLEL_SAFE, + SearchTool::new(), + ); let browser_backend = browser_backend_for_runtime_or_config( config, runtime_handle.as_ref(), @@ -3281,11 +3287,11 @@ mod tests { assert!(names.contains(&"done")); assert!(names.contains(&"update_plan")); // Both web searches are wired into the production dispatcher: the hosted - // `web_search` and the locally-executed DuckDuckGo `search`. + // `web_search` and the browser-use search API `search`. assert!(names.contains(&"web_search")); assert!( names.contains(&"search"), - "the locally-executed `search` tool must be reachable by the live model" + "the `search` tool must be reachable by the live model" ); } diff --git a/crates/browser-use-agent/src/tools/handlers/mod.rs b/crates/browser-use-agent/src/tools/handlers/mod.rs index bcbeea2b..cfafb0a7 100644 --- a/crates/browser-use-agent/src/tools/handlers/mod.rs +++ b/crates/browser-use-agent/src/tools/handlers/mod.rs @@ -59,9 +59,8 @@ pub use mcp::{ }; pub use python::{PythonApprovalKey, PythonBackend, PythonRequest, PythonTool}; pub use search::{ - classify_response, extract_real_url, format_results, normalize_whitespace, parse_lite_results, - HttpSearchBackend, SearchApprovalKey, SearchBackend, SearchError, SearchRequest, SearchResult, - SearchTool, + classify_response, format_results, normalize_whitespace, parse_results, HttpSearchBackend, + SearchApprovalKey, SearchBackend, SearchError, SearchRequest, SearchResult, SearchTool, }; pub use shell::{ ExecCommandApprovalKey, ExecCommandRequest, ExecCommandTool, ShellApprovalKey, ShellRequest, diff --git a/crates/browser-use-agent/src/tools/handlers/search.rs b/crates/browser-use-agent/src/tools/handlers/search.rs index fabf1b97..0e226a7b 100644 --- a/crates/browser-use-agent/src/tools/handlers/search.rs +++ b/crates/browser-use-agent/src/tools/handlers/search.rs @@ -1,50 +1,50 @@ -//! `search` tool: a LOCALLY-executed DuckDuckGo (Lite) web search. +//! `search` tool: a web search via the browser-use search API. //! -//! This is the async re-implementation of the legacy Python `search` action -//! (a `browser_use` `Controller` action that fetched -//! `lite.duckduckgo.com/lite/` over HTTP and parsed the result HTML). Only the -//! *search logic* is ported — the surrounding `Controller` / DB / session -//! scaffolding (and the unrelated `request_human_control` action) are dropped. -//! Like the other handlers it implements the full trait stack +//! The client POSTs the query to `search.browser-use.com` — a thin proxy in +//! front of [Parallel](https://parallel.ai)'s Search API with browser-use auth +//! and billing — and formats the returned JSON results for the model. This +//! replaced the DuckDuckGo Lite scrape the tool was originally ported from: +//! the engine changed, the tool surface (name, request shape, output layout) +//! did not. Like the other handlers it implements the full trait stack //! ([`Approvable`] + [`Sandboxable`] + [`ToolRuntime`]) so it can be driven by -//! the [`ToolOrchestrator`](crate::tools::orchestrator::ToolOrchestrator), -//! mirroring the `tool_search` tool's structure: a non-FS, -//! fetch-parse-and-return tool that spawns no process. +//! the [`ToolOrchestrator`](crate::tools::orchestrator::ToolOrchestrator). //! //! # Relationship to [`web_search`](super::web_search) //! //! [`web_search`](super::web_search) is the HOSTED, provider-executed web search -//! (the provider runs the search server-side; the client only declares + passes -//! through the result — it performs *no* local HTTP). This `search` tool is the -//! opposite: it performs a REAL local HTTP GET against DuckDuckGo Lite and parses -//! the returned HTML itself, exactly as the Python action did. The two are -//! complementary, not duplicates: `web_search` needs a capable provider; `search` -//! works against any provider because the client does the work. +//! (the model provider runs the search server-side; the client only declares + +//! passes through the result — it performs *no* local HTTP). This `search` tool +//! is the opposite: the client performs the API call itself, so it works +//! against any model provider. +//! +//! # API contract (verified against the `search` service source) +//! +//! * `POST {base}/search` with JSON `{"query": "…"}` and the +//! [`X-Browser-Use-API-Key`](SEARCH_API_KEY_HEADER) header (a `bu_…` key, +//! read from [`BROWSER_USE_API_KEY`](SEARCH_API_KEY_ENV) — the same variable +//! the rest of the workspace uses for browser-use cloud auth). The base URL +//! defaults to the production service and can be overridden via +//! [`BROWSER_USE_SEARCH_URL`](SEARCH_BASE_URL_ENV) (e.g. a local dev +//! instance, which runs as an open proxy without auth). +//! * `200` → `{"results": [{"title"?, "url", "published_date"?, "content"}]}`; +//! `title` / `published_date` are omitted when the source lacks them, and +//! `content` is multi-line markdown (whitespace-normalized here). +//! * Errors: `400` invalid query, `401` missing/invalid API key, `402` +//! insufficient balance, `422` upstream rejected the request, `502` upstream +//! failed, `503` auth/billing backend unavailable. //! //! # Network seam (testability) //! -//! The HTTP fetch lives behind the [`SearchBackend`] trait, with the real +//! The HTTP call lives behind the [`SearchBackend`] trait, with the real //! [`HttpSearchBackend`] (a `reqwest` client) injected by default and a fake //! substitutable in tests. This mirrors how the `browser` / `python` / `mcp` -//! handlers inject their backends (`BrowserTool::with_backend`, -//! `McpTool::new(Arc)`), so the tool's parsing/formatting logic is -//! unit-tested deterministically with fixture HTML — no network is touched. -//! -//! # HTML parsing -//! -//! The Python original used BeautifulSoup. This crate intentionally carries no -//! HTML-parser dependency (the existing browser tooling reads the DOM from a real -//! browser over CDP, never by parsing HTML strings), so to keep the dependency -//! footprint unchanged we extract the few fields we need with targeted `regex` -//! over the *specific, stable* DuckDuckGo Lite markup — the same fixed selectors -//! BeautifulSoup keyed on (`a.result-link`, `td.result-snippet`). The extraction -//! is faithful to the Python logic and fully fixture-tested in `search_tests.rs`. +//! handlers inject their backends, so the tool's parsing/formatting logic is +//! unit-tested deterministically with fixture JSON — no network is touched. use std::sync::{Arc, OnceLock}; use std::time::Duration; use regex::Regex; -use reqwest::header::{ACCEPT, ACCEPT_LANGUAGE, USER_AGENT}; use crate::tools::runtime::{ Approvable, ExecOutput, SandboxAttempt, Sandboxable, ToolCtx, ToolError, ToolRuntime, @@ -54,21 +54,33 @@ use crate::tools::sandbox::{SandboxPermissions, SandboxPreference}; /// The tool name surfaced to the model. pub const SEARCH_TOOL_NAME: &str = "search"; -/// The DuckDuckGo Lite search endpoint the real backend fetches. -const DDG_LITE_BASE_URL: &str = "https://lite.duckduckgo.com/lite/"; +/// Whether search calls may run concurrently with other parallel-safe tools. +/// +/// Kept serial: a conservative scheduling default for a billed API call +/// (carried over from the previous engine's rate-limit concerns). +pub const SEARCH_PARALLEL_SAFE: bool = false; + +/// The browser-use search service base URL. +const SEARCH_BASE_URL: &str = "https://search.browser-use.com"; -/// Browser-like `User-Agent` (ported verbatim from the Python action's headers). -const DDG_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \ -AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36"; +/// Environment variable overriding the search service base URL (e.g. a local +/// dev instance, `http://localhost:8080`, which runs as an open proxy without +/// auth). Defaults to [`SEARCH_BASE_URL`]. +const SEARCH_BASE_URL_ENV: &str = "BROWSER_USE_SEARCH_URL"; -/// `Accept` header (ported verbatim from the Python action's headers). -const DDG_ACCEPT: &str = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; +/// Environment variable holding the `bu_…` browser-use API key. The same +/// variable the rest of the workspace uses for browser-use cloud auth +/// (`.env.example`, `browser-use-browser`). +const SEARCH_API_KEY_ENV: &str = "BROWSER_USE_API_KEY"; -/// `Accept-Language` header (ported verbatim from the Python action's headers). -const DDG_ACCEPT_LANGUAGE: &str = "en-US,en;q=0.9"; +/// Auth header the search service expects (service `internal/api/server.go` / +/// its README: `X-Browser-Use-API-Key: bu_…`). +const SEARCH_API_KEY_HEADER: &str = "X-Browser-Use-API-Key"; -/// Request timeout (the Python action used `timeout=30.0`). -const SEARCH_REQUEST_TIMEOUT_SECS: u64 = 30; +/// Client-side request timeout. The service's own upstream (Parallel) timeout +/// is 30s (`UPSTREAM_TIMEOUT`); 60s gives it room to answer — including with a +/// `502` — before we cut the connection. +const SEARCH_REQUEST_TIMEOUT_SECS: u64 = 60; /// Max characters of a result title in the formatted output. Titles are trimmed /// (with an ellipsis counted within the cap) to keep the model-facing text token @@ -80,20 +92,22 @@ const MAX_DESCRIPTION_CHARS: usize = 125; /// A single parsed search result. /// -/// Mirrors the Python action's `{title, url, description}` dict. +/// Mirrors the service's result object; the wire `content` (multi-line +/// markdown) is whitespace-normalized into the single-line `description`. #[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct SearchResult { - /// The result's title (the `a.result-link` text). + /// The result's title; empty when the source provided none. pub title: String, - /// The result's destination URL (the DuckDuckGo redirect, unwrapped). + /// The result's destination URL. pub url: String, - /// The result's snippet (the following `td.result-snippet` text), if any. + /// `YYYY-MM-DD` publication date, when the source provides one. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub published_date: Option, + /// The result's content/snippet, normalized to a single line. pub description: String, } /// Typed request for the `search` tool. -/// -/// Mirrors the Python `SearchParams { query }`. #[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct SearchRequest { /// The search query to look up on the web. @@ -109,60 +123,79 @@ impl SearchRequest { } } -/// An error from the search backend's HTTP fetch. +/// An error from the search backend's HTTP call. /// -/// Reproduces the failure cases the Python `_search_duckduckgo` raised: a -/// challenge/CAPTCHA page, a non-2xx HTTP status, and a transport error. +/// The named variants mirror the service's documented statuses so the model +/// sees an actionable message instead of a bare code. #[derive(Debug, thiserror::Error)] pub enum SearchError { - /// DuckDuckGo returned a challenge/anti-bot page (HTTP 202, or the body - /// mentions "anomaly"). - #[error( - "DuckDuckGo is showing a challenge/CAPTCHA – too many requests or suspicious activity." - )] - Challenge, - /// The server returned a client/server error status. + /// No API key was configured; the request was not attempted. + #[error("BROWSER_USE_API_KEY is not set – the browser-use search API requires an API key")] + MissingApiKey, + /// The service rejected the API key (HTTP 401). + #[error("invalid or missing browser-use API key (HTTP 401)")] + Unauthorized, + /// The project balance is exhausted (HTTP 402). + #[error("insufficient browser-use balance (HTTP 402)")] + InsufficientBalance, + /// Any other client/server error status (400, 422, 502, 503, …). #[error("HTTP {status}: {snippet}")] Http { /// The HTTP status code. status: u16, - /// The first 200 chars of the response body (matching the Python - /// `response.text[:200]`). + /// The first 200 chars of the response body. snippet: String, }, + /// A `200` response whose body was not the documented JSON shape. + #[error("unexpected response body: {0}")] + Decode(String), /// A transport-level error (connection, timeout, decoding). #[error("{0}")] Request(String), } -/// The network seam: fetch the raw DuckDuckGo Lite HTML for a query. +/// The network seam: fetch the raw search-API response body for a query. /// /// Implemented for real by [`HttpSearchBackend`] and by a fake in tests, so the /// tool's parsing/formatting can be exercised without a real network — mirroring /// the `browser` / `python` / `mcp` backend seams. #[async_trait::async_trait] pub trait SearchBackend: Send + Sync { - /// Fetch the DuckDuckGo Lite result HTML for `query`. + /// Fetch the search service's JSON response body for `query`. async fn fetch(&self, query: &str) -> Result; } -/// The real [`SearchBackend`]: a `reqwest` client against DuckDuckGo Lite. +/// The real [`SearchBackend`]: a `reqwest` client against the browser-use +/// search service. pub struct HttpSearchBackend { client: reqwest::Client, base_url: String, + api_key: Option, } impl HttpSearchBackend { - /// Construct the backend with a default client and the DuckDuckGo Lite - /// endpoint. + /// Construct the backend from the environment: the base URL from + /// [`BROWSER_USE_SEARCH_URL`](SEARCH_BASE_URL_ENV) (defaulting to the + /// production [`SEARCH_BASE_URL`]) and the API key from + /// [`BROWSER_USE_API_KEY`](SEARCH_API_KEY_ENV). pub fn new() -> Self { + let base_url = std::env::var(SEARCH_BASE_URL_ENV) + .ok() + .map(|url| url.trim().trim_end_matches('/').to_string()) + .filter(|url| !url.is_empty()) + .unwrap_or_else(|| SEARCH_BASE_URL.to_string()); + let api_key = std::env::var(SEARCH_API_KEY_ENV) + .ok() + .map(|key| key.trim().to_string()) + .filter(|key| !key.is_empty()); let client = reqwest::Client::builder() .timeout(Duration::from_secs(SEARCH_REQUEST_TIMEOUT_SECS)) .build() .unwrap_or_else(|_| reqwest::Client::new()); Self { client, - base_url: DDG_LITE_BASE_URL.to_string(), + base_url, + api_key, } } } @@ -176,18 +209,22 @@ impl Default for HttpSearchBackend { #[async_trait::async_trait] impl SearchBackend for HttpSearchBackend { async fn fetch(&self, query: &str) -> Result { - // `reqwest`'s `.query()` produces application/x-www-form-urlencoded - // output (space -> `+`); the encoded byte set differs from Python's - // `quote_plus` on a few characters (e.g. `~`, `*`), but DuckDuckGo - // decodes both to the same query, so results are equivalent. Redirects - // are followed by default, matching `follow_redirects=True`. - let response = self + // The production service always requires a key: fail fast with an + // actionable message instead of a guaranteed 401 round-trip. A custom + // endpoint (BROWSER_USE_SEARCH_URL, e.g. a local dev instance) may be + // an open proxy, so keyless requests are allowed through there. + if self.api_key.is_none() && self.base_url == SEARCH_BASE_URL { + return Err(SearchError::MissingApiKey); + } + + let mut request = self .client - .get(&self.base_url) - .query(&[("q", query)]) - .header(USER_AGENT, DDG_USER_AGENT) - .header(ACCEPT, DDG_ACCEPT) - .header(ACCEPT_LANGUAGE, DDG_ACCEPT_LANGUAGE) + .post(format!("{}/search", self.base_url)) + .json(&serde_json::json!({ "query": query })); + if let Some(api_key) = self.api_key.as_deref() { + request = request.header(SEARCH_API_KEY_HEADER, api_key); + } + let response = request .send() .await .map_err(|err| SearchError::Request(err.to_string()))?; @@ -203,18 +240,20 @@ impl SearchBackend for HttpSearchBackend { } } -/// Classify an HTTP response the way the Python action did: a challenge page -/// (status 202 or an "anomaly" body) first, then any `>= 400` status as an -/// error, otherwise success. +/// Classify an HTTP response per the service's documented statuses: `401` and +/// `402` get named, actionable errors; any other `>= 400` (400 invalid query, +/// 422 upstream rejected, 502 upstream failed, 503 auth backend down) carries +/// the status plus the first 200 chars of the body; everything else is success. pub fn classify_response(status: u16, body: &str) -> Result<(), SearchError> { - if status == 202 || body.to_ascii_lowercase().contains("anomaly") { - return Err(SearchError::Challenge); - } - if status >= 400 { - let snippet: String = body.chars().take(200).collect(); - return Err(SearchError::Http { status, snippet }); + match status { + 401 => Err(SearchError::Unauthorized), + 402 => Err(SearchError::InsufficientBalance), + s if s >= 400 => { + let snippet: String = body.chars().take(200).collect(); + Err(SearchError::Http { status: s, snippet }) + } + _ => Ok(()), } - Ok(()) } /// The async `search` tool. @@ -281,8 +320,8 @@ impl Approvable for SearchTool { } // `exec_approval_requirement` is intentionally left at its trait default - // (`None`): the search is a benign, read-only HTTP GET (the Python action had - // no approval gate either). Returning `None` lets the orchestrator apply + // (`None`): the search is a benign, read-only query against the browser-use + // search API. Returning `None` lets the orchestrator apply // `default_exec_approval_requirement`, which yields `Skip` under any // non-prompting policy. The outbound request mirrors the crate's existing // network usage (the MCP HTTP client, analytics) which is likewise ungated. @@ -306,10 +345,7 @@ impl Sandboxable for SearchTool { #[async_trait::async_trait] impl ToolRuntime for SearchTool { fn parallel_safe(&self, _req: &SearchRequest) -> bool { - // A read-only HTTP GET + pure parse mutates no shared state, so it is safe - // to run concurrently with other tools — matching the parallel-safe - // stance of `tool_search` / `web_search`. - true + SEARCH_PARALLEL_SAFE } async fn run( @@ -329,13 +365,16 @@ impl ToolRuntime for SearchTool { )); } - // A fetch failure is surfaced to the model as a soft error (nonzero exit - // with the message on stderr), mirroring the Python action's - // `ActionResult(error="Search failed: …")` and the MCP handler's - // model-facing error mapping — not a hard tool error. - match self.backend.fetch(query).await { - Ok(html) => { - let results = parse_lite_results(&html); + // A fetch/parse failure is surfaced to the model as a soft error + // (nonzero exit with the message on stderr), mirroring the MCP + // handler's model-facing error mapping — not a hard tool error. + match self + .backend + .fetch(query) + .await + .and_then(|body| parse_results(&body)) + { + Ok(results) => { let stdout = if results.is_empty() { format!("No results found for \"{query}\".") } else { @@ -357,17 +396,65 @@ impl ToolRuntime for SearchTool { } // --------------------------------------------------------------------------- -// Pure helpers (parsing + formatting) — ported from the Python action. +// Pure helpers (parsing + formatting). // --------------------------------------------------------------------------- +/// Wire shape of the service's `200` response: `{"results": [...]}`. +#[derive(serde::Deserialize)] +struct SearchResponseWire { + #[serde(default)] + results: Vec, +} + +/// Wire shape of one result. `title` / `published_date` are omitted when the +/// source lacks them; everything defaults so one sparse result cannot fail the +/// whole response. +#[derive(serde::Deserialize)] +struct SearchResultWire { + #[serde(default)] + title: String, + #[serde(default)] + url: String, + #[serde(default)] + published_date: Option, + #[serde(default)] + content: String, +} + +/// Parse the search service's JSON response body into results. +/// +/// The wire `content` arrives as multi-line markdown; it is whitespace- +/// normalized into the single-line `description`. Results without a `url` are +/// dropped (the model cannot follow them). A body that is not the documented +/// JSON shape is a [`SearchError::Decode`]. +pub fn parse_results(body: &str) -> Result, SearchError> { + let wire: SearchResponseWire = + serde_json::from_str(body).map_err(|err| SearchError::Decode(err.to_string()))?; + + Ok(wire + .results + .into_iter() + .filter(|result| !result.url.trim().is_empty()) + .map(|result| SearchResult { + title: normalize_whitespace(&result.title), + url: result.url.trim().to_string(), + published_date: result + .published_date + .map(|date| date.trim().to_string()) + .filter(|date| !date.is_empty()), + description: normalize_whitespace(&result.content), + }) + .collect()) +} + /// Format parsed results into the readable text block the model sees. /// -/// Faithful to the Python action's `extracted_content` layout: a header (count + -/// the "you already have the results" guidance), then a numbered list with each -/// result's title, `URL:` line, and optional snippet, blank-line separated. The -/// title and description are truncated ([`MAX_TITLE_CHARS`] / -/// [`MAX_DESCRIPTION_CHARS`]) for token efficiency; URLs are kept intact so they -/// remain usable. +/// A header (count + the "you already have the results" guidance), then a +/// numbered list with each result's title (publication date appended when +/// known), `URL:` line, and optional snippet, blank-line separated. The title +/// and description are truncated ([`MAX_TITLE_CHARS`] / +/// [`MAX_DESCRIPTION_CHARS`]) for token efficiency; URLs are kept intact so +/// they remain usable. pub fn format_results(query: &str, results: &[SearchResult]) -> String { let mut lines: Vec = Vec::with_capacity(results.len() * 4 + 1); lines.push(format!( @@ -377,11 +464,17 @@ pub fn format_results(query: &str, results: &[SearchResult]) -> String { results.len() )); for (i, result) in results.iter().enumerate() { - lines.push(format!( - "{}. {}", - i + 1, - truncate_chars(&result.title, MAX_TITLE_CHARS) - )); + // Fall back to the URL when the source provided no title. + let title = if result.title.is_empty() { + result.url.as_str() + } else { + result.title.as_str() + }; + let mut title_line = format!("{}. {}", i + 1, truncate_chars(title, MAX_TITLE_CHARS)); + if let Some(date) = result.published_date.as_deref() { + title_line.push_str(&format!(" ({date})")); + } + lines.push(title_line); lines.push(format!(" URL: {}", result.url)); if !result.description.is_empty() { lines.push(format!( @@ -407,368 +500,14 @@ fn truncate_chars(text: &str, max: usize) -> String { format!("{}…", prefix.trim_end()) } -/// Unwrap a DuckDuckGo redirect URL to its real destination. -/// -/// Ported from the Python `_extract_real_url`: -/// * protocol-relative `//host/…` gets an `https:` scheme; -/// * a `duckduckgo.com/l/?uddg=…` redirect is unwrapped to its `uddg` target -/// (form-decoded, matching `parse_qs` + `unquote`); -/// * ad links (`duckduckgo.com/y.js`) and non-`http(s)` schemes are dropped -/// (returns `None`). -pub fn extract_real_url(ddg_url: &str) -> Option { - if ddg_url.is_empty() { - return None; - } - - let with_scheme = if let Some(rest) = ddg_url.strip_prefix("//") { - format!("https://{rest}") - } else { - ddg_url.to_string() - }; - - let mut url = with_scheme.clone(); - if with_scheme.contains("duckduckgo.com/l/") && with_scheme.contains("uddg=") { - if let Some(target) = query_param(&with_scheme, "uddg") { - url = target; - } - } - - // Ad links – skip. - if url.contains("duckduckgo.com/y.js") { - return None; - } - - // Only allow http/https to prevent unsafe URLs (javascript:, data:, …). - if !(url.starts_with("https://") || url.starts_with("http://")) { - return None; - } - - Some(url) -} - /// Collapse runs of whitespace into a single space and trim the ends. -/// -/// Ported from the Python `_normalize_whitespace` -/// (`re.sub(r"\s+", " ", text).strip()`). pub fn normalize_whitespace(text: &str) -> String { whitespace_regex() .replace_all(text.trim(), " ") .into_owned() } -/// Parse search results out of a DuckDuckGo Lite HTML response. -/// -/// Ported from the Python `_parse_lite_results`: for each `a.result-link`, take -/// its (entity-decoded) text as the title and unwrap its `href`; skip empty / -/// "more info" / duplicate / `duckduckgo.com` results; and attach the snippet -/// from the first following `td.result-snippet` that precedes the next result -/// link. -pub fn parse_lite_results(html: &str) -> Vec { - let anchors = collect_anchors(html); - let snippets = collect_snippets(html); - - let mut results: Vec = Vec::new(); - let mut seen: std::collections::HashSet = std::collections::HashSet::new(); - - for (idx, anchor) in anchors.iter().enumerate() { - if anchor.title.is_empty() || anchor.title.eq_ignore_ascii_case("more info") { - continue; - } - - let Some(url) = extract_real_url(&anchor.href) else { - continue; - }; - if seen.contains(&url) || is_duckduckgo_result_host(&url) { - continue; - } - seen.insert(url.clone()); - - // The snippet is the first `result-snippet` after this anchor and before - // the next one (matching the Python sibling-walk that stops at the next - // result link). - let next_pos = anchors.get(idx + 1).map_or(usize::MAX, |a| a.pos); - let description = snippets - .iter() - .find(|s| s.pos > anchor.pos && s.pos < next_pos) - .map(|s| s.text.clone()) - .unwrap_or_default(); - - results.push(SearchResult { - title: anchor.title.clone(), - url, - description, - }); - } - - results -} - -fn is_duckduckgo_result_host(url: &str) -> bool { - reqwest::Url::parse(url) - .ok() - .and_then(|url| url.host_str().map(str::to_ascii_lowercase)) - .is_some_and(|host| host == "duckduckgo.com" || host.ends_with(".duckduckgo.com")) -} - -/// A raw `a.result-link` extracted from the HTML, with its byte offset. -struct RawAnchor { - pos: usize, - href: String, - title: String, -} - -/// A raw `td.result-snippet` extracted from the HTML, with its byte offset. -struct RawSnippet { - pos: usize, - text: String, -} - -/// Extract every `a.result-link` anchor (offset, href, title) in document order. -fn collect_anchors(html: &str) -> Vec { - anchor_regex() - .captures_iter(html) - .filter_map(|caps| { - let whole = caps.get(0)?; - let attrs = caps.get(1).map_or("", |m| m.as_str()); - let inner = caps.get(2).map_or("", |m| m.as_str()); - if !has_class(attrs, "result-link") { - return None; - } - Some(RawAnchor { - pos: whole.start(), - href: attr_value(attrs, AttrName::Href).unwrap_or_default(), - // Strip tags, decode entities, then trim. DuckDuckGo Lite titles - // are plain text, so this matches the Python `get_text(strip=True)` - // title extraction; on any inline markup it yields the cleaner - // space-preserving text rather than BeautifulSoup's node-join. - title: text_from_html(inner, "").trim().to_string(), - }) - }) - .collect() -} - -/// Extract every `td.result-snippet` (offset, normalized text) in document order. -fn collect_snippets(html: &str) -> Vec { - td_regex() - .captures_iter(html) - .filter_map(|caps| { - let whole = caps.get(0)?; - let attrs = caps.get(1).map_or("", |m| m.as_str()); - let inner = caps.get(2).map_or("", |m| m.as_str()); - if !has_class(attrs, "result-snippet") { - return None; - } - Some(RawSnippet { - pos: whole.start(), - // `get_text(separator=" ")` then normalize whitespace. - text: normalize_whitespace(&text_from_html(inner, " ")), - }) - }) - .collect() -} - -/// Strip HTML tags (replacing each with `separator`) and decode entities. -fn text_from_html(html: &str, separator: &str) -> String { - let without_tags = tag_regex().replace_all(html, separator); - decode_entities(&without_tags) -} - -/// Whether a tag's attribute string declares `class` containing `class_name`. -fn has_class(attrs: &str, class_name: &str) -> bool { - attr_value(attrs, AttrName::Class) - .is_some_and(|value| value.split_whitespace().any(|c| c == class_name)) -} - -/// The attributes we extract from a tag. -#[derive(Clone, Copy)] -enum AttrName { - Href, - Class, -} - -/// Extract a quoted attribute value from a tag's attribute string. -fn attr_value(attrs: &str, name: AttrName) -> Option { - let re = match name { - AttrName::Href => href_regex(), - AttrName::Class => class_regex(), - }; - re.captures(attrs) - .and_then(|caps| caps.get(1)) - .map(|m| m.as_str().to_string()) -} - -/// Read a single query parameter's value, form-decoded (matching `parse_qs`: -/// `+` becomes a space and `%XX` is percent-decoded). -fn query_param(url: &str, key: &str) -> Option { - let (_, query) = url.split_once('?')?; - // Drop any fragment before splitting pairs. - let query = query.split('#').next().unwrap_or(query); - for pair in query.split('&') { - let (k, v) = pair.split_once('=').unwrap_or((pair, "")); - if k == key { - return Some(percent_decode_form(v)); - } - } - None -} - -/// Form-decode a query component: `+` -> space, `%XX` -> byte, then UTF-8. -fn percent_decode_form(value: &str) -> String { - let spaced = value.replace('+', " "); - let bytes = spaced.as_bytes(); - let mut out: Vec = Vec::with_capacity(bytes.len()); - let mut i = 0; - while i < bytes.len() { - if bytes[i] == b'%' && i + 2 < bytes.len() { - if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) { - out.push(hi * 16 + lo); - i += 3; - continue; - } - } - out.push(bytes[i]); - i += 1; - } - String::from_utf8_lossy(&out).into_owned() -} - -/// Hex digit value of an ASCII byte, or `None`. -fn hex_val(byte: u8) -> Option { - match byte { - b'0'..=b'9' => Some(byte - b'0'), - b'a'..=b'f' => Some(byte - b'a' + 10), - b'A'..=b'F' => Some(byte - b'A' + 10), - _ => None, - } -} - -/// Decode the common HTML character references in one pass. -/// -/// Covers the named references that appear in DuckDuckGo snippets plus all -/// numeric references (`&#NN;` / `&#xHH;`); unknown named references are left -/// intact (BeautifulSoup decodes the full set — this is the practical subset). -fn decode_entities(text: &str) -> String { - entity_regex() - .replace_all(text, |caps: ®ex::Captures<'_>| { - let body = &caps[1]; - if let Some(hex) = body.strip_prefix("#x").or_else(|| body.strip_prefix("#X")) { - return decode_codepoint(u32::from_str_radix(hex, 16).ok()) - .unwrap_or_else(|| caps[0].to_string()); - } - if let Some(dec) = body.strip_prefix('#') { - return decode_codepoint(dec.parse::().ok()) - .unwrap_or_else(|| caps[0].to_string()); - } - match body { - "amp" => "&", - "lt" => "<", - "gt" => ">", - "quot" => "\"", - "apos" => "'", - "nbsp" => " ", - // Typographic punctuation. - "hellip" => "…", - "mdash" => "—", - "ndash" => "–", - "rsquo" => "\u{2019}", - "lsquo" => "\u{2018}", - "rdquo" => "\u{201D}", - "ldquo" => "\u{201C}", - "laquo" => "«", - "raquo" => "»", - "middot" => "·", - "bull" => "•", - // Common symbols. - "copy" => "©", - "reg" => "®", - "trade" => "™", - "times" => "×", - "divide" => "÷", - "deg" => "°", - "euro" => "€", - "pound" => "£", - "cent" => "¢", - "sect" => "§", - // Common Western-European accented letters. - "aacute" => "á", - "agrave" => "à", - "acirc" => "â", - "auml" => "ä", - "aring" => "å", - "ccedil" => "ç", - "eacute" => "é", - "egrave" => "è", - "ecirc" => "ê", - "euml" => "ë", - "iacute" => "í", - "iuml" => "ï", - "ntilde" => "ñ", - "oacute" => "ó", - "ocirc" => "ô", - "ouml" => "ö", - "uacute" => "ú", - "uuml" => "ü", - "szlig" => "ß", - // Unknown named reference: leave the original text intact - // (BeautifulSoup decodes the full HTML5 set; this is the - // practical subset DuckDuckGo emits, plus all numeric refs). - _ => return caps[0].to_string(), - } - .to_string() - }) - .into_owned() -} - -/// Map a numeric character-reference code point to its string, if valid. -fn decode_codepoint(code: Option) -> Option { - code.and_then(char::from_u32).map(|c| c.to_string()) -} - -// --- Cached regexes (compiled once; patterns are constant) ----------------- -// -// The tag regexes use `[^>]*` for the attribute span, which assumes attribute -// values contain no literal `>` — true for the fixed DuckDuckGo Lite markup -// (see the module doc). On non-conforming markup a `>` inside an attribute -// value would truncate the match (dropping that result), never panic. - -fn anchor_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| Regex::new(r"(?is)]*)>(.*?)").expect("valid anchor regex")) -} - -fn td_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| Regex::new(r"(?is)]*)>(.*?)").expect("valid td regex")) -} - -fn tag_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| Regex::new(r"(?s)<[^>]*>").expect("valid tag regex")) -} - -fn href_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| { - Regex::new(r#"(?i)(?:^|\s)href\s*=\s*["']([^"']*)["']"#).expect("valid href regex") - }) -} - -fn class_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| { - Regex::new(r#"(?i)(?:^|\s)class\s*=\s*["']([^"']*)["']"#).expect("valid class regex") - }) -} - fn whitespace_regex() -> &'static Regex { static RE: OnceLock = OnceLock::new(); RE.get_or_init(|| Regex::new(r"\s+").expect("valid whitespace regex")) } - -fn entity_regex() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - RE.get_or_init(|| { - Regex::new(r"&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*);") - .expect("valid entity regex") - }) -} diff --git a/crates/browser-use-agent/src/tools/handlers/search_tests.rs b/crates/browser-use-agent/src/tools/handlers/search_tests.rs index 421103b8..ebfcd54e 100644 --- a/crates/browser-use-agent/src/tools/handlers/search_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/search_tests.rs @@ -1,14 +1,14 @@ //! Tests for the async `search` tool ([`SearchTool`]). //! -//! No real network is touched: the pure parsing/formatting/URL helpers are -//! exercised against fixture HTML, and the `run` path is driven through a fake +//! No real network is touched: the pure parsing/formatting helpers are +//! exercised against fixture JSON, and the `run` path is driven through a fake //! [`SearchBackend`] (mirroring `update_plan_tests` / `tool_search_tests`). use std::sync::Arc; use super::search::{ - classify_response, extract_real_url, format_results, normalize_whitespace, parse_lite_results, - SearchBackend, SearchError, SearchRequest, SearchResult, SearchTool, SEARCH_TOOL_NAME, + classify_response, format_results, normalize_whitespace, parse_results, SearchBackend, + SearchError, SearchRequest, SearchResult, SearchTool, SEARCH_PARALLEL_SAFE, SEARCH_TOOL_NAME, }; use crate::tools::approval::AskForApproval; use crate::tools::orchestrator::{ToolOrchestrator, TurnEnv}; @@ -59,60 +59,54 @@ fn turn_env() -> TurnEnv { } } -/// A fake backend returning a canned HTML body (no network). -struct HtmlBackend(String); +/// A fake backend returning a canned response body (no network). +struct StubBackend(String); #[async_trait::async_trait] -impl SearchBackend for HtmlBackend { +impl SearchBackend for StubBackend { async fn fetch(&self, _query: &str) -> Result { Ok(self.0.clone()) } } -/// A fake backend returning a challenge error (no network). -struct ChallengeBackend; +/// A fake backend failing with a 401 (no network). +struct UnauthorizedBackend; #[async_trait::async_trait] -impl SearchBackend for ChallengeBackend { +impl SearchBackend for UnauthorizedBackend { async fn fetch(&self, _query: &str) -> Result { - Err(SearchError::Challenge) + Err(SearchError::Unauthorized) } } -/// A small, realistic DuckDuckGo Lite results fixture exercising: a redirect -/// URL, an entity in the snippet, a "More info" link (skipped), a duplicate -/// (deduped), a `duckduckgo.com` target (skipped), a direct (non-redirect) link, -/// and a result without a snippet. -const FIXTURE: &str = r#" - - - - - - - - - - - - - - - - - - - - - -
1. The Rust Programming Language
 A language empowering everyone to build reliable & efficient software — fast.
www.rust-lang.org
2. Rust (duplicate target)
duplicate should be dropped
DuckDuckGo About
a duckduckgo.com target, should be dropped
More info
Direct Link No Redirect
direct link snippet
No Snippet Result
- -"#; +/// A realistic search-service response fixture exercising: a full result +/// (title + date + multi-line markdown content), a result without a +/// `published_date`, a result without a `title` (URL fallback), and a result +/// without a `url` (dropped). +const FIXTURE: &str = r##"{ + "results": [ + { + "title": "Genpact and Parallel Web Systems Partner to Drive Tangible Efficiency from AI Systems", + "url": "https://www.prnewswire.com/news-releases/genpact-parallel-302736563.html", + "published_date": "2026-04-08", + "content": "# Genpact and Parallel\n## Share this article\nIntegrating Parallel's API helps\nGenpact automate research workflows." + }, + { + "title": "Parallel raises $100M", + "url": "https://www.linkedin.com/posts/example-activity", + "content": "Nov 12, 2025 · The startup secured a $100 million Series A round." + }, + { + "url": "https://untitled.example.com/page", + "published_date": "2026-05-19", + "content": "A result whose source provided no title." + }, + { + "title": "No URL – must be dropped", + "content": "this result has no url and is filtered out" + } + ] +}"##; // ---- pure helpers: normalize_whitespace ----------------------------------- @@ -123,143 +117,56 @@ fn normalize_whitespace_collapses_and_trims() { assert_eq!(normalize_whitespace(" "), ""); } -// ---- pure helpers: extract_real_url --------------------------------------- - -#[test] -fn extract_real_url_unwraps_ddg_redirect() { - let raw = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage%3Fx%3D1&rut=abc"; - assert_eq!( - extract_real_url(raw), - Some("https://example.com/page?x=1".to_string()) - ); -} - -#[test] -fn extract_real_url_decodes_plus_as_space() { - // `parse_qs` semantics: `+` in a query value decodes to a space. - let raw = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fa+b"; - assert_eq!( - extract_real_url(raw), - Some("https://example.com/a b".to_string()) - ); -} +// ---- pure helpers: parse_results ------------------------------------------- #[test] -fn extract_real_url_adds_scheme_to_protocol_relative() { - assert_eq!( - extract_real_url("//example.com/x"), - Some("https://example.com/x".to_string()) - ); -} +fn parse_results_maps_wire_results() { + let results = parse_results(FIXTURE).unwrap(); -#[test] -fn extract_real_url_passes_through_plain_http() { - assert_eq!( - extract_real_url("https://example.com/"), - Some("https://example.com/".to_string()) - ); - assert_eq!( - extract_real_url("http://example.com/"), - Some("http://example.com/".to_string()) - ); -} + // The url-less result is dropped; the other three are kept in order. + assert_eq!(results.len(), 3); -#[test] -fn extract_real_url_drops_ads_and_unsafe_and_empty() { - // Ad links. + // Full result: title, url, date, and content normalized to one line. assert_eq!( - extract_real_url("//duckduckgo.com/y.js?ad_provider=x"), - None + results[0].title, + "Genpact and Parallel Web Systems Partner to Drive Tangible Efficiency from AI Systems" ); - // Non-http(s) schemes. - assert_eq!(extract_real_url("javascript:alert(1)"), None); - assert_eq!(extract_real_url("data:text/html,hi"), None); - // Empty. - assert_eq!(extract_real_url(""), None); -} - -// ---- pure helpers: parse_lite_results ------------------------------------- - -#[test] -fn parse_lite_results_extracts_decodes_dedupes_and_filters() { - let results = parse_lite_results(FIXTURE); - - // Kept, in order: rust-lang (redirect), direct link, no-snippet result. - // Dropped: duplicate target, duckduckgo.com target, "More info" title. - let titles: Vec<&str> = results.iter().map(|r| r.title.as_str()).collect(); assert_eq!( - titles, - vec![ - "The Rust Programming Language", - "Direct Link No Redirect", - "No Snippet Result", - ] + results[0].url, + "https://www.prnewswire.com/news-releases/genpact-parallel-302736563.html" ); - - // First result: redirect unwrapped + snippet entity-decoded + normalized. - assert_eq!(results[0].url, "https://www.rust-lang.org/"); + assert_eq!(results[0].published_date.as_deref(), Some("2026-04-08")); assert_eq!( results[0].description, - "A language empowering everyone to build reliable & efficient software — fast." + "# Genpact and Parallel ## Share this article Integrating Parallel's API helps Genpact automate research workflows." ); - // Direct (non-redirect) link is passed through with its own snippet. - assert_eq!(results[1].url, "https://direct.example.com/page"); - assert_eq!(results[1].description, "direct link snippet"); - - // A result with no following snippet gets an empty description. - assert_eq!(results[2].url, "https://no-snippet.example.com/"); - assert_eq!(results[2].description, ""); -} + // Date is optional. + assert_eq!(results[1].title, "Parallel raises $100M"); + assert_eq!(results[1].published_date, None); -#[test] -fn parse_lite_results_handles_empty_and_resultless_html() { - assert!(parse_lite_results("").is_empty()); - assert!(parse_lite_results("no results here").is_empty()); + // Title is optional (empty when the source provided none). + assert_eq!(results[2].title, ""); + assert_eq!(results[2].url, "https://untitled.example.com/page"); + assert_eq!(results[2].published_date.as_deref(), Some("2026-05-19")); } -/// Inline markup inside a title/snippet, real whitespace runs, and a broadened -/// named entity: exercises `text_from_html` tag-stripping (both separators), -/// `normalize_whitespace` via the parse path, and the entity table. #[test] -fn parse_lite_results_strips_inline_markup_and_collapses_whitespace() { - let html = "\ - \ - \ -
The Rust Book
Tokio is an\n async runtime for café & more.
"; - let results = parse_lite_results(html); - assert_eq!(results.len(), 1); - // Title: tags stripped (separator ""), single-spaced. - assert_eq!(results[0].title, "The Rust Book"); - assert_eq!(results[0].url, "https://book.example.com/"); - // Snippet: tags -> space, é/& decoded, whitespace runs collapsed. - assert_eq!( - results[0].description, - "Tokio is an async runtime for café & more." - ); +fn parse_results_handles_empty_and_missing_results() { + assert!(parse_results(r#"{"results": []}"#).unwrap().is_empty()); + // `results` defaults when absent. + assert!(parse_results("{}").unwrap().is_empty()); } #[test] -fn parse_lite_results_filters_duckduckgo_hosts_without_dropping_mentions_elsewhere() { - let html = r#" - - - - - -
Valid Mention
kept
DuckDuckGo About
dropped
- "#; - - let results = parse_lite_results(html); - - assert_eq!(results.len(), 1); - assert_eq!(results[0].title, "Valid Mention"); - assert_eq!( - results[0].url, - "https://example.com/articles/duckduckgo.com-review" - ); +fn parse_results_rejects_malformed_bodies() { + for body in ["not json", "", r#"{"results": "nope"}"#, "[1,2,3]"] { + let err = parse_results(body).unwrap_err(); + assert!( + matches!(err, SearchError::Decode(_)), + "expected Decode for {body:?}, got {err:?}" + ); + } } // ---- pure helpers: format_results ----------------------------------------- @@ -270,11 +177,13 @@ fn format_results_renders_header_and_numbered_entries() { SearchResult { title: "First".to_string(), url: "https://a.example/".to_string(), + published_date: Some("2026-04-08".to_string()), description: "first snippet".to_string(), }, SearchResult { title: "Second".to_string(), url: "https://b.example/".to_string(), + published_date: None, description: String::new(), }, ]; @@ -288,18 +197,37 @@ fn format_results_renders_header_and_numbered_entries() { out.contains("do NOT navigate to a search engine"), "got: {out}" ); - assert!(out.contains("1. First"), "got: {out}"); + // The publication date is appended to the title line when known. + assert!(out.contains("1. First (2026-04-08)"), "got: {out}"); assert!(out.contains(" URL: https://a.example/"), "got: {out}"); assert!(out.contains(" first snippet"), "got: {out}"); - assert!(out.contains("2. Second"), "got: {out}"); + // No date -> bare title line. + assert!(out.contains("2. Second\n"), "got: {out}"); assert!(out.contains(" URL: https://b.example/"), "got: {out}"); } +#[test] +fn format_results_falls_back_to_url_for_untitled_results() { + let results = vec![SearchResult { + title: String::new(), + url: "https://untitled.example.com/page".to_string(), + published_date: None, + description: "snippet".to_string(), + }]; + let out = format_results("q", &results); + // The fallback title is the URL, subject to the same 30-char cap. + assert!( + out.contains("1. https://untitled.example.com/…"), + "untitled result should show its URL as the title: {out}" + ); +} + #[test] fn format_results_truncates_long_title_and_description() { let results = vec![SearchResult { title: "ThisIsAVeryLongResultTitleThatExceedsThirtyCharacters".to_string(), url: "https://example.com/keep/this/whole/url".to_string(), + published_date: None, description: "d".repeat(250), }]; let out = format_results("q", &results); @@ -333,43 +261,46 @@ fn format_results_truncates_long_title_and_description() { // ---- pure helpers: classify_response -------------------------------------- #[test] -fn classify_response_flags_challenge_status_and_anomaly_body() { +fn classify_response_names_auth_and_billing_errors() { assert!(matches!( - classify_response(202, "anything"), - Err(SearchError::Challenge) + classify_response(401, "unauthorized"), + Err(SearchError::Unauthorized) )); assert!(matches!( - classify_response(200, "...Anomaly detected..."), - Err(SearchError::Challenge) + classify_response(402, "payment required"), + Err(SearchError::InsufficientBalance) )); } #[test] -fn classify_response_flags_http_errors_with_snippet() { +fn classify_response_flags_other_errors_with_snippet() { + // 400 invalid query, 422 upstream rejected, 502/503 upstream down — all + // carry the status + body snippet. + for status in [400u16, 422, 502, 503] { + match classify_response(status, "boom") { + Err(SearchError::Http { + status: got, + snippet, + }) => { + assert_eq!(got, status); + assert_eq!(snippet, "boom"); + } + other => panic!("expected Http for {status}, got {other:?}"), + } + } + // The snippet is truncated to 200 chars. let body = "x".repeat(500); - match classify_response(503, &body) { - Err(SearchError::Http { status, snippet }) => { - assert_eq!(status, 503); - assert_eq!( - snippet.chars().count(), - 200, - "snippet truncated to 200 chars" - ); + match classify_response(500, &body) { + Err(SearchError::Http { snippet, .. }) => { + assert_eq!(snippet.chars().count(), 200, "snippet truncated"); } other => panic!("expected Http error, got {other:?}"), } } #[test] -fn classify_response_flags_4xx_and_pins_the_400_boundary() { - // 4xx is the case the port must handle (not just 5xx). - match classify_response(404, "not found") { - Err(SearchError::Http { status, snippet }) => { - assert_eq!(status, 404); - assert_eq!(snippet, "not found"); - } - other => panic!("expected Http error, got {other:?}"), - } +fn classify_response_accepts_ok_and_pins_the_400_boundary() { + assert!(classify_response(200, r#"{"results":[]}"#).is_ok()); // The 399-ok / 400-error boundary pins against an off-by-one in `>= 400`. assert!(classify_response(399, "ok").is_ok()); assert!(matches!( @@ -378,20 +309,15 @@ fn classify_response_flags_4xx_and_pins_the_400_boundary() { )); } -#[test] -fn classify_response_accepts_ok() { - assert!(classify_response(200, "fine").is_ok()); -} - // ---- run() through the fake backend --------------------------------------- #[tokio::test] -async fn run_formats_results_from_backend_html() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(FIXTURE.to_string()))); +async fn run_formats_results_from_backend_json() { + let tool = SearchTool::with_backend(Arc::new(StubBackend(FIXTURE.to_string()))); let launch = none_launch(); let attempt = none_attempt(&launch); let out = tool - .run(&SearchRequest::new("rust"), &attempt, &ctx()) + .run(&SearchRequest::new("parallel"), &attempt, &ctx()) .await .unwrap(); @@ -399,19 +325,28 @@ async fn run_formats_results_from_backend_html() { assert!(out.stderr.is_empty()); assert!( out.stdout - .contains("Search results for \"rust\" (3 results):"), + .contains("Search results for \"parallel\" (3 results):"), "got: {}", out.stdout ); - // This title (29 chars) is within the 30-char cap, so it appears in full. + // Title truncated to 30 chars (incl. ellipsis) with the date appended. assert!( - out.stdout.contains("The Rust Programming Language"), + out.stdout + .contains("1. Genpact and Parallel Web Syst… (2026-04-08)"), "got: {}", out.stdout ); // URLs are kept intact. assert!( - out.stdout.contains("https://www.rust-lang.org/"), + out.stdout + .contains("https://www.prnewswire.com/news-releases/genpact-parallel-302736563.html"), + "got: {}", + out.stdout + ); + // Multi-line markdown content arrives normalized to one line. + assert!( + out.stdout + .contains("# Genpact and Parallel ## Share this article"), "got: {}", out.stdout ); @@ -419,9 +354,7 @@ async fn run_formats_results_from_backend_html() { #[tokio::test] async fn run_reports_no_results() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend( - "nothing".to_string(), - ))); + let tool = SearchTool::with_backend(Arc::new(StubBackend(r#"{"results":[]}"#.to_string()))); let launch = none_launch(); let attempt = none_attempt(&launch); let out = tool @@ -435,7 +368,7 @@ async fn run_reports_no_results() { #[tokio::test] async fn run_rejects_empty_query() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); + let tool = SearchTool::with_backend(Arc::new(StubBackend(String::new()))); let launch = none_launch(); let attempt = none_attempt(&launch); let err = tool @@ -450,11 +383,11 @@ async fn run_rejects_empty_query() { #[tokio::test] async fn run_surfaces_backend_failure_as_soft_error() { - let tool = SearchTool::with_backend(Arc::new(ChallengeBackend)); + let tool = SearchTool::with_backend(Arc::new(UnauthorizedBackend)); let launch = none_launch(); let attempt = none_attempt(&launch); let out = tool - .run(&SearchRequest::new("rust"), &attempt, &ctx()) + .run(&SearchRequest::new("parallel"), &attempt, &ctx()) .await .unwrap(); @@ -463,7 +396,25 @@ async fn run_surfaces_backend_failure_as_soft_error() { assert_eq!(out.exit_code, 1); assert!(out.stdout.is_empty()); assert!( - out.stderr.contains("Search failed:") && out.stderr.contains("challenge"), + out.stderr.contains("Search failed:") && out.stderr.contains("API key"), + "got: {}", + out.stderr + ); +} + +#[tokio::test] +async fn run_surfaces_malformed_body_as_soft_error() { + let tool = SearchTool::with_backend(Arc::new(StubBackend("not json".to_string()))); + let launch = none_launch(); + let attempt = none_attempt(&launch); + let out = tool + .run(&SearchRequest::new("parallel"), &attempt, &ctx()) + .await + .unwrap(); + + assert_eq!(out.exit_code, 1); + assert!( + out.stderr.contains("Search failed:") && out.stderr.contains("unexpected response body"), "got: {}", out.stderr ); @@ -473,8 +424,8 @@ async fn run_surfaces_backend_failure_as_soft_error() { #[test] fn approval_accessors() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); - let req = SearchRequest::new("rust"); + let tool = SearchTool::with_backend(Arc::new(StubBackend(String::new()))); + let req = SearchRequest::new("parallel"); assert_eq!(tool.approval_keys(&req).len(), 1, "one key per call"); assert_eq!( tool.sandbox_permissions(&req), @@ -484,15 +435,18 @@ fn approval_accessors() { } #[test] -fn search_is_parallel_safe() { - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); - assert!(tool.parallel_safe(&SearchRequest::new("rust"))); +fn search_is_serial_by_default() { + // A conservative scheduling default for a billed API call. + let tool = SearchTool::with_backend(Arc::new(StubBackend(String::new()))); + let req = SearchRequest::new("parallel"); + assert_eq!(tool.parallel_safe(&req), SEARCH_PARALLEL_SAFE); + assert!(!tool.parallel_safe(&req), "search must be serial"); } #[test] fn tool_name_is_search() { assert_eq!(SEARCH_TOOL_NAME, "search"); - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(String::new()))); + let tool = SearchTool::with_backend(Arc::new(StubBackend(String::new()))); assert_eq!(tool.name(), "search"); } @@ -510,12 +464,12 @@ fn request_round_trips_wire_shape() { #[tokio::test] async fn orchestrated_search_completes_under_none() { let orch = ToolOrchestrator::new(NoneSandboxProvider, AutoApprover); - let tool = SearchTool::with_backend(Arc::new(HtmlBackend(FIXTURE.to_string()))); + let tool = SearchTool::with_backend(Arc::new(StubBackend(FIXTURE.to_string()))); let result = orch .run( &tool, - &SearchRequest::new("rust"), + &SearchRequest::new("parallel"), &ctx(), &turn_env(), AskForApproval::Never, @@ -525,35 +479,41 @@ async fn orchestrated_search_completes_under_none() { assert_eq!(result.sandbox_used, SandboxType::None); assert_eq!(result.output.exit_code, 0); - // Within the 30-char title cap, so it appears in full. assert!( - result - .output - .stdout - .contains("The Rust Programming Language"), + result.output.stdout.contains("Genpact and Parallel"), "got: {}", result.output.stdout ); } -// ---- live smoke (ignored: hits the real DuckDuckGo endpoint) -------------- +// ---- live smoke (ignored: hits the real browser-use search API) ----------- -/// End-to-end check against the REAL DuckDuckGo Lite endpoint via the default -/// [`HttpSearchBackend`]. Ignored by default (network + non-deterministic, and -/// DuckDuckGo may rate-limit/serve a challenge). Run it manually with: +/// End-to-end check against the REAL `search.browser-use.com` service via the +/// default [`HttpSearchBackend`]. Ignored by default (network, billing, and a +/// `BROWSER_USE_API_KEY` requirement). Run it manually with: /// /// ```text /// cargo test -p browser-use-agent --lib -- --ignored --nocapture search_live_smoke /// ``` #[tokio::test] -#[ignore = "hits the live DuckDuckGo Lite endpoint"] +#[ignore = "hits the live browser-use search API (requires BROWSER_USE_API_KEY, \ + or BROWSER_USE_SEARCH_URL pointing at an open dev instance)"] async fn search_live_smoke() { + let has_key = std::env::var("BROWSER_USE_API_KEY").is_ok_and(|key| !key.trim().is_empty()); + let has_url = std::env::var("BROWSER_USE_SEARCH_URL").is_ok_and(|url| !url.trim().is_empty()); + if !has_key && !has_url { + eprintln!( + "skipping live smoke: neither BROWSER_USE_API_KEY nor BROWSER_USE_SEARCH_URL is set" + ); + return; + } + let tool = SearchTool::new(); let launch = none_launch(); let attempt = none_attempt(&launch); let out = tool .run( - &SearchRequest::new("rust programming language"), + &SearchRequest::new("Parallel Web Systems latest announcements"), &attempt, &ctx(), ) @@ -564,8 +524,8 @@ async fn search_live_smoke() { "exit_code={}\n--- stdout ---\n{}\n--- stderr ---\n{}", out.exit_code, out.stdout, out.stderr ); - // A challenge/CAPTCHA is a legitimate live outcome (exit 1 + message); only - // assert hard on the success shape so the test documents both paths. + // An auth/billing rejection is a legitimate live outcome (exit 1 + message); + // only assert hard on the success shape so the test documents both paths. if out.exit_code == 0 { assert!( out.stdout.contains("Search results for") || out.stdout.contains("No results found"), diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index 869a96ff..39808dd8 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -1162,16 +1162,14 @@ to the single frame that proves the task succeeded." } } - /// `search`: a LOCALLY-executed DuckDuckGo (Lite) web search. Unlike the - /// hosted [`web_search`](definitions::web_search), the client performs the - /// HTTP request itself and returns the parsed results as text. Ported from - /// the Python `search` action's description. + /// `search`: a web search via the browser-use search API + /// (`search.browser-use.com`). Unlike the hosted [`web_search`], the client + /// performs the API call itself and returns the parsed results as text. pub fn search() -> ToolDefinition { ToolDefinition { name: "search".to_string(), - description: "Search the web using DuckDuckGo and return results directly as text – \ - no browser navigation occurs. The returned results are final and complete. \ - NEVER open a search engine website after calling this action." + description: "Search the web for a free-text query and return results as text. \ + No browser needed; prefer this over opening a browser to a search engine." .to_string(), input_schema: json!({ "type": "object", @@ -1959,9 +1957,9 @@ Agent-role guidance below only helps choose which agent to use after spawning is /// `WireArgs` types. The browser/python/mcp handlers need an injected backend /// (they would otherwise reach the OS), so those are supplied by the caller. /// -/// `parallel_safe` per tool: `exec_command` / `tool_search` / `web_search` / -/// `search` = `true`; `shell` / `apply_patch` / `view_image` / `browser` / -/// `python` / `update_plan` / `done` = `false` (serial). `mcp` is registered +/// `parallel_safe` per tool: `exec_command` / `tool_search` / `web_search` = +/// `true`; `shell` / `apply_patch` / `view_image` / `browser` / `python` / +/// `search` / `update_plan` / `done` = `false` (serial). `mcp` is registered /// `false` here /// (a serial default); its per-request read-only hint still drives the handler's /// own [`ToolRuntime::parallel_safe`](crate::tools::ToolRuntime::parallel_safe). @@ -1988,7 +1986,7 @@ where use crate::tools::handlers::done::DoneRequest; use crate::tools::handlers::mcp::McpToolCallRequest; use crate::tools::handlers::python::PythonRequest; - use crate::tools::handlers::search::SearchRequest; + use crate::tools::handlers::search::{SearchRequest, SEARCH_PARALLEL_SAFE}; use crate::tools::handlers::shell::{ ExecCommandRequest, ExecCommandTool, ShellRequest, WriteStdinRequest, WriteStdinTool, }; @@ -2040,9 +2038,14 @@ where tool_search, ); reg.register::<_, WebSearchRequest>("web_search", definitions::web_search(), true, web_search); - // `search`: locally-executed DuckDuckGo search. Read-only HTTP GET + - // pure parse, so parallel-safe like `web_search` / `tool_search`. - reg.register::<_, SearchRequest>("search", definitions::search(), true, search); + // `search`: web search via the browser-use search API. Serial: a + // conservative scheduling default for a billed API call. + reg.register::<_, SearchRequest>( + "search", + definitions::search(), + SEARCH_PARALLEL_SAFE, + search, + ); // `done`: the completion tool. Serial (terminal; must not be reordered). reg.register::<_, DoneRequest>("done", definitions::done(), false, done); diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 3b92411d..6fe52519 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -28,7 +28,9 @@ use crate::tools::handlers::mcp::{ McpCallResult, McpClient, McpTool, McpToolCallRequest, McpWireArgs, }; use crate::tools::handlers::python::{PythonBackend, PythonRequest, PythonTool}; -use crate::tools::handlers::search::{SearchBackend, SearchError, SearchTool}; +use crate::tools::handlers::search::{ + SearchBackend, SearchError, SearchTool, SEARCH_PARALLEL_SAFE, +}; use crate::tools::handlers::shell::{ShellRequest, ShellTool}; use crate::tools::handlers::tool_search::{ToolSearchEntry, ToolSearchRequest, ToolSearchTool}; use crate::tools::handlers::update_plan::{UpdatePlanRequest, UpdatePlanTool}; @@ -486,8 +488,8 @@ impl McpClient for FakeMcpClient { } } -/// A fake search backend: returns a canned DuckDuckGo Lite HTML fragment with a -/// single result echoing the query, so no network is touched (mirrors +/// A fake search backend: returns a canned search-API JSON body with a single +/// result echoing the query, so no network is touched (mirrors /// `search_tests.rs`). struct FakeSearchBackend; @@ -495,10 +497,7 @@ struct FakeSearchBackend; impl SearchBackend for FakeSearchBackend { async fn fetch(&self, query: &str) -> Result { Ok(format!( - "\ - \ - \ -
Result for {query}
snippet for {query}
" + r#"{{"results":[{{"title":"Result for {query}","url":"https://example.com/","content":"snippet for {query}"}}]}}"# )) } } @@ -576,13 +575,36 @@ fn default_registry_registers_all_tools() { } } +#[test] +fn search_definition_is_concise_and_guides_away_from_browser() { + let desc = definitions::search().description; + assert!( + desc.contains("Search the web"), + "describes a web search: {desc}" + ); + assert!( + desc.contains("No browser needed"), + "search description should make clear no browser is needed: {desc}" + ); + assert!( + desc.contains("prefer this over opening a browser"), + "search description should prefer this tool over browser search-engine navigation: {desc}" + ); + // Keep it concise (Codex-style one-liner), unlike a multi-sentence blurb. + assert!( + desc.len() < 160, + "search description should stay concise, got {} chars: {desc}", + desc.len() + ); +} + #[test] fn parallel_safe_flags_match_registration() { let reg = full_registry(); // Pure / read-only tools are parallel-safe. assert_eq!(reg.parallel_safe("tool_search"), Some(true)); assert_eq!(reg.parallel_safe("web_search"), Some(true)); - assert_eq!(reg.parallel_safe("search"), Some(true)); + assert_eq!(reg.parallel_safe("search"), Some(SEARCH_PARALLEL_SAFE)); // Everything else is serial. for name in [ "shell", @@ -591,6 +613,7 @@ fn parallel_safe_flags_match_registration() { "browser", "python", "mcp", + "search", "update_plan", "done", ] {