diff --git a/crates/browser-use-agent/src/config_overrides.rs b/crates/browser-use-agent/src/config_overrides.rs index cfef0b5f..ccfdd2ed 100644 --- a/crates/browser-use-agent/src/config_overrides.rs +++ b/crates/browser-use-agent/src/config_overrides.rs @@ -733,41 +733,57 @@ pub fn apply_child_request_runtime_config( config: &mut ProviderRunConfig, request: &ChildAgentRunRequest, ) -> Result<()> { - let overrides = &request.config_overrides; + apply_runtime_config_overrides(&mut config.options, &request.config_overrides) +} + +/// Apply config keys that mutate in-memory runtime options. +/// +/// The raw override list is still retained for downstream consumers that read +/// less common config keys directly, but options that are consulted before those +/// consumers run must be materialized here. +pub fn apply_runtime_config_overrides( + options: &mut AgentRunOptions, + overrides: &ConfigOverrides, +) -> Result<()> { + if let Some(value) = config_override_u64(overrides, "max_turns") { + options.max_turns = usize::try_from(value) + .context("max_turns does not fit in usize")? + .max(1); + } if let Some(value) = config_override_str(overrides, "browser_mode") { - config.options.browser_mode = Some(value); + options.browser_mode = Some(value); } if let Some(value) = config_override_str(overrides, "base_instructions") { - config.options.base_instructions = Some(value); + options.base_instructions = Some(value); } if let Some(value) = config_override_str(overrides, "developer_instructions") { - config.options.developer_instructions = Some(value); + options.developer_instructions = Some(value); } if let Some(value) = config_override_str(overrides, "compact_prompt") { - config.options.compact_prompt = Some(value); + options.compact_prompt = Some(value); } if let Some(value) = config_override_u64(overrides, "python_tool_timeout_seconds") { - config.options.python_tool_timeout_seconds = value; + options.python_tool_timeout_seconds = value; } if let Some(value) = config_override_bool(overrides, "model_compaction_enabled") { - config.options.model_compaction_enabled = value; + options.model_compaction_enabled = value; } if let Some(value) = config_override_i64(overrides, "model_auto_compact_token_limit") { - config.options.model_auto_compact_token_limit = Some(value); + options.model_auto_compact_token_limit = Some(value); } if let Some(value) = config_override_str(overrides, "model_auto_compact_token_limit_scope") { - config.options.model_auto_compact_token_limit_scope = + options.model_auto_compact_token_limit_scope = parse_auto_compact_token_limit_scope(&value)?; } if let Some(value) = config_override_str(overrides, "approval_policy") .or_else(|| config_override_str(overrides, "ask_for_approval")) { - config.options.approval_policy = parse_approval_policy(&value)?; + options.approval_policy = parse_approval_policy(&value)?; } if let Some(value) = config_override_bool(overrides, "use_guardian") .or_else(|| config_override_bool(overrides, "guardian")) { - config.options.use_guardian = value; + options.use_guardian = value; } Ok(()) } @@ -1816,6 +1832,25 @@ command = "profile-server" assert!(options.agent_roles.is_empty()); } + #[test] + fn runtime_config_overrides_materialize_max_turns_and_browser_mode() { + let overrides = parse_config_overrides(&ov(&[ + "max_turns=100", + "browser_mode=\"remote-cdp\"", + "python_tool_timeout_seconds=45", + "model_compaction_enabled=false", + ])) + .unwrap(); + let mut options = AgentRunOptions::default(); + + apply_runtime_config_overrides(&mut options, &overrides).unwrap(); + + assert_eq!(options.max_turns, 100); + assert_eq!(options.browser_mode.as_deref(), Some("remote-cdp")); + assert_eq!(options.python_tool_timeout_seconds, 45); + assert!(!options.model_compaction_enabled); + } + #[test] fn provider_run_config_new_uses_explicit_source_and_default_options() { let config = ProviderRunConfig::new(ProviderBackend::Anthropic, "claude-x"); diff --git a/crates/browser-use-agent/src/context/tests_accounting.rs b/crates/browser-use-agent/src/context/tests_accounting.rs index 7daea781..d0c7e04d 100644 --- a/crates/browser-use-agent/src/context/tests_accounting.rs +++ b/crates/browser-use-agent/src/context/tests_accounting.rs @@ -79,6 +79,7 @@ fn from_llm_usage_uses_server_total_when_present() { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 0, output_tokens: 30, reasoning_output_tokens: 10, total_tokens: 123, @@ -96,6 +97,7 @@ fn from_llm_usage_total_fallback_excludes_cached() { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 0, output_tokens: 30, reasoning_output_tokens: 10, total_tokens: 0, diff --git a/crates/browser-use-agent/src/entrypoint/mod.rs b/crates/browser-use-agent/src/entrypoint/mod.rs index 85d2a316..3f984cd6 100644 --- a/crates/browser-use-agent/src/entrypoint/mod.rs +++ b/crates/browser-use-agent/src/entrypoint/mod.rs @@ -270,6 +270,10 @@ struct LiveTurnState { /// `Some`, it REPLACES the durable-log prompt; later recorded turns are /// appended after it. `None` until the first compaction. compacted: Mutex>>, + /// Unit/offline seams rely on the in-memory recorder because they do not emit + /// durable model/tool events. Production emits those events synchronously, so + /// replaying both durable history and this recorder tail duplicates turns. + include_recorded_tail_in_prompt: bool, } impl LiveTurnState { @@ -298,9 +302,21 @@ impl LiveTurnState { previous_model_compaction: None, compaction_sampler: None, compacted: Mutex::new(None), + include_recorded_tail_in_prompt: true, } } + /// Use only the durable event log when rebuilding prompts. + /// + /// The live facade persists model/tool events as they happen, before the next + /// sampling iteration. The in-memory fusion recorder is therefore redundant + /// for production prompt replay and would duplicate the same assistant/tool + /// turns that the event log already reconstructs. + fn with_durable_prompt_replay(mut self) -> Self { + self.include_recorded_tail_in_prompt = false; + self + } + /// Enable REAL token accounting + model-based compaction against a context /// window, driven by `sampler` for the no-tools summary pass. fn with_compaction( @@ -359,10 +375,16 @@ impl LiveTurnState { Some(compacted) => compacted.clone(), None => self.durable_history_blocking(), }; - msgs.extend(self.recorded.lock().unwrap().iter().cloned()); + self.append_recorded_tail_if_enabled(&mut msgs); msgs } + fn append_recorded_tail_if_enabled(&self, msgs: &mut Vec) { + if self.include_recorded_tail_in_prompt { + msgs.extend(self.recorded.lock().unwrap().iter().cloned()); + } + } + fn runtime_session_id(&self) -> Option { RuntimeSessionId::from_string(self.session_id.as_str().to_string()).ok() } @@ -396,13 +418,15 @@ impl LiveTurnState { events: &[browser_use_protocol::EventRecord], ) -> Vec { let mut items = provider_messages_from_events(events); - items.extend( - self.recorded - .lock() - .unwrap() - .iter() - .map(message_to_provider_item), - ); + if self.include_recorded_tail_in_prompt { + items.extend( + self.recorded + .lock() + .unwrap() + .iter() + .map(message_to_provider_item), + ); + } items } @@ -1289,6 +1313,29 @@ fn runtime_mailbox_items_as_pending_input( .collect() } +const DISABLE_FALLBACK_CAPTURE_GIF_ENV: &str = "BU_DISABLE_FALLBACK_CAPTURE_GIF"; +const ENABLE_FALLBACK_CAPTURE_GIF_ENV: &str = "BU_ENABLE_FALLBACK_CAPTURE_GIF"; + +fn env_bool(name: &str) -> Option { + std::env::var(name) + .ok() + .and_then(|value| match value.trim().to_ascii_lowercase().as_str() { + "1" | "true" | "yes" | "on" => Some(true), + "0" | "false" | "no" | "off" => Some(false), + _ => None, + }) +} + +fn fallback_capture_recording_enabled() -> bool { + if matches!(env_bool(DISABLE_FALLBACK_CAPTURE_GIF_ENV), Some(true)) { + return false; + } + if let Some(enabled) = env_bool(ENABLE_FALLBACK_CAPTURE_GIF_ENV) { + return enabled; + } + matches!(env_bool(DISABLE_FALLBACK_CAPTURE_GIF_ENV), Some(false)) +} + fn append_runtime_prompt_projection_event( runtime_handle: &RuntimeHandle, session_id: &str, @@ -1347,6 +1394,9 @@ fn runtime_or_store_events( } fn ensure_fallback_capture_recording(store: &SharedStore, session_id: &str) { + if !fallback_capture_recording_enabled() { + return; + } let Ok(store) = store.lock() else { return; }; @@ -1937,10 +1987,10 @@ fn enrich_token_count_payload( impl TurnState for LiveTurnState { async fn clone_history_for_prompt(&self) -> Vec { // Once compacted, the prompt base is the compacted override (codex's - // replaced history); otherwise it is the lowered durable log. The recorded - // buffer (this run's assistant turns + the fused driver's dispatched tool - // outputs) is appended either way, so tool outputs always re-enter the next - // prompt (the fusion seam is preserved across compaction). + // replaced history); otherwise it is the lowered durable log. Offline + // tests append the recorder tail so tool outputs re-enter the next prompt + // without a durable sink. Production disables that tail because the same + // model/tool events have already been persisted and replay from the log. if self.compacted.lock().unwrap().is_some() { return self.assemble_prompt_blocking(); } @@ -1963,10 +2013,7 @@ impl TurnState for LiveTurnState { }) .await .unwrap_or_default(); - // The recorded buffer carries this run's assistant turns AND the fused - // driver's dispatched tool outputs (both append through the same `Arc`), so - // the next prompt sees everything produced so far. - msgs.extend(self.recorded.lock().unwrap().iter().cloned()); + self.append_recorded_tail_if_enabled(&mut msgs); msgs } @@ -2480,6 +2527,7 @@ struct RuntimeTurnLoopDriver { previous_model_compaction: Option, runtime_handle: RuntimeHandle, cancel: CancellationToken, + max_turns: Option, } impl RuntimeTurnLoopDriver { @@ -2499,13 +2547,15 @@ impl RuntimeTurnLoopDriver { previous_model_compaction, runtime_handle, cancel, + max_turns, } = self; let mailbox_delivery_phase = initial_runtime_mailbox_delivery_phase(Some(&runtime_handle), session_id.as_str()); let state = LiveTurnState::new(Arc::clone(&store), session_id.clone(), recorded) .with_runtime_handle(Some(runtime_handle.clone())) - .with_mailbox_delivery_phase(mailbox_delivery_phase); + .with_mailbox_delivery_phase(mailbox_delivery_phase) + .with_durable_prompt_replay(); // Enable REAL token accounting + model-based compaction when a sampler is // available (the real backend path). The Fake/no-credential path passes `None` // and keeps the inert (never-compacts) behavior. @@ -2549,9 +2599,18 @@ impl RuntimeTurnLoopDriver { let observer = StoreObserver::new(sink, session_id.as_str().to_string()); let turn_loop = TurnLoop::new(state, driver, observer); - let result = turn_loop - .run(ctx, turn_has_fresh_input, cancel.clone()) - .await; + let result = match max_turns { + Some(max_turns) => { + turn_loop + .run_with_max_turns(ctx, turn_has_fresh_input, cancel.clone(), max_turns) + .await + } + None => { + turn_loop + .run(ctx, turn_has_fresh_input, cancel.clone()) + .await + } + }; if result.is_ok() { ensure_fallback_capture_recording(&store, session_id.as_str()); } @@ -2579,6 +2638,7 @@ async fn drive_run( previous_model_compaction: Option, runtime_handle: RuntimeHandle, cancel: CancellationToken, + max_turns: Option, ) -> Result, AgentError> { RuntimeTurnLoopDriver { store, @@ -2595,6 +2655,7 @@ async fn drive_run( previous_model_compaction, runtime_handle, cancel, + max_turns, } .run() .await @@ -3010,6 +3071,7 @@ async fn run_session_once_with_config_with_cancel( previous_model_compaction, runtime_handle.clone(), cancel.clone(), + Some(config.options.max_turns), ) .await?; } @@ -3035,6 +3097,7 @@ async fn run_session_once_with_config_with_cancel( None, runtime_handle.clone(), cancel.clone(), + Some(config.options.max_turns), ) .await?; } @@ -3127,8 +3190,66 @@ mod tests { use browser_use_store::Store; use serde_json::Value; use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::{Mutex as StdMutex, MutexGuard as StdMutexGuard, OnceLock as StdOnceLock}; use tempfile::TempDir; + static ENTRYPOINT_ENV_LOCK: StdOnceLock> = StdOnceLock::new(); + + struct EnvRestore { + _guard: StdMutexGuard<'static, ()>, + values: Vec<(&'static str, Option)>, + } + + impl EnvRestore { + fn set(vars: &[(&'static str, &str)]) -> Self { + let guard = ENTRYPOINT_ENV_LOCK + .get_or_init(|| StdMutex::new(())) + .lock() + .expect("env lock poisoned"); + let values = vars + .iter() + .map(|(key, _)| (*key, std::env::var(key).ok())) + .collect::>(); + for (key, value) in vars { + std::env::set_var(key, value); + } + Self { + _guard: guard, + values, + } + } + + fn unset(keys: &[&'static str]) -> Self { + let guard = ENTRYPOINT_ENV_LOCK + .get_or_init(|| StdMutex::new(())) + .lock() + .expect("env lock poisoned"); + let values = keys + .iter() + .map(|key| (*key, std::env::var(key).ok())) + .collect::>(); + for key in keys { + std::env::remove_var(key); + } + Self { + _guard: guard, + values, + } + } + } + + impl Drop for EnvRestore { + fn drop(&mut self) { + for (key, value) in self.values.drain(..) { + if let Some(value) = value { + std::env::set_var(key, value); + } else { + std::env::remove_var(key); + } + } + } + } + /// A tempdir-backed `SharedStore` with a fresh session row (the `events` table /// has a FK on `sessions(id)`, so the session must exist before we append). /// Returns the `TempDir` so the caller keeps the on-disk sqlite db alive. @@ -3260,6 +3381,32 @@ mod tests { ProviderRunConfig::new(ProviderBackend::Fake, "fake-model").with_fake_result("hi from fake") } + #[test] + fn fallback_capture_recording_is_opt_in_for_eval_speed() { + { + let _env = EnvRestore::unset(&[ + DISABLE_FALLBACK_CAPTURE_GIF_ENV, + ENABLE_FALLBACK_CAPTURE_GIF_ENV, + ]); + assert!(!fallback_capture_recording_enabled()); + } + { + let _env = EnvRestore::set(&[(ENABLE_FALLBACK_CAPTURE_GIF_ENV, "1")]); + assert!(fallback_capture_recording_enabled()); + } + { + let _env = EnvRestore::set(&[ + (ENABLE_FALLBACK_CAPTURE_GIF_ENV, "1"), + (DISABLE_FALLBACK_CAPTURE_GIF_ENV, "1"), + ]); + assert!(!fallback_capture_recording_enabled()); + } + { + let _env = EnvRestore::set(&[(DISABLE_FALLBACK_CAPTURE_GIF_ENV, "false")]); + assert!(fallback_capture_recording_enabled()); + } + } + /// Seed a real user turn into the durable log before driving. /// /// Appends straight through the store lock (the sync `Store::append_event`) @@ -3353,6 +3500,32 @@ mod tests { ) } + fn count_tool_call_ids(messages: &[Message], call_id: &str) -> usize { + messages + .iter() + .flat_map(|message| message.content.iter()) + .filter(|part| { + matches!( + part, + ContentPart::ToolCall { id, .. } if id == call_id + ) + }) + .count() + } + + fn count_tool_result_ids(messages: &[Message], call_id: &str) -> usize { + messages + .iter() + .flat_map(|message| message.content.iter()) + .filter(|part| { + matches!( + part, + ContentPart::ToolResult { tool_call_id, .. } if tool_call_id == call_id + ) + }) + .count() + } + fn seed_workspace_context(store: &SharedStore, session_id: &str, content: &str) { let store = store.lock().expect("store mutex poisoned"); store @@ -3727,6 +3900,83 @@ mod tests { assert!(!state.token_status().await.token_limit_reached); } + #[tokio::test] + async fn durable_prompt_replay_ignores_duplicate_fusion_tail() { + let (_dir, store, session_id) = store_with_session(); + seed_user_input(&store, &session_id, "use the browser").await; + { + let store = store.lock().expect("store mutex poisoned"); + store + .append_event( + &session_id, + "model.tool_call", + serde_json::json!({ + "id": "call_browser", + "name": "browser_script", + "arguments": { "code": "return document.title" }, + }), + ) + .expect("seed durable tool call"); + store + .append_event( + &session_id, + "tool.output", + serde_json::json!({ + "tool_call_id": "call_browser", + "name": "browser_script", + "text": "Example Domain", + }), + ) + .expect("seed durable tool output"); + } + + let recorded = Arc::new(Mutex::new(vec![ + Message::new( + MessageRole::Assistant, + vec![ContentPart::ToolCall { + id: "call_browser".to_string(), + name: "browser_script".to_string(), + input: serde_json::json!({ "code": "return document.title" }), + provider_metadata: None, + }], + ), + Message::new( + MessageRole::Tool, + vec![ContentPart::ToolResult { + tool_call_id: "call_browser".to_string(), + content: vec![ContentPart::text("Example Domain")], + is_error: false, + }], + ), + ])); + + let default_state = LiveTurnState::new( + Arc::clone(&store), + SessionId(session_id.clone()), + Arc::clone(&recorded), + ); + let default_prompt = default_state.clone_history_for_prompt().await; + assert_eq!( + count_tool_call_ids(&default_prompt, "call_browser"), + 2, + "test fixture should reproduce the old durable+recorder duplication" + ); + + let durable_state = LiveTurnState::new(Arc::clone(&store), SessionId(session_id), recorded) + .with_durable_prompt_replay(); + let prompt = durable_state.clone_history_for_prompt().await; + assert_eq!( + count_tool_call_ids(&prompt, "call_browser"), + 1, + "production prompt replay must not duplicate durable tool calls" + ); + assert_eq!( + count_tool_result_ids(&prompt, "call_browser"), + 1, + "production prompt replay must not duplicate durable tool outputs" + ); + } + #[tokio::test] async fn runtime_turn_state_reads_history_from_runtime_journal_first() { let (_dir, store, session_id) = store_with_session(); @@ -5475,6 +5725,7 @@ mod tests { None, runtime_handle, cancel, + None, ), ) .await diff --git a/crates/browser-use-agent/src/events/map.rs b/crates/browser-use-agent/src/events/map.rs index 60e84d3e..7243f4c2 100644 --- a/crates/browser-use-agent/src/events/map.rs +++ b/crates/browser-use-agent/src/events/map.rs @@ -95,6 +95,7 @@ pub fn usage_to_model_usage(u: &Usage) -> ModelUsage { ModelUsage { input_tokens: Some(u.input_tokens as i64), input_cached_tokens: Some(u.cached_input_tokens as i64), + input_cache_creation_tokens: positive_i64(u.cache_creation_input_tokens), output_tokens: Some(u.output_tokens as i64), reasoning_output_tokens: Some(u.reasoning_output_tokens as i64), total_tokens: Some(total as i64), @@ -102,6 +103,10 @@ pub fn usage_to_model_usage(u: &Usage) -> ModelUsage { } } +fn positive_i64(value: u64) -> Option { + (value > 0).then_some(value as i64) +} + /// Codex-shaped token-usage object (mirrors core `model_usage_to_codex_token_usage`): /// `{ input_tokens, cached_input_tokens, output_tokens, reasoning_output_tokens, /// total_tokens }`, where a missing `total_tokens` falls back to the sum of the @@ -109,6 +114,7 @@ pub fn usage_to_model_usage(u: &Usage) -> ModelUsage { fn codex_token_usage(usage: &ModelUsage) -> Value { let input_tokens = usage.input_tokens.unwrap_or(0); let cached_input_tokens = usage.input_cached_tokens.unwrap_or(0); + let cache_creation_input_tokens = usage.input_cache_creation_tokens.unwrap_or(0); let output_tokens = usage.output_tokens.unwrap_or(0); let reasoning_output_tokens = usage.reasoning_output_tokens.unwrap_or(0); let total_tokens = usage.total_tokens.unwrap_or_else(|| { @@ -116,20 +122,26 @@ fn codex_token_usage(usage: &ModelUsage) -> Value { .saturating_add(output_tokens) .saturating_add(reasoning_output_tokens) }); - json!({ + let mut value = json!({ "input_tokens": input_tokens, "cached_input_tokens": cached_input_tokens, "output_tokens": output_tokens, "reasoning_output_tokens": reasoning_output_tokens, "total_tokens": total_tokens, - }) + }); + if cache_creation_input_tokens > 0 { + value["input_cache_creation_tokens"] = json!(cache_creation_input_tokens); + } + value } /// Field-wise sum of two codex token-usage objects (mirrors core /// `add_codex_token_usage`). Missing keys are treated as `0`. fn add_codex_token_usage(previous: &Value, addition: &Value) -> Value { let get = |value: &Value, key: &str| value.get(key).and_then(Value::as_i64).unwrap_or(0); - json!({ + let cache_creation_input_tokens = + get(previous, "input_cache_creation_tokens") + get(addition, "input_cache_creation_tokens"); + let mut value = json!({ "input_tokens": get(previous, "input_tokens") + get(addition, "input_tokens"), "cached_input_tokens": get(previous, "cached_input_tokens") + get(addition, "cached_input_tokens"), @@ -137,7 +149,11 @@ fn add_codex_token_usage(previous: &Value, addition: &Value) -> Value { "reasoning_output_tokens": get(previous, "reasoning_output_tokens") + get(addition, "reasoning_output_tokens"), "total_tokens": get(previous, "total_tokens") + get(addition, "total_tokens"), - }) + }); + if cache_creation_input_tokens > 0 { + value["input_cache_creation_tokens"] = json!(cache_creation_input_tokens); + } + value } /// Build the `token_count` payload (core parity: diff --git a/crates/browser-use-agent/src/events/map_tests.rs b/crates/browser-use-agent/src/events/map_tests.rs index a530bd8a..667a685d 100644 --- a/crates/browser-use-agent/src/events/map_tests.rs +++ b/crates/browser-use-agent/src/events/map_tests.rs @@ -108,6 +108,7 @@ fn finish_maps_to_token_count_from_usage() { let usage = Usage { input_tokens: 100, cached_input_tokens: 10, + cache_creation_input_tokens: 0, output_tokens: 20, reasoning_output_tokens: 5, total_tokens: 125, @@ -206,6 +207,7 @@ fn usage_total_zero_falls_back_to_computed_total() { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 0, output_tokens: 20, reasoning_output_tokens: 5, total_tokens: 0, // provider didn't report an inclusive total @@ -230,6 +232,7 @@ fn usage_total_nonzero_is_preserved() { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 12, output_tokens: 20, reasoning_output_tokens: 5, total_tokens: 250, // explicit total wins over computed_total @@ -240,8 +243,8 @@ fn usage_total_nonzero_is_preserved() { assert_eq!(mu.input_cached_tokens, Some(40)); assert_eq!(mu.output_tokens, Some(20)); assert_eq!(mu.reasoning_output_tokens, Some(5)); - // Cost / cache-creation fields are unknown at this layer. - assert_eq!(mu.input_cache_creation_tokens, None); + assert_eq!(mu.input_cache_creation_tokens, Some(12)); + // Cost fields are unknown at this layer. assert_eq!(mu.cost_usd, None); assert_eq!(mu.cost_source, None); } diff --git a/crates/browser-use-agent/src/goals/tests.rs b/crates/browser-use-agent/src/goals/tests.rs index b0647326..a61f70ea 100644 --- a/crates/browser-use-agent/src/goals/tests.rs +++ b/crates/browser-use-agent/src/goals/tests.rs @@ -56,6 +56,7 @@ fn usage(input: u64, cached: u64, output: u64) -> Usage { Usage { input_tokens: input, cached_input_tokens: cached, + cache_creation_input_tokens: 0, output_tokens: output, reasoning_output_tokens: 0, total_tokens: 0, diff --git a/crates/browser-use-agent/src/prompts/mod.rs b/crates/browser-use-agent/src/prompts/mod.rs index 647c11b5..ee30ac53 100644 --- a/crates/browser-use-agent/src/prompts/mod.rs +++ b/crates/browser-use-agent/src/prompts/mod.rs @@ -209,6 +209,13 @@ pub fn browser_mode_instruction(mode: &str) -> String { "Remote start means start and connect; use `browser remote live-url` to retrieve the watch URL." ) .to_string(), + "remote-cdp" | "cdp" => concat!( + "Selected browser mode: Remote CDP. The evaluation harness already provides the browser endpoint. ", + "Do not call `browser connect managed`, `browser connect local`, or `browser remote start`. ", + "Start page work directly with `browser_script` using `goto_url(...)`, then call explicit waits such as `wait_for_load(...)` only when the next page state matters. ", + "Use `browser status --json` only if you need to inspect the current connection." + ) + .to_string(), other => format!( "Selected browser mode: {other}. Use `browser status --json` first, then choose an explicit browser connect command." ), diff --git a/crates/browser-use-agent/src/prompts/tests.rs b/crates/browser-use-agent/src/prompts/tests.rs index d929c9ac..b0fb0cac 100644 --- a/crates/browser-use-agent/src/prompts/tests.rs +++ b/crates/browser-use-agent/src/prompts/tests.rs @@ -92,6 +92,57 @@ fn browser_mode_instruction_matches_main_local_connection_guidance() { assert!(prompt.contains("/profile")); } +#[test] +fn browser_mode_instruction_guides_remote_cdp_to_direct_page_work() { + let prompt = browser_mode_instruction("remote-cdp"); + assert!(prompt.contains("Selected browser mode: Remote CDP")); + assert!(prompt.contains("already provides the browser endpoint")); + assert!(prompt.contains("Start page work directly with `browser_script`")); + assert!(prompt.contains("Do not call `browser connect managed`")); +} + +#[test] +fn system_prompt_bounds_multi_item_collection_loops() { + assert!(BASE_SYSTEM_PROMPT.contains("Multi-item collection rule")); + assert!(BASE_SYSTEM_PROMPT.contains("maintain a checklist")); + assert!(BASE_SYSTEM_PROMPT.contains("Do not keep varying one search term")); + assert!(BASE_SYSTEM_PROMPT.contains("audit the checklist")); +} + +#[test] +fn system_prompt_commits_single_site_collection_to_one_domain() { + assert!(BASE_SYSTEM_PROMPT.contains("Single-site collection rule")); + assert!(BASE_SYSTEM_PROMPT.contains("choose one viable domain early")); + assert!(BASE_SYSTEM_PROMPT.contains("do not keep searching for a perfect domain")); + assert!(BASE_SYSTEM_PROMPT.contains("Do not stitch rows from multiple domains")); + assert!(BASE_SYSTEM_PROMPT.contains("mark it unavailable for that domain")); +} + +#[test] +fn prompts_avoid_screenshots_for_text_heavy_extraction() { + assert!(BASE_SYSTEM_PROMPT.contains( + "For text-heavy research, document reading, search, pricing, tables, and list extraction" + )); + assert!(BASE_SYSTEM_PROMPT.contains("screenshots add latency")); + assert!(BASE_SYSTEM_PROMPT.contains("If you have three or more independent URLs")); + + let script = browser_script_tool_description(); + assert!(script.contains( + "For text-heavy research, document reading, search, pricing, tables, and list extraction" + )); + assert!(script.contains("screenshots add latency")); +} + +#[test] +fn dataset_prompt_enforces_timeboxed_finalization() { + let prompt = include_str!("../../../../prompts/dataset-case-user.md"); + + assert!(prompt.contains("Timebox contract")); + assert!(prompt.contains("soft deadline")); + assert!(prompt.contains("hard deadline")); + assert!(prompt.contains("Never keep running until the external runner timeout")); +} + /// Plan mode was removed. The compatibility enum value now renders the Default /// asset so stale configs do not re-enable planning behavior. #[test] @@ -141,13 +192,26 @@ fn browser_tool_descriptions_preserve_interaction_skills() { script.contains("js(function_source, *args)"), "browser_script description lost js argument helper guidance" ); + assert!( + script.contains("http_get_many(urls, **kwargs)") + && script.contains("browser_fetch_many(requests, **kwargs)"), + "browser_script description lost batch/direct fetch helper guidance" + ); + assert!( + script.contains("Batch recipe after discovering stable links or endpoints") + && script.contains("responses = http_get_many(urls, timeout=12, max_workers=8)") + && script.contains("Fetched ${$.ok_count}/${$.total} independent URLs"), + "browser_script description lost its concrete batch-fetch adoption recipe" + ); // The base system prompt enumerates the page-interaction helpers, including // the screenshot/image helpers used for visual inspection. assert!( BASE_SYSTEM_PROMPT.contains("capture_screenshot") && BASE_SYSTEM_PROMPT.contains("emit_image") - && BASE_SYSTEM_PROMPT.contains("js(function_source, *args)"), + && BASE_SYSTEM_PROMPT.contains("js(function_source, *args)") + && BASE_SYSTEM_PROMPT.contains("http_get_many") + && BASE_SYSTEM_PROMPT.contains("browser_fetch_many"), "base system prompt lost its screenshot/image interaction helpers" ); } diff --git a/crates/browser-use-agent/src/session/reconstruct.rs b/crates/browser-use-agent/src/session/reconstruct.rs index 7e6279c8..d4bd636e 100644 --- a/crates/browser-use-agent/src/session/reconstruct.rs +++ b/crates/browser-use-agent/src/session/reconstruct.rs @@ -1154,15 +1154,22 @@ fn response_input_item_output_content(item: &Value) -> Value { fn value_to_tool_output_text(value: &Value) -> String { match value { Value::String(text) => text.clone(), - Value::Array(parts) => parts - .iter() - .filter_map(|part| { - part.get("text") - .and_then(Value::as_str) - .map(ToOwned::to_owned) - }) - .collect::>() - .join(""), + Value::Array(parts) => { + let text = parts + .iter() + .filter_map(|part| { + part.get("text") + .and_then(Value::as_str) + .map(ToOwned::to_owned) + }) + .collect::>() + .join(""); + if text.trim().is_empty() { + value.to_string() + } else { + text + } + } Value::Null => String::new(), other => other.to_string(), } @@ -1473,15 +1480,33 @@ fn tool_output_event_content(payload: &Value) -> Value { fn tool_output_event_text(payload: &Value) -> String { if let Some(text) = payload.get("text").and_then(Value::as_str) { - return text.to_string(); + if !text.trim().is_empty() { + return text.to_string(); + } } if let Some(output) = payload.get("output") { - return value_to_tool_output_text(output); + let text = value_to_tool_output_text(output); + if !text.trim().is_empty() { + return text; + } } if let Some(content) = payload.get("content") { - return value_to_tool_output_text(content); + let text = value_to_tool_output_text(content); + if !text.trim().is_empty() { + return text; + } + } + let mut parts = Vec::new(); + for key in ["summary", "data", "outputs"] { + let Some(value) = payload.get(key) else { + continue; + }; + if value.is_null() || value == &serde_json::json!({}) || value == &serde_json::json!([]) { + continue; + } + parts.push(format!("{key}: {}", value_to_tool_output_text(value))); } - String::new() + parts.join("\n") } fn synthetic_tool_result_text(name: &str) -> String { diff --git a/crates/browser-use-agent/src/session/reconstruct_tests.rs b/crates/browser-use-agent/src/session/reconstruct_tests.rs index 224cc3c1..0d76691d 100644 --- a/crates/browser-use-agent/src/session/reconstruct_tests.rs +++ b/crates/browser-use-agent/src/session/reconstruct_tests.rs @@ -140,6 +140,58 @@ fn turn_with_tool_call_and_output() { ); } +#[test] +fn tool_output_event_uses_structured_browser_script_fallback_text() { + let events = vec![ + event(1, "session.input", json!({ "text": "open page" })), + event( + 2, + "model.tool_call", + json!({ + "id": "call_browser", + "name": "browser_script", + "arguments": { "code": "emit_output(page_info(), label='page_info')" } + }), + ), + event( + 3, + "tool.output", + json!({ + "tool_call_id": "call_browser", + "name": "browser_script", + "text": "", + "summary": [{ + "kind": "page", + "output_label": "page_info", + "title": "Example Domain", + "url": "https://example.com" + }], + "outputs": [{ + "label": "page_info", + "value": { + "title": "Example Domain", + "url": "https://example.com" + } + }] + }), + ), + event(4, "session.done", json!({})), + ]; + + let messages = provider_messages_from_events(&events); + assert_eq!(messages.len(), 3, "messages: {messages:#?}"); + let tool = &messages[2]; + assert_eq!(tool.get("role").and_then(Value::as_str), Some("tool")); + let content = tool + .get("content") + .and_then(Value::as_str) + .expect("structured fallback content"); + assert!(content.contains("summary:")); + assert!(content.contains("outputs:")); + assert!(content.contains("Example Domain")); + assert!(!content.trim().is_empty()); +} + #[test] fn tool_output_event_preserves_image_content() { let events = vec![ diff --git a/crates/browser-use-agent/src/tools/handlers/browser.rs b/crates/browser-use-agent/src/tools/handlers/browser.rs index 48675136..473a896e 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser.rs @@ -73,9 +73,17 @@ pub const DEFAULT_OBSERVE_TIMEOUT_MS: u64 = 1_000; /// [`ContentPart`]s so provider protocols can send images to vision-capable /// models while preserving a plain text fallback for logs/tests. pub const BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX: &str = "\n__browser_script_content__:"; +/// Maximum bytes of browser-script text returned to the next model turn. +/// +/// Full browser-script output is persisted through durable events/artifacts; the +/// inline model view is deliberately smaller because long eval tasks repeatedly +/// carry every prior tool result in later prompts. +pub const MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES: usize = 4 * 1024; const BROWSER_PREF_MODE: &str = "browser.preference.mode"; const BROWSER_PREF_PROFILE: &str = "browser.preference.profile"; +const BROWSER_DOMAIN_PROFILE_PREFIX: &str = "browser.domain_profile."; +const BROWSER_SCRIPT_MAX_IMAGE_DIMENSION: u32 = 8_000; const BROWSER_PREF_PROFILE_LABEL: &str = "browser.preference.profile_label"; /// What the model wants the browser to do. @@ -455,6 +463,7 @@ impl RealBackend { Some( match mode { "cloud" | "browser-use-cloud" | "remote-cloud" => "cloud", + "remote-cdp" | "cdp" => "remote-cdp", "headless" | "headless-chromium" | "managed-headless" => "managed-headless", other => other, } @@ -463,13 +472,18 @@ impl RealBackend { } fn should_ensure_before_command(&self, command: &str) -> bool { - if self.normalized_browser_mode().is_none() { + let Some(mode) = self.normalized_browser_mode() else { return false; - } + }; let Ok(words) = browser_command_words(command) else { return false; }; let words = words.iter().map(String::as_str).collect::>(); + if mode == "remote-cdp" + && matches!(words.as_slice(), ["browser", "status", ..] | ["status", ..]) + { + return true; + } if browser_command_is_passive(words.as_slice()) { return false; } @@ -479,6 +493,8 @@ impl RealBackend { | ["remote", "start", ..] | ["browser", "remote", "stop", ..] | ["remote", "stop", ..] + | ["browser", "connect", "remote-cdp", ..] + | ["connect", "remote-cdp", ..] ) } @@ -526,16 +542,24 @@ impl RealBackend { status.content.get("connection").and_then(Value::as_str) == Some("connected"); let current_mode = status.content.get("mode").and_then(Value::as_str); let owner = status.content.get("owner").and_then(Value::as_str); - let Some(desired_command) = + let desired_command = if mode == "remote-cdp" { + if connected && current_mode == Some("remote-cdp") { + None + } else { + Some(remote_cdp_connect_command()?) + } + } else { desired_browser_connect_command(mode.as_str(), connected, current_mode, owner) - else { + .map(str::to_string) + }; + let Some(desired_command) = desired_command else { return Ok(events); }; let mut started = browser_use_browser::run_browser_command_with_options_and_registries( session_id, cwd, artifact_dir, - desired_command, + &desired_command, browser_use_browser::BrowserCommandOptions::default(), &self.script_registry, &self.session_registry, @@ -765,10 +789,12 @@ fn dispatch_browser_preference( selected_browser_mode: Option<&str>, ) -> anyhow::Result { match args.get(1).map(String::as_str) { - None | Some("--json") | Some("show") => browser_preference_json(store), + None | Some("--json") | Some("show") => { + browser_preference_json(store, selected_browser_mode) + } Some("use") => { let mode = args.get(2).map(String::as_str).ok_or_else(|| { - anyhow!("browser preference use requires ") + anyhow!("browser preference use requires ") })?; let normalized = normalize_browser_preference_mode(mode)?; enforce_selected_browser_mode(selected_browser_mode, normalized)?; @@ -776,7 +802,7 @@ fn dispatch_browser_preference( store.set_setting("browser", browser_display_name(normalized))?; Ok(json!({ "status": "ok", - "preference": browser_preference_json(store)?, + "preference": browser_preference_json(store, selected_browser_mode)?, "next_step": "browser connect", })) } @@ -794,7 +820,7 @@ fn dispatch_browser_profile_preference( selected_browser_mode: Option<&str>, ) -> anyhow::Result { match args.get(1).map(String::as_str) { - Some("current") => browser_preference_json(store), + Some("current") => browser_preference_json(store, selected_browser_mode), Some("use") => { enforce_selected_browser_mode(selected_browser_mode, "local")?; let profile_id = args @@ -885,7 +911,7 @@ fn dispatch_browser_profile_preference( })) } Some(other) => bail!("unknown browser profile command: {other}"), - None => browser_preference_json(store), + None => browser_preference_json(store, selected_browser_mode), } } @@ -906,16 +932,43 @@ fn resolve_browser_command_for_selected_mode( .transpose()? .flatten() }; - Ok(browser_connect_command_for_mode( - effective_mode, - profile_id.as_deref(), - )) + browser_connect_command_for_mode(effective_mode, profile_id.as_deref()) } else { + if let Some(command) = + remote_cdp_compatibility_connect_command(&args, selected_browser_mode)? + { + return Ok(command); + } enforce_browser_command_matches_selected_mode(&args, selected_browser_mode)?; Ok(cmd.to_string()) } } +fn remote_cdp_compatibility_connect_command( + args: &[String], + selected_browser_mode: Option<&str>, +) -> anyhow::Result> { + let Some(selected_mode) = selected_browser_mode + .map(str::trim) + .filter(|value| !value.is_empty()) + else { + return Ok(None); + }; + if normalize_browser_preference_mode(selected_mode)? != "remote-cdp" { + return Ok(None); + } + let requests_different_browser_setup = match args { + [command, mode, ..] if command == "connect" => mode != "remote-cdp", + [command, ..] if command == "local" => true, + [command, action, ..] if command == "remote" && action == "start" => true, + _ => false, + }; + if requests_different_browser_setup { + return Ok(Some(remote_cdp_connect_command()?)); + } + Ok(None) +} + fn local_connect_default_profile_preflight( has_default_profile: bool, backend: &dyn BrowserBackend, @@ -1376,9 +1429,41 @@ fn preferred_browser_mode(store: Option<&Store>) -> anyhow::Result<&'static str> normalize_browser_preference_mode(&mode) } -fn browser_connect_command_for_mode(mode: &str, profile_id: Option<&str>) -> String { +fn remote_cdp_connect_command() -> anyhow::Result { + if let Some(ws) = env_trimmed("BU_CDP_WS") { + return Ok(remote_cdp_connect_command_for_endpoint(&ws)); + } + if let Some(url) = env_trimmed("BU_CDP_URL") { + return Ok(remote_cdp_connect_command_for_endpoint(&url)); + } + bail!("browser mode is locked to Remote CDP, but BU_CDP_URL or BU_CDP_WS is not set") +} + +fn remote_cdp_connect_command_for_endpoint(endpoint: &str) -> String { + let flag = if endpoint.starts_with("ws://") || endpoint.starts_with("wss://") { + "--ws" + } else { + "--url" + }; + format!( + "browser connect remote-cdp {flag} {}", + shell_quote_browser_arg(endpoint) + ) +} + +fn env_trimmed(name: &str) -> Option { + std::env::var(name) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) +} + +fn browser_connect_command_for_mode( + mode: &str, + profile_id: Option<&str>, +) -> anyhow::Result { match normalize_browser_preference_mode(mode).unwrap_or("local") { - "cloud" => profile_id.filter(|value| !value.is_empty()).map_or_else( + "cloud" => Ok(profile_id.filter(|value| !value.is_empty()).map_or_else( || "browser remote start".to_string(), |profile_id| { format!( @@ -1386,10 +1471,11 @@ fn browser_connect_command_for_mode(mode: &str, profile_id: Option<&str>) -> Str shell_quote_browser_arg(profile_id) ) }, - ), - "managed-headless" => "browser connect managed --headless".to_string(), - "managed-headed" => "browser connect managed --headed".to_string(), - _ => "browser connect local".to_string(), + )), + "managed-headless" => Ok("browser connect managed --headless".to_string()), + "managed-headed" => Ok("browser connect managed --headed".to_string()), + "remote-cdp" => remote_cdp_connect_command(), + _ => Ok("browser connect local".to_string()), } } @@ -1443,10 +1529,7 @@ fn enforce_browser_command_matches_selected_mode( }; enforce_selected_browser_mode(Some(selected_mode), requested_mode) } - Some("remote-cdp") => bail!( - "browser mode is locked to {} for this run; remote CDP endpoints are not selectable from this terminal browser mode", - browser_display_name(selected_mode), - ), + Some("remote-cdp") => enforce_selected_browser_mode(Some(selected_mode), "remote-cdp"), Some(other) => bail!("unknown browser connect mode: {other}"), }, "local" => enforce_selected_browser_mode(Some(selected_mode), "local"), @@ -1470,31 +1553,79 @@ fn has_browser_arg(args: &[String], flag: &str) -> bool { args.iter().any(|arg| arg == flag) } -fn browser_preference_json(store: &Store) -> anyhow::Result { - let mode = store - .get_setting(BROWSER_PREF_MODE)? - .or_else(|| { - store - .get_setting("browser") - .ok() - .flatten() - .and_then(|value| display_browser_to_mode(&value).map(ToOwned::to_owned)) +fn browser_preference_json( + store: &Store, + selected_browser_mode: Option<&str>, +) -> anyhow::Result { + let selected_mode = selected_browser_mode + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(normalize_browser_preference_mode) + .transpose()?; + let mode = selected_mode.map(ToOwned::to_owned).unwrap_or_else(|| { + store + .get_setting(BROWSER_PREF_MODE) + .ok() + .flatten() + .or_else(|| { + store + .get_setting("browser") + .ok() + .flatten() + .and_then(|value| display_browser_to_mode(&value).map(ToOwned::to_owned)) + }) + .unwrap_or_else(|| "local".to_string()) + }); + let profile_id = if selected_mode.is_some() { + None + } else { + store.get_setting(BROWSER_PREF_PROFILE)? + }; + let profile_label = if selected_mode.is_some() { + None + } else { + store.get_setting(BROWSER_PREF_PROFILE_LABEL)? + }; + let domain_profiles = store + .list_settings()? + .into_iter() + .filter_map(|(key, value)| { + key.strip_prefix(BROWSER_DOMAIN_PROFILE_PREFIX) + .and_then(|domain| { + serde_json::from_str::(&value) + .ok() + .map(|value| (domain.to_string(), value)) + }) }) - .unwrap_or_else(|| "local".to_string()); + .map(|(domain, value)| json!({ "domain": domain, "preference": value })) + .collect::>(); Ok(json!({ "mode": normalize_browser_preference_mode(&mode)?, "display": browser_display_name(normalize_browser_preference_mode(&mode)?), - "profile_id": store.get_setting(BROWSER_PREF_PROFILE)?, - "profile_label": store.get_setting(BROWSER_PREF_PROFILE_LABEL)?, - "connect_command": match normalize_browser_preference_mode(&mode)? { - "cloud" => "browser remote start", - "managed-headless" => "browser connect managed --headless", - "managed-headed" => "browser connect managed --headed", - _ => "browser connect local", - }, + "profile_id": profile_id, + "profile_label": profile_label, + "domain_profiles": domain_profiles, + "connect_command": browser_connect_command_display_for_mode(&mode, profile_id.as_deref())?, })) } +fn browser_connect_command_display_for_mode( + mode: &str, + profile_id: Option<&str>, +) -> anyhow::Result { + match normalize_browser_preference_mode(mode)? { + "remote-cdp" => Ok("browser connect remote-cdp --url ".to_string()), + _ => browser_connect_command_for_mode(mode, profile_id), + } +} + +fn remembered_domain_profile(store: &Store, domain: &str) -> anyhow::Result> { + store + .get_setting(&browser_domain_profile_key(domain))? + .map(|raw| serde_json::from_str::(&raw).map_err(Into::into)) + .transpose() +} + fn browser_command_words(cmd: &str) -> anyhow::Result> { let mut words = Vec::new(); let mut current = String::new(); @@ -1557,11 +1688,56 @@ fn shell_quote_browser_arg(value: &str) -> String { } } +fn normalize_domain(domain: &str) -> String { + domain + .trim() + .trim_start_matches("https://") + .trim_start_matches("http://") + .trim_start_matches("www.") + .trim_matches('/') + .to_ascii_lowercase() +} + +fn browser_domain_profile_key(domain: &str) -> String { + format!( + "{BROWSER_DOMAIN_PROFILE_PREFIX}{}", + normalize_domain(domain) + ) +} + +fn browser_profile_connect_next_step(mode: &str, profile_id: Option<&str>) -> String { + let profile_id = profile_id.filter(|value| !value.is_empty()); + match normalize_browser_preference_mode(mode).unwrap_or("local") { + "cloud" => profile_id.map_or_else( + || "browser remote start".to_string(), + |profile_id| { + format!( + "browser remote start --profile-id {}", + shell_quote_browser_arg(profile_id) + ) + }, + ), + "managed-headless" => "browser connect managed --headless".to_string(), + "managed-headed" => "browser connect managed --headed".to_string(), + "remote-cdp" => "browser connect remote-cdp --url ".to_string(), + _ => profile_id.map_or_else( + || "browser connect local".to_string(), + |profile_id| { + format!( + "If this profile is already open with remote debugging, run `browser connect local`; otherwise run `browser local setup --profile {}` and then `browser connect local`.", + shell_quote_browser_arg(profile_id) + ) + }, + ), + } +} + fn normalize_browser_preference_mode(mode: &str) -> anyhow::Result<&'static str> { let normalized = mode.to_ascii_lowercase().replace(['_', ' '], "-"); match normalized.as_str() { "local" | "local-chrome" => Ok("local"), "cloud" | "browser-use-cloud" | "remote-cloud" => Ok("cloud"), + "remote-cdp" | "cdp" => Ok("remote-cdp"), "headless" | "headless-chromium" | "managed-headless" => Ok("managed-headless"), "managed" | "managed-headed" | "headed" => Ok("managed-headed"), other => bail!("unknown browser preference mode: {other}"), @@ -1571,6 +1747,7 @@ fn normalize_browser_preference_mode(mode: &str) -> anyhow::Result<&'static str> fn browser_display_name(mode: &str) -> &'static str { match mode { "cloud" => "Browser Use Cloud", + "remote-cdp" => "Remote CDP", "managed-headless" => "Headless Chromium", "managed-headed" => "Managed Chromium", _ => "Local Chrome", @@ -1579,7 +1756,8 @@ fn browser_display_name(mode: &str) -> &'static str { fn display_browser_to_mode(display: &str) -> Option<&'static str> { match display { - "Browser Use Cloud" => Some("cloud"), + "Browser Use Cloud" | "Browser Use cloud" => Some("cloud"), + "Remote CDP" => Some("remote-cdp"), "Headless Chromium" => Some("managed-headless"), "Managed Chromium" => Some("managed-headed"), "Local Chrome" => Some("local"), @@ -1671,13 +1849,30 @@ fn map_script_output(out: BrowserScriptOutput) -> ExecOutput { fn browser_script_stdout(response: &BrowserScriptOutput) -> String { let text = browser_script_tool_message_content(response); let (image_parts, warnings) = browser_script_image_parts(response); - let text = append_browser_script_image_warnings(text, &warnings); + let text = + cap_inline_browser_script_stdout(append_browser_script_image_warnings(text, &warnings)); let Some(payload) = browser_script_content_payload(&text, image_parts) else { return text; }; format!("{text}{BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX}{payload}") } +fn cap_inline_browser_script_stdout(text: String) -> String { + if text.len() <= MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES { + return text; + } + let mut end = MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES; + while end > 0 && !text.is_char_boundary(end) { + end -= 1; + } + let elided = text.len() - end; + let mut out = text[..end].to_string(); + out.push_str(&format!( + "\n... [browser_script stdout truncated, {elided} more bytes; full output persisted. Use a narrower browser_script extraction, the emitted summaries, or a saved artifact instead of re-reading broad page text.]" + )); + out +} + fn browser_script_content_payload(text: &str, image_parts: Vec) -> Option { if image_parts.is_empty() { return None; @@ -1727,6 +1922,14 @@ fn browser_script_image_part(image: &Value) -> Result, Strin if !mime_type.starts_with("image/") { return Ok(None); } + if let Some((width, height)) = png_dimensions(&bytes) { + if width > BROWSER_SCRIPT_MAX_IMAGE_DIMENSION || height > BROWSER_SCRIPT_MAX_IMAGE_DIMENSION + { + return Err(format!( + "Warning: image artifact was not attached because its dimensions {width}x{height} exceed provider limit; artifact remains at {path}" + )); + } + } Ok(Some(ContentPart::Media { mime_type: mime_type.to_string(), data: Some(general_purpose::STANDARD.encode(bytes)), @@ -1735,6 +1938,15 @@ fn browser_script_image_part(image: &Value) -> Result, Strin })) } +fn png_dimensions(bytes: &[u8]) -> Option<(u32, u32)> { + if bytes.len() < 24 || &bytes[..8] != b"\x89PNG\r\n\x1a\n" { + return None; + } + let width = u32::from_be_bytes(bytes.get(16..20)?.try_into().ok()?); + let height = u32::from_be_bytes(bytes.get(20..24)?.try_into().ok()?); + Some((width, height)) +} + fn browser_script_tool_message_content(response: &BrowserScriptOutput) -> String { if response.status.as_deref() == Some("running") { return browser_script_running_message(response); @@ -1813,12 +2025,6 @@ fn browser_script_failure_message(response: &BrowserScriptOutput) -> String { fn browser_script_structured_message_parts(response: &BrowserScriptOutput) -> Vec { let mut parts = Vec::new(); - if !response.outputs.is_empty() { - parts.push(format!( - "outputs: {}", - Value::Array(response.outputs.clone()) - )); - } if !response.summary.is_empty() { parts.push(format!( "summary: {}", @@ -1828,6 +2034,12 @@ fn browser_script_structured_message_parts(response: &BrowserScriptOutput) -> Ve if !response.data.is_null() && response.data != serde_json::json!({}) { parts.push(format!("data: {}", response.data)); } + if !response.outputs.is_empty() { + parts.push(format!( + "outputs: {}", + Value::Array(response.outputs.clone()) + )); + } parts } diff --git a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs index 4337e006..141c0b8f 100644 --- a/crates/browser-use-agent/src/tools/handlers/browser_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/browser_tests.rs @@ -12,6 +12,7 @@ //! parallel_safe = false; (4) backend error -> ToolError; (5) an //! orchestrator-driven run with the fake backend. +use std::ffi::OsString; use std::path::PathBuf; use std::sync::{Arc, Mutex}; @@ -23,6 +24,7 @@ use serde_json::json; use super::browser::{ browser_command_is_passive, desired_browser_connect_command, BrowserAction, BrowserBackend, BrowserRequest, BrowserTool, BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX, + MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES, }; use crate::session::SharedStore; use crate::tools::approval::AskForApproval; @@ -59,10 +61,36 @@ struct FakeBackend { last_session: Mutex>, last_paths: Mutex>, last_timeout_secs: Mutex>, + script_text: Mutex>, + script_outputs: Mutex>, + script_summary: Mutex>, script_images: Mutex>, fail: bool, } +struct EnvVarGuard { + key: &'static str, + previous: Option, +} + +impl EnvVarGuard { + fn set(key: &'static str, value: &str) -> Self { + let previous = std::env::var_os(key); + std::env::set_var(key, value); + Self { key, previous } + } +} + +impl Drop for EnvVarGuard { + fn drop(&mut self) { + if let Some(previous) = &self.previous { + std::env::set_var(self.key, previous); + } else { + std::env::remove_var(self.key); + } + } +} + impl FakeBackend { fn last(&self) -> LastCall { self.last.lock().unwrap().clone() @@ -96,6 +124,10 @@ impl FakeBackend { *self.last_timeout_secs.lock().unwrap() } + fn set_script_text(&self, text: impl Into) { + *self.script_text.lock().unwrap() = Some(text.into()); + } + fn record_paths(&self, cwd: &std::path::Path, artifact_dir: &std::path::Path) { *self.last_paths.lock().unwrap() = Some((cwd.to_path_buf(), artifact_dir.to_path_buf())); } @@ -190,19 +222,31 @@ impl FakeBackend { } fn ok_script(status: Option<&str>, ok: bool) -> BrowserScriptOutput { + Self::ok_script_with_text(status, ok, "script-output".to_string()) + } + + fn ok_script_with_text(status: Option<&str>, ok: bool, text: String) -> BrowserScriptOutput { BrowserScriptOutput { ok, status: status.map(|s| s.to_string()), run_id: Some("run-1".to_string()), - text: "script-output".to_string(), + text, ..Default::default() } } fn ok_script_with_images(&self, status: Option<&str>, ok: bool) -> BrowserScriptOutput { + let text = self + .script_text + .lock() + .unwrap() + .clone() + .unwrap_or_else(|| "script-output".to_string()); BrowserScriptOutput { + outputs: self.script_outputs.lock().unwrap().clone(), + summary: self.script_summary.lock().unwrap().clone(), images: self.script_images(), - ..Self::ok_script(status, ok) + ..Self::ok_script_with_text(status, ok, text) } } } @@ -427,6 +471,60 @@ async fn bare_browser_connect_resolves_to_selected_cloud_mode() { ); } +#[tokio::test] +async fn bare_browser_connect_resolves_to_selected_remote_cdp_mode() { + let _guard = EnvVarGuard::set( + "BU_CDP_URL", + "ws://127.0.0.1:9222/devtools/browser/session-id", + ); + let backend = Arc::new(FakeBackend::default()); + let tool = + tool_with(Arc::clone(&backend)).with_selected_browser_mode(Some("remote-cdp".to_string())); + + let req = BrowserRequest::command("sess-1", "browser connect"); + let out = run_direct(&tool, &req).await.unwrap(); + + assert_eq!(out.exit_code, 0); + assert_eq!( + backend.last(), + LastCall::Command( + "browser connect remote-cdp --ws ws://127.0.0.1:9222/devtools/browser/session-id" + .to_string() + ) + ); +} + +#[tokio::test] +async fn selected_remote_cdp_rewrites_wrong_browser_family_commands() { + let _guard = EnvVarGuard::set( + "BU_CDP_URL", + "ws://127.0.0.1:9222/devtools/browser/session-id", + ); + let backend = Arc::new(FakeBackend::default()); + let tool = + tool_with(Arc::clone(&backend)).with_selected_browser_mode(Some("remote-cdp".to_string())); + + for command in [ + "browser connect managed --headed", + "browser connect managed --headless", + "browser connect local", + "browser remote start", + ] { + let req = BrowserRequest::command("sess-1", command); + let out = run_direct(&tool, &req).await.unwrap(); + + assert_eq!(out.exit_code, 0, "{command}"); + assert_eq!( + backend.last(), + LastCall::Command( + "browser connect remote-cdp --ws ws://127.0.0.1:9222/devtools/browser/session-id" + .to_string() + ), + "{command}" + ); + } +} + #[tokio::test] async fn selected_browser_mode_rejects_wrong_connection_family() { let backend = Arc::new(FakeBackend::default()); @@ -829,6 +927,84 @@ async fn script_images_are_appended_as_structured_stdout_payload() { assert!(media.2.is_none()); } +#[tokio::test] +async fn script_oversized_stdout_is_truncated_for_model_output() { + let backend = Arc::new(FakeBackend::default()); + backend.set_script_text("x".repeat(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES + 5_000)); + let tool = tool_with(Arc::clone(&backend)); + + let req = BrowserRequest::execute("sess-1", "document.body.innerText", false); + let out = run_direct(&tool, &req).await.unwrap(); + + assert_eq!(out.exit_code, 0); + assert!( + out.stdout.len() < MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES + 1_000, + "stdout should be capped, got {} bytes", + out.stdout.len() + ); + assert!( + out.stdout.contains("[browser_script stdout truncated"), + "stdout: {}", + out.stdout + ); + assert!( + !out.stdout + .contains(&"x".repeat(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES + 100)), + "uncapped browser_script output leaked into model stdout" + ); +} + +#[tokio::test] +async fn script_truncated_structured_output_preserves_summary_first() { + let backend = Arc::new(FakeBackend::default()); + backend.script_summary.lock().unwrap().push(json!({ + "kind": "extracted", + "message": "Read 40 candidate rows", + "output_label": "candidate_rows" + })); + backend.script_outputs.lock().unwrap().push(json!({ + "label": "candidate_rows", + "value": "x".repeat(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES + 8_000) + })); + let tool = tool_with(Arc::clone(&backend)); + + let req = BrowserRequest::execute("sess-1", "emit_output(rows, label='candidate_rows')", false); + let out = run_direct(&tool, &req).await.unwrap(); + + assert_eq!(out.exit_code, 0); + assert!( + out.stdout.contains("summary:"), + "summary should remain visible before large raw output: {}", + out.stdout + ); + assert!( + out.stdout.contains("Read 40 candidate rows"), + "stdout: {}", + out.stdout + ); + assert!( + out.stdout.contains("[browser_script stdout truncated"), + "stdout: {}", + out.stdout + ); + assert!( + out.stdout + .contains("Use a narrower browser_script extraction"), + "stdout: {}", + out.stdout + ); + assert!( + out.stdout.find("summary:") < out.stdout.find("outputs:"), + "summary should precede raw outputs: {}", + out.stdout + ); +} + +#[test] +fn browser_script_stdout_cap_defaults_to_four_kib_for_eval_cost() { + assert_eq!(MAX_INLINE_BROWSER_SCRIPT_STDOUT_BYTES, 4 * 1024); +} + #[tokio::test] async fn script_unreadable_images_warn_in_stdout() { let temp = tempfile::tempdir().expect("tempdir"); @@ -859,6 +1035,46 @@ async fn script_unreadable_images_warn_in_stdout() { ); } +#[tokio::test] +async fn script_oversized_png_images_warn_in_stdout_without_media_payload() { + let temp = tempfile::tempdir().expect("tempdir"); + let image_path = temp.path().join("wide.png"); + let mut png = vec![0_u8; 24]; + png[0..8].copy_from_slice(b"\x89PNG\r\n\x1a\n"); + png[12..16].copy_from_slice(b"IHDR"); + png[16..20].copy_from_slice(&8001_u32.to_be_bytes()); + png[20..24].copy_from_slice(&600_u32.to_be_bytes()); + std::fs::write(&image_path, png).expect("write png"); + + let backend = Arc::new(FakeBackend::default()); + backend.script_images.lock().unwrap().push(json!({ + "path": image_path, + "mime_type": "image/png", + "detail": "auto", + "label": "wide", + })); + let tool = tool_with(Arc::clone(&backend)); + + let req = BrowserRequest::execute("sess-1", "capture_screenshot()", false); + let out = run_direct(&tool, &req).await.unwrap(); + assert!( + out.stdout + .contains("dimensions 8001x600 exceed provider limit"), + "stdout: {}", + out.stdout + ); + assert!( + out.stdout.contains("artifact remains at"), + "stdout: {}", + out.stdout + ); + assert!( + !out.stdout.contains(BROWSER_SCRIPT_CONTENT_STDOUT_PREFIX), + "oversized-only images should not emit a media marker: {}", + out.stdout + ); +} + #[tokio::test] async fn default_artifact_dir_comes_from_tool_ctx_artifact_root() { let backend = Arc::new(FakeBackend::default()); diff --git a/crates/browser-use-agent/src/tools/handlers/done.rs b/crates/browser-use-agent/src/tools/handlers/done.rs index 267dfaa0..6b38c8cf 100644 --- a/crates/browser-use-agent/src/tools/handlers/done.rs +++ b/crates/browser-use-agent/src/tools/handlers/done.rs @@ -31,9 +31,10 @@ //! //! * **Tool name** — `done` (the completion tool key). Mirrors the codex/legacy //! completion/`done` tool the agent calls to declare it has finished. -//! * **Args** — `{ "text"?: string }`: an optional free-text final summary -//! message. Codex's completion carries the final assistant text; we model the -//! summary as the single optional `text` field (omittable on the wire). +//! * **Args** — `{ "result"?: string, "text"?: string, "result_file"?: string }`: +//! an optional user-facing final answer, a legacy `text` alias, and an optional +//! result file pointer. Codex's completion carries the final assistant text; +//! Browser Use prompts call this `result`, so both names are accepted. //! * **no approval / benign** — like `update_plan`, this is a pure state echo: it //! needs no approval and touches no sandbox. We leave //! [`exec_approval_requirement`](Approvable::exec_approval_requirement) at its @@ -64,14 +65,21 @@ pub const DONE_STDOUT_PREFIX: &str = "done:"; /// Typed request for the `done` tool. /// -/// `text` is the optional final summary message the model carries when it -/// declares the task finished. `#[serde(default)]` so it may be omitted on the -/// wire; skipped on serialize when `None` to keep the echoed JSON tidy. +/// `result` is the canonical final answer. `text` remains accepted for legacy +/// callers, and `result_file` can point at a persisted artifact when the answer +/// is intentionally file-backed. All fields are optional so the model may still +/// declare done with no message. #[derive(Clone, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct DoneRequest { - /// The final summary message (optional). + /// Canonical user-facing final answer. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub result: Option, + /// Legacy final summary alias. #[serde(default, skip_serializing_if = "Option::is_none")] pub text: Option, + /// Optional relative or absolute result artifact path. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub result_file: Option, } impl DoneRequest { @@ -79,12 +87,49 @@ impl DoneRequest { pub fn with_text(text: impl Into) -> Self { Self { text: Some(text.into()), + ..Self::default() } } - /// The final summary message, trimmed; empty when no (or blank) text. - pub fn summary(&self) -> &str { - self.text.as_deref().map(str::trim).unwrap_or("") + /// Convenience constructor with the canonical final answer field. + pub fn with_result(result: impl Into) -> Self { + Self { + result: Some(result.into()), + ..Self::default() + } + } + + /// The user-facing final answer, trimmed. + /// + /// `result` wins over legacy `text`. If both are blank and only a + /// `result_file` was supplied, expose a compact file-pointer summary so the + /// host has a visible completion result. + pub fn summary(&self) -> String { + if let Some(result) = self + .result + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + { + return result.to_string(); + } + if let Some(text) = self + .text + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + { + return text.to_string(); + } + if let Some(result_file) = self + .result_file + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + { + return format!("Result file: {result_file}"); + } + String::new() } } @@ -106,7 +151,9 @@ impl DoneTool { /// so the key is rarely consulted; it exists to satisfy [`Approvable`] uniformly. #[derive(serde::Serialize, Clone, Debug, Eq, PartialEq, Hash)] pub struct DoneApprovalKey { + result: Option, text: Option, + result_file: Option, } impl Approvable for DoneTool { @@ -114,7 +161,9 @@ impl Approvable for DoneTool { fn approval_keys(&self, req: &DoneRequest) -> Vec { vec![DoneApprovalKey { + result: req.result.clone(), text: req.text.clone(), + result_file: req.result_file.clone(), }] } diff --git a/crates/browser-use-agent/src/tools/handlers/done_tests.rs b/crates/browser-use-agent/src/tools/handlers/done_tests.rs index 430e0267..198e493d 100644 --- a/crates/browser-use-agent/src/tools/handlers/done_tests.rs +++ b/crates/browser-use-agent/src/tools/handlers/done_tests.rs @@ -87,28 +87,56 @@ async fn done_without_text_yields_empty_summary() { assert_eq!(out.stdout, DONE_STDOUT_PREFIX); } -// ---- (3) the wire args deserialize from the model's `{ "text": ... }` ---- +// ---- (3) the wire args deserialize from Browser Use-style and legacy payloads ---- #[test] fn done_wire_args_round_trip() { - // Full form. - let req: DoneRequest = serde_json::from_value(serde_json::json!({ "text": "done now" })) + // Browser Use-style final result form. + let req: DoneRequest = serde_json::from_value(serde_json::json!({ "result": "done now" })) .expect("done deserialize"); - assert_eq!(req.text.as_deref(), Some("done now")); + assert_eq!(req.result.as_deref(), Some("done now")); assert_eq!(req.summary(), "done now"); - // Minimal: `text` omitted -> None (the model may declare done with no message). + // Legacy `text` remains accepted. + let legacy: DoneRequest = serde_json::from_value(serde_json::json!({ "text": "legacy done" })) + .expect("legacy done deserialize"); + assert_eq!(legacy.text.as_deref(), Some("legacy done")); + assert_eq!(legacy.summary(), "legacy done"); + + // `result` wins if both canonical and legacy fields are present. + let both: DoneRequest = + serde_json::from_value(serde_json::json!({ "result": "canonical", "text": "legacy" })) + .expect("combined done deserialize"); + assert_eq!(both.summary(), "canonical"); + + // File-only completion still produces a visible host summary. + let file_only: DoneRequest = + serde_json::from_value(serde_json::json!({ "result_file": "outputs/answer.json" })) + .expect("file done deserialize"); + assert_eq!(file_only.summary(), "Result file: outputs/answer.json"); + + // Minimal: fields omitted -> None (the model may declare done with no message). let empty: DoneRequest = serde_json::from_value(serde_json::json!({})).expect("empty done deserialize"); + assert_eq!(empty.result, None); assert_eq!(empty.text, None); + assert_eq!(empty.result_file, None); assert_eq!(empty.summary(), ""); - // `text` is skipped on serialize when None. + // Empty fields are skipped on serialize. let json = serde_json::to_value(&DoneRequest::default()).unwrap(); + assert!( + json.get("result").is_none(), + "None result is skipped on serialize" + ); assert!( json.get("text").is_none(), "None text is skipped on serialize" ); + assert!( + json.get("result_file").is_none(), + "None result_file is skipped on serialize" + ); } // ---- (4) drive one call through the orchestrator over the seam ---- diff --git a/crates/browser-use-agent/src/tools/registry.rs b/crates/browser-use-agent/src/tools/registry.rs index 79f9c7e0..7a978632 100644 --- a/crates/browser-use-agent/src/tools/registry.rs +++ b/crates/browser-use-agent/src/tools/registry.rs @@ -1109,22 +1109,29 @@ to the single frame that proves the task succeeded." } /// `done`: the completion tool the model calls to declare the task finished, - /// carrying its final summary. Parity: codex/legacy completion (`done`) tool - /// (`{ "text"?: string }`). The handler's - /// [`DoneRequest`](crate::tools::handlers::done::DoneRequest) accepts an - /// optional `text` summary. + /// carrying its final answer. The handler accepts Browser Use-style + /// `{ "result"?: string, "result_file"?: string }` and the legacy + /// `{ "text"?: string }` alias. pub fn done() -> ToolDefinition { ToolDefinition { name: "done".to_string(), description: - "Signal that the task is finished, with an optional final summary message." + "Signal that the task is finished, carrying the complete user-facing final answer." .to_string(), input_schema: json!({ "type": "object", "properties": { + "result": { + "type": "string", + "description": "The complete final answer to show the user or evaluator. Include all requested data here when the task asks for inline JSON, CSV, markdown, a table, links, or a schema-shaped response." + }, "text": { "type": "string", - "description": "The final summary message describing what was accomplished." + "description": "Legacy alias for result. Prefer result for new calls." + }, + "result_file": { + "type": "string", + "description": "Optional path to a saved final-result artifact when a file pointer satisfies the task or supplements the inline result." } }, "additionalProperties": false diff --git a/crates/browser-use-agent/src/tools/registry_tests.rs b/crates/browser-use-agent/src/tools/registry_tests.rs index 8efc4089..c63b0c9f 100644 --- a/crates/browser-use-agent/src/tools/registry_tests.rs +++ b/crates/browser-use-agent/src/tools/registry_tests.rs @@ -1096,7 +1096,7 @@ async fn done_dispatches_through_the_registry() { let out = reg .dispatch( "done", - &serde_json::json!({ "text": "task finished" }), + &serde_json::json!({ "result": "task finished" }), &ctx("done"), &env(), AskForApproval::Never, @@ -1114,4 +1114,14 @@ async fn done_dispatches_through_the_registry() { ); // done is serial (terminal). assert_eq!(reg.parallel_safe("done"), Some(false)); + + let done_def = reg + .model_visible_definitions() + .into_iter() + .find(|definition| definition.name == "done") + .expect("done definition"); + let properties = &done_def.input_schema["properties"]; + assert!(properties.get("result").is_some()); + assert!(properties.get("text").is_some()); + assert!(properties.get("result_file").is_some()); } diff --git a/crates/browser-use-agent/src/turn/loop_driver.rs b/crates/browser-use-agent/src/turn/loop_driver.rs index a4966368..7c6dae6d 100644 --- a/crates/browser-use-agent/src/turn/loop_driver.rs +++ b/crates/browser-use-agent/src/turn/loop_driver.rs @@ -79,8 +79,12 @@ use super::{CompactionMode, SamplingDriver, TurnObserver, TurnState}; use crate::decision::{self, LoopStep}; use crate::events::TurnCtx; use crate::task::{TurnAbortReason, TurnLifecycleEvent}; +use browser_use_llm::schema::{ContentPart, Message, MessageRole}; use tokio_util::sync::CancellationToken; +const FINAL_MAX_TURNS_NUDGE: &str = "This is the final allowed step for this run. Stop exploring and call the done tool with the best complete answer you can provide now. Include unknown or unavailable items explicitly instead of continuing to search."; +const PROGRESS_MAX_TURNS_NUDGE: &str = "Progress checkpoint: If you have enough evidence, a saved artifact, or a complete-enough answer, stop further exploration and call the done tool now. Continue only for clearly missing required information that is likely to change the final answer."; + /// The async, unbounded turn-loop driver. Generic over the three frozen turn /// traits so production wires real impls (`ContextManager`+`Session`, /// `ModelSamplingDriver`, a `StoreSink`-backed observer) while tests inject @@ -118,6 +122,33 @@ impl TurnLoop { ctx: TurnCtx, turn_has_fresh_input: bool, cancel: CancellationToken, + ) -> Result, crate::AgentError> { + self.run_inner(ctx, turn_has_fresh_input, cancel, None) + .await + } + + /// Run the driver with an optional sampling-round limit. + /// + /// Browser Use's Python API exposes this as `Agent.run(max_steps=...)`. + /// The default [`run`](Self::run) remains unbounded for Codex parity, while + /// terminal/browser-use bridge callers can opt into the cap. + pub async fn run_with_max_turns( + &self, + ctx: TurnCtx, + turn_has_fresh_input: bool, + cancel: CancellationToken, + max_turns: usize, + ) -> Result, crate::AgentError> { + self.run_inner(ctx, turn_has_fresh_input, cancel, Some(max_turns.max(1))) + .await + } + + async fn run_inner( + &self, + ctx: TurnCtx, + turn_has_fresh_input: bool, + cancel: CancellationToken, + max_turns: Option, ) -> Result, crate::AgentError> { let turn_id = ctx.session_id.clone(); self.observer.on_lifecycle(TurnLifecycleEvent::TurnStarted { @@ -128,6 +159,7 @@ impl TurnLoop { // drained; with no fresh input we may drain immediately. let mut can_drain = decision::initial_can_drain(turn_has_fresh_input); let mut last_agent_message: Option = None; + let mut turns_run = 0usize; // Unbounded (`turn.rs:214`): NO max-turns counter. The only exits are // Complete, cancellation, or a hard error. @@ -147,6 +179,18 @@ impl TurnLoop { // `ContextManager` history; the loop simply threads it through. let mut request = self.state.clone_history_for_prompt().await; request.extend(input); + let next_turn = turns_run + 1; + if max_turns.is_some_and(|limit| next_turn == limit) { + request.push(Message::new( + MessageRole::Developer, + vec![ContentPart::text(FINAL_MAX_TURNS_NUDGE)], + )); + } else if max_turns.is_some_and(|limit| should_emit_progress_nudge(limit, next_turn)) { + request.push(Message::new( + MessageRole::Developer, + vec![ContentPart::text(PROGRESS_MAX_TURNS_NUDGE)], + )); + } // ---- 2. run one sampling round-trip ---- let outcome = match self @@ -166,6 +210,7 @@ impl TurnLoop { } Err(other) => return Err(other), }; + turns_run += 1; // Carry the latest assistant text forward (codex keeps the last // non-empty agent message as the turn result; `turn.rs:340`). @@ -183,6 +228,23 @@ impl TurnLoop { // ---- 4. act on the step (codex `turn.rs:250-355`) ---- match step { + LoopStep::Complete => { + // Terminal: no follow-up needed and no compaction. Record the + // final agent message and break (`turn.rs:340-355`). + self.observer + .on_lifecycle(TurnLifecycleEvent::TurnComplete { + turn_id, + last_agent_message: last_agent_message.clone(), + }); + return Ok(last_agent_message); + } + _ if max_turns.is_some_and(|limit| turns_run >= limit) => { + self.observer.on_lifecycle(TurnLifecycleEvent::TurnAborted { + turn_id, + reason: TurnAbortReason::Interrupted, + }); + return Ok(last_agent_message); + } LoopStep::CompactThenContinue { can_drain_next } => { // Compact, then continue. The compaction BODY is a stub hook // (real model-based compaction WP pending); the CONTROL FLOW @@ -197,17 +259,14 @@ impl TurnLoop { // gate is always open (`turn.rs:250-255`). can_drain = true; } - LoopStep::Complete => { - // Terminal: no follow-up needed and no compaction. Record the - // final agent message and break (`turn.rs:340-355`). - self.observer - .on_lifecycle(TurnLifecycleEvent::TurnComplete { - turn_id, - last_agent_message: last_agent_message.clone(), - }); - return Ok(last_agent_message); - } } } } } + +fn should_emit_progress_nudge(max_turns: usize, next_turn: usize) -> bool { + if max_turns < 40 || next_turn >= max_turns { + return false; + } + next_turn >= max_turns / 2 && next_turn % 10 == 0 +} diff --git a/crates/browser-use-agent/src/turn/loop_tests.rs b/crates/browser-use-agent/src/turn/loop_tests.rs index 5c698db2..895e091f 100644 --- a/crates/browser-use-agent/src/turn/loop_tests.rs +++ b/crates/browser-use-agent/src/turn/loop_tests.rs @@ -28,7 +28,7 @@ use tokio_util::sync::CancellationToken; use crate::decision::{SamplingOutcome, TokenStatus}; use crate::events::TurnCtx; -use crate::task::TurnLifecycleEvent; +use crate::task::{TurnAbortReason, TurnLifecycleEvent}; use crate::turn::{SamplingDriver, TurnLoop, TurnObserver, TurnState}; use crate::AgentError; @@ -712,7 +712,90 @@ async fn loop_is_unbounded_fifty_iterations_complete() { assert_eq!(observer.kinds(), vec!["started", "complete"]); } -// ---- (7) a hard (non-abort) sampling error propagates out of the loop ------ +// ---- (7) bounded run stops after max_turns ------------------------------- + +#[tokio::test] +async fn bounded_loop_aborts_after_max_turns() { + let sampler = ScriptedSamplingDriver::new(vec![ + SamplingScript::Ok(follow_up("step 0")), + SamplingScript::Ok(follow_up("step 1")), + SamplingScript::Ok(complete("should not run")), + ]); + let requests = sampler.requests_handle(); + let inputs = sampler.inputs_handle(); + let state = InMemoryTurnState::new(Vec::new(), token_status(false)); + let observer = RecordingObserver::new(); + + let turn = TurnLoop::new(state, sampler, observer.clone()); + let out = turn + .run_with_max_turns(ctx(), false, CancellationToken::new(), 2) + .await + .expect("bounded loop should stop gracefully"); + + assert_eq!(requests.load(Ordering::SeqCst), 2); + assert_eq!(out.as_deref(), Some("step 1")); + let recorded_inputs = inputs.lock().unwrap(); + let Some(Message { + role: MessageRole::Developer, + content, + .. + }) = recorded_inputs[1].last() + else { + panic!("last bounded request should include final-step developer nudge"); + }; + assert!( + matches!(content.first(), Some(ContentPart::Text { text }) if text.contains("final allowed step")), + "final nudge should tell the agent to finish" + ); + assert_eq!(observer.kinds(), vec!["started", "aborted"]); + let events = observer.events.lock().unwrap(); + assert!(matches!( + events.last(), + Some(TurnLifecycleEvent::TurnAborted { + reason: TurnAbortReason::Interrupted, + .. + }) + )); +} + +#[tokio::test] +async fn bounded_loop_adds_progress_nudge_for_long_runs() { + let mut scripts: Vec = (0..50) + .map(|i| SamplingScript::Ok(follow_up(&format!("step {i}")))) + .collect(); + scripts.push(SamplingScript::Ok(complete("done after checkpoint"))); + + let sampler = ScriptedSamplingDriver::new(scripts); + let requests = sampler.requests_handle(); + let inputs = sampler.inputs_handle(); + let state = InMemoryTurnState::new(Vec::new(), token_status(false)); + let observer = RecordingObserver::new(); + + let turn = TurnLoop::new(state, sampler, observer.clone()); + let out = turn + .run_with_max_turns(ctx(), false, CancellationToken::new(), 100) + .await + .expect("bounded long run should complete"); + + assert_eq!(requests.load(Ordering::SeqCst), 51); + assert_eq!(out.as_deref(), Some("done after checkpoint")); + let recorded_inputs = inputs.lock().unwrap(); + let Some(Message { + role: MessageRole::Developer, + content, + .. + }) = recorded_inputs[49].last() + else { + panic!("turn 50 should include the progress developer nudge"); + }; + assert!( + matches!(content.first(), Some(ContentPart::Text { text }) if text.contains("Progress checkpoint")), + "progress nudge should tell the agent to finalize once enough evidence exists" + ); + assert_eq!(observer.kinds(), vec!["started", "complete"]); +} + +// ---- (8) a hard (non-abort) sampling error propagates out of the loop ------ #[tokio::test] async fn hard_sampling_error_propagates_and_does_not_complete() { diff --git a/crates/browser-use-agent/src/turn/sampling.rs b/crates/browser-use-agent/src/turn/sampling.rs index 734bcbf8..93c3a2b0 100644 --- a/crates/browser-use-agent/src/turn/sampling.rs +++ b/crates/browser-use-agent/src/turn/sampling.rs @@ -54,12 +54,13 @@ use std::collections::HashMap; use std::future::Future; use std::pin::Pin; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use std::time::Instant; use browser_use_llm::route::{ModelClient, Route}; use browser_use_llm::schema::{ - ContentPart, FinishReason, LlmError, LlmErrorReason, LlmEvent, LlmRequest, Message, + CacheHint, ContentPart, FinishReason, LlmError, LlmErrorReason, LlmEvent, LlmRequest, Message, MessageRole, SystemPart, TextPhase, Usage, }; use futures_util::{Stream, StreamExt}; @@ -343,6 +344,7 @@ pub struct ModelSamplingDriver< transport: T, sink: Arc, ctx: TurnCtx, + next_turn_idx: AtomicUsize, /// Retry budget (codex `provider.stream_max_retries()`). max_retries: u32, /// Whether to apply I/O-layer jitter to the post-decision backoff sleep. @@ -378,6 +380,7 @@ impl ModelSamplingDriver { Self { transport, sink, + next_turn_idx: AtomicUsize::new(ctx.turn_idx), ctx, max_retries, jitter: true, @@ -406,6 +409,7 @@ impl ModelSamplingDriver { ModelSamplingDriver { transport: self.transport, sink: self.sink, + next_turn_idx: self.next_turn_idx, ctx: self.ctx, max_retries: self.max_retries, jitter: self.jitter, @@ -442,22 +446,36 @@ impl ModelSamplingDriver { } /// Map an [`LlmEvent`] to UI events and emit them through the sink. - fn emit_event(&self, ev: &LlmEvent) { - for pending in events::map_llm_event(&self.ctx, ev) { + fn ctx_for_turn(&self, turn_idx: usize) -> TurnCtx { + let mut ctx = self.ctx.clone(); + ctx.turn_idx = turn_idx; + ctx + } + + fn emit_event(&self, ev: &LlmEvent, turn_idx: usize) { + let ctx = self.ctx_for_turn(turn_idx); + for pending in events::map_llm_event(&ctx, ev) { self.sink.emit(pending); } } - fn emit_turn_request(&self, attempt: u32, composition: &Value) { + fn emit_turn_request( + &self, + turn_idx: usize, + attempt: u32, + composition: &Value, + llm_input: &Value, + ) { self.sink.emit(PendingEvent::new( self.ctx.session_id.clone(), names::MODEL_TURN_REQUEST, serde_json::json!({ "model": &self.ctx.model, "provider": &self.ctx.provider, - "turn_idx": self.ctx.turn_idx, + "turn_idx": turn_idx, "attempt": attempt, "composition": composition, + "llm_input": llm_input, }), )); } @@ -552,9 +570,10 @@ impl ModelSamplingDriver { acc: &mut TurnAccumulator, ev: LlmEvent, started_at: Instant, + turn_idx: usize, ) -> Result { // Emit UI events first (map is pure; emit is the only side effect). - self.emit_event(&ev); + self.emit_event(&ev, turn_idx); match ev { LlmEvent::TextDelta { id, delta } => { let has_content = !delta.trim().is_empty(); @@ -700,18 +719,31 @@ fn calls_done_tool(tool_calls: &[ContentPart]) -> bool { /// The final summary carried by the model's `done` call, if any. /// -/// Reads the `text` field from the first `done` tool call's JSON arguments -/// (matching the `done` handler's `DoneRequest { text }`). Returns `None` when -/// there is no `done` call or it carried no (non-empty) summary, so the caller -/// only overrides the turn result when there is a real message to surface. +/// Reads the `result` field from the first `done` tool call's JSON arguments, +/// falling back to the legacy `text` alias and then to a compact `result_file` +/// pointer. Returns `None` when there is no `done` call or it carried no +/// non-empty completion payload, so the caller only overrides the turn result +/// when there is a real message to surface. fn done_summary(tool_calls: &[ContentPart]) -> Option { tool_calls.iter().find_map(|p| match p { - ContentPart::ToolCall { name, input, .. } if name == DONE_TOOL_NAME => input - .get("text") - .and_then(|t| t.as_str()) - .map(str::trim) - .filter(|s| !s.is_empty()) - .map(str::to_string), + ContentPart::ToolCall { name, input, .. } if name == DONE_TOOL_NAME => { + for field in ["result", "text"] { + if let Some(value) = input + .get(field) + .and_then(|value| value.as_str()) + .map(str::trim) + .filter(|value| !value.is_empty()) + { + return Some(value.to_string()); + } + } + input + .get("result_file") + .and_then(|value| value.as_str()) + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(|path| format!("Result file: {path}")) + } _ => None, }) } @@ -860,6 +892,7 @@ impl SamplingDriver // the populated conversation, not an empty body. let input = self.input_with_goal_context(input); let mut req = build_request(&self.ctx, input); + let turn_idx = self.next_turn_idx.fetch_add(1, Ordering::Relaxed); // Advertise the tool catalog. When a dispatcher is attached (the fused // path), it carries the registry's model-visible definitions; we copy them // verbatim (order-stable) into `req.tools` so the model can actually emit @@ -874,9 +907,10 @@ impl SamplingDriver // exist here (they are never persisted as message events). Uses the same // byte->token estimator the agent uses elsewhere, so it stays consistent. let composition = request_composition(&req); + let llm_input = request_observability_input(&req); let mut attempt: u32 = 0; loop { - self.emit_turn_request(attempt, &composition); + self.emit_turn_request(turn_idx, attempt, &composition, &llm_input); // ---- open the stream (codex: `client.stream(&prompt).await`) ---- let mut stream = match self.transport.open_stream(&req) { Ok(s) => s, @@ -908,7 +942,7 @@ impl SamplingDriver match maybe_event { Some(Ok(ev)) => { let check_mailbox_preemption = checks_mailbox_preemption_after_event(&ev); - match self.consume_event(&mut acc, ev, started_at)? { + match self.consume_event(&mut acc, ev, started_at, turn_idx)? { StreamProgress::Continue => { if check_mailbox_preemption && self.has_mailbox_preemption().await { preempted_for_mailbox = true; @@ -1032,9 +1066,11 @@ impl SamplingDriver /// unit-reachable while the fused driver still advertises the catalog. fn build_request(ctx: &TurnCtx, input: Vec) -> LlmRequest { let mut req = LlmRequest::new(ctx.model.clone(), ctx.provider.clone()); - req.system - .push(SystemPart::new(ctx.base_instructions.clone())); + let mut base_system = SystemPart::new(ctx.base_instructions.clone()); + base_system.cache = Some(CacheHint::Ephemeral); + req.system.push(base_system); req.messages = input; + mark_message_cache_breakpoints(&mut req.messages); if let Some(instruction) = ctx.browser_mode_instruction.as_deref() { req.messages.insert( 0, @@ -1047,6 +1083,47 @@ fn build_request(ctx: &TurnCtx, input: Vec) -> LlmRequest { req } +fn mark_message_cache_breakpoints(messages: &mut [Message]) { + const LOOKBACK_TARGET_BLOCKS: usize = 16; + const MAX_MESSAGE_BREAKPOINTS: usize = 2; + + for message in messages.iter_mut() { + message.cache = None; + } + + let eligible: Vec<(usize, usize)> = messages + .iter() + .enumerate() + .filter(|(_, message)| { + !matches!(message.role, MessageRole::System | MessageRole::Developer) + }) + .map(|(index, message)| (index, message.content.len().max(1))) + .collect(); + let Some((last_index, _)) = eligible.last().copied() else { + return; + }; + + let mut selected = vec![last_index]; + let mut blocks_since_last = 0usize; + for (index, block_count) in eligible.into_iter().rev().skip(1) { + blocks_since_last = blocks_since_last.saturating_add(block_count); + if blocks_since_last >= LOOKBACK_TARGET_BLOCKS { + selected.push(index); + break; + } + } + selected.sort_unstable(); + selected.dedup(); + if selected.len() > MAX_MESSAGE_BREAKPOINTS { + selected.drain(0..selected.len() - MAX_MESSAGE_BREAKPOINTS); + } + for index in selected { + if let Some(message) = messages.get_mut(index) { + message.cache = Some(CacheHint::Ephemeral); + } + } +} + /// Token attribution for the per-turn request, computed from the REAL assembled /// [`LlmRequest`]. The system prompt and tool schemas are not message events, so /// this is the only place the `/context` view can learn their size. Counts use @@ -1075,3 +1152,61 @@ fn request_composition(req: &LlmRequest) -> Value { "tools": tools, }) } + +fn request_observability_input(req: &LlmRequest) -> Value { + let message_count = req.messages.len(); + let messages: Vec = req.messages.iter().map(observability_json_value).collect(); + let system: Vec = req.system.iter().map(observability_json_value).collect(); + let tools: Vec = req.tools.iter().map(observability_json_value).collect(); + + serde_json::json!({ + "system": system, + "messages": messages, + "tools": tools, + "tools_count": tools.len(), + "message_count": message_count, + "omitted_earlier_messages": 0, + "truncated": false, + }) +} + +fn observability_json_value(value: &T) -> Value { + serde_json::to_value(value) + .map(sanitize_observability_value) + .unwrap_or(Value::Null) +} + +fn sanitize_observability_value(value: Value) -> Value { + match value { + Value::Object(map) => { + let mut out = serde_json::Map::with_capacity(map.len()); + for (key, value) in map { + if is_observability_secret_key(&key) { + out.insert(key, Value::String("[redacted]".to_string())); + } else { + out.insert(key, sanitize_observability_value(value)); + } + } + Value::Object(out) + } + Value::Array(values) => Value::Array( + values + .into_iter() + .map(sanitize_observability_value) + .collect(), + ), + other => other, + } +} + +fn is_observability_secret_key(key: &str) -> bool { + let key = key.to_ascii_lowercase(); + key.contains("api_key") + || key.contains("apikey") + || key.contains("authorization") + || key.contains("auth_token") + || key.contains("password") + || key.contains("secret") + || key.contains("token") + || key.contains("cookie") +} diff --git a/crates/browser-use-agent/src/turn/sampling_tests.rs b/crates/browser-use-agent/src/turn/sampling_tests.rs index 8af86924..af3398ec 100644 --- a/crates/browser-use-agent/src/turn/sampling_tests.rs +++ b/crates/browser-use-agent/src/turn/sampling_tests.rs @@ -12,7 +12,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use browser_use_llm::schema::{ - ContentPart, FinishReason, LlmError, LlmErrorReason, LlmEvent, LlmRequest, Message, + CacheHint, ContentPart, FinishReason, LlmError, LlmErrorReason, LlmEvent, LlmRequest, Message, MessageRole, TextPhase, Usage, }; use browser_use_protocol::EventRecord; @@ -219,6 +219,15 @@ fn tool_call(name: &str) -> Result { }) } +fn tool_call_with_input(name: &str, input: serde_json::Value) -> Result { + Ok(LlmEvent::ToolCall { + id: "call-1".to_string(), + name: name.to_string(), + namespace: None, + input, + }) +} + fn finish(reason: FinishReason) -> Result { Ok(LlmEvent::Finish { usage: Usage { @@ -319,6 +328,40 @@ async fn finish_accounts_usage_only_when_goal_is_active() { ); } +#[tokio::test] +async fn repeated_sampling_requests_emit_monotonic_turn_indices() { + let (transport, _opens) = ScriptedTransport::new(vec![ + OpenScript::Stream(vec![finish(FinishReason::Stop)]), + OpenScript::Stream(vec![finish(FinishReason::Stop)]), + ]); + let sink = Arc::new(RecordingSink::default()); + let d = driver(transport, sink.clone(), 5); + + let _ = d + .run_sampling_request(user_input(), CancellationToken::new()) + .await + .expect("first sampling request should succeed"); + let _ = d + .run_sampling_request(user_input(), CancellationToken::new()) + .await + .expect("second sampling request should succeed"); + + let events = sink.drain(); + let turn_request_indices: Vec = events + .iter() + .filter(|event| event.event_type == names::MODEL_TURN_REQUEST) + .map(|event| event.payload["turn_idx"].as_i64().expect("turn_idx")) + .collect(); + let token_count_indices: Vec = events + .iter() + .filter(|event| event.event_type == names::TOKEN_COUNT) + .map(|event| event.payload["turn_idx"].as_i64().expect("turn_idx")) + .collect(); + + assert_eq!(turn_request_indices, vec![0, 1]); + assert_eq!(token_count_indices, vec![0, 1]); +} + #[tokio::test] async fn active_goal_context_is_injected_with_codex_envelope() { let (transport, seen) = @@ -585,9 +628,11 @@ async fn driver_passes_populated_per_call_request_to_open_stream() { ); // And it must be EXACTLY the input the driver was asked to sample, with the // turn's model/provider identity from `ctx()`. + let mut expected_messages = input.clone(); + expected_messages.last_mut().unwrap().cache = Some(CacheHint::Ephemeral); assert_eq!( - req.messages, input, - "open_stream must receive the driver's per-call input messages verbatim" + req.messages, expected_messages, + "open_stream must receive the driver's per-call input messages with the current-state cache hint" ); // `req.model`/`req.provider` are the `ModelId`/`ProviderId` newtypes; compare // against the same `.into()` conversion `LlmRequest::new` applies to `ctx()`. @@ -601,6 +646,167 @@ async fn driver_passes_populated_per_call_request_to_open_stream() { ctx().provider.into(), "request carries the turn's provider" ); + assert_eq!( + req.system.first().and_then(|part| part.cache), + Some(CacheHint::Ephemeral), + "stable base system prompt should be cacheable for providers that support prompt caching" + ); + assert_eq!( + req.messages.last().and_then(|message| message.cache), + Some(CacheHint::Ephemeral), + "latest browser-state message should be cacheable like the Python Anthropic serializer" + ); +} + +#[tokio::test] +async fn open_stream_marks_an_earlier_cache_breakpoint_for_long_histories() { + let (transport, seen) = + RecordingTransport::new(vec![text_delta("ok"), finish(FinishReason::Stop)]); + let sink: Arc = Arc::new(RecordingSink::default()); + let d = ModelSamplingDriver::new(transport, sink, ctx(), 5).without_jitter(); + + let input: Vec = (0..25) + .map(|index| { + Message::new( + MessageRole::User, + vec![ContentPart::text(format!("browser state {index}"))], + ) + }) + .collect(); + let _ = d + .run_sampling_request(input, CancellationToken::new()) + .await + .expect("sampling should succeed"); + + let captured = seen.lock().unwrap(); + let req = &captured[0]; + let cache_indices: Vec = req + .messages + .iter() + .enumerate() + .filter_map(|(index, message)| { + (message.cache == Some(CacheHint::Ephemeral)).then_some(index) + }) + .collect(); + + assert_eq!( + cache_indices, + vec![8, 24], + "long browser histories should keep the latest message cacheable and add one earlier breakpoint inside Anthropic's lookback window" + ); +} + +#[tokio::test] +async fn turn_request_event_carries_full_llm_input_messages() { + let (transport, _opens) = + ScriptedTransport::new(vec![OpenScript::Stream(vec![finish(FinishReason::Stop)])]); + let sink = Arc::new(RecordingSink::default()); + let d = driver(transport, sink.clone(), 5); + + let input = vec![ + Message::new( + MessageRole::User, + vec![ + ContentPart::text("Find the account page."), + ContentPart::Media { + mime_type: "image/png".to_string(), + data: Some("iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB".to_string()), + url: None, + detail: Some("low".to_string()), + }, + ], + ), + Message::new( + MessageRole::Assistant, + vec![ContentPart::ToolCall { + id: "call-1".to_string(), + name: "browser_script".to_string(), + input: serde_json::json!({ + "code": "goto_url('https://example.com')", + "api_key": "secret-value", + }), + provider_metadata: None, + }], + ), + ]; + + let _ = d + .run_sampling_request(input, CancellationToken::new()) + .await + .expect("sampling should succeed"); + + let events = sink.drain(); + let request = events + .iter() + .find(|event| event.event_type == names::MODEL_TURN_REQUEST) + .expect("turn request event emitted"); + let llm_input = &request.payload["llm_input"]; + assert_eq!(llm_input["message_count"], serde_json::json!(2)); + assert_eq!(llm_input["omitted_earlier_messages"], serde_json::json!(0)); + assert_eq!( + llm_input["messages"][0]["content"][0]["text"], + serde_json::json!("Find the account page.") + ); + assert_eq!( + llm_input["messages"][0]["content"][1]["data"], + serde_json::json!("iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB") + ); + assert_eq!( + llm_input["messages"][1]["content"][0]["input"]["api_key"], + serde_json::json!("[redacted]") + ); + assert_eq!(llm_input["truncated"], serde_json::json!(false)); + assert!(!llm_input["system"][0]["text"] + .as_str() + .unwrap_or_default() + .is_empty()); +} + +#[tokio::test] +async fn turn_request_event_carries_all_observability_messages_without_text_budget() { + let (transport, _opens) = + ScriptedTransport::new(vec![OpenScript::Stream(vec![finish(FinishReason::Stop)])]); + let sink = Arc::new(RecordingSink::default()); + let d = driver(transport, sink.clone(), 5); + + let long_text = "observe-this-text".repeat(6_000); + let mut input: Vec = (0..85) + .map(|index| { + Message::new( + MessageRole::User, + vec![ContentPart::text(format!("msg-{index}"))], + ) + }) + .collect(); + input.push(Message::new( + MessageRole::User, + vec![ContentPart::text(long_text.clone())], + )); + + let _ = d + .run_sampling_request(input, CancellationToken::new()) + .await + .expect("sampling should succeed"); + + let events = sink.drain(); + let request = events + .iter() + .find(|event| event.event_type == names::MODEL_TURN_REQUEST) + .expect("turn request event emitted"); + let llm_input = &request.payload["llm_input"]; + let messages = llm_input["messages"].as_array().expect("messages array"); + assert_eq!(llm_input["message_count"], serde_json::json!(86)); + assert_eq!(llm_input["omitted_earlier_messages"], serde_json::json!(0)); + assert_eq!(messages.len(), 86); + assert_eq!( + messages[85]["content"][0]["text"], + serde_json::json!(long_text) + ); + assert_eq!(llm_input["truncated"], serde_json::json!(false)); + + let serialized = serde_json::to_string(llm_input).expect("llm_input serializes"); + assert!(!serialized.contains("request observability text budget exhausted")); + assert!(!serialized.contains("...[truncated]")); } #[tokio::test] @@ -751,7 +957,7 @@ impl crate::turn::sampling::FusionRecorder for NoopRecorder { fn tool_def(name: &str) -> browser_use_llm::schema::ToolDefinition { browser_use_llm::schema::ToolDefinition { name: name.to_string(), - description: String::new(), + description: format!("{name} model-visible tool description"), input_schema: serde_json::json!({"type": "object"}), output_schema: None, namespace: None, @@ -789,9 +995,10 @@ async fn fused_driver_advertises_dispatcher_tool_specs_on_request() { // about the request the driver built. let (transport, seen) = RecordingTransport::new(vec![text_delta("ok"), finish(FinishReason::Stop)]); - let sink: Arc = Arc::new(RecordingSink::default()); + let sink = Arc::new(RecordingSink::default()); + let sink_for_driver: Arc = sink.clone(); let recorder: Arc = Arc::new(NoopRecorder); - let d = ModelSamplingDriver::new(transport, sink, ctx(), 5) + let d = ModelSamplingDriver::new(transport, sink_for_driver, ctx(), 5) .without_jitter() .with_fusion(dispatcher, recorder); @@ -813,12 +1020,79 @@ async fn fused_driver_advertises_dispatcher_tool_specs_on_request() { !req.tools.is_empty(), "fused driver must advertise the dispatcher's tool specs — req.tools is EMPTY" ); - let names: Vec<&str> = req.tools.iter().map(|t| t.name.as_str()).collect(); + let tool_names: Vec<&str> = req.tools.iter().map(|t| t.name.as_str()).collect(); assert_eq!( - names, + tool_names, vec!["browser", "python", "shell"], "req.tools must carry the registered tool names, in the registry's order" ); + + let events = sink.drain(); + let request = events + .iter() + .find(|event| event.event_type == names::MODEL_TURN_REQUEST) + .expect("turn request event emitted"); + let llm_tools = request.payload["llm_input"]["tools"] + .as_array() + .expect("llm_input tools array"); + assert_eq!( + request.payload["llm_input"]["tools_count"], + serde_json::json!(3) + ); + assert_eq!(llm_tools[0]["name"], serde_json::json!("browser")); + assert_eq!( + llm_tools[0]["description"], + serde_json::json!("browser model-visible tool description") + ); + assert_eq!( + llm_tools[0]["input_schema"], + serde_json::json!({"type": "object"}) + ); +} + +#[tokio::test] +async fn fused_done_result_becomes_final_message_without_follow_up() { + use crate::turn::dispatch::ToolDispatcher; + use crate::turn::sampling::FusionRecorder; + + let specs = vec![tool_def("done")]; + let dispatcher = Arc::new(ToolDispatcher::with_runner_and_specs( + NoopRunner, /* model_supports */ true, specs, + )); + let (transport, _opens) = ScriptedTransport::new(vec![OpenScript::Stream(vec![ + tool_call_with_input( + "done", + serde_json::json!({ + "result": "full table answer", + "text": "legacy summary" + }), + ), + finish(FinishReason::ToolUse), + ])]); + let sink: Arc = Arc::new(RecordingSink::default()); + let recorder: Arc = Arc::new(NoopRecorder); + let d = ModelSamplingDriver::new(transport, sink, ctx(), 5) + .without_jitter() + .with_fusion(dispatcher, recorder); + + let out = d + .run_sampling_request(user_input(), CancellationToken::new()) + .await + .expect("sampling should succeed"); + + assert!( + !out.model_needs_follow_up, + "done must terminate the fused turn instead of requesting another sample" + ); + assert_eq!( + out.last_agent_message.as_deref(), + Some("full table answer"), + "canonical done.result must be surfaced over the legacy text alias" + ); + assert!( + out.defers_mailbox_delivery_to_next_turn, + "terminal done output is the final-answer boundary" + ); } #[tokio::test] diff --git a/crates/browser-use-browser/src/browser_script_helpers.py b/crates/browser-use-browser/src/browser_script_helpers.py index 979cf892..4e95ad0d 100644 --- a/crates/browser-use-browser/src/browser_script_helpers.py +++ b/crates/browser-use-browser/src/browser_script_helpers.py @@ -12,9 +12,11 @@ import os import pathlib import sys +import threading import time as _time import urllib.error import urllib.request +from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import urlparse @@ -23,13 +25,50 @@ __last_domain_skills = [] +_bridge_call_lock = threading.RLock() +_TRANSIENT_BRIDGE_ERRORS = ( + "browser is not connected or is busy", + "browser session is busy", + "browser bridge closed before response", + "cdp runtime.evaluate timed out", + "runtime.evaluate timed out", + "temporarily unavailable", +) + + +def _is_transient_bridge_error(exc): + message = str(exc).lower() + return any(part in message for part in _TRANSIENT_BRIDGE_ERRORS) + + +def _bridge_with_retry(payload, *, attempts=4): + delay = 0.25 + last_exc = None + for attempt in range(attempts): + try: + with _bridge_call_lock: + return _bridge(payload) + except (OSError, TimeoutError, RuntimeError) as exc: + last_exc = exc + if attempt + 1 >= attempts or not _is_transient_bridge_error(exc): + raise + print( + f"browser_script bridge retry {attempt + 2}/{attempts} after transient error: {exc}", + file=sys.stderr, + flush=True, + ) + _time.sleep(delay) + delay = min(delay * 2, 2.0) + raise last_exc + + def _send_meta(meta, **params): - return _bridge({"kind": "meta", "meta": meta, **params}) + return _bridge_with_retry({"kind": "meta", "meta": meta, **params}) def cdp(method, session_id=None, **params): """Raw CDP. Example: cdp("Page.navigate", url="https://example.com").""" - return _bridge({"kind": "cdp", "method": method, "session_id": session_id, "params": params}) + return _bridge_with_retry({"kind": "cdp", "method": method, "session_id": session_id, "params": params}) def cdp_batch(calls): @@ -386,6 +425,10 @@ def goto_url(url): def page_info(): """Return url, title, viewport, scroll position, page size, and target info.""" + try: + ensure_real_tab() + except Exception: + pass dialog = _send_meta("pending_dialog").get("dialog") if dialog: return {"dialog": dialog} @@ -563,10 +606,21 @@ def _timeout_seconds(timeout): def wait_for_load(timeout=3.0): timeout = _timeout_seconds(timeout) deadline = _time.time() + timeout + interactive_since = None while _time.time() < deadline: try: - if js("document.readyState") == "complete": + state = js("document.readyState") + if state == "complete": return True + if state == "interactive": + has_body = js("!!document.body && !!location.href && !location.href.startsWith('about:')") + if has_body: + if interactive_since is None: + interactive_since = _time.time() + if _time.time() - interactive_since >= 1.0: + return True + else: + interactive_since = None except Exception: pass _time.sleep(0.3) @@ -630,9 +684,53 @@ def _write_b64_artifact(label, data_b64, suffix=".png", mime_type="image/png"): return str(path) +def _positive_int_env(names, default=None): + for name in names: + raw = os.environ.get(name) + if raw is None: + continue + try: + value = int(str(raw).strip()) + except ValueError: + continue + if value > 0: + return value + if value == 0: + return None + return default + + +def _screenshot_max_dim(max_dim): + if max_dim is not None: + try: + value = int(max_dim) + except (TypeError, ValueError): + return None + return value if value > 0 else None + return _positive_int_env(("BU_BROWSER_SCREENSHOT_MAX_DIM", "BROWSER_USE_SCREENSHOT_MAX_DIM"), 7600) + + +def _downscale_image_artifact(path, max_dim): + if not max_dim: + return None + try: + from PIL import Image + + img = Image.open(path) + original_size = img.size + if max(original_size) > max_dim: + img.thumbnail((max_dim, max_dim)) + img.save(path) + return {"width": img.size[0], "height": img.size[1], "downscaled": True, "original_size": original_size} + return {"width": original_size[0], "height": original_size[1], "downscaled": False} + except Exception: + return None + + def capture_screenshot(label="screenshot", full=False, attach=True, max_dim=None, **kwargs): """Save a PNG of the current viewport and return its local artifact path.""" try: + ensure_real_tab() target_id = (current_tab() or {}).get("targetId") if target_id: cdp("Target.activateTarget", session_id=None, targetId=target_id) @@ -647,27 +745,33 @@ def capture_screenshot(label="screenshot", full=False, attach=True, max_dim=None if full: params["captureBeyondViewport"] = True params.update(kwargs) - result = cdp("Page.captureScreenshot", **params) + last_error = None + for attempt in range(3): + try: + result = cdp("Page.captureScreenshot", **params) + break + except Exception as exc: + last_error = exc + if attempt == 2: + raise + _time.sleep(0.35 * (attempt + 1)) + else: + raise last_error if not attach: return result path = _write_b64_artifact(label, result["data"], ".png", "image/png") - if max_dim: - try: - from PIL import Image - - img = Image.open(path) - if max(img.size) > max_dim: - img.thumbnail((max_dim, max_dim)) - img.save(path) - except Exception: - pass + image_info = _downscale_image_artifact(path, _screenshot_max_dim(max_dim)) + if image_info and __images: + __images[-1].update(image_info) + if image_info and __artifacts: + __artifacts[-1].update({key: image_info[key] for key in ("width", "height") if key in image_info}) return path def note(caption): """Mark the current moment as important for the recording, with a short human-readable caption (e.g. note("Delta $209 - cheapest fare details")). - Cheap: it just timestamps a caption; the 2fps session capture already has the + Cheap: it just timestamps a caption; when enabled, session capture already has the frame. Call it at each meaningful step so the end-of-run highlight GIF can be captioned. Returns the recorded note.""" record = {"ts_ms": int(_time.time() * 1000), "caption": str(caption)} @@ -992,3 +1096,240 @@ def http_get(url, headers=None, timeout=20.0, binary=None): raise RuntimeError( f"http_get failed for {url}: {exc}. Try a shorter timeout, browser js(fetch(...)), or a configured proxy if the site blocks direct HTTP." ) from exc + + +def http_get_many(urls, headers=None, timeout=20.0, binary=None, max_workers=8, return_errors=True): + """Fetch many independent URLs with http_get while preserving input order. + + By default one failed URL becomes {"ok": False, "url": ..., "error": ...} + instead of failing the whole batch. Set return_errors=False when every URL is + required and the caller should abort on the first failure. + """ + items = list(urls) + if not items: + return [] + workers = max(1, min(int(max_workers or 1), len(items))) + results = [None] * len(items) + + def fetch_one(index, item): + if isinstance(item, dict): + request_url = item["url"] + request_headers = dict(headers or {}) + request_headers.update(item.get("headers") or {}) + request_timeout = item.get("timeout", timeout) + request_binary = item.get("binary", binary) + else: + request_url = str(item) + request_headers = headers + request_timeout = timeout + request_binary = binary + return index, request_url, http_get( + request_url, + headers=request_headers, + timeout=request_timeout, + binary=request_binary, + ) + + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = [pool.submit(fetch_one, index, item) for index, item in enumerate(items)] + for future in as_completed(futures): + try: + index, _url, response = future.result() + results[index] = response + except Exception as exc: + index = futures.index(future) + item = items[index] + request_url = item.get("url") if isinstance(item, dict) else str(item) + if not return_errors: + raise + results[index] = {"ok": False, "url": request_url, "error": str(exc)} + return results + + +def _normalize_browser_fetch_request( + url, + method="GET", + headers=None, + body=None, + json_body=None, + timeout=20.0, + binary=None, +): + request_headers = dict(headers or {}) + request_body = body + if json_body is not None: + request_body = json.dumps(json_body) + if not any(k.lower() == "content-type" for k in request_headers): + request_headers["Content-Type"] = "application/json" + if isinstance(request_body, (dict, list)): + request_body = json.dumps(request_body) + if not any(k.lower() == "content-type" for k in request_headers): + request_headers["Content-Type"] = "application/json" + if isinstance(request_body, bytes): + request_body = request_body.decode("latin1") + return { + "url": str(url), + "method": str(method or "GET").upper(), + "headers": request_headers, + "body": request_body, + "timeout_ms": int(float(timeout) * 1000), + "binary": bool(binary), + } + + +def _browser_fetch_response(result, return_error=False): + if not isinstance(result, dict): + if return_error: + return {"ok": False, "url": None, "error": f"invalid browser_fetch result: {result!r}"} + raise RuntimeError(f"invalid browser_fetch result: {result!r}") + if not result.get("ok"): + if return_error: + return { + "ok": False, + "url": result.get("url"), + "error": result.get("error", "browser_fetch failed"), + } + raise RuntimeError(f"browser_fetch failed for {result.get('url')}: {result.get('error')}") + headers = result.get("headers") or {} + status = result.get("status") + url = result.get("url") + if result.get("binary"): + body = base64.b64decode(result.get("body_b64") or "") + return _HttpGetBytes(body, status, headers, url) + return _HttpGetText(result.get("body") or "", status, headers, url) + + +def browser_fetch( + url, + method="GET", + headers=None, + body=None, + json_body=None, + timeout=20.0, + binary=None, + return_error=True, +): + """Fetch from the current page context with browser cookies/session state. + + By default a failed page-context fetch returns + {"ok": False, "url": ..., "error": ...} instead of failing the entire + browser_script call. Pass return_error=False when the caller wants a hard + exception for required URLs. + """ + request = _normalize_browser_fetch_request( + url, + method=method, + headers=headers, + body=body, + json_body=json_body, + timeout=timeout, + binary=binary, + ) + return browser_fetch_many([request], timeout=timeout, return_errors=return_error)[0] + + +def browser_fetch_many(requests, timeout=20.0, max_concurrency=6, return_errors=True): + """Fetch many URLs from the current page context, preserving order. + + Each item may be a URL string or a dict with url/method/headers/body/json_body/ + timeout/binary. This is useful after the page reveals stable endpoints but + direct http_get lacks cookies, auth headers, or browser-only access. + """ + normalized = [] + for item in list(requests): + if isinstance(item, dict): + normalized.append( + _normalize_browser_fetch_request( + item["url"], + method=item.get("method", "GET"), + headers=item.get("headers"), + body=item.get("body"), + json_body=item.get("json_body"), + timeout=item.get("timeout", timeout), + binary=item.get("binary"), + ) + ) + else: + normalized.append(_normalize_browser_fetch_request(item, timeout=timeout)) + if not normalized: + return [] + + expression = f""" +(async () => {{ + const requests = {json.dumps(normalized)}; + const maxConcurrency = Math.max(1, Math.min({int(max_concurrency or 1)}, requests.length)); + function arrayBufferToBase64(buffer) {{ + const bytes = new Uint8Array(buffer); + let binary = ""; + const chunkSize = 0x8000; + for (let i = 0; i < bytes.length; i += chunkSize) {{ + const chunk = bytes.subarray(i, i + chunkSize); + binary += String.fromCharCode.apply(null, chunk); + }} + return btoa(binary); + }} + async function fetchOne(request) {{ + const controller = new AbortController(); + const timeoutMs = Math.max(1, Number(request.timeout_ms || 20000)); + const timer = setTimeout(() => controller.abort(), timeoutMs); + try {{ + const options = {{ + method: request.method || "GET", + headers: request.headers || {{}}, + credentials: "include", + signal: controller.signal + }}; + if (request.body !== null && request.body !== undefined) {{ + options.body = request.body; + }} + const response = await fetch(request.url, options); + const headers = {{}}; + response.headers.forEach((value, key) => {{ headers[key] = value; }}); + if (request.binary) {{ + const buffer = await response.arrayBuffer(); + return {{ + ok: true, + response_ok: response.ok, + status: response.status, + statusText: response.statusText, + url: response.url, + headers, + binary: true, + body_b64: arrayBufferToBase64(buffer) + }}; + }} + const body = await response.text(); + return {{ + ok: true, + response_ok: response.ok, + status: response.status, + statusText: response.statusText, + url: response.url, + headers, + binary: false, + body + }}; + }} catch (error) {{ + return {{ + ok: false, + url: request.url, + error: String(error && (error.message || error)) + }}; + }} finally {{ + clearTimeout(timer); + }} + }} + const results = new Array(requests.length); + let next = 0; + async function worker() {{ + while (next < requests.length) {{ + const index = next++; + results[index] = await fetchOne(requests[index]); + }} + }} + await Promise.all(Array.from({{length: maxConcurrency}}, worker)); + return results; +}})() +""" + raw_results = _runtime_evaluate(expression, await_promise=True, return_by_value=True) + return [_browser_fetch_response(result, return_error=return_errors) for result in raw_results] diff --git a/crates/browser-use-browser/src/lib.rs b/crates/browser-use-browser/src/lib.rs index fd54270d..8d8e12ac 100644 --- a/crates/browser-use-browser/src/lib.rs +++ b/crates/browser-use-browser/src/lib.rs @@ -28,7 +28,7 @@ use tungstenite::{connect, Message, WebSocket}; const BU_API: &str = "https://api.browser-use.com/api/v3"; const LOG_LIMIT: usize = 250; const SCRIPT_MAX_OUTPUT_CHARS: usize = 120_000; -const BROWSER_SCRIPT_INITIAL_WAIT_MS: u64 = 750; +const BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS: u64 = 7_000; const BROWSER_SCRIPT_DEFAULT_OBSERVE_MS: u64 = 1_000; const BROWSER_SCRIPT_HELPERS: &str = include_str!("browser_script_helpers.py"); const BROWSER_CONNECT_LOCAL_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(120); @@ -196,6 +196,7 @@ struct BrowserSession { last_target_id: Option, last_session_id: Option, last_emitted_browser_payload: Option, + browser_profile_runtime: BrowserProfileRuntimeState, preferred_target_marker: Option, preferred_profile_id: Option, active_local_profile_id: Option, @@ -203,6 +204,11 @@ struct BrowserSession { logs: VecDeque, } +#[derive(Default)] +struct BrowserProfileRuntimeState { + applied_setup_keys: HashSet, +} + impl Default for BrowserSession { fn default() -> Self { Self { @@ -224,6 +230,7 @@ impl Default for BrowserSession { last_target_id: None, last_session_id: None, last_emitted_browser_payload: None, + browser_profile_runtime: BrowserProfileRuntimeState::default(), preferred_target_marker: None, preferred_profile_id: None, active_local_profile_id: None, @@ -558,6 +565,97 @@ fn active_browser_script_next_step(active_scripts: &Value) -> Option { .map(ToOwned::to_owned) } +fn is_browser_recovery_command(argv: &[String]) -> bool { + argv.first().map(String::as_str) == Some("recover") +} + +fn busy_recovery_status_json( + session_id: &str, + argv: &[String], + mut status: Value, + script_registry: &BrowserScriptRunRegistry, +) -> Value { + let requested_command = format!("browser {}", argv.join(" ")); + let live_active_scripts = + active_browser_script_runs_json_with_registry(session_id, script_registry); + let active_scripts = if live_active_scripts + .as_array() + .is_some_and(|scripts| !scripts.is_empty()) + { + live_active_scripts + } else { + status + .get("active_scripts") + .cloned() + .unwrap_or(live_active_scripts) + }; + let next_step = busy_recovery_next_step(&active_scripts, &requested_command); + + if let Some(object) = status.as_object_mut() { + object.insert("status".to_string(), Value::String("busy".to_string())); + object.insert("busy".to_string(), Value::Bool(true)); + object.insert("recovery_deferred".to_string(), Value::Bool(true)); + object.insert( + "reason".to_string(), + Value::String( + "Browser recovery was requested while an active browser_script owned the browser session." + .to_string(), + ), + ); + object.insert( + "requested_command".to_string(), + Value::String(requested_command.clone()), + ); + object.insert("active_scripts".to_string(), active_scripts); + object.insert("next_step".to_string(), Value::String(next_step.clone())); + object.insert( + "model_instruction".to_string(), + Value::String(format!( + "The browser session is busy, not failed. Follow next_step, then retry {requested_command}." + )), + ); + return status; + } + + json!({ + "status": "busy", + "busy": true, + "recovery_deferred": true, + "reason": "Browser recovery was requested while an active browser_script owned the browser session.", + "requested_command": requested_command, + "active_scripts": active_scripts, + "next_step": next_step, + "model_instruction": format!( + "The browser session is busy, not failed. Follow next_step, then retry {requested_command}." + ), + }) +} + +fn busy_recovery_next_step(active_scripts: &Value, requested_command: &str) -> String { + let Some(script) = active_scripts + .as_array() + .and_then(|scripts| scripts.first()) + else { + return format!( + "Wait for the in-flight browser_script to return, run browser status --json, then retry {requested_command}." + ); + }; + let Some(run_id) = script.get("run_id").and_then(Value::as_str) else { + return format!( + "Wait for the in-flight browser_script to return, run browser status --json, then retry {requested_command}." + ); + }; + let status = script + .get("status") + .and_then(Value::as_str) + .unwrap_or("running"); + if matches!(status, "finished" | "timed_out") { + format!("browser_script action=observe run_id={run_id}; then retry {requested_command}.") + } else { + format!("browser_script action=observe run_id={run_id}; if it is still stuck without progress, browser_script action=cancel run_id={run_id}; then retry {requested_command}.") + } +} + pub fn run_browser_command( session_id: &str, cwd: impl AsRef, @@ -658,6 +756,12 @@ pub fn run_browser_command_with_options_and_registries( content, }); } + if is_browser_recovery_command(&argv) { + return Ok(BrowserCommandOutput { + events: Vec::new(), + content: busy_recovery_status_json(session_id, &argv, content, script_registry), + }); + } bail!( "browser session is busy with an active browser_script; observe or cancel that script before running browser {}", argv.join(" ") @@ -789,7 +893,7 @@ pub fn start_browser_script_with_registries( timeout_seconds, session_registry, )?; - let initial_deadline = Instant::now() + Duration::from_millis(BROWSER_SCRIPT_INITIAL_WAIT_MS); + let initial_deadline = Instant::now() + Duration::from_millis(browser_script_initial_wait_ms()); loop { if run.child.try_wait()?.is_some() { return finish_browser_script_run(run, false); @@ -838,6 +942,22 @@ pub fn start_browser_script_with_registries( } } +fn browser_script_initial_wait_ms() -> u64 { + [ + "BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", + "BROWSER_SCRIPT_INITIAL_WAIT_MS", + ] + .iter() + .find_map(|name| { + std::env::var(name) + .ok() + .and_then(|value| value.trim().parse::().ok()) + .filter(|value| *value > 0) + }) + .map(|value| value.clamp(250, 30_000)) + .unwrap_or(BROWSER_SCRIPT_DEFAULT_INITIAL_WAIT_MS) +} + pub fn observe_browser_script( session_id: &str, run_id: &str, @@ -1019,6 +1139,16 @@ fn spawn_browser_script_with_session_registry( code, )?; let mut command = browser_script_python_command(); + if browser_script_session_outputs_enabled() { + let outputs_dir = artifact_dir.as_ref().join("outputs"); + fs::create_dir_all(&outputs_dir).with_context(|| { + format!( + "create browser_script outputs dir {}", + outputs_dir.display() + ) + })?; + command.env("BH_OUTPUTS_DIR", outputs_dir); + } let mut child = command .arg("-c") .arg(prelude) @@ -1459,6 +1589,10 @@ fn nonempty_os_var(name: &str) -> Option { std::env::var_os(name).filter(|value| !value.is_empty()) } +fn browser_script_session_outputs_enabled() -> bool { + env_bool("BU_BROWSER_SCRIPT_SESSION_OUTPUTS").unwrap_or(false) +} + fn venv_python_path(venv: &Path) -> PathBuf { #[cfg(windows)] { @@ -1954,7 +2088,9 @@ fn dispatch_connect(session: &mut BrowserSession, argv: &[String]) -> Result ManagedProfile::Temp, Some(path) => ManagedProfile::Path(PathBuf::from(path)), }; - let extra_args = option_values(argv, "--arg"); + let profile = managed_browser_profile_from_env(profile); + let mut extra_args = option_values(argv, "--arg"); + extra_args.extend(managed_browser_extra_args_from_env()); session.connect_managed(headless, profile, extra_args) } Some("remote-cdp") => { @@ -1971,6 +2107,358 @@ fn dispatch_connect(session: &mut BrowserSession, argv: &[String]) -> Result, + params: Value, +} + +fn env_trimmed(name: &str) -> Option { + std::env::var(name) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) +} + +fn env_bool(name: &str) -> Option { + match env_trimmed(name)?.to_ascii_lowercase().as_str() { + "1" | "true" | "yes" | "on" => Some(true), + "0" | "false" | "no" | "off" => Some(false), + _ => None, + } +} + +fn env_json_string_list(name: &str) -> Vec { + let Some(raw) = env_trimmed(name) else { + return Vec::new(); + }; + let Ok(value) = serde_json::from_str::(&raw) else { + return Vec::new(); + }; + let Some(items) = value.as_array() else { + return Vec::new(); + }; + let mut seen = HashSet::new(); + let mut out = Vec::new(); + for item in items { + let Some(value) = item + .as_str() + .map(str::trim) + .filter(|value| !value.is_empty()) + else { + continue; + }; + if seen.insert(value.to_string()) { + out.push(value.to_string()); + } + } + out +} + +fn expand_browser_profile_path(value: &str) -> PathBuf { + if let Some(rest) = value.strip_prefix("~/") { + if let Some(home) = home_dir() { + return home.join(rest); + } + } + PathBuf::from(value) +} + +fn managed_browser_profile_from_env(fallback: ManagedProfile) -> ManagedProfile { + env_trimmed("BU_MANAGED_BROWSER_PROFILE") + .map(|path| ManagedProfile::Path(expand_browser_profile_path(&path))) + .unwrap_or(fallback) +} + +fn managed_browser_extra_args_from_env() -> Vec { + let Some(raw) = env_trimmed("BU_MANAGED_BROWSER_ARGS") else { + return Vec::new(); + }; + let Ok(value) = serde_json::from_str::(&raw) else { + return Vec::new(); + }; + value + .as_array() + .into_iter() + .flatten() + .filter_map(Value::as_str) + .filter(|arg| !arg.is_empty()) + .map(ToOwned::to_owned) + .collect() +} + +fn browser_viewport_launch_args() -> Vec { + if env_bool("BU_BROWSER_NO_VIEWPORT") == Some(true) { + return Vec::new(); + } + let Some(raw) = env_trimmed("BU_BROWSER_VIEWPORT") else { + return Vec::new(); + }; + let Ok(value) = serde_json::from_str::(&raw) else { + return Vec::new(); + }; + let Some(width) = value.get("width").and_then(Value::as_i64) else { + return Vec::new(); + }; + let Some(height) = value.get("height").and_then(Value::as_i64) else { + return Vec::new(); + }; + if width <= 0 || height <= 0 { + return Vec::new(); + } + let mut args = vec![format!("--window-size={width},{height}")]; + if let Some(scale) = value + .get("deviceScaleFactor") + .and_then(Value::as_f64) + .filter(|scale| *scale > 0.0) + { + args.push(format!("--force-device-scale-factor={scale}")); + } + args +} + +fn browser_download_behavior() -> Option<(String, Value)> { + if env_bool("BU_BROWSER_ACCEPT_DOWNLOADS") == Some(false) { + return Some(("downloads:false".to_string(), json!({ "behavior": "deny" }))); + } + let raw_path = env_trimmed("BU_BROWSER_DOWNLOADS_PATH")?; + let path = expand_browser_profile_path(&raw_path); + let _ = fs::create_dir_all(&path); + Some(( + format!("downloads:true:{}", path.display()), + json!({ + "behavior": "allow", + "downloadPath": path.display().to_string(), + "eventsEnabled": true, + }), + )) +} + +fn browser_storage_state_raw() -> Option { + env_trimmed("BU_BROWSER_STORAGE_STATE") +} + +fn browser_storage_state() -> Option { + serde_json::from_str::(&browser_storage_state_raw()?).ok() +} + +fn browser_storage_cookies(storage_state: &Value) -> Vec { + storage_state + .get("cookies") + .and_then(Value::as_array) + .into_iter() + .flatten() + .filter_map(cookie_to_cdp_param) + .collect() +} + +fn browser_storage_init_scripts(storage_state: &Value) -> Vec { + let Some(origins) = storage_state.get("origins").and_then(Value::as_array) else { + return Vec::new(); + }; + let mut scripts = Vec::new(); + for origin_state in origins { + let Some(origin_state) = origin_state.as_object() else { + continue; + }; + let origin = origin_state.get("origin").and_then(Value::as_str); + let mut statements = Vec::new(); + for storage_name in ["localStorage", "sessionStorage"] { + let Some(items) = origin_state.get(storage_name).and_then(Value::as_array) else { + continue; + }; + for item in items { + let Some(name) = item.get("name").and_then(Value::as_str) else { + continue; + }; + let Some(value) = item.get("value").and_then(Value::as_str) else { + continue; + }; + statements.push(format!( + "window.{storage_name}.setItem({}, {});", + serde_json::to_string(name).unwrap_or_else(|_| "\"\"".to_string()), + serde_json::to_string(value).unwrap_or_else(|_| "\"\"".to_string()) + )); + } + } + if statements.is_empty() { + continue; + } + let body = statements.join("\n "); + if let Some(origin) = origin.filter(|origin| !origin.trim().is_empty()) { + scripts.push(format!( + "try {{\n if (window.location.origin === {}) {{\n {body}\n }}\n}} catch (error) {{}}", + serde_json::to_string(origin).unwrap_or_else(|_| "\"\"".to_string()) + )); + } else { + scripts.push(format!("try {{\n {body}\n}} catch (error) {{}}")); + } + } + scripts +} + +fn browser_profile_setup_calls(session_id: Option<&str>) -> Vec { + let mut calls = Vec::new(); + + let permissions = env_json_string_list("BU_BROWSER_PERMISSIONS"); + if !permissions.is_empty() { + calls.push(BrowserProfileSetupCall { + key: format!("permissions:{}", permissions.join("\0")), + method: "Browser.grantPermissions", + session_id: None, + params: json!({ "permissions": permissions }), + }); + } + + if let Some((key, params)) = browser_download_behavior() { + calls.push(BrowserProfileSetupCall { + key, + method: "Browser.setDownloadBehavior", + session_id: None, + params, + }); + } + + if let Some(storage_state) = browser_storage_state() { + if let Some(raw) = browser_storage_state_raw() { + let cookies = browser_storage_cookies(&storage_state); + if !cookies.is_empty() { + calls.push(BrowserProfileSetupCall { + key: format!("storage-cookies:{raw}"), + method: "Storage.setCookies", + session_id: None, + params: json!({ "cookies": cookies }), + }); + } + if let Some(session_id) = session_id { + for (index, source) in browser_storage_init_scripts(&storage_state) + .into_iter() + .enumerate() + { + calls.push(BrowserProfileSetupCall { + key: format!("storage-script:{session_id}:{index}:{raw}"), + method: "Page.addScriptToEvaluateOnNewDocument", + session_id: Some(session_id.to_string()), + params: json!({ "source": source, "runImmediately": true }), + }); + } + } + } + } + + if let (Some(session_id), Some(user_agent)) = (session_id, env_trimmed("BU_BROWSER_USER_AGENT")) + { + calls.push(BrowserProfileSetupCall { + key: format!("user-agent:{session_id}:{user_agent}"), + method: "Network.setUserAgentOverride", + session_id: Some(session_id.to_string()), + params: json!({ "userAgent": user_agent }), + }); + } + + calls +} + +fn is_root_domain_pattern(pattern: &str) -> bool { + !pattern.contains('*') && !pattern.contains("://") && pattern.matches('.').count() == 1 +} + +fn wildcard_match(pattern: &str, value: &str) -> bool { + if !pattern.contains('*') { + return pattern == value; + } + let mut remainder = value; + let mut first = true; + for part in pattern.split('*') { + if part.is_empty() { + continue; + } + if first && !pattern.starts_with('*') { + let Some(stripped) = remainder.strip_prefix(part) else { + return false; + }; + remainder = stripped; + } else if let Some(index) = remainder.find(part) { + remainder = &remainder[index + part.len()..]; + } else { + return false; + } + first = false; + } + pattern.ends_with('*') || remainder.is_empty() +} + +fn browser_domain_pattern_matches(url: &str, host: &str, scheme: &str, pattern: &str) -> bool { + let pattern = pattern.trim(); + if pattern.is_empty() { + return false; + } + let host_lower = host.to_ascii_lowercase(); + let pattern_lower = pattern.to_ascii_lowercase(); + if let Some(domain) = pattern_lower.strip_prefix("*.") { + return matches!(scheme, "http" | "https") + && (host_lower == domain || host_lower.ends_with(&format!(".{domain}"))); + } + if pattern_lower.ends_with("/*") { + return url + .to_ascii_lowercase() + .starts_with(pattern_lower.trim_end_matches('*')); + } + if pattern_lower.contains('*') { + let value = if pattern_lower.contains("://") { + format!("{scheme}://{host_lower}") + } else { + host_lower.clone() + }; + return wildcard_match(&pattern_lower, &value); + } + if pattern_lower.contains("://") { + return url.to_ascii_lowercase().starts_with(&pattern_lower); + } + host_lower == pattern_lower + || (is_root_domain_pattern(&pattern_lower) && host_lower == format!("www.{pattern_lower}")) +} + +fn browser_profile_url_allowed(raw_url: &str) -> bool { + if matches!( + raw_url, + "about:blank" | "chrome://new-tab-page/" | "chrome://new-tab-page" | "chrome://newtab/" + ) { + return true; + } + let block_ip_addresses = env_bool("BU_BROWSER_BLOCK_IP_ADDRESSES") == Some(true); + let allowed_domains = env_json_string_list("BU_BROWSER_ALLOWED_DOMAINS"); + let prohibited_domains = env_json_string_list("BU_BROWSER_PROHIBITED_DOMAINS"); + let constraints_active = + block_ip_addresses || !allowed_domains.is_empty() || !prohibited_domains.is_empty(); + let Ok(url) = reqwest::Url::parse(raw_url) else { + return !constraints_active; + }; + if matches!(url.scheme(), "data" | "blob") { + return true; + } + let Some(host) = url.host_str() else { + return !constraints_active; + }; + if block_ip_addresses && host.parse::().is_ok() { + return false; + } + + if !allowed_domains.is_empty() { + return allowed_domains + .iter() + .any(|pattern| browser_domain_pattern_matches(raw_url, host, url.scheme(), pattern)); + } + if !prohibited_domains.is_empty() { + return !prohibited_domains + .iter() + .any(|pattern| browser_domain_pattern_matches(raw_url, host, url.scheme(), pattern)); + } + true +} + fn dispatch_local( session: &mut BrowserSession, argv: &[String], @@ -3211,6 +3699,7 @@ impl BrowserSession { "browser is not connected. Run `browser status --json` or `browser connect ...`." ); } + self.prepare_browser_profile_runtime(method, session_id, ¶ms)?; browser_session_prepare_cdp_visuals(self, method, session_id, ¶ms); let Some(connection) = self.connection.as_mut() else { bail!( @@ -3347,6 +3836,47 @@ impl BrowserSession { } } + fn prepare_browser_profile_runtime( + &mut self, + method: &str, + session_id: Option<&str>, + params: &Value, + ) -> Result<()> { + if method == "Page.navigate" { + if let Some(url) = params.get("url").and_then(Value::as_str) { + if !browser_profile_url_allowed(url) { + bail!("BrowserProfile domain constraints blocked navigation to {url}"); + } + } + } + + let setup_calls = browser_profile_setup_calls(session_id); + if setup_calls.is_empty() { + return Ok(()); + } + let Some(connection) = self.connection.as_mut() else { + return Ok(()); + }; + for call in setup_calls { + if self + .browser_profile_runtime + .applied_setup_keys + .contains(&call.key) + { + continue; + } + if connection + .call(&call.method, call.session_id.as_deref(), call.params) + .is_ok() + { + self.browser_profile_runtime + .applied_setup_keys + .insert(call.key); + } + } + Ok(()) + } + fn attach_first_page(&mut self) -> Result<()> { let preferred_marker = self.preferred_target_marker.clone(); let mut attached_profile_marker = false; @@ -4870,15 +5400,19 @@ fn launch_managed_browser(launch: ManagedLaunch) -> Result<(ManagedBrowser, Stri "--no-first-run".to_string(), "--no-default-browser-check".to_string(), ]; + let viewport_args = browser_viewport_launch_args(); if launch.headless { args.push("--headless=new".to_string()); - args.push("--window-size=1280,720".to_string()); + if viewport_args.is_empty() && env_bool("BU_BROWSER_NO_VIEWPORT") != Some(true) { + args.push("--window-size=1280,720".to_string()); + } } else { - args.extend([ - "--new-window".to_string(), - "--window-size=1512,900".to_string(), - ]); + args.push("--new-window".to_string()); + if viewport_args.is_empty() { + args.push("--window-size=1512,900".to_string()); + } } + args.extend(viewport_args); args.extend(launch.extra_args.clone()); args.push("about:blank".to_string()); let mut child = Command::new(&launch.executable) @@ -6499,7 +7033,7 @@ ARTIFACT_DIR.mkdir(parents=True, exist_ok=True) STREAM_PATH.parent.mkdir(parents=True, exist_ok=True) FRAMES_DIR.mkdir(parents=True, exist_ok=True) FRAMES_MANIFEST = FRAMES_DIR / "frames.ndjson" -OUTPUTS_DIR = CWD +OUTPUTS_DIR = pathlib.Path(os.environ.get("BH_OUTPUTS_DIR") or {cwd:?}).expanduser().resolve() OUTPUTS_DIR.mkdir(parents=True, exist_ok=True) __USER_CODE = base64.b64decode({encoded_code:?}).decode() @@ -6508,7 +7042,7 @@ __USER_CODE = base64.b64decode({encoded_code:?}).decode() # are written as JPEGs plus a sidecar manifest, kept OUT of STREAM_PATH so the # event drain never sees partial/interleaved lines. try: - CAPTURE_FPS = float(os.environ.get("LLM_BROWSER_CAPTURE_FPS", "2") or "2") + CAPTURE_FPS = float(os.environ.get("LLM_BROWSER_CAPTURE_FPS", "0") or "0") except (TypeError, ValueError): CAPTURE_FPS = 2.0 try: @@ -7574,7 +8108,7 @@ fn session_capture_fps() -> f64 { std::env::var("LLM_BROWSER_CAPTURE_FPS") .ok() .and_then(|v| v.trim().parse::().ok()) - .unwrap_or(2.0) + .unwrap_or(0.0) } fn session_capture_quality() -> i64 { std::env::var("LLM_BROWSER_CAPTURE_QUALITY") @@ -8772,6 +9306,57 @@ mod tests { assert_eq!(registry.active_session_count(), 1); } + #[test] + fn browser_recovery_while_checked_out_returns_busy_guidance() { + let temp = tempfile::tempdir().unwrap(); + let registry = BrowserSessionRegistry::new(); + let script_registry = BrowserScriptRunRegistry::new(); + let session_id = "checked-out-recover"; + registry + .checked_out_statuses + .lock() + .expect("browser checked-out session registry poisoned") + .insert( + session_id.to_string(), + json!({ + "mode": "remote-cloud", + "connection": "connected", + "active_scripts": [{ + "run_id": "script-1", + "status": "running", + "next_step": "browser_script action=observe run_id=script-1" + }], + "page": { + "target_id": "target-1", + "session_id": "session-1" + } + }), + ); + + let output = run_browser_command_with_options_and_registries( + session_id, + temp.path(), + temp.path().join("artifacts"), + "browser recover reconnect-websocket", + BrowserCommandOptions::default(), + &script_registry, + ®istry, + ) + .expect("busy recovery should return structured guidance"); + + assert_eq!(output.content["status"], "busy"); + assert_eq!(output.content["busy"], true); + assert_eq!(output.content["recovery_deferred"], true); + assert_eq!( + output.content["requested_command"], + "browser recover reconnect-websocket" + ); + assert_eq!(output.content["active_scripts"][0]["run_id"], "script-1"); + let next_step = output.content["next_step"].as_str().unwrap(); + assert!(next_step.contains("browser_script action=observe run_id=script-1")); + assert!(next_step.contains("retry browser recover reconnect-websocket")); + } + #[test] fn browser_help_is_cli_like() { let help = browser_help(); @@ -8875,6 +9460,48 @@ print(session_metadata()["outputs_dir"]) } } + #[test] + fn browser_script_session_outputs_dir_isolates_parallel_cwd_files() { + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_SESSION_OUTPUTS", "1")]); + let temp = tempfile::tempdir().unwrap(); + let artifacts = temp.path().join("artifacts"); + let output = run_browser_script( + "script-session-outputs", + temp.path(), + &artifacts, + r#" +shared = pathlib.Path.cwd() / 'parallel-task-leak.txt' +shared.write_text('from another parallel task', encoding='utf-8') +answer = pathlib.Path(outputs_dir()) / 'answer.json' +answer.write_text(json.dumps({'ok': True}), encoding='utf-8') +print(session_metadata()["outputs_dir"]) +"#, + 10, + ) + .unwrap(); + assert!(output.ok, "{:?}", output.error); + let artifact_paths = output + .artifacts + .iter() + .filter_map(|artifact| artifact["path"].as_str()) + .collect::>(); + assert!( + artifact_paths + .iter() + .any(|path| path.ends_with("/outputs/answer.json")), + "expected outputs artifact, got {artifact_paths:?}" + ); + assert!( + artifact_paths + .iter() + .all(|path| !path.ends_with("parallel-task-leak.txt")), + "cwd file leaked into artifacts: {artifact_paths:?}" + ); + assert!(output + .text + .contains(artifacts.join("outputs").to_str().unwrap())); + } + #[test] fn browser_script_summary_comment_maps_output_to_display_summary() { let temp = tempfile::tempdir().unwrap(); @@ -9086,6 +9713,163 @@ print("navigation helpers do not auto wait") assert!(output.text.contains("navigation helpers do not auto wait")); } + #[test] + fn browser_profile_runtime_setup_calls_read_env() { + let temp = tempfile::tempdir().unwrap(); + let downloads = temp.path().join("downloads"); + let downloads_text = downloads.display().to_string(); + let storage_state = json!({ + "cookies": [{ + "name": "sid", + "value": "secret", + "domain": ".example.com", + "path": "/" + }], + "origins": [{ + "origin": "https://example.com", + "localStorage": [{"name": "theme", "value": "dark"}], + "sessionStorage": [{"name": "step", "value": "one"}] + }] + }) + .to_string(); + let _env = EnvRestore::set(&[ + ( + "BU_BROWSER_PERMISSIONS", + r#"["clipboardReadWrite","notifications","clipboardReadWrite",3]"#, + ), + ("BU_BROWSER_ACCEPT_DOWNLOADS", "true"), + ("BU_BROWSER_DOWNLOADS_PATH", &downloads_text), + ("BU_BROWSER_STORAGE_STATE", &storage_state), + ("BU_BROWSER_USER_AGENT", "BrowserUseRuntime/6.0"), + ]); + + let calls = browser_profile_setup_calls(Some("session-1")); + let methods = calls.iter().map(|call| call.method).collect::>(); + + assert_eq!( + methods, + vec![ + "Browser.grantPermissions", + "Browser.setDownloadBehavior", + "Storage.setCookies", + "Page.addScriptToEvaluateOnNewDocument", + "Network.setUserAgentOverride", + ] + ); + assert_eq!( + calls[0].params["permissions"], + json!(["clipboardReadWrite", "notifications"]) + ); + assert_eq!(calls[1].params["behavior"], "allow"); + assert_eq!(calls[1].params["downloadPath"], downloads_text); + assert!(downloads.exists()); + assert_eq!(calls[2].params["cookies"][0]["name"], "sid"); + assert!(calls[3] + .params + .get("source") + .and_then(Value::as_str) + .is_some_and(|source| source + .contains("window.localStorage.setItem(\"theme\", \"dark\");") + && source.contains("window.sessionStorage.setItem(\"step\", \"one\");"))); + assert_eq!(calls[4].session_id.as_deref(), Some("session-1")); + assert_eq!(calls[4].params["userAgent"], "BrowserUseRuntime/6.0"); + } + + #[test] + fn managed_browser_launch_reads_browser_profile_env() { + let temp = tempfile::tempdir().unwrap(); + let profile = temp.path().join("profile"); + let profile_text = profile.display().to_string(); + let _env = EnvRestore::set(&[ + ("BU_MANAGED_BROWSER_PROFILE", &profile_text), + ( + "BU_MANAGED_BROWSER_ARGS", + r#"["--proxy-server=http://proxy.example:8080","--user-agent=BrowserUseManaged/1.0",3,""]"#, + ), + ( + "BU_BROWSER_VIEWPORT", + r#"{"width":960,"height":720,"deviceScaleFactor":2}"#, + ), + ("BU_BROWSER_NO_VIEWPORT", "false"), + ]); + + let ManagedProfile::Path(resolved_profile) = + managed_browser_profile_from_env(ManagedProfile::Temp) + else { + panic!("expected managed profile path from env"); + }; + assert_eq!(resolved_profile, profile); + assert_eq!( + managed_browser_extra_args_from_env(), + vec![ + "--proxy-server=http://proxy.example:8080".to_string(), + "--user-agent=BrowserUseManaged/1.0".to_string(), + ] + ); + assert_eq!( + browser_viewport_launch_args(), + vec![ + "--window-size=960,720".to_string(), + "--force-device-scale-factor=2".to_string(), + ] + ); + } + + #[test] + fn browser_profile_runtime_domain_constraints_read_env() { + { + let _env = EnvRestore::set(&[ + ( + "BU_BROWSER_ALLOWED_DOMAINS", + r#"["example.com","*.browser-use.com"]"#, + ), + ("BU_BROWSER_PROHIBITED_DOMAINS", r#"["*.tracking.example"]"#), + ("BU_BROWSER_BLOCK_IP_ADDRESSES", "true"), + ]); + + assert!(browser_profile_url_allowed("https://www.example.com/path")); + assert!(browser_profile_url_allowed("https://docs.browser-use.com/")); + assert!(browser_profile_url_allowed("about:blank")); + assert!(!browser_profile_url_allowed("https://iana.org/")); + assert!(!browser_profile_url_allowed("http://127.0.0.1/")); + } + + { + let _env = EnvRestore::set(&[ + ("BU_BROWSER_ALLOWED_DOMAINS", "[]"), + ("BU_BROWSER_PROHIBITED_DOMAINS", r#"["*.tracking.example"]"#), + ("BU_BROWSER_BLOCK_IP_ADDRESSES", "false"), + ]); + + assert!(!browser_profile_url_allowed( + "https://ads.tracking.example/" + )); + assert!(browser_profile_url_allowed("https://example.com/")); + } + } + + #[test] + fn browser_profile_domain_constraints_are_passive_without_env() { + let _env = EnvRestore::unset(&[ + "BU_BROWSER_ALLOWED_DOMAINS", + "BU_BROWSER_PROHIBITED_DOMAINS", + "BU_BROWSER_BLOCK_IP_ADDRESSES", + ]); + + assert!(browser_profile_url_allowed("")); + assert!(browser_profile_url_allowed("/relative-path")); + + drop(_env); + let _env = EnvRestore::set(&[ + ("BU_BROWSER_ALLOWED_DOMAINS", r#"["example.com"]"#), + ("BU_BROWSER_PROHIBITED_DOMAINS", "[]"), + ("BU_BROWSER_BLOCK_IP_ADDRESSES", "false"), + ]); + + assert!(!browser_profile_url_allowed("")); + assert!(!browser_profile_url_allowed("/relative-path")); + } + #[test] fn browser_script_new_tab_preserves_current_browser_context() { let temp = tempfile::tempdir().unwrap(); @@ -9744,6 +10528,172 @@ print("http_get parity ok") assert!(output.text.contains("http_get parity ok")); } + #[test] + fn browser_script_http_get_many_preserves_order_and_errors() { + let temp = tempfile::tempdir().unwrap(); + let output = run_browser_script( + "script-http-get-many", + temp.path(), + temp.path().join("artifacts"), + r#" +import http.server +import socketserver +import threading + +class Handler(http.server.BaseHTTPRequestHandler): + def log_message(self, fmt, *args): + pass + + def do_GET(self): + if self.path in ("/one", "/two"): + assert self.headers.get("X-Shared") == "yes", dict(self.headers) + if self.path == "/one": + assert self.headers.get("X-Item") == "one", dict(self.headers) + body = self.path.strip("/").encode() + self.send_response(200) + self.send_header("Content-Type", "text/plain; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + return + self.send_response(404) + self.end_headers() + +server = socketserver.ThreadingTCPServer(("127.0.0.1", 0), Handler) +thread = threading.Thread(target=server.serve_forever, daemon=True) +thread.start() +base = f"http://127.0.0.1:{server.server_address[1]}" +try: + results = http_get_many( + [base + "/two", {"url": base + "/one", "headers": {"X-Item": "one"}}, base + "/missing"], + headers={"X-Shared": "yes"}, + max_workers=3, + ) + assert len(results) == 3, results + assert results[0] == "two", results + assert results[0].status_code == 200 + assert results[1] == "one", results + assert results[1].url.endswith("/one") + assert results[2]["ok"] is False, results[2] + assert results[2]["url"].endswith("/missing"), results[2] + try: + http_get_many([base + "/missing"], return_errors=False) + except RuntimeError: + pass + else: + raise AssertionError("return_errors=False should raise") +finally: + server.shutdown() + server.server_close() + +assert callable(browser_fetch) +assert callable(browser_fetch_many) +print("http_get_many parity ok") +"#, + 10, + ) + .unwrap(); + + assert!(output.ok, "{:?}\n{}", output.error, output.text); + assert!(output.text.contains("http_get_many parity ok")); + } + + #[test] + fn browser_script_browser_fetch_single_returns_structured_errors_by_default() { + let temp = tempfile::tempdir().unwrap(); + let output = run_browser_script( + "script-browser-fetch-single-error", + temp.path(), + temp.path().join("artifacts"), + r#" +def fake_runtime_evaluate(expression, await_promise=False, return_by_value=False): + return [{"ok": False, "url": "https://example.test/api", "error": "Failed to fetch"}] + +globals()["_runtime_evaluate"] = fake_runtime_evaluate + +result = browser_fetch("https://example.test/api") +assert result["ok"] is False, result +assert result["url"] == "https://example.test/api", result +assert "Failed to fetch" in result["error"], result + +try: + browser_fetch("https://example.test/api", return_error=False) +except RuntimeError as exc: + assert "browser_fetch failed" in str(exc), exc +else: + raise AssertionError("return_error=False should raise") + +print("browser_fetch single structured error ok") +"#, + 10, + ) + .unwrap(); + + assert!(output.ok, "{:?}\n{}", output.error, output.text); + assert!(output + .text + .contains("browser_fetch single structured error ok")); + } + + #[test] + fn browser_script_bridge_retries_transient_busy_errors() { + let temp = tempfile::tempdir().unwrap(); + let output = run_browser_script( + "script-bridge-retry-busy", + temp.path(), + temp.path().join("artifacts"), + r#" +attempts = {"n": 0} + +class FakeSock: + def __init__(self, payload): + self.payload = bytearray(payload) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def sendall(self, data): + pass + + def recv(self, n): + if not self.payload: + return b"" + chunk = self.payload[:n] + del self.payload[:n] + return bytes(chunk) + +original_create_connection = socket.create_connection + +def fake_create_connection(*args, **kwargs): + attempts["n"] += 1 + if attempts["n"] < 3: + return FakeSock(b'{"ok":false,"error":"browser is not connected or is busy; run `browser status --json`"}\n') + return FakeSock(b'{"ok":true,"result":{"targetInfos":[]}}\n') + +socket.create_connection = fake_create_connection +try: + result = cdp("Target.getTargets") +finally: + socket.create_connection = original_create_connection + +assert result == {"targetInfos": []}, result +assert attempts["n"] == 3, attempts +print("bridge retry ok") +"#, + 10, + ) + .unwrap(); + + assert!(output.ok, "{:?}\n{}", output.error, output.text); + assert!(output.text.contains("bridge retry ok")); + assert!(output + .text + .contains("browser_script bridge retry 2/4 after transient error")); + } + #[test] fn browser_script_timeout_returns_tool_failure() { let temp = tempfile::tempdir().unwrap(); @@ -9782,10 +10732,46 @@ print("http_get parity ok") assert!(output.elapsed_ms.is_some()); } + #[test] + fn browser_script_initial_wait_defaults_to_seven_seconds_and_clamps_env() { + { + let _env = EnvRestore::unset(&[ + "BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", + "BROWSER_SCRIPT_INITIAL_WAIT_MS", + ]); + assert_eq!(browser_script_initial_wait_ms(), 7_000); + } + { + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "1500")]); + assert_eq!(browser_script_initial_wait_ms(), 1_500); + } + { + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "50")]); + assert_eq!(browser_script_initial_wait_ms(), 250); + } + { + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "45000")]); + assert_eq!(browser_script_initial_wait_ms(), 30_000); + } + } + + #[test] + fn session_capture_is_opt_in_for_eval_speed() { + { + let _env = EnvRestore::unset(&["LLM_BROWSER_CAPTURE_FPS"]); + assert_eq!(session_capture_fps(), 0.0); + } + { + let _env = EnvRestore::set(&[("LLM_BROWSER_CAPTURE_FPS", "2")]); + assert_eq!(session_capture_fps(), 2.0); + } + } + #[test] fn browser_script_start_observe_finishes_slow_scripts() { let temp = tempfile::tempdir().unwrap(); let session_id = "script-start-observe"; + let _env = EnvRestore::set(&[("BU_BROWSER_SCRIPT_INITIAL_WAIT_MS", "500")]); let started = start_browser_script( session_id, temp.path(), diff --git a/crates/browser-use-cli/src/main.rs b/crates/browser-use-cli/src/main.rs index ae269c6a..00f9dc97 100644 --- a/crates/browser-use-cli/src/main.rs +++ b/crates/browser-use-cli/src/main.rs @@ -16,11 +16,12 @@ use browser_use_agent::config_model::{ model_catalog_for_cwd_with_options, }; use browser_use_agent::config_overrides::{ - apply_child_request_runtime_config, load_mcp_servers_for_profile, parse_config_overrides, - resolve_agent_roles_for_profile, resolve_approval_policy_for_profile, - resolve_collab_for_profile, resolve_guardian_for_profile, resolve_multi_agent_v2_for_profile, - AgentRunOptions, ChildAgentRunCompletion, ChildAgentRunRequest, ChildAgentRunner, - ConfigOverrides, ProviderBackend, ProviderRunConfig, RunConfigValueSource, + apply_child_request_runtime_config, apply_runtime_config_overrides, + load_mcp_servers_for_profile, parse_config_overrides, resolve_agent_roles_for_profile, + resolve_approval_policy_for_profile, resolve_collab_for_profile, resolve_guardian_for_profile, + resolve_multi_agent_v2_for_profile, AgentRunOptions, ChildAgentRunCompletion, + ChildAgentRunRequest, ChildAgentRunner, ConfigOverrides, ProviderBackend, ProviderRunConfig, + RunConfigValueSource, }; use browser_use_agent::context::{ append_user_shell_command_context_event, typed_user_input_payload_from_items_for_cwd, @@ -202,6 +203,11 @@ enum Command { #[arg(long, default_value = "gpt-5.1-codex")] model: String, }, + RunCodexSession { + task_id: String, + #[arg(long, default_value = "gpt-5.1-codex")] + model: String, + }, RunOpenaiSession { task_id: String, #[arg(long)] @@ -750,6 +756,15 @@ fn main() -> Result<()> { collaboration_mode, &runtime_options, ), + Command::RunCodexSession { task_id, model } => run_codex_session( + &store, + &task_id, + model, + config_profile.as_deref(), + &config_overrides, + collaboration_mode, + &runtime_options, + ), Command::RunOpenaiSession { task_id, model } => run_openai_session( &store, &task_id, @@ -1066,6 +1081,7 @@ fn command_name(command: &Command) -> &'static str { Command::RunOpenrouter { .. } => "run_openrouter", Command::RunDeepseek { .. } => "run_deepseek", Command::RunCodex { .. } => "run_codex", + Command::RunCodexSession { .. } => "run_codex_session", Command::RunOpenaiSession { .. } => "run_openai_session", Command::RunAnthropicSession { .. } => "run_anthropic_session", Command::RunOpenrouterSession { .. } => "run_openrouter_session", @@ -1960,6 +1976,7 @@ fn cli_agent_options( options = options.with_mcp_servers(mcp_servers); } if !config_overrides.is_empty() { + apply_runtime_config_overrides(&mut options, &config_overrides)?; options = options.with_config_overrides(config_overrides); } Ok(options) @@ -2171,6 +2188,28 @@ fn run_codex( run_new_session_from_config(store, text, config) } +fn run_codex_session( + store: &Store, + task_id: &str, + model: String, + config_profile: Option<&str>, + raw_config_overrides: &[String], + collaboration_mode: CollaborationModeKind, + runtime_options: &CliRuntimeOptions, +) -> Result<()> { + ensure_task_exists(store, task_id)?; + let config = + ProviderRunConfig::new(ProviderBackend::Codex, model).with_options(cli_agent_options( + config_profile, + raw_config_overrides, + collaboration_mode, + runtime_options, + )?); + let session_id = run_existing_session_from_config_and_notify(store, task_id, config, None)?; + println!("{session_id}"); + Ok(()) +} + fn run_openai_session( store: &Store, task_id: &str, @@ -2803,15 +2842,12 @@ fn python(store: &Store, task_id: &str, code: String) -> Result<()> { fn browser_script(store: &Store, task_id: &str, code: String) -> Result<()> { let task = ensure_task_exists(store, task_id)?; let tool_call_id = format!("browser_script-cli-{task_id}"); - if let Some(cdp_url) = std::env::var("BU_CDP_URL") - .ok() - .filter(|url| !url.trim().is_empty()) - { + if let Some(connect_command) = remote_cdp_connect_command_from_env() { let connect = browser_use_browser::run_browser_command( task_id, &task.cwd, &task.artifact_root, - &format!("browser connect remote-cdp --url {}", cdp_url.trim()), + &connect_command, )?; if connect.content.get("status").and_then(Value::as_str) != Some("connected") { bail!("browser connect remote-cdp failed: {}", connect.content); @@ -2860,6 +2896,30 @@ fn browser_script(store: &Store, task_id: &str, code: String) -> Result<()> { ) } +fn remote_cdp_connect_command_from_env() -> Option { + std::env::var("BU_CDP_WS") + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .or_else(|| { + std::env::var("BU_CDP_URL") + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + }) + .map(|endpoint| { + let flag = if endpoint.starts_with("ws://") || endpoint.starts_with("wss://") { + "--ws" + } else { + "--url" + }; + format!( + "browser connect remote-cdp {flag} {}", + shell_quote_arg(&endpoint) + ) + }) +} + #[derive(Clone, Debug)] struct SyncCookiesArgs { profile: Option, @@ -6875,6 +6935,27 @@ command = "test-mcp" Ok(()) } + #[test] + fn run_codex_session_command_accepts_task_id_and_model() -> Result<()> { + let parsed = Args::try_parse_from([ + "browser-use-terminal", + "run-codex-session", + "session-123", + "--model", + "gpt-test", + ])?; + + match &parsed.command { + Command::RunCodexSession { task_id, model } => { + assert_eq!(task_id, "session-123"); + assert_eq!(model, "gpt-test"); + assert_eq!(command_name(&parsed.command), "run_codex_session"); + } + other => panic!("expected run-codex-session command, got {other:?}"), + } + Ok(()) + } + #[test] fn sdk_json_rpc_ping_and_create_methods_use_runtime() -> Result<()> { let temp = unique_cli_test_dir("sdk-json-rpc")?; diff --git a/crates/browser-use-llm/src/protocols/anthropic_messages.rs b/crates/browser-use-llm/src/protocols/anthropic_messages.rs index a3ba582a..8148cbc9 100644 --- a/crates/browser-use-llm/src/protocols/anthropic_messages.rs +++ b/crates/browser-use-llm/src/protocols/anthropic_messages.rs @@ -67,7 +67,13 @@ impl Protocol for AnthropicMessagesProtocol { // Tool definitions. if !req.tools.is_empty() { - let tools: Vec = req.tools.iter().map(build_tool).collect(); + let mut tools: Vec = req.tools.iter().map(build_tool).collect(); + if let Some(Value::Object(last_tool)) = tools.last_mut() { + last_tool.insert( + "cache_control".to_string(), + cache_control(CacheHint::Ephemeral), + ); + } body.insert("tools".to_string(), Value::Array(tools)); } @@ -115,15 +121,36 @@ fn build_message(message: &Message) -> Result { } }; - let content: Result, LlmError> = - message.content.iter().map(build_content_block).collect(); + let mut content: Vec = message + .content + .iter() + .map(build_content_block) + .collect::, LlmError>>()?; + apply_cache_control_to_last_content_block(&mut content, message.cache); Ok(json!({ "role": role, - "content": content?, + "content": content, })) } +fn apply_cache_control_to_last_content_block(content: &mut [Value], cache: Option) { + let Some(cache) = cache else { + return; + }; + + for block in content.iter_mut().rev() { + let Some(obj) = block.as_object_mut() else { + continue; + }; + if obj.get("type").and_then(Value::as_str) == Some("image") { + continue; + } + obj.insert("cache_control".to_string(), cache_control(cache)); + break; + } +} + /// Translate a canonical [`ContentPart`] into an Anthropic content block. fn build_content_block(part: &ContentPart) -> Result { match part { @@ -159,15 +186,24 @@ fn build_content_block(part: &ContentPart) -> Result { content, is_error, } => { - let blocks: Result, LlmError> = - content.iter().map(build_content_block).collect(); + let blocks = if *is_error { + vec![json!({ + "type": "text", + "text": flatten_error_tool_result_content(content), + })] + } else { + content + .iter() + .map(build_content_block) + .collect::, LlmError>>()? + }; let mut block = Map::new(); block.insert("type".to_string(), Value::String("tool_result".to_string())); block.insert( "tool_use_id".to_string(), Value::String(tool_call_id.clone()), ); - block.insert("content".to_string(), Value::Array(blocks?)); + block.insert("content".to_string(), Value::Array(blocks)); if *is_error { block.insert("is_error".to_string(), Value::Bool(true)); } @@ -192,6 +228,49 @@ fn build_content_block(part: &ContentPart) -> Result { } } +fn flatten_error_tool_result_content(content: &[ContentPart]) -> String { + let mut chunks = Vec::new(); + collect_error_tool_result_text(content, &mut chunks); + if chunks.is_empty() { + return "Tool call failed.".to_string(); + } + chunks.join("\n") +} + +fn collect_error_tool_result_text(content: &[ContentPart], chunks: &mut Vec) { + for part in content { + match part { + ContentPart::Text { text } | ContentPart::Reasoning { text, .. } => { + if !text.trim().is_empty() { + chunks.push(text.clone()); + } + } + ContentPart::Media { + mime_type, + data, + url, + .. + } => { + let pointer = url + .as_deref() + .map(|url| format!(" at {url}")) + .unwrap_or_else(|| { + data.as_ref() + .map(|_| " inline".to_string()) + .unwrap_or_default() + }); + chunks.push(format!("[{mime_type} media{pointer}]")); + } + ContentPart::ToolCall { name, .. } => { + chunks.push(format!("[nested tool call: {name}]")); + } + ContentPart::ToolResult { content, .. } => { + collect_error_tool_result_text(content, chunks); + } + } + } +} + /// Extract a thinking signature stored under `provider_metadata` if present. fn reasoning_signature_from_metadata(part: &ContentPart) -> Option { if let ContentPart::Reasoning { @@ -445,16 +524,35 @@ impl AnthropicMessagesStream { /// Merge any usage fields present in `usage` into the running total. fn apply_usage(&mut self, usage: &Value) { if let Some(v) = usage.get("input_tokens").and_then(Value::as_u64) { - self.usage.input_tokens = v; + self.set_uncached_input_tokens(v); } if let Some(v) = usage.get("output_tokens").and_then(Value::as_u64) { self.usage.output_tokens = v; } if let Some(v) = usage.get("cache_read_input_tokens").and_then(Value::as_u64) { - self.usage.cached_input_tokens = v; + self.set_cached_input_tokens(v); + } + if let Some(v) = usage + .get("cache_creation_input_tokens") + .and_then(Value::as_u64) + { + self.usage.cache_creation_input_tokens = v; } } + fn set_uncached_input_tokens(&mut self, raw_input_tokens: u64) { + self.usage.input_tokens = raw_input_tokens.saturating_add(self.usage.cached_input_tokens); + } + + fn set_cached_input_tokens(&mut self, cached_input_tokens: u64) { + let raw_input_tokens = self + .usage + .input_tokens + .saturating_sub(self.usage.cached_input_tokens); + self.usage.cached_input_tokens = cached_input_tokens; + self.usage.input_tokens = raw_input_tokens.saturating_add(cached_input_tokens); + } + /// Flush open blocks and emit `StepFinish` + `Finish` (idempotent). fn flush_finish(&mut self) -> Result, LlmError> { let mut out = Vec::new(); @@ -625,7 +723,8 @@ mod tests { "type": "object", "properties": { "city": { "type": "string" } }, "required": ["city"], - } + }, + "cache_control": { "type": "ephemeral" } } ]) ); @@ -634,6 +733,42 @@ mod tests { assert_eq!(body["tool_choice"], json!({ "type": "auto" })); } + #[test] + fn build_body_marks_cache_control_breakpoints() { + let mut req = LlmRequest::new("claude-sonnet-4-6", "anthropic"); + let mut system = SystemPart::new("Stable system prompt."); + system.cache = Some(CacheHint::Ephemeral); + req.system.push(system); + req.messages + .push(Message::user_text("Current browser state.").with_cache(CacheHint::Ephemeral)); + for name in ["first_tool", "last_tool"] { + req.tools.push(ToolDefinition { + name: name.into(), + description: String::new(), + input_schema: json!({ "type": "object" }), + output_schema: None, + namespace: None, + namespace_description: None, + }); + } + + let body = AnthropicMessagesProtocol::new().build_body(&req).unwrap(); + + assert_eq!( + body["system"][0]["cache_control"], + json!({ "type": "ephemeral" }) + ); + assert_eq!( + body["messages"][0]["content"][0]["cache_control"], + json!({ "type": "ephemeral" }) + ); + assert!(body["tools"][0].get("cache_control").is_none()); + assert_eq!( + body["tools"][1]["cache_control"], + json!({ "type": "ephemeral" }) + ); + } + #[test] fn build_body_respects_max_tokens_and_omits_empty_sections() { let mut req = LlmRequest::new("m", "anthropic"); @@ -704,6 +839,50 @@ mod tests { ); } + #[test] + fn build_body_flattens_error_tool_result_content_to_text_blocks() { + let mut req = LlmRequest::new("m", "anthropic"); + req.messages.push(Message::new( + MessageRole::Tool, + vec![ContentPart::ToolResult { + tool_call_id: "toolu_error".into(), + content: vec![ + ContentPart::text("browser script failed"), + ContentPart::Media { + mime_type: "image/png".into(), + data: Some("base64-image".into()), + url: None, + detail: None, + }, + ], + is_error: true, + }], + )); + + let body = AnthropicMessagesProtocol::new().build_body(&req).unwrap(); + assert_eq!( + body["messages"], + json!([ + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "toolu_error", + "is_error": true, + "content": [ + { + "type": "text", + "text": "browser script failed\n[image/png media inline]" + } + ] + } + ] + } + ]) + ); + } + #[test] fn build_body_maps_reasoning_signature_to_thinking_block() { let mut req = LlmRequest::new("m", "anthropic"); @@ -939,6 +1118,58 @@ mod tests { assert_eq!(events, expected); } + #[test] + fn decoder_normalizes_anthropic_cached_usage_to_inclusive_input() { + let frames = vec![ + frame( + "message_start", + json!({ + "type": "message_start", + "message": { + "id": "msg_cache", + "role": "assistant", + "content": [], + "usage": { + "input_tokens": 12, + "cache_creation_input_tokens": 44, + "output_tokens": 0 + } + } + }), + ), + frame( + "message_delta", + json!({ + "type": "message_delta", + "delta": { "stop_reason": "end_turn", "stop_sequence": null }, + "usage": { + "output_tokens": 3088, + "cache_read_input_tokens": 183250 + } + }), + ), + frame("message_stop", json!({ "type": "message_stop" })), + ]; + + let events = drive(&frames); + let usage = Usage { + input_tokens: 183262, + cached_input_tokens: 183250, + cache_creation_input_tokens: 44, + output_tokens: 3088, + ..Default::default() + }; + + assert!(events.contains(&LlmEvent::StepFinish { + usage, + finish_reason: Some(FinishReason::Stop), + })); + assert!(events.contains(&LlmEvent::Finish { + usage, + finish_reason: Some(FinishReason::Stop), + })); + } + #[test] fn decoder_handles_thinking_block_and_signature() { let frames = vec![ diff --git a/crates/browser-use-llm/src/protocols/openai_chat.rs b/crates/browser-use-llm/src/protocols/openai_chat.rs index efe88804..308f370b 100644 --- a/crates/browser-use-llm/src/protocols/openai_chat.rs +++ b/crates/browser-use-llm/src/protocols/openai_chat.rs @@ -701,6 +701,7 @@ fn parse_usage(usage: &Value) -> Usage { Usage { input_tokens: u("prompt_tokens"), cached_input_tokens: cached, + cache_creation_input_tokens: 0, output_tokens: u("completion_tokens"), reasoning_output_tokens: reasoning, total_tokens: u("total_tokens"), @@ -974,6 +975,7 @@ mod tests { let usage = Usage { input_tokens: 10, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 5, reasoning_output_tokens: 0, total_tokens: 15, @@ -1048,6 +1050,7 @@ mod tests { let usage = Usage { input_tokens: 3, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 1, reasoning_output_tokens: 0, total_tokens: 4, // computed: 3 + 1 diff --git a/crates/browser-use-llm/src/protocols/openai_responses.rs b/crates/browser-use-llm/src/protocols/openai_responses.rs index 76a0cf33..b71deb6a 100644 --- a/crates/browser-use-llm/src/protocols/openai_responses.rs +++ b/crates/browser-use-llm/src/protocols/openai_responses.rs @@ -712,6 +712,7 @@ fn parse_usage(usage: Option<&Value>) -> Option { Some(Usage { input_tokens: input, cached_input_tokens: cached, + cache_creation_input_tokens: 0, output_tokens: output, reasoning_output_tokens: reasoning, total_tokens: total, @@ -971,6 +972,7 @@ mod tests { usage: Usage { input_tokens: 11, cached_input_tokens: 4, + cache_creation_input_tokens: 0, output_tokens: 7, reasoning_output_tokens: 2, total_tokens: 18, @@ -981,6 +983,7 @@ mod tests { usage: Usage { input_tokens: 11, cached_input_tokens: 4, + cache_creation_input_tokens: 0, output_tokens: 7, reasoning_output_tokens: 2, total_tokens: 18, @@ -1082,6 +1085,7 @@ mod tests { usage: Usage { input_tokens: 1, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 1, reasoning_output_tokens: 0, total_tokens: 2, @@ -1092,6 +1096,7 @@ mod tests { usage: Usage { input_tokens: 1, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 1, reasoning_output_tokens: 0, total_tokens: 2, diff --git a/crates/browser-use-llm/src/route/client.rs b/crates/browser-use-llm/src/route/client.rs index 121bdabc..511ff2b7 100644 --- a/crates/browser-use-llm/src/route/client.rs +++ b/crates/browser-use-llm/src/route/client.rs @@ -848,6 +848,7 @@ mod tests { let usage = Usage { input_tokens: 11, cached_input_tokens: 0, + cache_creation_input_tokens: 0, output_tokens: 7, reasoning_output_tokens: 0, total_tokens: 18, diff --git a/crates/browser-use-llm/src/schema/event.rs b/crates/browser-use-llm/src/schema/event.rs index 037579f1..f4aeb228 100644 --- a/crates/browser-use-llm/src/schema/event.rs +++ b/crates/browser-use-llm/src/schema/event.rs @@ -10,15 +10,20 @@ use serde_json::Value; use super::ids::FinishReason; use super::messages::ContentPart; -/// Token usage with an explicitly **non-overlapping** breakdown, so consumers -/// never have to subtract. `total_tokens` is the inclusive total reported (or -/// computed) for the turn. +/// Token usage normalized for Browser Use cost accounting. #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct Usage { + /// Regular input plus cache-read input. Anthropic cache-write input is kept + /// separate in `cache_creation_input_tokens` so it can be billed at the + /// cache-write rate without also being charged as base input. #[serde(default)] pub input_tokens: u64, + /// Cache-read input tokens. These are included in `input_tokens`. #[serde(default)] pub cached_input_tokens: u64, + /// Cache-write input tokens. These are not included in `input_tokens`. + #[serde(default)] + pub cache_creation_input_tokens: u64, #[serde(default)] pub output_tokens: u64, #[serde(default)] @@ -29,10 +34,13 @@ pub struct Usage { impl Usage { /// Sum of the breakdown fields (use when a provider does not report an - /// inclusive total). `cached_input_tokens` is a subset of `input_tokens` - /// and is therefore not added again. + /// inclusive total). `cached_input_tokens` is included in `input_tokens`, + /// while cache-creation tokens are a separate Anthropic billing bucket. pub fn computed_total(&self) -> u64 { - self.input_tokens + self.output_tokens + self.reasoning_output_tokens + self.input_tokens + + self.cache_creation_input_tokens + + self.output_tokens + + self.reasoning_output_tokens } } diff --git a/crates/browser-use-llm/src/schema/messages.rs b/crates/browser-use-llm/src/schema/messages.rs index d38fe7dc..bb01f0a2 100644 --- a/crates/browser-use-llm/src/schema/messages.rs +++ b/crates/browser-use-llm/src/schema/messages.rs @@ -86,17 +86,29 @@ pub enum CacheHint { #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Message { pub role: MessageRole, + /// Optional prompt-cache hint; only honored by protocols that support + /// inline cache markers (Anthropic / Bedrock). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub cache: Option, #[serde(default)] pub content: Vec, } impl Message { pub fn new(role: MessageRole, content: Vec) -> Self { - Self { role, content } + Self { + role, + cache: None, + content, + } } pub fn user_text(s: impl Into) -> Self { Self::new(MessageRole::User, vec![ContentPart::text(s)]) } + pub fn with_cache(mut self, cache: CacheHint) -> Self { + self.cache = Some(cache); + self + } } /// A tool the model may call. The handler is never on the wire — only schema. diff --git a/crates/browser-use-llm/src/schema/mod.rs b/crates/browser-use-llm/src/schema/mod.rs index 50b0159d..e883f65c 100644 --- a/crates/browser-use-llm/src/schema/mod.rs +++ b/crates/browser-use-llm/src/schema/mod.rs @@ -107,6 +107,7 @@ mod tests { usage: Usage { input_tokens: 10, cached_input_tokens: 4, + cache_creation_input_tokens: 0, output_tokens: 6, reasoning_output_tokens: 2, total_tokens: 18, @@ -125,6 +126,7 @@ mod tests { let u = Usage { input_tokens: 100, cached_input_tokens: 40, + cache_creation_input_tokens: 0, output_tokens: 20, reasoning_output_tokens: 5, total_tokens: 0, diff --git a/crates/browser-use-providers/src/lib.rs b/crates/browser-use-providers/src/lib.rs index 427c9ec5..b86b6e71 100644 --- a/crates/browser-use-providers/src/lib.rs +++ b/crates/browser-use-providers/src/lib.rs @@ -2450,6 +2450,7 @@ impl AnthropicMessagesProvider { let mut body = json!({ "model": self.model, "max_tokens": 16000, + "cache_control": { "type": "ephemeral" }, "system": anthropic_system_blocks_with_developer_context(&instructions, &turn.messages, is_oauth), "messages": messages_to_anthropic_messages(&turn.messages, is_oauth)?, }); @@ -5306,7 +5307,7 @@ fn chat_tool_description(tool: &ToolSpec) -> String { } fn tool_specs_to_anthropic_tools(tools: &[ToolSpec], is_oauth: bool) -> Vec { - tools + let mut anthropic_tools: Vec = tools .iter() .map(|tool| { json!({ @@ -5315,7 +5316,11 @@ fn tool_specs_to_anthropic_tools(tools: &[ToolSpec], is_oauth: bool) -> Vec String { @@ -5923,9 +5928,21 @@ fn messages_to_anthropic_messages(messages: &[Value], is_oauth: bool) -> Result< })), } } + drop_trailing_anthropic_assistant_prefill(&mut out); Ok(out) } +fn drop_trailing_anthropic_assistant_prefill(messages: &mut Vec) { + while messages + .last() + .and_then(|message| message.get("role")) + .and_then(Value::as_str) + == Some("assistant") + { + messages.pop(); + } +} + fn should_skip_raw_response_item_for_fallback_provider(message: &Value) -> bool { let Some(item_type) = message.get("type").and_then(Value::as_str) else { return false; @@ -7051,10 +7068,33 @@ fn parse_usage(usage: Option<&Value>, model: &str) -> Option { .or_else(|| usage.get("total_cost")) .or_else(|| usage.get("cost_usd")) .and_then(value_f64); - let input_tokens = usage + let raw_input_tokens = usage .get("input_tokens") .or_else(|| usage.get("prompt_tokens")) .and_then(Value::as_i64); + let cached_input_tokens = usage + .get("input_tokens_details") + .and_then(|details| details.get("cached_tokens")) + .or_else(|| { + usage + .get("prompt_tokens_details") + .and_then(|details| details.get("cached_tokens")) + }) + .or_else(|| usage.get("cache_read_input_tokens")) + .and_then(Value::as_i64); + let cache_creation_tokens = usage + .get("cache_creation_input_tokens") + .or_else(|| usage.get("prompt_cache_creation_tokens")) + .and_then(Value::as_i64); + let input_tokens = raw_input_tokens.map(|tokens| { + if usage.get("cache_read_input_tokens").is_some() + || usage.get("cache_creation_input_tokens").is_some() + { + tokens + cached_input_tokens.unwrap_or(0) + } else { + tokens + } + }); let output_tokens = usage .get("output_tokens") .or_else(|| usage.get("completion_tokens")) @@ -7068,26 +7108,21 @@ fn parse_usage(usage: Option<&Value>, model: &str) -> Option { .and_then(|details| details.get("reasoning_tokens")) }) .and_then(Value::as_i64); - let total_tokens = usage - .get("total_tokens") - .and_then(Value::as_i64) - .or_else(|| Some(input_tokens? + output_tokens?)); + let has_anthropic_cache_fields = usage.get("cache_read_input_tokens").is_some() + || usage.get("cache_creation_input_tokens").is_some(); + let computed_total_tokens = input_tokens? + cache_creation_tokens.unwrap_or(0) + output_tokens?; + let total_tokens = if has_anthropic_cache_fields { + Some(computed_total_tokens) + } else { + usage + .get("total_tokens") + .and_then(Value::as_i64) + .or(Some(computed_total_tokens)) + }; let usage = ModelUsage { input_tokens, - input_cached_tokens: usage - .get("input_tokens_details") - .and_then(|details| details.get("cached_tokens")) - .or_else(|| { - usage - .get("prompt_tokens_details") - .and_then(|details| details.get("cached_tokens")) - }) - .or_else(|| usage.get("cache_read_input_tokens")) - .and_then(Value::as_i64), - input_cache_creation_tokens: usage - .get("cache_creation_input_tokens") - .or_else(|| usage.get("prompt_cache_creation_tokens")) - .and_then(Value::as_i64), + input_cached_tokens: cached_input_tokens, + input_cache_creation_tokens: cache_creation_tokens, output_tokens, reasoning_output_tokens, total_tokens, @@ -8170,6 +8205,27 @@ mod tests { Ok(()) } + #[test] + fn anthropic_messages_drop_trailing_assistant_prefill() -> Result<()> { + let messages = [ + json!({ + "role": "user", + "content": "do the browser task" + }), + json!({ + "role": "assistant", + "content": "premature final answer" + }), + ]; + + let anthropic = messages_to_anthropic_messages(&messages, false)?; + + assert_eq!(anthropic.len(), 1); + assert_eq!(anthropic[0]["role"], "user"); + assert_eq!(anthropic[0]["content"][0]["text"], "do the browser task"); + Ok(()) + } + #[test] fn chat_messages_map_developer_context_to_system_priority() -> Result<()> { let messages = messages_to_chat_messages( @@ -9427,6 +9483,52 @@ mod tests { Ok(()) } + #[test] + fn anthropic_messages_request_marks_last_tool_cacheable() -> Result<()> { + let provider = AnthropicMessagesProvider::new("anthropic-key", "claude-test"); + let body = provider.messages_request_body( + &ProviderTurn { + instructions: Some("Stable system prompt".to_string()), + messages: vec![json!({"role": "user", "content": "finish"})], + tools: vec![ + ToolSpec { + name: "browser".to_string(), + namespace: None, + namespace_description: None, + description: "Inspect a page".to_string(), + input_schema: json!({"type": "object"}), + output_schema: None, + freeform: None, + }, + ToolSpec { + name: "done".to_string(), + namespace: None, + namespace_description: None, + description: "Finish the task".to_string(), + input_schema: json!({"type": "object"}), + output_schema: None, + freeform: None, + }, + ], + ..ProviderTurn::default() + }, + false, + true, + )?; + + assert!(body["tools"][0].get("cache_control").is_none()); + assert_eq!(body["cache_control"], json!({"type": "ephemeral"})); + assert_eq!( + body["tools"][1]["cache_control"], + json!({"type": "ephemeral"}) + ); + assert_eq!( + body["system"][0]["cache_control"], + json!({"type": "ephemeral"}) + ); + Ok(()) + } + #[test] fn openai_compatible_chat_retries_5xx_inside_provider_like_codex_request_layer() -> Result<()> { let success = json!({ diff --git a/prompts/browser-agent-system.md b/prompts/browser-agent-system.md index 6699965f..7041246c 100644 --- a/prompts/browser-agent-system.md +++ b/prompts/browser-agent-system.md @@ -4,7 +4,7 @@ Raw CDP is the center of page interaction. Treat `cdp("Domain.method", ...)` ins The `browser` tool behaves like a CLI for browser runtime management. Use it for `browser status --json`, `browser connect local`, `browser local setup`, `browser connect managed`, `browser remote start`, `browser doctor`, explicit recovery, profile summaries, runtime logs, and ownership checks. It does not interact with pages. -The `browser_script` tool runs fresh Python in a browser-connected environment. Browser/CDP state persists in Rust; Python variables do not persist across calls. Important helpers include `cdp`, `new_tab`, `goto_url`, `page_info`, `js`, `capture_screenshot`, `screenshot`, `screenshot_clip`, `emit_image`, `click_at_xy`, `fill_input`, `type_text`, `press_key`, `scroll`, `wait_for_load`, `wait_for_element`, `wait_for_network_idle`, `current_tab`, `list_tabs`, `switch_tab`, `ensure_real_tab`, `upload_file`, `drain_events`, `http_get`, `copy_artifact`, `artifact_root`, `outputs_dir`, `session_metadata`, `audit_artifact`, `agent_workspace`, `load_agent_helpers`, `domain_skills_for_url`, and `last_domain_skills`. Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. +The `browser_script` tool runs fresh Python in a browser-connected environment. Browser/CDP state persists in Rust; Python variables do not persist across calls. Important helpers include `cdp`, `new_tab`, `goto_url`, `page_info`, `js`, `capture_screenshot`, `screenshot`, `screenshot_clip`, `emit_image`, `click_at_xy`, `fill_input`, `type_text`, `press_key`, `scroll`, `wait_for_load`, `wait_for_element`, `wait_for_network_idle`, `current_tab`, `list_tabs`, `switch_tab`, `ensure_real_tab`, `upload_file`, `drain_events`, `http_get`, `http_get_many`, `browser_fetch`, `browser_fetch_many`, `copy_artifact`, `artifact_root`, `outputs_dir`, `session_metadata`, `audit_artifact`, `agent_workspace`, `load_agent_helpers`, `domain_skills_for_url`, and `last_domain_skills`. Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. `browser_script` has a start/listen lifecycle. A fast call returns final output immediately. A longer call returns `status: running` plus `run_id`; observe it with `action="observe"` until final status. If observe returns no new output for its wait window, back off instead of polling constantly. Images/artifacts emitted by the running script are returned by observe as soon as they exist. Use `action="cancel"` with the `run_id` only when the running script is no longer useful. @@ -29,12 +29,12 @@ Browser-harness workflow: - First navigation should usually be `new_tab(url)`, not `goto_url(url)`, because `goto_url` mutates the active tab. `new_tab(url)` and `goto_url(url)` have zero implicit wait: they send the CDP navigation command and then return without waiting for readyState, network idle, selectors, paint, or sleeps. If you chain more work in the same script after navigation, explicitly wait or poll before reading/clicking. If navigation is the last action before yielding to the model, the LLM call itself may provide enough elapsed time; the next call must still inspect state before assuming the page loaded. - When a task is site-specific and a matching domain skill exists, read it before inventing selectors, private API routes, or flows. Use `domain_skills_for_url(url, include_content=True)` before or immediately after navigation; `goto_url(url)` also records matching skill metadata in the tool result. -- Use screenshots as labeled temporal checkpoints. Screenshots are often the fastest way to understand the page, spot blockers, read visible state, and verify what changed. Capture visual state before and after meaningful browser actions: initial load, clicks, scrolls, route changes, menus, dialogs, downloads, uploads, form submissions, and final verification. +- Use screenshots as labeled temporal checkpoints when visual state matters. For text-heavy research, document reading, search, pricing, tables, and list extraction, default to `page_info()`, `js(...)`, targeted DOM text, `http_get_many`, or `browser_fetch_many`; screenshots add latency and usually do not help. Capture visual state before and after meaningful browser actions only when layout, coordinates, blockers, menus, dialogs, downloads, uploads, form submissions, or final visual verification matter. - Prefer coordinate clicks for visible targets. Use `screenshot` or `capture_screenshot`, inspect the pixels, `click_at_xy(x, y)`, then screenshot again to verify. Chrome hit-testing handles iframes, shadow DOM, and cross-origin content better than selector abstractions. - For forms, behave like a browser user. Inspect visually with screenshots first; use read-only JS only when pixels are insufficient to identify labels or stable selectors. Click into visible text fields before typing. Use `type_text(...)`, `press_key(...)`, or `fill_input(...)` for text, and real coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Never bulk-fill a live form by setting DOM values, setting checked state, dispatching synthetic form events, or running a restore loop; this can desynchronize framework state from the visible DOM. - Prefer capturing the action timeline inside one `browser_script` tool call when possible: `screenshot("before_click")`, perform the action, wait for the state change, then `screenshot("after_click")`. - Do not call `screenshot` repeatedly on an unchanged viewport. Once you have a screenshot, either take an action, inspect with CDP/JS, navigate, scroll, call `screenshot_clip(...)` for a different region, wait for an async transition, or finish. Every screenshot should have a purpose: observe current state, verify an action, inspect a changed region, or preserve final evidence. -- Use raw `cdp(...)`, `page_info()`, `wait_for_element(...)`, `wait_for_network_idle(...)`, and `js(...)` when coordinates are the wrong tool or you need structured data. +- Use raw `cdp(...)`, `page_info()`, `wait_for_element(...)`, `wait_for_network_idle(...)`, and `js(...)` when coordinates are the wrong tool or you need structured data. If you have three or more independent URLs, files, documents, or API endpoints to inspect, batch them in one `browser_script` call with `http_get_many` or `browser_fetch_many` instead of visiting them one at a time. - `js(...)` returns Python values. After `text = js("document.body.innerText")`, use Python slicing like `text[:1000]`; only use JavaScript methods such as `.slice(...)` inside the JavaScript expression itself. - After actions that trigger loads, SPA transitions, XHR/fetch, menus, dialogs, downloads, uploads, or other visible state changes, be patient by making several cheap observations, not one long blind wait. Prefer short waits, then inspect again with `page_info()` or a screenshot. A wait returning false is not a task failure; inspect the current page and continue from the best available state or decide whether it is stuck. - If redirected to an auth wall or credential prompt, stop and ask the user. Do not infer or type credentials from screenshots. @@ -51,6 +51,10 @@ Python namespace rule: `browser_script` variables do not persist across calls. S Durable helper rule: if you discover a reusable selector, site quirk, private API, or interaction helper, put the smallest useful helper in `.browser-use/agent-workspace/agent_helpers.py` and use it on later calls. The file is auto-loaded when it changes; call `load_agent_helpers()` if you need to force reload. Keep helpers task-focused, CDP-friendly, and free of secrets. Do not build manager layers, retry frameworks, page-object frameworks, or wrapper abstractions unless the task itself absolutely requires it. -Use the browser to discover and verify. Once the browser reveals stable data endpoints, static links, downloadable assets, XHR/fetch patterns, or predictable pagination URLs, switch to `requests`, `http_get`, `fetch` inside `js`, or `ThreadPoolExecutor` for bulk extraction. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Use one global deadline plus per-item micro timeouts, and check the global deadline before every navigation, wait, and sleep. Any loop over multiple pages/items must emit short progress every item or every 2 seconds, whichever comes first. For list/profile extraction, filter candidates before navigating when possible, and poll for record readiness rather than nullable answer fields; if a loaded record has a missing optional field, record it as missing and continue. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. +Multi-item collection rule: when the task asks for many products, countries, people, records, plans, prices, links, or fields, maintain a checklist of every required row and field. Spend work across the checklist, not indefinitely on one difficult item. For each item/source, use a small number of targeted attempts, then either record the best verified value, mark it unavailable/unknown with the source and reason, or move to a better source. Do not keep varying one search term while other required rows are untouched. Before `done`, audit the checklist: every requested row/field must be filled, explicitly unavailable/unknown, or clearly reported as partial with the remaining gaps. + +Single-site collection rule: when the task asks for data from one website, one vendor, one domain, or "a single website", choose one viable domain early and complete the checklist on that domain. Candidate scouting should be brief: verify the domain has the right category, currency, locale, or authority, then commit. If the task permits unavailable/missing rows, a domain is viable as soon as it has the requested category/source type and at least one requested row or a searchable catalog in the requested currency/locale; do not keep searching for a perfect domain that has every row. Do not stitch rows from multiple domains, and do not keep vendor-hopping after a viable domain exists. Switch domains only when the current domain clearly cannot satisfy the requested category/currency/authority after a bounded check. If an item is missing on the committed domain, mark it unavailable for that domain and move to the next checklist row. + +Use the browser to discover and verify. Once the browser reveals stable data endpoints, static links, downloadable assets, XHR/fetch patterns, or predictable pagination URLs, switch to `http_get_many` for independent public URLs or `browser_fetch_many` when browser cookies/session state are needed. Use single `http_get`/`browser_fetch` calls for one-off checks. For long extraction loops, split work into bounded chunks, use explicit timeouts, checkpoint partial results to files, and resume from checkpoints instead of restarting. Use one global deadline plus per-item micro timeouts, and check the global deadline before every navigation, wait, and sleep. Any loop over multiple pages/items must emit short progress every item or every 2 seconds, whichever comes first. For list/profile extraction, filter candidates before navigating when possible, and poll for record readiness rather than nullable answer fields; if a loaded record has a missing optional field, record it as missing and continue. Extract only task-relevant fields; do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless smaller field-level extraction failed. Use `outputs_dir()` for generated result files; files written there are collected as artifacts automatically. Use `copy_artifact(path)` only for files created elsewhere, and `emit_image(path)` for screenshots or visual artifacts. When a task expects a large JSON/CSV/list output, write the full file; if the final answer must be inline structured content, return that content with `done(result=...)` and optionally include `result_file=path`, otherwise finish with `done(result_file=path)`. Use helper agents only when the user explicitly asks for sub-agents, delegation, or parallel agent work. Requests for depth, thoroughness, research, investigation, or detailed codebase analysis do not by themselves authorize spawning a helper. When delegation is authorized, give each helper a narrow, self-contained task that materially advances the work, keep urgent blocking work local, avoid duplicate helper work, and continue useful non-overlapping local work while the helper runs. Use the `explorer` role for authorized read-only repository questions and `worker` for authorized implementation work with a bounded write scope. diff --git a/prompts/browser-script-tool-description.md b/prompts/browser-script-tool-description.md index d7f623bc..86c13298 100644 --- a/prompts/browser-script-tool-description.md +++ b/prompts/browser-script-tool-description.md @@ -51,6 +51,9 @@ ensure_real_tab() upload_file(...) drain_events() http_get(url, **kwargs) +http_get_many(urls, **kwargs) +browser_fetch(url, **kwargs) +browser_fetch_many(requests, **kwargs) copy_artifact(path, kind="file") emit_output(value, label=None) @@ -73,7 +76,7 @@ Usage guidance: - Do not combine `Input.dispatchKeyEvent` carrying printable `text` with a manual `char` event for the same character; that double-inserts text in Chrome. - If the task is site-specific, call `domain_skills_for_url(url, include_content=True)` before inventing selectors, private API routes, or flows. `goto_url(url)` also returns matching `domain_skills` metadata when a skill root is available. - Be patient with loading pages by making several cheap observations, not one long blind wait. Prefer short waits such as `wait_for_load(1)`, `wait_for_element(selector, timeout=2)`, or `wait_for_network_idle(2)`, then inspect again. If a wait returns false, that is not a task failure; inspect the current page and continue from the best available state or decide whether it is stuck. -- Use screenshots as labeled temporal checkpoints: initial load, before/after meaningful clicks, scrolls, route changes, dialogs, uploads, downloads, and final verification. +- Use screenshots as labeled temporal checkpoints when visual state matters: before/after meaningful clicks, scrolls, route changes, dialogs, uploads, downloads, and visual final verification. For text-heavy research, document reading, search, pricing, tables, and list extraction, prefer `page_info()`, `js(...)`, targeted DOM text, `http_get_many`, or `browser_fetch_many`; screenshots add latency and usually do not help. - The common screenshot call is `screenshot(label)`, for example `screenshot("before_submit")`. - Screenshot/image artifacts are sent as `input_image` content to the next model turn. The user does not see those pixels inline in the terminal; describe what you see or provide the saved artifact path when the user asks for the screenshot. - If a script emits screenshots/images and then fails, the next model turn still receives the images alongside the failure diagnosis. Use those pixels to decide the next smaller retry. @@ -109,7 +112,38 @@ emit_output(rows, label="employee_rows") - Use `js(...)` for DOM inspection and raw `cdp(...)` for lower-level browser actions. - Use `js(function_source, *args)` when passing JSON-serializable Python values into JavaScript; use `target_id=` as a keyword for iframe targets. - For real user forms, act like a browser user: screenshot, click the visible field/control, type with `type_text(...)`, `press_key(...)`, or `fill_input(...)`, then screenshot or otherwise verify. Use coordinate clicks for checkboxes, radios, buttons, dropdowns, and custom controls. Do not assign `element.value`, `element.checked`, `selectedIndex`, React private state, or MutationObserver restore loops on live forms. Do not synthesize `input`, `change`, `click`, or keyboard events in page JavaScript to make a form look filled. Those anti-patterns can desynchronize framework state from the visible DOM. -- Use `http_get(...)` for static pages and APIs after the browser reveals stable endpoints. It returns the response body as a string by default, or bytes with `binary=True`; the returned body also exposes `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. If direct HTTP hits bot or login protection, retry with site-specific headers/cookies, `js(fetch(...))` in the browser, or the configured Browser Use fetch proxy. +- Use `http_get(...)` for one static page/API URL after the browser reveals a stable endpoint, and `http_get_many(...)` for several independent public URLs. Use `browser_fetch(...)` or `browser_fetch_many(...)` when the page's cookies, auth headers, or browser session are needed. Returned bodies are strings by default, bytes with `binary=True`, and expose `.status_code`, `.headers`, `.url`, `.text`, `.content`, and `.json()` for convenience. `browser_fetch(...)` and the batch helpers return error records by default so one bad endpoint does not waste the whole extraction chunk; pass `return_error=False` or `return_errors=False` only when a hard failure is intended. If direct HTTP hits bot or login protection, retry with `browser_fetch(...)`, site-specific headers/cookies, or the configured Browser Use fetch proxy. +- Batch recipe after discovering stable links or endpoints: + +```python +# browser_summary: +# { +# "fetch_progress": { +# "kind": "extracted", +# "message": "Fetched ${$.ok_count}/${$.total} independent URLs" +# }, +# "records": { +# "kind": "extracted", +# "message": "Extracted ${$.length} records from fetched pages" +# } +# } + +urls = [...] +responses = http_get_many(urls, timeout=12, max_workers=8) +ok = [r for r in responses if not isinstance(r, dict) and getattr(r, "status_code", 0) < 400] +emit_output({"total": len(responses), "ok_count": len(ok)}, label="fetch_progress") + +records = [] +for url, response in zip(urls, responses): + if isinstance(response, dict) and response.get("error"): + records.append({"url": url, "status": "error", "error": response["error"]}) + continue + text = response.text + records.append({"url": url, "status": response.status_code, "title": text[:200]}) + +emit_output(records, label="records") +``` + - Extract only fields needed for the task. Do not emit full profile text, full DOM text, cookies, localStorage, or entire app caches unless you are debugging and the smaller field-level extraction failed. - Save complete generated result files under `outputs_dir()` or relative paths in the current working directory. Files written there are collected as artifacts automatically; `copy_artifact(...)` is for files created elsewhere. - For large structured results, write the full JSON/CSV/text to a file. If the task asks for an exact inline final format, return that content with `done(result=...)` and optionally include `result_file=path`; otherwise finish with `done(result_file=path)`. diff --git a/prompts/dataset-case-user.md b/prompts/dataset-case-user.md index c46f6bcd..d668d259 100644 --- a/prompts/dataset-case-user.md +++ b/prompts/dataset-case-user.md @@ -6,7 +6,7 @@ Task ID: {{task_id}} Task: {{task}} -Use `browser` for browser connection/status/recovery and `browser_script` for browser interaction. Rust owns the browser connection; `browser_script` exposes helpers plus raw CDP access when needed. Prefer robust CDP/DOM observations over guessing. Attach screenshots after meaningful visual transitions or whenever visible state matters. +Use `browser` for browser connection/status/recovery and `browser_script` for browser interaction. Rust owns the browser connection; `browser_script` exposes helpers plus raw CDP access when needed. Prefer robust CDP/DOM observations over guessing. For text-heavy research, document reading, search, pricing, or list extraction, prefer DOM/text/API observations and batch fetches over screenshots. Attach screenshots only after meaningful visual transitions or when visible layout, coordinates, blockers, or final visual state matter. Filesystem contract: if the task asks you to save files, write them in the current working directory using relative paths. For large JSON/CSV/list results, save the full result to `result.json` or `result.csv` so it is available as an artifact. If the requested final answer is not an exact inline format, return a compact final answer with the output path, record count, schema/columns, and one sample row instead of pasting a giant blob. @@ -18,6 +18,8 @@ Remote browser contract: browser automation may run on a different machine from Long extraction contract: if the task needs many pages, rows, files, or detail records, work in bounded chunks. Discover the endpoint or pagination pattern first, then fetch in batches with explicit timeouts, checkpoint partial results in the current working directory, and print compact progress counts. A timed-out all-in-one crawl with no saved artifact is not progress; resume from checkpoints when a chunk fails. +Timebox contract: dataset runs have a short wall-clock budget. For long research, document, or extraction tasks, set a soft deadline before starting broad collection, about 7 minutes from now, and a hard deadline about 8.5 minutes from now. Check the deadline before each new page, document, query, or file. After the soft deadline, stop broad research and fill remaining fields from the strongest verified evidence or mark them unknown/unavailable. Before the hard deadline, call `done(...)` with the completed or partial result. Never keep running until the external runner timeout with no saved result. + Completion contract: the final answer must contain the requested answer or a clear pointer to the artifact that contains it. For artifact-heavy results, include the artifact path, record count, schema/columns, and one sample row. A bare acknowledgement such as `Done.` is not useful unless the task explicitly asked for no visible answer. Before finalizing extraction results, briefly check that the returned items are the same kind of thing the task asked for and that hard filters were not softened to satisfy quantity. If an item is only adjacent, similar, or uncertain, exclude it or mark it uncertain rather than silently treating it as a match. diff --git a/python/llm_browser_worker/worker.py b/python/llm_browser_worker/worker.py index 9e7d23fd..2054cc58 100644 --- a/python/llm_browser_worker/worker.py +++ b/python/llm_browser_worker/worker.py @@ -3,7 +3,9 @@ import contextlib import atexit import base64 +import fnmatch import hashlib +import ipaddress import importlib import importlib.util import io @@ -19,6 +21,7 @@ import re import tempfile import time +import urllib.parse import urllib.request from pathlib import Path from typing import Any, Dict @@ -27,6 +30,7 @@ _namespaces: Dict[str, Dict[str, Any]] = {} _managed_chrome: subprocess.Popen[Any] | None = None _managed_chrome_profile: Path | None = None +_managed_chrome_profile_is_temporary = False _explicit_agent_workspace = os.environ.get("BH_AGENT_WORKSPACE") @@ -230,6 +234,275 @@ def _browser_mode() -> str: return os.environ.get("LLM_BROWSER_BROWSER_MODE", "").lower().replace("_", "-").replace(" ", "-") +def _browser_user_agent() -> str | None: + value = os.environ.get("BU_BROWSER_USER_AGENT") + if value is None: + return None + value = value.strip() + return value or None + + +def _apply_browser_user_agent_override(cdp: Any, session_id: Any = None) -> None: + user_agent = _browser_user_agent() + if not user_agent: + return + with contextlib.suppress(Exception): + cdp("Network.setUserAgentOverride", session_id=session_id, userAgent=user_agent) + + +def _browser_permissions() -> list[str]: + raw = os.environ.get("BU_BROWSER_PERMISSIONS") + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return [] + if not isinstance(parsed, list): + return [] + permissions: list[str] = [] + seen: set[str] = set() + for permission in parsed: + if not isinstance(permission, str): + continue + permission = permission.strip() + if not permission or permission in seen: + continue + permissions.append(permission) + seen.add(permission) + return permissions + + +def _apply_browser_permissions(cdp: Any) -> None: + permissions = _browser_permissions() + if not permissions: + return + with contextlib.suppress(Exception): + cdp("Browser.grantPermissions", permissions=permissions) + + +def _browser_download_behavior() -> dict[str, Any] | None: + accept_downloads = _env_bool("BU_BROWSER_ACCEPT_DOWNLOADS") + if accept_downloads is False: + return {"behavior": "deny"} + + raw_path = os.environ.get("BU_BROWSER_DOWNLOADS_PATH") + if not raw_path or not raw_path.strip(): + return None + download_path = Path(raw_path.strip()).expanduser().resolve() + with contextlib.suppress(OSError): + download_path.mkdir(parents=True, exist_ok=True) + return {"behavior": "allow", "downloadPath": str(download_path), "eventsEnabled": True} + + +def _apply_browser_download_behavior(cdp: Any) -> None: + behavior = _browser_download_behavior() + if not behavior: + return + with contextlib.suppress(Exception): + cdp("Browser.setDownloadBehavior", **behavior) + + +def _browser_storage_state_raw() -> str | None: + raw = os.environ.get("BU_BROWSER_STORAGE_STATE") + if not raw or not raw.strip(): + return None + return raw + + +def _browser_storage_state() -> dict[str, Any] | None: + raw = _browser_storage_state_raw() + if raw is None: + return None + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return None + return parsed if isinstance(parsed, dict) else None + + +def _browser_storage_cookies(storage_state: dict[str, Any]) -> list[dict[str, Any]]: + raw_cookies = storage_state.get("cookies") + if not isinstance(raw_cookies, list): + return [] + cookies: list[dict[str, Any]] = [] + for cookie in raw_cookies: + if not isinstance(cookie, dict): + continue + if not isinstance(cookie.get("name"), str) or not isinstance(cookie.get("value"), str): + continue + cookies.append(cookie) + return cookies + + +def _browser_storage_init_scripts(storage_state: dict[str, Any]) -> list[str]: + origins = storage_state.get("origins") + if not isinstance(origins, list): + return [] + scripts: list[str] = [] + for origin_state in origins: + if not isinstance(origin_state, dict): + continue + origin = origin_state.get("origin") + statements: list[str] = [] + for storage_name in ("localStorage", "sessionStorage"): + items = origin_state.get(storage_name) + if not isinstance(items, list): + continue + for item in items: + if not isinstance(item, dict): + continue + name = item.get("name") + value = item.get("value") + if not isinstance(name, str) or not isinstance(value, str): + continue + statements.append( + f"window.{storage_name}.setItem({json.dumps(name)}, {json.dumps(value)});" + ) + if not statements: + continue + body = "\n".join(statements) + if isinstance(origin, str) and origin: + scripts.append( + "try {\n" + f" if (window.location.origin === {json.dumps(origin)}) {{\n" + f" {body}\n" + " }\n" + "} catch (error) {}" + ) + else: + scripts.append(f"try {{\n {body}\n}} catch (error) {{}}") + return scripts + + +def _apply_browser_storage_state( + cdp: Any, + session_id: Any = None, + applied: set[tuple[Any, ...]] | None = None, +) -> None: + raw = _browser_storage_state_raw() + if raw is None: + return + storage_state = _browser_storage_state() + if not storage_state: + return + + signature = hashlib.sha256(raw.encode("utf-8")).hexdigest() + cookies = _browser_storage_cookies(storage_state) + cookie_key = ("storage_cookies", signature) + if cookies and (applied is None or cookie_key not in applied): + try: + cdp("Storage.setCookies", session_id=session_id, cookies=cookies) + except Exception: + pass + else: + if applied is not None: + applied.add(cookie_key) + + if session_id is None: + return + for index, script in enumerate(_browser_storage_init_scripts(storage_state)): + script_key = ("storage_script", str(session_id), signature, index) + if applied is not None and script_key in applied: + continue + try: + cdp( + "Page.addScriptToEvaluateOnNewDocument", + session_id=session_id, + source=script, + runImmediately=True, + ) + except Exception: + pass + else: + if applied is not None: + applied.add(script_key) + + +def _env_json_string_list(name: str) -> list[str]: + raw = os.environ.get(name) + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return [] + if not isinstance(parsed, list): + return [] + return [value.strip() for value in parsed if isinstance(value, str) and value.strip()] + + +def _is_root_domain(domain: str) -> bool: + if "*" in domain or "://" in domain: + return False + return domain.count(".") == 1 + + +def _is_ip_address(host: str) -> bool: + with contextlib.suppress(ValueError): + ipaddress.ip_address(host) + return True + return False + + +def _domain_pattern_matches(url: str, host: str, scheme: str, pattern: str) -> bool: + full_url_pattern = f"{scheme}://{host}" + pattern = pattern.strip() + if not pattern: + return False + if "*" in pattern: + if pattern.startswith("*."): + domain_part = pattern[2:].lower() + host_lower = host.lower() + return scheme in {"http", "https"} and ( + host_lower == domain_part or host_lower.endswith(f".{domain_part}") + ) + if pattern.endswith("/*"): + return url.startswith(pattern[:-1]) + return fnmatch.fnmatch(full_url_pattern if "://" in pattern else host, pattern) + if "://" in pattern: + return url.lower().startswith(pattern.lower()) + host_lower = host.lower() + pattern_lower = pattern.lower() + if host_lower == pattern_lower: + return True + return _is_root_domain(pattern_lower) and host_lower == f"www.{pattern_lower}" + + +def _browser_profile_url_allowed(url: str) -> bool: + if url in {"about:blank", "chrome://new-tab-page/", "chrome://new-tab-page", "chrome://newtab/"}: + return True + try: + parsed = urllib.parse.urlparse(url) + except Exception: + return False + if parsed.scheme in {"data", "blob"}: + return True + host = parsed.hostname + if not host: + return False + if _env_bool("BU_BROWSER_BLOCK_IP_ADDRESSES") is True and _is_ip_address(host): + return False + + allowed_domains = _env_json_string_list("BU_BROWSER_ALLOWED_DOMAINS") + prohibited_domains = _env_json_string_list("BU_BROWSER_PROHIBITED_DOMAINS") + if allowed_domains: + return any(_domain_pattern_matches(url, host, parsed.scheme, pattern) for pattern in allowed_domains) + if prohibited_domains: + return not any(_domain_pattern_matches(url, host, parsed.scheme, pattern) for pattern in prohibited_domains) + return True + + +def _enforce_browser_domain_constraints(method: str, params: dict[str, Any]) -> None: + if method != "Page.navigate": + return + url = params.get("url") + if not isinstance(url, str) or not url: + return + if not _browser_profile_url_allowed(url): + raise RuntimeError(f"BrowserProfile domain constraints blocked navigation to {url}") + + def _annotate_error(msg: str) -> str: for pattern, hint in _HINT_PATTERNS: if pattern.search(msg): @@ -333,6 +606,11 @@ def _free_port() -> int: def _managed_chrome_is_visible() -> bool: + mode = _browser_mode() + if mode in {"managed-headed", "headed", "headful"}: + return True + if mode in {"managed-headless", "headless", "headless-chromium"}: + return False return os.environ.get("LLM_BROWSER_MANAGED_CHROME_VISIBLE") == "1" @@ -340,7 +618,8 @@ def _should_start_managed_chrome() -> bool: if os.environ.get("BU_CDP_URL") or os.environ.get("BU_CDP_WS") or os.environ.get("BU_BROWSER_ID"): return False return ( - _browser_mode() in {"headless", "headless-chromium"} + _browser_mode() + in {"managed-headless", "managed-headed", "headless", "headless-chromium", "headed", "headful"} or os.environ.get("LLM_BROWSER_AUTO_CHROME") == "1" ) @@ -356,7 +635,84 @@ def _pick_managed_chrome_path(visible: bool) -> str: return _pick_chromium_path() +def _managed_chrome_extra_args() -> list[str]: + raw = os.environ.get("BU_MANAGED_BROWSER_ARGS") + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return [] + if not isinstance(parsed, list): + return [] + return [arg for arg in parsed if isinstance(arg, str) and arg] + + +def _managed_chrome_profile_dir() -> tuple[Path, bool]: + configured = os.environ.get("BU_MANAGED_BROWSER_PROFILE") + if configured and configured.strip(): + profile = Path(configured).expanduser() + profile.mkdir(parents=True, exist_ok=True) + return profile, False + return Path(tempfile.mkdtemp(prefix="but-managed-chrome.")), True + + +def _env_bool(name: str) -> bool | None: + raw = os.environ.get(name) + if raw is None: + return None + value = raw.strip().lower() + if value in {"1", "true", "yes", "on"}: + return True + if value in {"0", "false", "no", "off"}: + return False + return None + + +def _env_milliseconds_to_seconds(name: str) -> float: + raw = os.environ.get(name) + if raw is None: + return 0.0 + try: + milliseconds = float(raw.strip()) + except ValueError: + return 0.0 + if milliseconds <= 0: + return 0.0 + return milliseconds / 1000.0 + + +def _apply_browser_wait_between_actions() -> None: + wait_seconds = _env_milliseconds_to_seconds("BU_BROWSER_WAIT_BETWEEN_ACTIONS_MS") + if wait_seconds > 0: + time.sleep(wait_seconds) + + +def _managed_chrome_viewport_args() -> list[str]: + if _env_bool("BU_BROWSER_NO_VIEWPORT") is True: + return [] + raw = os.environ.get("BU_BROWSER_VIEWPORT") + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return [] + if not isinstance(parsed, dict): + return [] + width = parsed.get("width") + height = parsed.get("height") + if type(width) is not int or type(height) is not int or width <= 0 or height <= 0: + return [] + args = [f"--window-size={width},{height}"] + device_scale_factor = parsed.get("deviceScaleFactor") + if isinstance(device_scale_factor, (int, float)) and device_scale_factor > 0: + args.append(f"--force-device-scale-factor={device_scale_factor:g}") + return args + + def _managed_chrome_args(chrome: str, port: int, profile: Path, visible: bool) -> list[str]: + viewport_args = _managed_chrome_viewport_args() args = [ chrome, "--remote-debugging-address=127.0.0.1", @@ -366,9 +722,13 @@ def _managed_chrome_args(chrome: str, port: int, profile: Path, visible: bool) - "--no-default-browser-check", ] if visible: - args.extend(["--new-window", "--window-size=1512,900"]) + args.append("--new-window") + if not viewport_args: + args.append("--window-size=1512,900") else: args.append("--headless=new") + args.extend(viewport_args) + args.extend(_managed_chrome_extra_args()) args.append("about:blank") return args @@ -383,7 +743,7 @@ def _daemon_has_browser_connection(admin: Any) -> bool: def _cleanup_managed_chrome() -> None: - global _managed_chrome, _managed_chrome_profile + global _managed_chrome, _managed_chrome_profile, _managed_chrome_profile_is_temporary proc = _managed_chrome _managed_chrome = None if proc is not None and proc.poll() is None: @@ -393,13 +753,14 @@ def _cleanup_managed_chrome() -> None: except subprocess.TimeoutExpired: proc.kill() proc.wait(timeout=5) - if _managed_chrome_profile is not None: + if _managed_chrome_profile is not None and _managed_chrome_profile_is_temporary: shutil.rmtree(_managed_chrome_profile, ignore_errors=True) - _managed_chrome_profile = None + _managed_chrome_profile = None + _managed_chrome_profile_is_temporary = False def _ensure_managed_chrome(admin: Any | None = None) -> None: - global _managed_chrome, _managed_chrome_profile + global _managed_chrome, _managed_chrome_profile, _managed_chrome_profile_is_temporary if not _should_start_managed_chrome(): return if admin is not None and _daemon_has_browser_connection(admin): @@ -408,7 +769,7 @@ def _ensure_managed_chrome(admin: Any | None = None) -> None: return port = _free_port() - profile = Path(tempfile.mkdtemp(prefix="but-managed-chrome.")) + profile, profile_is_temporary = _managed_chrome_profile_dir() visible = _managed_chrome_is_visible() chrome = _pick_managed_chrome_path(visible) proc = subprocess.Popen( @@ -430,11 +791,13 @@ def _ensure_managed_chrome(admin: Any | None = None) -> None: time.sleep(0.25) else: proc.terminate() - shutil.rmtree(profile, ignore_errors=True) + if profile_is_temporary: + shutil.rmtree(profile, ignore_errors=True) raise RuntimeError(f"managed Chrome DevTools did not become available: {last_error}") _managed_chrome = proc _managed_chrome_profile = profile + _managed_chrome_profile_is_temporary = profile_is_temporary os.environ["BU_CDP_URL"] = f"http://127.0.0.1:{port}" if not visible: atexit.register(_cleanup_managed_chrome) @@ -553,12 +916,26 @@ def _patch_browser_harness_cdp(helpers: Any, admin: Any) -> None: if getattr(helpers, "__llm_browser_cdp_patched__", False): return original_cdp = helpers.cdp + applied_browser_profile_state: set[tuple[Any, ...]] = set() def cdp_with_daemon(method: str, session_id: Any = None, **params: Any) -> Any: if _browser_mode() == "cloud": _ensure_cloud_browser(admin) else: admin.ensure_daemon() + _enforce_browser_domain_constraints(method, params) + if method != "Browser.grantPermissions": + _apply_browser_permissions(original_cdp) + if method != "Browser.setDownloadBehavior": + _apply_browser_download_behavior(original_cdp) + if method not in {"Storage.setCookies", "Page.addScriptToEvaluateOnNewDocument"}: + _apply_browser_storage_state( + original_cdp, + session_id=session_id, + applied=applied_browser_profile_state, + ) + if method != "Network.setUserAgentOverride": + _apply_browser_user_agent_override(original_cdp, session_id=session_id) return original_cdp(method, session_id=session_id, **params) helpers.__llm_browser_original_cdp__ = original_cdp @@ -1653,6 +2030,7 @@ def _run(request: Dict[str, Any]) -> Dict[str, Any]: assert ns is not None with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stdout): exec(compile(code, "", "exec"), ns) + _apply_browser_wait_between_actions() _auto_emit_browser_state(ns, request_id) _emit_browser_identity_events(ns, request_id) return { diff --git a/python/tests/test_worker_package.py b/python/tests/test_worker_package.py index cdcabca0..c45d1725 100644 --- a/python/tests/test_worker_package.py +++ b/python/tests/test_worker_package.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from llm_browser_worker import worker @@ -30,6 +31,28 @@ def test_worker_run_executes_in_persistent_session_namespace(tmp_path: Path) -> assert second["data"] == 2 +def test_worker_run_applies_browser_wait_between_actions_env( + tmp_path: Path, monkeypatch +) -> None: + sleeps = [] + monkeypatch.setenv("BU_BROWSER_WAIT_BETWEEN_ACTIONS_MS", "125") + monkeypatch.setattr(worker.time, "sleep", lambda seconds: sleeps.append(seconds)) + + response = worker._run( + { + "id": "wait-between", + "session_id": "task-wait-between", + "cwd": str(tmp_path), + "artifact_dir": str(tmp_path / "artifacts"), + "code": "result = 'ok'", + } + ) + + assert response["ok"] is True + assert response["data"] == "ok" + assert sleeps == [0.125] + + def test_worker_records_artifacts_and_images(tmp_path: Path) -> None: source = tmp_path / "source.png" source.write_bytes(b"png") @@ -239,6 +262,279 @@ def fake_cdp(method, session_id=None, **kwargs): assert Path(response["images"][0]["path"]).exists() +def test_worker_cdp_applies_browser_user_agent_env(monkeypatch) -> None: + calls = [] + monkeypatch.setenv("BU_BROWSER_USER_AGENT", " BrowserUseRuntime/3.0 ") + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Runtime.evaluate", session_id="target-1", expression="navigator.userAgent") + + assert result == {"method": "Runtime.evaluate"} + assert admin.ensure_calls == 1 + assert calls == [ + ("Network.setUserAgentOverride", "target-1", {"userAgent": "BrowserUseRuntime/3.0"}), + ("Runtime.evaluate", "target-1", {"expression": "navigator.userAgent"}), + ] + + calls.clear() + result = helpers.cdp("Network.setUserAgentOverride", userAgent="Manual/1.0") + + assert result == {"method": "Network.setUserAgentOverride"} + assert admin.ensure_calls == 2 + assert calls == [("Network.setUserAgentOverride", None, {"userAgent": "Manual/1.0"})] + + +def test_worker_cdp_grants_browser_permissions_env(monkeypatch) -> None: + calls = [] + monkeypatch.setenv( + "BU_BROWSER_PERMISSIONS", + '["clipboardReadWrite","notifications","clipboardReadWrite",3,""]', + ) + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Page.navigate", session_id="target-1", url="https://example.com") + + assert result == {"method": "Page.navigate"} + assert admin.ensure_calls == 1 + assert calls == [ + ( + "Browser.grantPermissions", + None, + {"permissions": ["clipboardReadWrite", "notifications"]}, + ), + ("Page.navigate", "target-1", {"url": "https://example.com"}), + ] + + calls.clear() + result = helpers.cdp("Browser.grantPermissions", permissions=["geolocation"]) + + assert result == {"method": "Browser.grantPermissions"} + assert admin.ensure_calls == 2 + assert calls == [("Browser.grantPermissions", None, {"permissions": ["geolocation"]})] + + +def test_worker_cdp_applies_browser_download_behavior_env(tmp_path: Path, monkeypatch) -> None: + calls = [] + downloads_path = tmp_path / "downloads" + monkeypatch.setenv("BU_BROWSER_ACCEPT_DOWNLOADS", "true") + monkeypatch.setenv("BU_BROWSER_DOWNLOADS_PATH", str(downloads_path)) + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Page.navigate", session_id="target-1", url="https://example.com") + + assert result == {"method": "Page.navigate"} + assert admin.ensure_calls == 1 + assert calls == [ + ( + "Browser.setDownloadBehavior", + None, + { + "behavior": "allow", + "downloadPath": str(downloads_path.resolve()), + "eventsEnabled": True, + }, + ), + ("Page.navigate", "target-1", {"url": "https://example.com"}), + ] + assert downloads_path.exists() + + calls.clear() + monkeypatch.setenv("BU_BROWSER_ACCEPT_DOWNLOADS", "false") + result = helpers.cdp("Runtime.evaluate", expression="1") + + assert result == {"method": "Runtime.evaluate"} + assert admin.ensure_calls == 2 + assert calls == [ + ("Browser.setDownloadBehavior", None, {"behavior": "deny"}), + ("Runtime.evaluate", None, {"expression": "1"}), + ] + + calls.clear() + result = helpers.cdp("Browser.setDownloadBehavior", behavior="allowAndName") + + assert result == {"method": "Browser.setDownloadBehavior"} + assert admin.ensure_calls == 3 + assert calls == [("Browser.setDownloadBehavior", None, {"behavior": "allowAndName"})] + + +def test_worker_cdp_applies_browser_storage_state_env(monkeypatch) -> None: + calls = [] + cookie = {"name": "sid", "value": "secret", "domain": ".example.com", "path": "/"} + monkeypatch.setenv( + "BU_BROWSER_STORAGE_STATE", + json.dumps( + { + "cookies": [cookie, {"name": "bad"}], + "origins": [ + { + "origin": "https://example.com", + "localStorage": [{"name": "theme", "value": "dark"}, {"name": 3}], + "sessionStorage": [{"name": "step", "value": "one"}], + } + ], + } + ), + ) + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Page.navigate", session_id="target-1", url="https://example.com") + + assert result == {"method": "Page.navigate"} + assert admin.ensure_calls == 1 + assert calls[0] == ("Storage.setCookies", "target-1", {"cookies": [cookie]}) + assert calls[1][0] == "Page.addScriptToEvaluateOnNewDocument" + assert calls[1][1] == "target-1" + assert calls[1][2]["runImmediately"] is True + script = calls[1][2]["source"] + assert 'window.location.origin === "https://example.com"' in script + assert 'window.localStorage.setItem("theme", "dark");' in script + assert 'window.sessionStorage.setItem("step", "one");' in script + assert calls[2] == ("Page.navigate", "target-1", {"url": "https://example.com"}) + + calls.clear() + result = helpers.cdp("Runtime.evaluate", session_id="target-1", expression="1") + + assert result == {"method": "Runtime.evaluate"} + assert admin.ensure_calls == 2 + assert calls == [("Runtime.evaluate", "target-1", {"expression": "1"})] + + calls.clear() + result = helpers.cdp("Storage.setCookies", session_id="target-1", cookies=[]) + + assert result == {"method": "Storage.setCookies"} + assert admin.ensure_calls == 3 + assert calls == [("Storage.setCookies", "target-1", {"cookies": []})] + + +def test_worker_cdp_enforces_browser_domain_constraints_env(monkeypatch) -> None: + calls = [] + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "remote-cdp") + monkeypatch.setenv("BU_BROWSER_ALLOWED_DOMAINS", '["example.com","*.browser-use.com"]') + + class Helpers: + __all__ = ["cdp"] + + def cdp(self, method, session_id=None, **params): + calls.append((method, session_id, params)) + return {"method": method} + + class Admin: + def __init__(self) -> None: + self.ensure_calls = 0 + + def ensure_daemon(self): + self.ensure_calls += 1 + + def expect_blocked(url: str) -> None: + try: + helpers.cdp("Page.navigate", session_id="target-1", url=url) + except RuntimeError as exc: + assert "BrowserProfile domain constraints blocked navigation" in str(exc) + assert url in str(exc) + else: + raise AssertionError(f"navigation should be blocked: {url}") + + helpers = Helpers() + admin = Admin() + worker._patch_browser_harness_cdp(helpers, admin) + + result = helpers.cdp("Page.navigate", session_id="target-1", url="https://www.example.com/path") + + assert result == {"method": "Page.navigate"} + assert calls == [("Page.navigate", "target-1", {"url": "https://www.example.com/path"})] + + expect_blocked("https://iana.org/") + + assert len(calls) == 1 + + monkeypatch.delenv("BU_BROWSER_ALLOWED_DOMAINS", raising=False) + monkeypatch.setenv("BU_BROWSER_PROHIBITED_DOMAINS", '["*.tracking.example"]') + + expect_blocked("https://ads.tracking.example/") + + assert len(calls) == 1 + + monkeypatch.delenv("BU_BROWSER_PROHIBITED_DOMAINS", raising=False) + monkeypatch.setenv("BU_BROWSER_BLOCK_IP_ADDRESSES", "true") + + expect_blocked("http://127.0.0.1/") + + assert len(calls) == 1 + assert admin.ensure_calls == 4 + + def test_worker_page_info_fallback_reads_target_url_and_title( tmp_path: Path, monkeypatch ) -> None: @@ -536,6 +832,96 @@ def __str__(self) -> str: ) +def test_managed_browser_profile_env_controls_worker_launch( + tmp_path: Path, monkeypatch +) -> None: + monkeypatch.delenv("BU_CDP_URL", raising=False) + monkeypatch.delenv("BU_CDP_WS", raising=False) + monkeypatch.delenv("BU_BROWSER_ID", raising=False) + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "managed_headed") + monkeypatch.setenv( + "BU_MANAGED_BROWSER_ARGS", + '["--proxy-server=http://proxy.example:8080","--user-agent=BrowserUseTest/1.0",3,""]', + ) + + assert worker._should_start_managed_chrome() is True + assert worker._managed_chrome_is_visible() is True + + headed = worker._managed_chrome_args("/chrome", 9335, tmp_path / "profile", True) + assert "--new-window" in headed + assert "--proxy-server=http://proxy.example:8080" in headed + assert "--user-agent=BrowserUseTest/1.0" in headed + assert headed.index("--proxy-server=http://proxy.example:8080") < headed.index("about:blank") + assert "3" not in headed + assert "" not in headed + + monkeypatch.setenv("LLM_BROWSER_BROWSER_MODE", "managed-headless") + + assert worker._should_start_managed_chrome() is True + assert worker._managed_chrome_is_visible() is False + + headless = worker._managed_chrome_args("/chrome", 9336, tmp_path / "profile", False) + assert "--headless=new" in headless + assert "--new-window" not in headless + assert "--proxy-server=http://proxy.example:8080" in headless + + +def test_managed_browser_profile_env_uses_configured_user_data_dir( + tmp_path: Path, monkeypatch +) -> None: + configured_profile = tmp_path / "configured-profile" + monkeypatch.setenv("BU_MANAGED_BROWSER_PROFILE", str(configured_profile)) + + profile, is_temporary = worker._managed_chrome_profile_dir() + + assert profile == configured_profile + assert is_temporary is False + assert configured_profile.exists() + + args = worker._managed_chrome_args("/chrome", 9337, profile, False) + assert f"--user-data-dir={configured_profile}" in args + + monkeypatch.setattr(worker, "_managed_chrome", None) + monkeypatch.setattr(worker, "_managed_chrome_profile", configured_profile) + monkeypatch.setattr(worker, "_managed_chrome_profile_is_temporary", False) + + worker._cleanup_managed_chrome() + + assert configured_profile.exists() + assert worker._managed_chrome_profile is None + assert worker._managed_chrome_profile_is_temporary is False + + +def test_managed_browser_viewport_env_controls_worker_launch( + tmp_path: Path, monkeypatch +) -> None: + monkeypatch.setenv( + "BU_BROWSER_VIEWPORT", + '{"width":1024,"height":768,"deviceScaleFactor":2,"screenWidth":1440,"screenHeight":900}', + ) + monkeypatch.setenv("BU_BROWSER_NO_VIEWPORT", "false") + + headless = worker._managed_chrome_args("/chrome", 9338, tmp_path / "profile", False) + + assert "--headless=new" in headless + assert "--window-size=1024,768" in headless + assert "--force-device-scale-factor=2" in headless + + headed = worker._managed_chrome_args("/chrome", 9339, tmp_path / "profile", True) + + assert "--new-window" in headed + assert "--window-size=1024,768" in headed + assert "--window-size=1512,900" not in headed + + monkeypatch.setenv("BU_BROWSER_NO_VIEWPORT", "true") + + no_viewport = worker._managed_chrome_args("/chrome", 9340, tmp_path / "profile", True) + + assert "--window-size=1024,768" not in no_viewport + assert "--force-device-scale-factor=2" not in no_viewport + assert "--window-size=1512,900" in no_viewport + + def test_managed_chrome_args_visible_vs_headless(tmp_path: Path) -> None: visible = worker._managed_chrome_args("/chrome", 9333, tmp_path / "profile", True) headless = worker._managed_chrome_args("/chrome", 9334, tmp_path / "profile", False)