From 2cfeda9a82e871f7fb4c8dfa4ff17efc4de4002b Mon Sep 17 00:00:00 2001 From: julia Date: Thu, 25 Jun 2026 11:23:19 +0800 Subject: [PATCH] Align BrowseComp and Odysseys evaluators --- browseruse_bench/cli/eval.py | 96 ++++-- .../eval/browse_comp/evaluator.py | 7 +- browseruse_bench/eval/browse_comp/grader.py | 6 +- .../eval/browse_comp/prompts/grader_user.txt | 15 +- browseruse_bench/eval/odysseys/evaluator.py | 6 +- browseruse_bench/eval/odysseys/grader.py | 291 +++++++++++++----- tests/browseruse_bench/test_browsecomp.py | 46 +++ tests/browseruse_bench/test_eval_cli.py | 24 ++ tests/browseruse_bench/test_odysseys.py | 79 ++++- 9 files changed, 449 insertions(+), 121 deletions(-) create mode 100644 tests/browseruse_bench/test_browsecomp.py create mode 100644 tests/browseruse_bench/test_eval_cli.py diff --git a/browseruse_bench/cli/eval.py b/browseruse_bench/cli/eval.py index 70790ef..360df25 100644 --- a/browseruse_bench/cli/eval.py +++ b/browseruse_bench/cli/eval.py @@ -3,17 +3,16 @@ import argparse import json import logging -import os -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any from browseruse_bench.eval.base import EvaluatorArgs from browseruse_bench.eval.model import TaskIdLogFilter from browseruse_bench.eval.registry import get_evaluator_class from browseruse_bench.utils import ( - DataSource, REPO_ROOT, + DataSource, add_eval_args, classify_failures_batch, find_latest_tasks_dir, @@ -57,8 +56,8 @@ def run_failure_classification( base_url: str, skip_existing: bool = False, num_workers: int = 4, - max_samples: Optional[int] = None, - temperature: Optional[float] = None, + max_samples: int | None = None, + temperature: float | None = None, ) -> int: """Run failure classification on results file (post-evaluation).""" if not results_file.exists(): @@ -66,8 +65,8 @@ def run_failure_classification( return 0 with normalized_results_file(results_file) as prepared_file: - eval_results: List[Dict[str, Any]] = [] - with open(prepared_file, "r", encoding="utf-8") as handle: + eval_results: list[dict[str, Any]] = [] + with open(prepared_file, encoding="utf-8") as handle: for line in handle: line = line.strip() if not line: @@ -109,16 +108,16 @@ def _merge_manifest_into_summary( eval_mode: str, model: str, base_url: str, - score_threshold: Optional[int], - results_file: Optional[Path], + score_threshold: int | None, + results_file: Path | None, trajectories_dir: Path, exit_code: int, ) -> None: """Append eval-run metadata to the summary file.""" - summary: Dict[str, Any] = {} + summary: dict[str, Any] = {} if summary_path.exists(): try: - with open(summary_path, "r", encoding="utf-8") as fh: + with open(summary_path, encoding="utf-8") as fh: summary = json.load(fh) except (json.JSONDecodeError, OSError): pass @@ -127,7 +126,7 @@ def _merge_manifest_into_summary( passed = 0 failed = 0 if results_file and results_file.exists(): - with open(results_file, "r", encoding="utf-8") as fh: + with open(results_file, encoding="utf-8") as fh: for raw in fh: raw = raw.strip() if not raw: @@ -138,7 +137,7 @@ def _merge_manifest_into_summary( continue evaluated += 1 score = rec.get("score") if "score" in rec else rec.get("predicted_label") - if isinstance(score, (int, float)) and score >= 1: + if isinstance(score, int | float) and score >= 1: passed += 1 else: failed += 1 @@ -148,7 +147,7 @@ def _merge_manifest_into_summary( "model": model, "base_url": base_url or None, "score_threshold": score_threshold, - "finished_at": datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00", "Z"), + "finished_at": datetime.now(UTC).isoformat(timespec="seconds").replace("+00:00", "Z"), "exit_code": exit_code, "tasks_evaluated": evaluated, "tasks_passed": passed, @@ -171,7 +170,7 @@ def _attach_file_logger(log_path: Path): log_path.parent.mkdir(parents=True, exist_ok=True) with open(log_path, "a", encoding="utf-8") as fh: fh.write( - f"\n--- EVAL STARTED {datetime.now(timezone.utc).isoformat(timespec='seconds').replace('+00:00', 'Z')} ---\n" + f"\n--- EVAL STARTED {datetime.now(UTC).isoformat(timespec='seconds').replace('+00:00', 'Z')} ---\n" ) handler = logging.FileHandler(log_path, mode="a", encoding="utf-8") handler.setLevel(logging.INFO) @@ -184,12 +183,58 @@ def _attach_file_logger(log_path: Path): return handler +def _coerce_extra_value(value: str) -> Any: + lowered = value.lower() + if lowered == "true": + return True + if lowered == "false": + return False + if lowered in {"none", "null"}: + return None + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + return value + + +def _parse_extra_args(extra_args: list[str]) -> dict[str, Any]: + extra: dict[str, Any] = {} + idx = 0 + while idx < len(extra_args): + token = extra_args[idx] + if not token.startswith("--"): + raise SystemExit(f"[FAILED] Unexpected eval extra argument: {token}") + raw = token[2:] + if not raw: + raise SystemExit("[FAILED] Empty eval extra argument") + if "=" in raw: + key, value = raw.split("=", 1) + idx += 1 + elif idx + 1 < len(extra_args) and not extra_args[idx + 1].startswith("--"): + key = raw + value = extra_args[idx + 1] + idx += 2 + else: + key = raw + value = "true" + idx += 1 + key = key.replace("-", "_") + if not key: + raise SystemExit("[FAILED] Empty eval extra argument key") + extra[key] = _coerce_extra_value(value) + return extra + + def run_evaluation( agent_name: str, benchmark_name: str, - config: Dict[str, Any], + config: dict[str, Any], args: argparse.Namespace, - extra_args: List[str], + extra_args: list[str], ) -> int: # Resolve evaluator class via registry (also validates benchmark name) evaluator_cls = get_evaluator_class(benchmark_name) @@ -247,16 +292,17 @@ def run_evaluation( "Ignoring --score-threshold for %s; per-task score_threshold will be used.", benchmark_name, ) - score_threshold: Optional[int] = None + score_threshold: int | None = None else: score_threshold = args.score_threshold if args.score_threshold is not None else 3 # Pack benchmark-private extras unconditionally — evaluators that don't read # a given key simply ignore it. - extra: Dict[str, Any] = { + extra: dict[str, Any] = { "eval_strategy": getattr(args, "eval_strategy", None) or "stepwise", "force_download": bool(getattr(args, "force_download", False)), } + extra.update(_parse_extra_args(extra_args)) if max_tokens is not None: extra["max_tokens"] = max_tokens @@ -304,7 +350,7 @@ def run_evaluation( logging.getLogger().removeHandler(handler) handler.close() - results_file: Optional[Path] = evaluator.results_path() + results_file: Path | None = evaluator.results_path() if not results_file.exists(): results_file = None @@ -345,7 +391,7 @@ def run_evaluation( return classification_exit -def configure_eval_parser(parser: argparse.ArgumentParser, config: Dict[str, Any]) -> None: +def configure_eval_parser(parser: argparse.ArgumentParser, config: dict[str, Any]) -> None: """Configure arguments for the eval command.""" add_eval_args(parser) parser.add_argument("--data", default=config.get("default", {}).get("data") or config.get("default", {}).get("benchmark", "Online-Mind2Web")) @@ -395,7 +441,7 @@ def configure_eval_parser(parser: argparse.ArgumentParser, config: Dict[str, Any ) -def eval_command(args: argparse.Namespace, config: Dict[str, Any]) -> int: +def eval_command(args: argparse.Namespace, config: dict[str, Any]) -> int: """Entry point for the eval subcommand.""" extra_args = getattr(args, "extra_args", []) agent_name = normalize_agent_name(args.agent, config) @@ -404,14 +450,14 @@ def eval_command(args: argparse.Namespace, config: Dict[str, Any]) -> int: @handle_cli_errors -def main(argv: Optional[List[str]] = None) -> int: +def main(argv: list[str] | None = None) -> int: config = load_config_file(CONFIG_PATH) parser = argparse.ArgumentParser(prog="bubench eval") configure_eval_parser(parser, config) args, extra = parser.parse_known_args(argv) if extra: logger.info("Forwarding extra arguments: %s", " ".join(extra)) - setattr(args, "extra_args", extra) + args.extra_args = extra if args.agent_config is not None: cfg_path = args.agent_config if not cfg_path.is_absolute(): diff --git a/browseruse_bench/eval/browse_comp/evaluator.py b/browseruse_bench/eval/browse_comp/evaluator.py index f158a56..92ea145 100644 --- a/browseruse_bench/eval/browse_comp/evaluator.py +++ b/browseruse_bench/eval/browse_comp/evaluator.py @@ -4,8 +4,7 @@ import json import logging from datetime import UTC, datetime -from pathlib import Path -from typing import Any, ClassVar, Dict, List +from typing import Any, ClassVar from browseruse_bench.eval.base import BaseEvaluator from browseruse_bench.eval.browse_comp.grader import ( @@ -43,7 +42,7 @@ def results_filename(self) -> str: def summary_filename(self) -> str: return f"BrowseComp_grader_eval_{self.args.model}_summary.json" - def load_tasks(self) -> Dict[str, Dict[str, Any]]: + def load_tasks(self) -> dict[str, dict[str, Any]]: tasks_jsonl = REPO_ROOT / "browseruse_bench/data/BrowseComp/task.jsonl" return { str(task["task_id"]): task @@ -111,7 +110,7 @@ def evaluate_one(self, task_id, task, agent_result, trajectory_dir): agent_response=agent_response, ) - def _generate_summary(self, records: List[Dict[str, Any]]) -> None: + def _generate_summary(self, records: list[dict[str, Any]]) -> None: super()._generate_summary(records) path = self.summary_path() if not path.exists(): diff --git a/browseruse_bench/eval/browse_comp/grader.py b/browseruse_bench/eval/browse_comp/grader.py index a31fb14..494f96c 100644 --- a/browseruse_bench/eval/browse_comp/grader.py +++ b/browseruse_bench/eval/browse_comp/grader.py @@ -3,7 +3,7 @@ import re import time -from openai import OpenAI +from openai import APIConnectionError, APIError, OpenAI, RateLimitError from browseruse_bench.utils import load_eval_config, load_prompt, make_template_prompt @@ -38,12 +38,12 @@ def __call__(self, prompt: str) -> str: ) self.last_usage = getattr(response, "usage", None) return response.choices[0].message.content or "" - except Exception: + except (APIError, APIConnectionError, RateLimitError): if trial < _max_tries - 1: time.sleep(2 ** trial) else: self.last_usage = None - return "correct: no" + raise raise RuntimeError("Unreachable: max_tries exhausted without return") def load_grader_model(model=None, api_key=None, base_url=None, temperature=None): diff --git a/browseruse_bench/eval/browse_comp/prompts/grader_user.txt b/browseruse_bench/eval/browse_comp/prompts/grader_user.txt index 1fdc71f..410dedb 100644 --- a/browseruse_bench/eval/browse_comp/prompts/grader_user.txt +++ b/browseruse_bench/eval/browse_comp/prompts/grader_user.txt @@ -2,10 +2,17 @@ Judge whether the following [response] to [question] is correct or not based on [question]: {question} +[response]: {response} + +Your judgement must be in the format and criteria specified below: + +extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response. + [correct_answer]: {correct_answer} -[response]: {response} +reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match. + +correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect. + -Your response should be in the format: -Reasoning: {{your reasoning}} -Correct: {{yes or no}} \ No newline at end of file +confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available. diff --git a/browseruse_bench/eval/odysseys/evaluator.py b/browseruse_bench/eval/odysseys/evaluator.py index 0cbbfdc..9038af7 100644 --- a/browseruse_bench/eval/odysseys/evaluator.py +++ b/browseruse_bench/eval/odysseys/evaluator.py @@ -52,7 +52,7 @@ def image_scale_factor(self) -> float: @property def max_screenshots(self) -> int: - return int(self.args.extra.get("max_screenshots", 6)) + return int(self.args.extra.get("max_screenshots", 0)) def results_filename(self) -> str: return f"Odysseys_{self.args.model}_rubric_results.json" @@ -95,6 +95,7 @@ def evaluate_one( rubrics=rubrics, screenshot_paths=screenshots, model=self.model, + action_history=agent_result.get("action_history"), image_scale_factor=self.image_scale_factor, temperature=self.args.temperature or 0.0, ) @@ -133,6 +134,7 @@ def evaluate_one( "passed_rubrics": grading["passed_rubrics"], "total_rubrics": grading["total_rubrics"], "screenshot_count": len(screenshots), + "rubric_results_official": grading["official_rubric_results"], }, ) @@ -178,7 +180,7 @@ def _generate_summary(self, records: list[dict[str, Any]]) -> None: details = record.get("evaluation_details") or {} metrics = details.get("agent_metrics") or {} steps = metrics.get("steps") if isinstance(metrics, dict) else None - if isinstance(steps, (int, float)) and steps > 0: + if isinstance(steps, int | float) and steps > 0: efficiency_values.append(score / steps) trajectory_efficiency = ( sum(efficiency_values) / len(efficiency_values) diff --git a/browseruse_bench/eval/odysseys/grader.py b/browseruse_bench/eval/odysseys/grader.py index 4355e42..874d927 100644 --- a/browseruse_bench/eval/odysseys/grader.py +++ b/browseruse_bench/eval/odysseys/grader.py @@ -1,8 +1,8 @@ """Odysseys rubric grader.""" from __future__ import annotations -import json import logging +import re from pathlib import Path from typing import Any @@ -12,88 +12,235 @@ logger = logging.getLogger(__name__) -_SYSTEM_PROMPT = ( - "You are evaluating a long-horizon browser-agent task using rubric checkpoints. " - "Use the task instruction, the agent's final response, and screenshots from the " - "trajectory as evidence. Grade each rubric independently. Do not browse the web. " - "Return only valid JSON with this shape: " - '{"rubric_results":{"R1":{"passed":true,"reasoning":"..."}},"reasoning":"overall notes"}.' -) - -_USER_TEMPLATE = ( - "Task instruction:\n{task}\n\n" - "Agent final response:\n{answer}\n\n" - "Rubrics:\n{rubrics}\n\n" - "{num} trajectory screenshots are attached in chronological order." -) - - -def _safe_json_object(text: str) -> dict[str, Any]: - stripped = text.strip() - try: - parsed = json.loads(stripped) - except json.JSONDecodeError: - start = stripped.find("{") - end = stripped.rfind("}") - if start == -1 or end == -1 or end <= start: - return {} - try: - parsed = json.loads(stripped[start:end + 1]) - except json.JSONDecodeError: - return {} - return parsed if isinstance(parsed, dict) else {} +FINAL_JUDGMENT_MAX_COMPLETION_TOKENS = 8192 +_SYSTEM_PROMPT = """You are an expert evaluator of web-navigation agent trajectories. -def grade_rubrics( +You will receive: +- The user task (for context). +- ONE specific rubric item with a requirement and a verification description. +- The agent's full action history (one line per step). +- Every screenshot from the trajectory, in chronological order. + +Your goal is to decide whether this single rubric item is satisfied by the trajectory. + +Evaluation rules: +- Judge ONLY the one rubric item you are given; ignore all other implicit requirements. +- Ground your judgment in what the screenshots and actions actually show. Do not invent state. +- Filtering / sorting / form requirements must be applied and confirmed to count as satisfied. +- If the agent was blocked (captcha, access denied, etc.) and therefore could not satisfy the rubric, report failure. + +Respond in exactly this format: + +Thoughts: +Status: "success" or "failure" +""" + +_STATUS_RE = re.compile(r'Status:\s*["\']?(success|failure)["\']?', re.IGNORECASE) +_THOUGHTS_RE = re.compile(r"Thoughts:\s*(.+?)(?:Status:|$)", re.IGNORECASE | re.DOTALL) + + +def _stringify_action(action: Any) -> str: + if isinstance(action, str): + return action.strip() + if isinstance(action, dict): + return " ".join( + f"{key}={value}" for key, value in action.items() + if value not in (None, "") + ).strip() + return str(action).strip() + + +def _format_action_history(action_history: Any) -> str: + if isinstance(action_history, list): + lines = [ + f"{idx}. {text}" + for idx, action in enumerate(action_history, start=1) + if (text := _stringify_action(action)) + ] + return "\n".join(lines) if lines else "No actions recorded." + if isinstance(action_history, str) and action_history.strip(): + return action_history.strip() + return "No actions recorded." + + +def _iter_rubrics(rubrics: dict[str, Any] | list[Any]) -> list[tuple[str, dict[str, Any]]]: + if isinstance(rubrics, dict): + return [ + (str(rubric_id), value if isinstance(value, dict) else {"requirement": str(value)}) + for rubric_id, value in rubrics.items() + ] + if isinstance(rubrics, list): + items: list[tuple[str, dict[str, Any]]] = [] + for idx, item in enumerate(rubrics, start=1): + if isinstance(item, dict): + items.append((str(item.get("id", f"R{idx}")), item)) + continue + items.append((f"R{idx}", {"requirement": str(item)})) + return items + return [] + + +def _rubric_prompt( task: str, - answer: str, - rubrics: dict[str, Any], - screenshot_paths: list[Path], - model: Any, - image_scale_factor: float = 1.0, - temperature: float = 0.0, - max_tokens: int = 2048, -) -> dict[str, Any]: - """Grade Odysseys rubric checkpoints from screenshots and final answer.""" - rubrics_text = json.dumps(rubrics, ensure_ascii=False, indent=2) - user_text = _USER_TEMPLATE.format( - task=task, - answer=answer or "No answer provided.", - rubrics=rubrics_text, - num=len(screenshot_paths), + rubric_id: str, + rubric: dict[str, Any], + action_history: str, + screenshot_count: int, + total_steps: int, +) -> str: + rubric_lines = [ + f"Rubric ID: {rubric_id}", + f"Requirement: {str(rubric.get('requirement', '')).strip()}", + ] + verification = str(rubric.get("verification", "")).strip() + if verification: + rubric_lines.append(f"Verification: {verification}") + + return ( + f"User Task (context only): {task}\n\n" + "Evaluate ONLY this rubric item:\n" + + "\n".join(rubric_lines) + + f"\n\nFull Action History:\n{action_history}\n\n" + f"Screenshots attached below: {screenshot_count} " + f"(trajectory had {total_steps} total step(s)).\n\n" + f"Decide whether the rubric ({rubric_id}) is satisfied. " + "Use the required 'Thoughts:' / 'Status:' format." ) - messages: list[dict[str, Any]] = [ - {"role": "system", "content": _SYSTEM_PROMPT}, - {"role": "user", "content": [{"type": "text", "text": user_text}]}, - ] +def _parse_status(response: str) -> tuple[bool, str]: + status_match = _STATUS_RE.search(response) + thoughts_match = _THOUGHTS_RE.search(response) + reasoning = thoughts_match.group(1).strip() if thoughts_match else response.strip() + return bool(status_match and status_match.group(1).lower() == "success"), reasoning + + +def _image_items(screenshot_paths: list[Path], image_scale_factor: float) -> list[dict[str, Any]]: + items: list[dict[str, Any]] = [] for path in screenshot_paths: try: img = Image.open(path) b64 = encode_image(img, scale_factor=image_scale_factor) - messages[1]["content"].append({ - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "high"}, - }) except OSError as exc: logger.warning("Failed to load screenshot %s: %s", path, exc) + continue + items.append({ + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "high"}, + }) + return items + + +def _usage_to_dict(usage: Any) -> dict[str, Any] | None: + if usage is None: + return None + if isinstance(usage, dict): + data = usage + elif hasattr(usage, "model_dump"): + data = usage.model_dump() + elif hasattr(usage, "__dict__"): + data = usage.__dict__ + else: + return None + + prompt_tokens = int(data.get("prompt_tokens") or 0) + completion_tokens = int(data.get("completion_tokens") or 0) + total_tokens = int(data.get("total_tokens") or prompt_tokens + completion_tokens) + prompt_details = data.get("prompt_tokens_details") or {} + if hasattr(prompt_details, "model_dump"): + prompt_details = prompt_details.model_dump() + elif hasattr(prompt_details, "__dict__"): + prompt_details = prompt_details.__dict__ + cached_tokens = 0 + if isinstance(prompt_details, dict): + cached_tokens = int(prompt_details.get("cached_tokens") or 0) + cached_tokens = int(data.get("cached_tokens") or cached_tokens) - messages[1]["content"].append({"type": "text", "text": "JSON verdict:"}) - response = model.generate(messages, max_tokens=max_tokens, temperature=temperature) - parsed = _safe_json_object(response) - raw_results = parsed.get("rubric_results") - rubric_results = raw_results if isinstance(raw_results, dict) else {} + return { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "cached_tokens": cached_tokens, + "non_cached_prompt": max(0, prompt_tokens - cached_tokens), + "prompt_tokens_details": {"cached_tokens": cached_tokens}, + } + + +def _aggregate_usages(usages: list[dict[str, Any]]) -> dict[str, Any] | None: + if not usages: + return None + prompt_tokens = sum(int(usage.get("prompt_tokens") or 0) for usage in usages) + completion_tokens = sum(int(usage.get("completion_tokens") or 0) for usage in usages) + cached_tokens = sum(int(usage.get("cached_tokens") or 0) for usage in usages) + total_tokens = sum( + int(usage.get("total_tokens") or 0) + or int(usage.get("prompt_tokens") or 0) + int(usage.get("completion_tokens") or 0) + for usage in usages + ) + return { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "cached_tokens": cached_tokens, + "non_cached_prompt": max(0, prompt_tokens - cached_tokens), + "prompt_tokens_details": {"cached_tokens": cached_tokens}, + } + + +def grade_rubrics( + task: str, + answer: str, + rubrics: dict[str, Any] | list[Any], + screenshot_paths: list[Path], + model: Any, + action_history: Any = None, + image_scale_factor: float = 1.0, + temperature: float = 0.0, + max_tokens: int = FINAL_JUDGMENT_MAX_COMPLETION_TOKENS, +) -> dict[str, Any]: + """Grade Odysseys rubric checkpoints using the official per-rubric protocol.""" + rubric_items = _iter_rubrics(rubrics) + action_history_text = _format_action_history(action_history) + images = _image_items(screenshot_paths, image_scale_factor) + total_steps = len(action_history) if isinstance(action_history, list) else len(screenshot_paths) normalized: dict[str, dict[str, Any]] = {} - for rubric_id in rubrics: - result = rubric_results.get(rubric_id, {}) - if not isinstance(result, dict): - result = {} + raw_responses: list[str] = [] + official_results: list[dict[str, Any]] = [] + usages: list[dict[str, Any]] = [] + for rubric_id, rubric in rubric_items: + user_text = _rubric_prompt( + task=task, + rubric_id=rubric_id, + rubric=rubric, + action_history=action_history_text, + screenshot_count=len(images), + total_steps=total_steps, + ) + messages = [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": [{"type": "text", "text": user_text}] + images}, + ] + response = model.generate(messages, max_tokens=max_tokens, temperature=temperature) + usage = _usage_to_dict(getattr(model, "last_usage", None)) + if usage is not None: + usages.append(usage) + raw_responses.append(f"### {rubric_id}\n{response}") + success, reasoning = _parse_status(response) normalized[rubric_id] = { - "passed": bool(result.get("passed")), - "reasoning": str(result.get("reasoning") or ""), + "passed": success, + "reasoning": reasoning, } + official_results.append({ + "rubric_id": rubric_id, + "requirement": str(rubric.get("requirement", "")), + "verification": str(rubric.get("verification", "")), + "score": 1 if success else 0, + "success": success, + "reasoning": reasoning, + "response": response, + }) passed = sum(1 for result in normalized.values() if result["passed"]) total = len(normalized) @@ -105,9 +252,13 @@ def grade_rubrics( "passed_rubrics": passed, "total_rubrics": total, "is_correct": total > 0 and passed == total, - "response": response, - "reasoning": str(parsed.get("reasoning") or response.strip()), - "usage": getattr(model, "last_usage", None), + "response": "\n\n".join(raw_responses), + "reasoning": "\n".join( + f"{rubric_id}: {result['reasoning']}" + for rubric_id, result in normalized.items() + ), + "usage": _aggregate_usages(usages), "system_prompt": _SYSTEM_PROMPT, - "user_prompt": user_text, + "action_history": action_history_text, + "official_rubric_results": official_results, } diff --git a/tests/browseruse_bench/test_browsecomp.py b/tests/browseruse_bench/test_browsecomp.py new file mode 100644 index 0000000..17d59e8 --- /dev/null +++ b/tests/browseruse_bench/test_browsecomp.py @@ -0,0 +1,46 @@ +"""Tests for BrowseComp evaluator helpers.""" +from __future__ import annotations + + +class _RecordingGrader: + def __init__(self, response: str): + self.response = response + self.last_usage = None + self.prompt = "" + + def __call__(self, prompt: str) -> str: + self.prompt = prompt + return self.response + + +def test_grade_response_uses_official_grader_fields(): + from browseruse_bench.eval.browse_comp.grader import grade_response + + grader = _RecordingGrader("reasoning: same\ncorrect: yes\nconfidence: 100") + + result = grade_response( + question="Question?", + correct_answer="Answer", + agent_response="Exact Answer: Answer", + grader_fn=grader, + ) + + assert result["is_correct"] is True + assert "extracted_final_answer:" in grader.prompt + assert "confidence:" in grader.prompt + assert "[correct_answer]: Answer" in grader.prompt + + +def test_grade_response_defaults_to_incorrect_without_verdict(): + from browseruse_bench.eval.browse_comp.grader import grade_response + + grader = _RecordingGrader("I cannot decide.") + + result = grade_response( + question="Question?", + correct_answer="Answer", + agent_response="Wrong", + grader_fn=grader, + ) + + assert result["is_correct"] is False diff --git a/tests/browseruse_bench/test_eval_cli.py b/tests/browseruse_bench/test_eval_cli.py new file mode 100644 index 0000000..5463bbe --- /dev/null +++ b/tests/browseruse_bench/test_eval_cli.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import pytest + +from browseruse_bench.cli.eval import _parse_extra_args + + +def test_parse_eval_extra_args_coerces_private_options() -> None: + assert _parse_extra_args([ + "--max-screenshots", "50", + "--image-scale-factor=0.5", + "--use-cache", "false", + "--dry-private-flag", + ]) == { + "max_screenshots": 50, + "image_scale_factor": 0.5, + "use_cache": False, + "dry_private_flag": True, + } + + +def test_parse_eval_extra_args_rejects_positional() -> None: + with pytest.raises(SystemExit): + _parse_extra_args(["unexpected"]) diff --git a/tests/browseruse_bench/test_odysseys.py b/tests/browseruse_bench/test_odysseys.py index 8d73b79..d46c6ed 100644 --- a/tests/browseruse_bench/test_odysseys.py +++ b/tests/browseruse_bench/test_odysseys.py @@ -101,13 +101,12 @@ def _make_mock_model(self, response_text: str) -> MagicMock: def test_grade_rubrics_computes_partial_score(self): from browseruse_bench.eval.odysseys.grader import grade_rubrics - model = self._make_mock_model(json.dumps({ - "rubric_results": { - "R1": {"passed": True, "reasoning": "done"}, - "R2": {"passed": False, "reasoning": "missing"}, - }, - "reasoning": "partial", - })) + model = MagicMock() + model.generate.side_effect = [ + 'Thoughts: The first checkpoint is visible.\nStatus: "success"', + 'Thoughts: The second checkpoint is missing.\nStatus: "failure"', + ] + model.last_usage = None result = grade_rubrics( task="Do a long task.", @@ -115,22 +114,28 @@ def test_grade_rubrics_computes_partial_score(self): rubrics={"R1": {}, "R2": {}}, screenshot_paths=[], model=model, + action_history=["Open search", "Inspect result"], ) assert result["passed_rubrics"] == 1 assert result["total_rubrics"] == 2 assert result["rubric_score"] == 0.5 assert result["is_correct"] is False + assert model.generate.call_count == 2 + first_prompt = model.generate.call_args_list[0].args[0][1]["content"][0]["text"] + assert "Evaluate ONLY this rubric item" in first_prompt + assert "Full Action History" in first_prompt + assert "1. Open search" in first_prompt def test_grade_rubrics_marks_perfect_success(self): from browseruse_bench.eval.odysseys.grader import grade_rubrics - model = self._make_mock_model(json.dumps({ - "rubric_results": { - "R1": {"passed": True}, - "R2": {"passed": True}, - } - })) + model = MagicMock() + model.generate.side_effect = [ + 'Thoughts: Done.\nStatus: "success"', + 'Thoughts: Also done.\nStatus: "success"', + ] + model.last_usage = None result = grade_rubrics( task="Do a long task.", @@ -143,6 +148,54 @@ def test_grade_rubrics_marks_perfect_success(self): assert result["rubric_score"] == 1.0 assert result["is_correct"] is True + def test_grade_rubrics_marks_missing_status_as_failure(self): + from browseruse_bench.eval.odysseys.grader import grade_rubrics + + model = self._make_mock_model("Thoughts: I cannot tell.") + + result = grade_rubrics( + task="Do a long task.", + answer="Done.", + rubrics={"R1": {"requirement": "Find evidence."}}, + screenshot_paths=[], + model=model, + ) + + assert result["rubric_results"]["R1"]["passed"] is False + assert result["rubric_score"] == 0.0 + + def test_grade_rubrics_accumulates_usage_across_rubrics(self): + from browseruse_bench.eval.odysseys.grader import grade_rubrics + + class UsageModel: + def __init__(self): + self.calls = 0 + self.last_usage = None + + def generate(self, *_args, **_kwargs): + self.calls += 1 + self.last_usage = { + "prompt_tokens": 100 * self.calls, + "completion_tokens": 10 * self.calls, + "total_tokens": 110 * self.calls, + "prompt_tokens_details": {"cached_tokens": 5 * self.calls}, + } + return 'Thoughts: Done.\nStatus: "success"' + + result = grade_rubrics( + task="Do a long task.", + answer="Done.", + rubrics={"R1": {}, "R2": {}}, + screenshot_paths=[], + model=UsageModel(), + ) + + assert result["usage"]["prompt_tokens"] == 300 + assert result["usage"]["completion_tokens"] == 30 + assert result["usage"]["total_tokens"] == 330 + assert result["usage"]["cached_tokens"] == 15 + assert result["usage"]["prompt_tokens_details"]["cached_tokens"] == 15 + class TestOdysseysEvaluatorLoadTasks: """Tests for OdysseysEvaluator.load_tasks()."""