Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 71 additions & 25 deletions browseruse_bench/cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,16 @@
import argparse
import json
import logging
import os
from datetime import datetime, timezone
from datetime import UTC, datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any

from browseruse_bench.eval.base import EvaluatorArgs
from browseruse_bench.eval.model import TaskIdLogFilter
from browseruse_bench.eval.registry import get_evaluator_class
from browseruse_bench.utils import (
DataSource,
REPO_ROOT,
DataSource,
add_eval_args,
classify_failures_batch,
find_latest_tasks_dir,
Expand Down Expand Up @@ -57,17 +56,17 @@ def run_failure_classification(
base_url: str,
skip_existing: bool = False,
num_workers: int = 4,
max_samples: Optional[int] = None,
temperature: Optional[float] = None,
max_samples: int | None = None,
temperature: float | None = None,
) -> int:
"""Run failure classification on results file (post-evaluation)."""
if not results_file.exists():
logger.warning("Results file not found, skipping failure classification: %s", results_file)
return 0

with normalized_results_file(results_file) as prepared_file:
eval_results: List[Dict[str, Any]] = []
with open(prepared_file, "r", encoding="utf-8") as handle:
eval_results: list[dict[str, Any]] = []
with open(prepared_file, encoding="utf-8") as handle:
for line in handle:
line = line.strip()
if not line:
Expand Down Expand Up @@ -109,16 +108,16 @@ def _merge_manifest_into_summary(
eval_mode: str,
model: str,
base_url: str,
score_threshold: Optional[int],
results_file: Optional[Path],
score_threshold: int | None,
results_file: Path | None,
trajectories_dir: Path,
exit_code: int,
) -> None:
"""Append eval-run metadata to the summary file."""
summary: Dict[str, Any] = {}
summary: dict[str, Any] = {}
if summary_path.exists():
try:
with open(summary_path, "r", encoding="utf-8") as fh:
with open(summary_path, encoding="utf-8") as fh:
summary = json.load(fh)
except (json.JSONDecodeError, OSError):
pass
Expand All @@ -127,7 +126,7 @@ def _merge_manifest_into_summary(
passed = 0
failed = 0
if results_file and results_file.exists():
with open(results_file, "r", encoding="utf-8") as fh:
with open(results_file, encoding="utf-8") as fh:
for raw in fh:
raw = raw.strip()
if not raw:
Expand All @@ -138,7 +137,7 @@ def _merge_manifest_into_summary(
continue
evaluated += 1
score = rec.get("score") if "score" in rec else rec.get("predicted_label")
if isinstance(score, (int, float)) and score >= 1:
if isinstance(score, int | float) and score >= 1:
passed += 1
else:
failed += 1
Expand All @@ -148,7 +147,7 @@ def _merge_manifest_into_summary(
"model": model,
"base_url": base_url or None,
"score_threshold": score_threshold,
"finished_at": datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00", "Z"),
"finished_at": datetime.now(UTC).isoformat(timespec="seconds").replace("+00:00", "Z"),
"exit_code": exit_code,
"tasks_evaluated": evaluated,
"tasks_passed": passed,
Expand All @@ -171,7 +170,7 @@ def _attach_file_logger(log_path: Path):
log_path.parent.mkdir(parents=True, exist_ok=True)
with open(log_path, "a", encoding="utf-8") as fh:
fh.write(
f"\n--- EVAL STARTED {datetime.now(timezone.utc).isoformat(timespec='seconds').replace('+00:00', 'Z')} ---\n"
f"\n--- EVAL STARTED {datetime.now(UTC).isoformat(timespec='seconds').replace('+00:00', 'Z')} ---\n"
)
handler = logging.FileHandler(log_path, mode="a", encoding="utf-8")
handler.setLevel(logging.INFO)
Expand All @@ -184,12 +183,58 @@ def _attach_file_logger(log_path: Path):
return handler


def _coerce_extra_value(value: str) -> Any:
lowered = value.lower()
if lowered == "true":
return True
if lowered == "false":
return False
if lowered in {"none", "null"}:
return None
try:
return int(value)
except ValueError:
pass
try:
return float(value)
except ValueError:
return value


def _parse_extra_args(extra_args: list[str]) -> dict[str, Any]:
extra: dict[str, Any] = {}
idx = 0
while idx < len(extra_args):
token = extra_args[idx]
if not token.startswith("--"):
raise SystemExit(f"[FAILED] Unexpected eval extra argument: {token}")
raw = token[2:]
if not raw:
raise SystemExit("[FAILED] Empty eval extra argument")
if "=" in raw:
key, value = raw.split("=", 1)
idx += 1
elif idx + 1 < len(extra_args) and not extra_args[idx + 1].startswith("--"):
key = raw
value = extra_args[idx + 1]
idx += 2
else:
key = raw
value = "true"
idx += 1
key = key.replace("-", "_")
if not key:
raise SystemExit("[FAILED] Empty eval extra argument key")
extra[key] = _coerce_extra_value(value)
return extra


def run_evaluation(
agent_name: str,
benchmark_name: str,
config: Dict[str, Any],
config: dict[str, Any],
args: argparse.Namespace,
extra_args: List[str],
extra_args: list[str],
) -> int:
# Resolve evaluator class via registry (also validates benchmark name)
evaluator_cls = get_evaluator_class(benchmark_name)
Expand Down Expand Up @@ -247,16 +292,17 @@ def run_evaluation(
"Ignoring --score-threshold for %s; per-task score_threshold will be used.",
benchmark_name,
)
score_threshold: Optional[int] = None
score_threshold: int | None = None
else:
score_threshold = args.score_threshold if args.score_threshold is not None else 3

# Pack benchmark-private extras unconditionally — evaluators that don't read
# a given key simply ignore it.
extra: Dict[str, Any] = {
extra: dict[str, Any] = {
"eval_strategy": getattr(args, "eval_strategy", None) or "stepwise",
"force_download": bool(getattr(args, "force_download", False)),
}
extra.update(_parse_extra_args(extra_args))
if max_tokens is not None:
extra["max_tokens"] = max_tokens

Expand Down Expand Up @@ -304,7 +350,7 @@ def run_evaluation(
logging.getLogger().removeHandler(handler)
handler.close()

results_file: Optional[Path] = evaluator.results_path()
results_file: Path | None = evaluator.results_path()
if not results_file.exists():
results_file = None

Expand Down Expand Up @@ -345,7 +391,7 @@ def run_evaluation(
return classification_exit


def configure_eval_parser(parser: argparse.ArgumentParser, config: Dict[str, Any]) -> None:
def configure_eval_parser(parser: argparse.ArgumentParser, config: dict[str, Any]) -> None:
"""Configure arguments for the eval command."""
add_eval_args(parser)
parser.add_argument("--data", default=config.get("default", {}).get("data") or config.get("default", {}).get("benchmark", "Online-Mind2Web"))
Expand Down Expand Up @@ -395,7 +441,7 @@ def configure_eval_parser(parser: argparse.ArgumentParser, config: Dict[str, Any
)


def eval_command(args: argparse.Namespace, config: Dict[str, Any]) -> int:
def eval_command(args: argparse.Namespace, config: dict[str, Any]) -> int:
"""Entry point for the eval subcommand."""
extra_args = getattr(args, "extra_args", [])
agent_name = normalize_agent_name(args.agent, config)
Expand All @@ -404,14 +450,14 @@ def eval_command(args: argparse.Namespace, config: Dict[str, Any]) -> int:


@handle_cli_errors
def main(argv: Optional[List[str]] = None) -> int:
def main(argv: list[str] | None = None) -> int:
config = load_config_file(CONFIG_PATH)
parser = argparse.ArgumentParser(prog="bubench eval")
configure_eval_parser(parser, config)
args, extra = parser.parse_known_args(argv)
if extra:
logger.info("Forwarding extra arguments: %s", " ".join(extra))
setattr(args, "extra_args", extra)
args.extra_args = extra
if args.agent_config is not None:
cfg_path = args.agent_config
if not cfg_path.is_absolute():
Expand Down
7 changes: 3 additions & 4 deletions browseruse_bench/eval/browse_comp/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
import json
import logging
from datetime import UTC, datetime
from pathlib import Path
from typing import Any, ClassVar, Dict, List
from typing import Any, ClassVar

from browseruse_bench.eval.base import BaseEvaluator
from browseruse_bench.eval.browse_comp.grader import (
Expand Down Expand Up @@ -43,7 +42,7 @@ def results_filename(self) -> str:
def summary_filename(self) -> str:
return f"BrowseComp_grader_eval_{self.args.model}_summary.json"

def load_tasks(self) -> Dict[str, Dict[str, Any]]:
def load_tasks(self) -> dict[str, dict[str, Any]]:
tasks_jsonl = REPO_ROOT / "browseruse_bench/data/BrowseComp/task.jsonl"
return {
str(task["task_id"]): task
Expand Down Expand Up @@ -111,7 +110,7 @@ def evaluate_one(self, task_id, task, agent_result, trajectory_dir):
agent_response=agent_response,
)

def _generate_summary(self, records: List[Dict[str, Any]]) -> None:
def _generate_summary(self, records: list[dict[str, Any]]) -> None:
super()._generate_summary(records)
path = self.summary_path()
if not path.exists():
Expand Down
6 changes: 3 additions & 3 deletions browseruse_bench/eval/browse_comp/grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
import time

from openai import OpenAI
from openai import APIConnectionError, APIError, OpenAI, RateLimitError

from browseruse_bench.utils import load_eval_config, load_prompt, make_template_prompt

Expand Down Expand Up @@ -38,12 +38,12 @@ def __call__(self, prompt: str) -> str:
)
self.last_usage = getattr(response, "usage", None)
return response.choices[0].message.content or ""
except Exception:
except (APIError, APIConnectionError, RateLimitError):
if trial < _max_tries - 1:
time.sleep(2 ** trial)
else:
self.last_usage = None
return "correct: no"
raise

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid dropping BrowseComp tasks after grader API errors

When a BrowseComp grading request exhausts retries, this re-raises an OpenAI error; BaseEvaluator._run_iteration catches openai.OpenAIError and continues without appending an EvalResult. In runs with a transient, credential, or rate-limit failure for a task, that task disappears from the results and summary denominator instead of being recorded as failed, which can make BrowseComp accuracy look better than the attempted evaluation.

Useful? React with 👍 / 👎.

raise RuntimeError("Unreachable: max_tries exhausted without return")

def load_grader_model(model=None, api_key=None, base_url=None, temperature=None):
Expand Down
15 changes: 11 additions & 4 deletions browseruse_bench/eval/browse_comp/prompts/grader_user.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,17 @@ Judge whether the following [response] to [question] is correct or not based on

[question]: {question}

[response]: {response}

Your judgement must be in the format and criteria specified below:

extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.

[correct_answer]: {correct_answer}

[response]: {response}
reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.

correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.


Your response should be in the format:
Reasoning: {{your reasoning}}
Correct: {{yes or no}}
confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.
6 changes: 4 additions & 2 deletions browseruse_bench/eval/odysseys/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def image_scale_factor(self) -> float:

@property
def max_screenshots(self) -> int:
return int(self.args.extra.get("max_screenshots", 6))
return int(self.args.extra.get("max_screenshots", 0))

def results_filename(self) -> str:
return f"Odysseys_{self.args.model}_rubric_results.json"
Expand Down Expand Up @@ -95,6 +95,7 @@ def evaluate_one(
rubrics=rubrics,
screenshot_paths=screenshots,
model=self.model,
action_history=agent_result.get("action_history"),
image_scale_factor=self.image_scale_factor,
temperature=self.args.temperature or 0.0,
)
Expand Down Expand Up @@ -133,6 +134,7 @@ def evaluate_one(
"passed_rubrics": grading["passed_rubrics"],
"total_rubrics": grading["total_rubrics"],
"screenshot_count": len(screenshots),
"rubric_results_official": grading["official_rubric_results"],
},
)

Expand Down Expand Up @@ -178,7 +180,7 @@ def _generate_summary(self, records: list[dict[str, Any]]) -> None:
details = record.get("evaluation_details") or {}
metrics = details.get("agent_metrics") or {}
steps = metrics.get("steps") if isinstance(metrics, dict) else None
if isinstance(steps, (int, float)) and steps > 0:
if isinstance(steps, int | float) and steps > 0:
efficiency_values.append(score / steps)
trajectory_efficiency = (
sum(efficiency_values) / len(efficiency_values)
Expand Down
Loading
Loading