From 2cfeda9a82e871f7fb4c8dfa4ff17efc4de4002b Mon Sep 17 00:00:00 2001
From: julia <julia.zhu@lexmount.com>
Date: Thu, 25 Jun 2026 11:23:19 +0800
Subject: [PATCH] Align BrowseComp and Odysseys evaluators

---
 browseruse_bench/cli/eval.py                  |  96 ++++--
 .../eval/browse_comp/evaluator.py             |   7 +-
 browseruse_bench/eval/browse_comp/grader.py   |   6 +-
 .../eval/browse_comp/prompts/grader_user.txt  |  15 +-
 browseruse_bench/eval/odysseys/evaluator.py   |   6 +-
 browseruse_bench/eval/odysseys/grader.py      | 291 +++++++++++++-----
 tests/browseruse_bench/test_browsecomp.py     |  46 +++
 tests/browseruse_bench/test_eval_cli.py       |  24 ++
 tests/browseruse_bench/test_odysseys.py       |  79 ++++-
 9 files changed, 449 insertions(+), 121 deletions(-)
 create mode 100644 tests/browseruse_bench/test_browsecomp.py
 create mode 100644 tests/browseruse_bench/test_eval_cli.py

diff --git a/browseruse_bench/cli/eval.py b/browseruse_bench/cli/eval.py
index 70790ef..360df25 100644
--- a/browseruse_bench/cli/eval.py
+++ b/browseruse_bench/cli/eval.py
@@ -3,17 +3,16 @@
 import argparse
 import json
 import logging
-import os
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from browseruse_bench.eval.base import EvaluatorArgs
 from browseruse_bench.eval.model import TaskIdLogFilter
 from browseruse_bench.eval.registry import get_evaluator_class
 from browseruse_bench.utils import (
-    DataSource,
     REPO_ROOT,
+    DataSource,
     add_eval_args,
     classify_failures_batch,
     find_latest_tasks_dir,
@@ -57,8 +56,8 @@ def run_failure_classification(
     base_url: str,
     skip_existing: bool = False,
     num_workers: int = 4,
-    max_samples: Optional[int] = None,
-    temperature: Optional[float] = None,
+    max_samples: int | None = None,
+    temperature: float | None = None,
 ) -> int:
     """Run failure classification on results file (post-evaluation)."""
     if not results_file.exists():
@@ -66,8 +65,8 @@ def run_failure_classification(
         return 0
 
     with normalized_results_file(results_file) as prepared_file:
-        eval_results: List[Dict[str, Any]] = []
-        with open(prepared_file, "r", encoding="utf-8") as handle:
+        eval_results: list[dict[str, Any]] = []
+        with open(prepared_file, encoding="utf-8") as handle:
             for line in handle:
                 line = line.strip()
                 if not line:
@@ -109,16 +108,16 @@ def _merge_manifest_into_summary(
     eval_mode: str,
     model: str,
     base_url: str,
-    score_threshold: Optional[int],
-    results_file: Optional[Path],
+    score_threshold: int | None,
+    results_file: Path | None,
     trajectories_dir: Path,
     exit_code: int,
 ) -> None:
     """Append eval-run metadata to the summary file."""
-    summary: Dict[str, Any] = {}
+    summary: dict[str, Any] = {}
     if summary_path.exists():
         try:
-            with open(summary_path, "r", encoding="utf-8") as fh:
+            with open(summary_path, encoding="utf-8") as fh:
                 summary = json.load(fh)
         except (json.JSONDecodeError, OSError):
             pass
@@ -127,7 +126,7 @@ def _merge_manifest_into_summary(
     passed = 0
     failed = 0
     if results_file and results_file.exists():
-        with open(results_file, "r", encoding="utf-8") as fh:
+        with open(results_file, encoding="utf-8") as fh:
             for raw in fh:
                 raw = raw.strip()
                 if not raw:
@@ -138,7 +137,7 @@ def _merge_manifest_into_summary(
                     continue
                 evaluated += 1
                 score = rec.get("score") if "score" in rec else rec.get("predicted_label")
-                if isinstance(score, (int, float)) and score >= 1:
+                if isinstance(score, int | float) and score >= 1:
                     passed += 1
                 else:
                     failed += 1
@@ -148,7 +147,7 @@ def _merge_manifest_into_summary(
         "model": model,
         "base_url": base_url or None,
         "score_threshold": score_threshold,
-        "finished_at": datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00", "Z"),
+        "finished_at": datetime.now(UTC).isoformat(timespec="seconds").replace("+00:00", "Z"),
         "exit_code": exit_code,
         "tasks_evaluated": evaluated,
         "tasks_passed": passed,
@@ -171,7 +170,7 @@ def _attach_file_logger(log_path: Path):
     log_path.parent.mkdir(parents=True, exist_ok=True)
     with open(log_path, "a", encoding="utf-8") as fh:
         fh.write(
-            f"\n--- EVAL STARTED {datetime.now(timezone.utc).isoformat(timespec='seconds').replace('+00:00', 'Z')} ---\n"
+            f"\n--- EVAL STARTED {datetime.now(UTC).isoformat(timespec='seconds').replace('+00:00', 'Z')} ---\n"
         )
     handler = logging.FileHandler(log_path, mode="a", encoding="utf-8")
     handler.setLevel(logging.INFO)
@@ -184,12 +183,58 @@ def _attach_file_logger(log_path: Path):
     return handler
 
 
+def _coerce_extra_value(value: str) -> Any:
+    lowered = value.lower()
+    if lowered == "true":
+        return True
+    if lowered == "false":
+        return False
+    if lowered in {"none", "null"}:
+        return None
+    try:
+        return int(value)
+    except ValueError:
+        pass
+    try:
+        return float(value)
+    except ValueError:
+        return value
+
+
+def _parse_extra_args(extra_args: list[str]) -> dict[str, Any]:
+    extra: dict[str, Any] = {}
+    idx = 0
+    while idx < len(extra_args):
+        token = extra_args[idx]
+        if not token.startswith("--"):
+            raise SystemExit(f"[FAILED] Unexpected eval extra argument: {token}")
+        raw = token[2:]
+        if not raw:
+            raise SystemExit("[FAILED] Empty eval extra argument")
+        if "=" in raw:
+            key, value = raw.split("=", 1)
+            idx += 1
+        elif idx + 1 < len(extra_args) and not extra_args[idx + 1].startswith("--"):
+            key = raw
+            value = extra_args[idx + 1]
+            idx += 2
+        else:
+            key = raw
+            value = "true"
+            idx += 1
+        key = key.replace("-", "_")
+        if not key:
+            raise SystemExit("[FAILED] Empty eval extra argument key")
+        extra[key] = _coerce_extra_value(value)
+    return extra
+
+
 def run_evaluation(
     agent_name: str,
     benchmark_name: str,
-    config: Dict[str, Any],
+    config: dict[str, Any],
     args: argparse.Namespace,
-    extra_args: List[str],
+    extra_args: list[str],
 ) -> int:
     # Resolve evaluator class via registry (also validates benchmark name)
     evaluator_cls = get_evaluator_class(benchmark_name)
@@ -247,16 +292,17 @@ def run_evaluation(
                 "Ignoring --score-threshold for %s; per-task score_threshold will be used.",
                 benchmark_name,
             )
-        score_threshold: Optional[int] = None
+        score_threshold: int | None = None
     else:
         score_threshold = args.score_threshold if args.score_threshold is not None else 3
 
     # Pack benchmark-private extras unconditionally — evaluators that don't read
     # a given key simply ignore it.
-    extra: Dict[str, Any] = {
+    extra: dict[str, Any] = {
         "eval_strategy": getattr(args, "eval_strategy", None) or "stepwise",
         "force_download": bool(getattr(args, "force_download", False)),
     }
+    extra.update(_parse_extra_args(extra_args))
     if max_tokens is not None:
         extra["max_tokens"] = max_tokens
 
@@ -304,7 +350,7 @@ def run_evaluation(
         logging.getLogger().removeHandler(handler)
         handler.close()
 
-    results_file: Optional[Path] = evaluator.results_path()
+    results_file: Path | None = evaluator.results_path()
     if not results_file.exists():
         results_file = None
 
@@ -345,7 +391,7 @@ def run_evaluation(
     return classification_exit
 
 
-def configure_eval_parser(parser: argparse.ArgumentParser, config: Dict[str, Any]) -> None:
+def configure_eval_parser(parser: argparse.ArgumentParser, config: dict[str, Any]) -> None:
     """Configure arguments for the eval command."""
     add_eval_args(parser)
     parser.add_argument("--data", default=config.get("default", {}).get("data") or config.get("default", {}).get("benchmark", "Online-Mind2Web"))
@@ -395,7 +441,7 @@ def configure_eval_parser(parser: argparse.ArgumentParser, config: Dict[str, Any
     )
 
 
-def eval_command(args: argparse.Namespace, config: Dict[str, Any]) -> int:
+def eval_command(args: argparse.Namespace, config: dict[str, Any]) -> int:
     """Entry point for the eval subcommand."""
     extra_args = getattr(args, "extra_args", [])
     agent_name = normalize_agent_name(args.agent, config)
@@ -404,14 +450,14 @@ def eval_command(args: argparse.Namespace, config: Dict[str, Any]) -> int:
 
 
 @handle_cli_errors
-def main(argv: Optional[List[str]] = None) -> int:
+def main(argv: list[str] | None = None) -> int:
     config = load_config_file(CONFIG_PATH)
     parser = argparse.ArgumentParser(prog="bubench eval")
     configure_eval_parser(parser, config)
     args, extra = parser.parse_known_args(argv)
     if extra:
         logger.info("Forwarding extra arguments: %s", " ".join(extra))
-    setattr(args, "extra_args", extra)
+    args.extra_args = extra
     if args.agent_config is not None:
         cfg_path = args.agent_config
         if not cfg_path.is_absolute():
diff --git a/browseruse_bench/eval/browse_comp/evaluator.py b/browseruse_bench/eval/browse_comp/evaluator.py
index f158a56..92ea145 100644
--- a/browseruse_bench/eval/browse_comp/evaluator.py
+++ b/browseruse_bench/eval/browse_comp/evaluator.py
@@ -4,8 +4,7 @@
 import json
 import logging
 from datetime import UTC, datetime
-from pathlib import Path
-from typing import Any, ClassVar, Dict, List
+from typing import Any, ClassVar
 
 from browseruse_bench.eval.base import BaseEvaluator
 from browseruse_bench.eval.browse_comp.grader import (
@@ -43,7 +42,7 @@ def results_filename(self) -> str:
     def summary_filename(self) -> str:
         return f"BrowseComp_grader_eval_{self.args.model}_summary.json"
 
-    def load_tasks(self) -> Dict[str, Dict[str, Any]]:
+    def load_tasks(self) -> dict[str, dict[str, Any]]:
         tasks_jsonl = REPO_ROOT / "browseruse_bench/data/BrowseComp/task.jsonl"
         return {
             str(task["task_id"]): task
@@ -111,7 +110,7 @@ def evaluate_one(self, task_id, task, agent_result, trajectory_dir):
             agent_response=agent_response,
         )
 
-    def _generate_summary(self, records: List[Dict[str, Any]]) -> None:
+    def _generate_summary(self, records: list[dict[str, Any]]) -> None:
         super()._generate_summary(records)
         path = self.summary_path()
         if not path.exists():
diff --git a/browseruse_bench/eval/browse_comp/grader.py b/browseruse_bench/eval/browse_comp/grader.py
index a31fb14..494f96c 100644
--- a/browseruse_bench/eval/browse_comp/grader.py
+++ b/browseruse_bench/eval/browse_comp/grader.py
@@ -3,7 +3,7 @@
 import re
 import time
 
-from openai import OpenAI
+from openai import APIConnectionError, APIError, OpenAI, RateLimitError
 
 from browseruse_bench.utils import load_eval_config, load_prompt, make_template_prompt
 
@@ -38,12 +38,12 @@ def __call__(self, prompt: str) -> str:
                 )
                 self.last_usage = getattr(response, "usage", None)
                 return response.choices[0].message.content or ""
-            except Exception:
+            except (APIError, APIConnectionError, RateLimitError):
                 if trial < _max_tries - 1:
                     time.sleep(2 ** trial)
                 else:
                     self.last_usage = None
-                    return "correct: no"
+                    raise
         raise RuntimeError("Unreachable: max_tries exhausted without return")
 
 def load_grader_model(model=None, api_key=None, base_url=None, temperature=None):
diff --git a/browseruse_bench/eval/browse_comp/prompts/grader_user.txt b/browseruse_bench/eval/browse_comp/prompts/grader_user.txt
index 1fdc71f..410dedb 100644
--- a/browseruse_bench/eval/browse_comp/prompts/grader_user.txt
+++ b/browseruse_bench/eval/browse_comp/prompts/grader_user.txt
@@ -2,10 +2,17 @@ Judge whether the following [response] to [question] is correct or not based on
 
 [question]: {question}
 
+[response]: {response}
+
+Your judgement must be in the format and criteria specified below:
+
+extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.
+
 [correct_answer]: {correct_answer}
 
-[response]: {response}
+reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
+
+correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
+
 
-Your response should be in the format:
-Reasoning: {{your reasoning}}
-Correct: {{yes or no}}
\ No newline at end of file
+confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.
diff --git a/browseruse_bench/eval/odysseys/evaluator.py b/browseruse_bench/eval/odysseys/evaluator.py
index 0cbbfdc..9038af7 100644
--- a/browseruse_bench/eval/odysseys/evaluator.py
+++ b/browseruse_bench/eval/odysseys/evaluator.py
@@ -52,7 +52,7 @@ def image_scale_factor(self) -> float:
 
     @property
     def max_screenshots(self) -> int:
-        return int(self.args.extra.get("max_screenshots", 6))
+        return int(self.args.extra.get("max_screenshots", 0))
 
     def results_filename(self) -> str:
         return f"Odysseys_{self.args.model}_rubric_results.json"
@@ -95,6 +95,7 @@ def evaluate_one(
             rubrics=rubrics,
             screenshot_paths=screenshots,
             model=self.model,
+            action_history=agent_result.get("action_history"),
             image_scale_factor=self.image_scale_factor,
             temperature=self.args.temperature or 0.0,
         )
@@ -133,6 +134,7 @@ def evaluate_one(
                 "passed_rubrics": grading["passed_rubrics"],
                 "total_rubrics": grading["total_rubrics"],
                 "screenshot_count": len(screenshots),
+                "rubric_results_official": grading["official_rubric_results"],
             },
         )
 
@@ -178,7 +180,7 @@ def _generate_summary(self, records: list[dict[str, Any]]) -> None:
             details = record.get("evaluation_details") or {}
             metrics = details.get("agent_metrics") or {}
             steps = metrics.get("steps") if isinstance(metrics, dict) else None
-            if isinstance(steps, (int, float)) and steps > 0:
+            if isinstance(steps, int | float) and steps > 0:
                 efficiency_values.append(score / steps)
         trajectory_efficiency = (
             sum(efficiency_values) / len(efficiency_values)
diff --git a/browseruse_bench/eval/odysseys/grader.py b/browseruse_bench/eval/odysseys/grader.py
index 4355e42..874d927 100644
--- a/browseruse_bench/eval/odysseys/grader.py
+++ b/browseruse_bench/eval/odysseys/grader.py
@@ -1,8 +1,8 @@
 """Odysseys rubric grader."""
 from __future__ import annotations
 
-import json
 import logging
+import re
 from pathlib import Path
 from typing import Any
 
@@ -12,88 +12,235 @@
 
 logger = logging.getLogger(__name__)
 
-_SYSTEM_PROMPT = (
-    "You are evaluating a long-horizon browser-agent task using rubric checkpoints. "
-    "Use the task instruction, the agent's final response, and screenshots from the "
-    "trajectory as evidence. Grade each rubric independently. Do not browse the web. "
-    "Return only valid JSON with this shape: "
-    '{"rubric_results":{"R1":{"passed":true,"reasoning":"..."}},"reasoning":"overall notes"}.'
-)
-
-_USER_TEMPLATE = (
-    "Task instruction:\n{task}\n\n"
-    "Agent final response:\n{answer}\n\n"
-    "Rubrics:\n{rubrics}\n\n"
-    "{num} trajectory screenshots are attached in chronological order."
-)
-
-
-def _safe_json_object(text: str) -> dict[str, Any]:
-    stripped = text.strip()
-    try:
-        parsed = json.loads(stripped)
-    except json.JSONDecodeError:
-        start = stripped.find("{")
-        end = stripped.rfind("}")
-        if start == -1 or end == -1 or end <= start:
-            return {}
-        try:
-            parsed = json.loads(stripped[start:end + 1])
-        except json.JSONDecodeError:
-            return {}
-    return parsed if isinstance(parsed, dict) else {}
+FINAL_JUDGMENT_MAX_COMPLETION_TOKENS = 8192
 
+_SYSTEM_PROMPT = """You are an expert evaluator of web-navigation agent trajectories.
 
-def grade_rubrics(
+You will receive:
+- The user task (for context).
+- ONE specific rubric item with a requirement and a verification description.
+- The agent's full action history (one line per step).
+- Every screenshot from the trajectory, in chronological order.
+
+Your goal is to decide whether this single rubric item is satisfied by the trajectory.
+
+Evaluation rules:
+- Judge ONLY the one rubric item you are given; ignore all other implicit requirements.
+- Ground your judgment in what the screenshots and actions actually show. Do not invent state.
+- Filtering / sorting / form requirements must be applied and confirmed to count as satisfied.
+- If the agent was blocked (captcha, access denied, etc.) and therefore could not satisfy the rubric, report failure.
+
+Respond in exactly this format:
+
+Thoughts: <your reasoning, citing specific steps/screenshots>
+Status: "success" or "failure"
+"""
+
+_STATUS_RE = re.compile(r'Status:\s*["\']?(success|failure)["\']?', re.IGNORECASE)
+_THOUGHTS_RE = re.compile(r"Thoughts:\s*(.+?)(?:Status:|$)", re.IGNORECASE | re.DOTALL)
+
+
+def _stringify_action(action: Any) -> str:
+    if isinstance(action, str):
+        return action.strip()
+    if isinstance(action, dict):
+        return " ".join(
+            f"{key}={value}" for key, value in action.items()
+            if value not in (None, "")
+        ).strip()
+    return str(action).strip()
+
+
+def _format_action_history(action_history: Any) -> str:
+    if isinstance(action_history, list):
+        lines = [
+            f"{idx}. {text}"
+            for idx, action in enumerate(action_history, start=1)
+            if (text := _stringify_action(action))
+        ]
+        return "\n".join(lines) if lines else "No actions recorded."
+    if isinstance(action_history, str) and action_history.strip():
+        return action_history.strip()
+    return "No actions recorded."
+
+
+def _iter_rubrics(rubrics: dict[str, Any] | list[Any]) -> list[tuple[str, dict[str, Any]]]:
+    if isinstance(rubrics, dict):
+        return [
+            (str(rubric_id), value if isinstance(value, dict) else {"requirement": str(value)})
+            for rubric_id, value in rubrics.items()
+        ]
+    if isinstance(rubrics, list):
+        items: list[tuple[str, dict[str, Any]]] = []
+        for idx, item in enumerate(rubrics, start=1):
+            if isinstance(item, dict):
+                items.append((str(item.get("id", f"R{idx}")), item))
+                continue
+            items.append((f"R{idx}", {"requirement": str(item)}))
+        return items
+    return []
+
+
+def _rubric_prompt(
     task: str,
-    answer: str,
-    rubrics: dict[str, Any],
-    screenshot_paths: list[Path],
-    model: Any,
-    image_scale_factor: float = 1.0,
-    temperature: float = 0.0,
-    max_tokens: int = 2048,
-) -> dict[str, Any]:
-    """Grade Odysseys rubric checkpoints from screenshots and final answer."""
-    rubrics_text = json.dumps(rubrics, ensure_ascii=False, indent=2)
-    user_text = _USER_TEMPLATE.format(
-        task=task,
-        answer=answer or "No answer provided.",
-        rubrics=rubrics_text,
-        num=len(screenshot_paths),
+    rubric_id: str,
+    rubric: dict[str, Any],
+    action_history: str,
+    screenshot_count: int,
+    total_steps: int,
+) -> str:
+    rubric_lines = [
+        f"Rubric ID: {rubric_id}",
+        f"Requirement: {str(rubric.get('requirement', '')).strip()}",
+    ]
+    verification = str(rubric.get("verification", "")).strip()
+    if verification:
+        rubric_lines.append(f"Verification: {verification}")
+
+    return (
+        f"User Task (context only): {task}\n\n"
+        "Evaluate ONLY this rubric item:\n"
+        + "\n".join(rubric_lines)
+        + f"\n\nFull Action History:\n{action_history}\n\n"
+        f"Screenshots attached below: {screenshot_count} "
+        f"(trajectory had {total_steps} total step(s)).\n\n"
+        f"Decide whether the rubric ({rubric_id}) is satisfied. "
+        "Use the required 'Thoughts:' / 'Status:' format."
     )
 
-    messages: list[dict[str, Any]] = [
-        {"role": "system", "content": _SYSTEM_PROMPT},
-        {"role": "user", "content": [{"type": "text", "text": user_text}]},
-    ]
 
+def _parse_status(response: str) -> tuple[bool, str]:
+    status_match = _STATUS_RE.search(response)
+    thoughts_match = _THOUGHTS_RE.search(response)
+    reasoning = thoughts_match.group(1).strip() if thoughts_match else response.strip()
+    return bool(status_match and status_match.group(1).lower() == "success"), reasoning
+
+
+def _image_items(screenshot_paths: list[Path], image_scale_factor: float) -> list[dict[str, Any]]:
+    items: list[dict[str, Any]] = []
     for path in screenshot_paths:
         try:
             img = Image.open(path)
             b64 = encode_image(img, scale_factor=image_scale_factor)
-            messages[1]["content"].append({
-                "type": "image_url",
-                "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "high"},
-            })
         except OSError as exc:
             logger.warning("Failed to load screenshot %s: %s", path, exc)
+            continue
+        items.append({
+            "type": "image_url",
+            "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "high"},
+        })
+    return items
+
+
+def _usage_to_dict(usage: Any) -> dict[str, Any] | None:
+    if usage is None:
+        return None
+    if isinstance(usage, dict):
+        data = usage
+    elif hasattr(usage, "model_dump"):
+        data = usage.model_dump()
+    elif hasattr(usage, "__dict__"):
+        data = usage.__dict__
+    else:
+        return None
+
+    prompt_tokens = int(data.get("prompt_tokens") or 0)
+    completion_tokens = int(data.get("completion_tokens") or 0)
+    total_tokens = int(data.get("total_tokens") or prompt_tokens + completion_tokens)
+    prompt_details = data.get("prompt_tokens_details") or {}
+    if hasattr(prompt_details, "model_dump"):
+        prompt_details = prompt_details.model_dump()
+    elif hasattr(prompt_details, "__dict__"):
+        prompt_details = prompt_details.__dict__
+    cached_tokens = 0
+    if isinstance(prompt_details, dict):
+        cached_tokens = int(prompt_details.get("cached_tokens") or 0)
+    cached_tokens = int(data.get("cached_tokens") or cached_tokens)
 
-    messages[1]["content"].append({"type": "text", "text": "JSON verdict:"})
-    response = model.generate(messages, max_tokens=max_tokens, temperature=temperature)
-    parsed = _safe_json_object(response)
-    raw_results = parsed.get("rubric_results")
-    rubric_results = raw_results if isinstance(raw_results, dict) else {}
+    return {
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "total_tokens": total_tokens,
+        "cached_tokens": cached_tokens,
+        "non_cached_prompt": max(0, prompt_tokens - cached_tokens),
+        "prompt_tokens_details": {"cached_tokens": cached_tokens},
+    }
+
+
+def _aggregate_usages(usages: list[dict[str, Any]]) -> dict[str, Any] | None:
+    if not usages:
+        return None
+    prompt_tokens = sum(int(usage.get("prompt_tokens") or 0) for usage in usages)
+    completion_tokens = sum(int(usage.get("completion_tokens") or 0) for usage in usages)
+    cached_tokens = sum(int(usage.get("cached_tokens") or 0) for usage in usages)
+    total_tokens = sum(
+        int(usage.get("total_tokens") or 0)
+        or int(usage.get("prompt_tokens") or 0) + int(usage.get("completion_tokens") or 0)
+        for usage in usages
+    )
+    return {
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "total_tokens": total_tokens,
+        "cached_tokens": cached_tokens,
+        "non_cached_prompt": max(0, prompt_tokens - cached_tokens),
+        "prompt_tokens_details": {"cached_tokens": cached_tokens},
+    }
+
+
+def grade_rubrics(
+    task: str,
+    answer: str,
+    rubrics: dict[str, Any] | list[Any],
+    screenshot_paths: list[Path],
+    model: Any,
+    action_history: Any = None,
+    image_scale_factor: float = 1.0,
+    temperature: float = 0.0,
+    max_tokens: int = FINAL_JUDGMENT_MAX_COMPLETION_TOKENS,
+) -> dict[str, Any]:
+    """Grade Odysseys rubric checkpoints using the official per-rubric protocol."""
+    rubric_items = _iter_rubrics(rubrics)
+    action_history_text = _format_action_history(action_history)
+    images = _image_items(screenshot_paths, image_scale_factor)
+    total_steps = len(action_history) if isinstance(action_history, list) else len(screenshot_paths)
 
     normalized: dict[str, dict[str, Any]] = {}
-    for rubric_id in rubrics:
-        result = rubric_results.get(rubric_id, {})
-        if not isinstance(result, dict):
-            result = {}
+    raw_responses: list[str] = []
+    official_results: list[dict[str, Any]] = []
+    usages: list[dict[str, Any]] = []
+    for rubric_id, rubric in rubric_items:
+        user_text = _rubric_prompt(
+            task=task,
+            rubric_id=rubric_id,
+            rubric=rubric,
+            action_history=action_history_text,
+            screenshot_count=len(images),
+            total_steps=total_steps,
+        )
+        messages = [
+            {"role": "system", "content": _SYSTEM_PROMPT},
+            {"role": "user", "content": [{"type": "text", "text": user_text}] + images},
+        ]
+        response = model.generate(messages, max_tokens=max_tokens, temperature=temperature)
+        usage = _usage_to_dict(getattr(model, "last_usage", None))
+        if usage is not None:
+            usages.append(usage)
+        raw_responses.append(f"### {rubric_id}\n{response}")
+        success, reasoning = _parse_status(response)
         normalized[rubric_id] = {
-            "passed": bool(result.get("passed")),
-            "reasoning": str(result.get("reasoning") or ""),
+            "passed": success,
+            "reasoning": reasoning,
         }
+        official_results.append({
+            "rubric_id": rubric_id,
+            "requirement": str(rubric.get("requirement", "")),
+            "verification": str(rubric.get("verification", "")),
+            "score": 1 if success else 0,
+            "success": success,
+            "reasoning": reasoning,
+            "response": response,
+        })
 
     passed = sum(1 for result in normalized.values() if result["passed"])
     total = len(normalized)
@@ -105,9 +252,13 @@ def grade_rubrics(
         "passed_rubrics": passed,
         "total_rubrics": total,
         "is_correct": total > 0 and passed == total,
-        "response": response,
-        "reasoning": str(parsed.get("reasoning") or response.strip()),
-        "usage": getattr(model, "last_usage", None),
+        "response": "\n\n".join(raw_responses),
+        "reasoning": "\n".join(
+            f"{rubric_id}: {result['reasoning']}"
+            for rubric_id, result in normalized.items()
+        ),
+        "usage": _aggregate_usages(usages),
         "system_prompt": _SYSTEM_PROMPT,
-        "user_prompt": user_text,
+        "action_history": action_history_text,
+        "official_rubric_results": official_results,
     }
diff --git a/tests/browseruse_bench/test_browsecomp.py b/tests/browseruse_bench/test_browsecomp.py
new file mode 100644
index 0000000..17d59e8
--- /dev/null
+++ b/tests/browseruse_bench/test_browsecomp.py
@@ -0,0 +1,46 @@
+"""Tests for BrowseComp evaluator helpers."""
+from __future__ import annotations
+
+
+class _RecordingGrader:
+    def __init__(self, response: str):
+        self.response = response
+        self.last_usage = None
+        self.prompt = ""
+
+    def __call__(self, prompt: str) -> str:
+        self.prompt = prompt
+        return self.response
+
+
+def test_grade_response_uses_official_grader_fields():
+    from browseruse_bench.eval.browse_comp.grader import grade_response
+
+    grader = _RecordingGrader("reasoning: same\ncorrect: yes\nconfidence: 100")
+
+    result = grade_response(
+        question="Question?",
+        correct_answer="Answer",
+        agent_response="Exact Answer: Answer",
+        grader_fn=grader,
+    )
+
+    assert result["is_correct"] is True
+    assert "extracted_final_answer:" in grader.prompt
+    assert "confidence:" in grader.prompt
+    assert "[correct_answer]: Answer" in grader.prompt
+
+
+def test_grade_response_defaults_to_incorrect_without_verdict():
+    from browseruse_bench.eval.browse_comp.grader import grade_response
+
+    grader = _RecordingGrader("I cannot decide.")
+
+    result = grade_response(
+        question="Question?",
+        correct_answer="Answer",
+        agent_response="Wrong",
+        grader_fn=grader,
+    )
+
+    assert result["is_correct"] is False
diff --git a/tests/browseruse_bench/test_eval_cli.py b/tests/browseruse_bench/test_eval_cli.py
new file mode 100644
index 0000000..5463bbe
--- /dev/null
+++ b/tests/browseruse_bench/test_eval_cli.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+import pytest
+
+from browseruse_bench.cli.eval import _parse_extra_args
+
+
+def test_parse_eval_extra_args_coerces_private_options() -> None:
+    assert _parse_extra_args([
+        "--max-screenshots", "50",
+        "--image-scale-factor=0.5",
+        "--use-cache", "false",
+        "--dry-private-flag",
+    ]) == {
+        "max_screenshots": 50,
+        "image_scale_factor": 0.5,
+        "use_cache": False,
+        "dry_private_flag": True,
+    }
+
+
+def test_parse_eval_extra_args_rejects_positional() -> None:
+    with pytest.raises(SystemExit):
+        _parse_extra_args(["unexpected"])
diff --git a/tests/browseruse_bench/test_odysseys.py b/tests/browseruse_bench/test_odysseys.py
index 8d73b79..d46c6ed 100644
--- a/tests/browseruse_bench/test_odysseys.py
+++ b/tests/browseruse_bench/test_odysseys.py
@@ -101,13 +101,12 @@ def _make_mock_model(self, response_text: str) -> MagicMock:
     def test_grade_rubrics_computes_partial_score(self):
         from browseruse_bench.eval.odysseys.grader import grade_rubrics
 
-        model = self._make_mock_model(json.dumps({
-            "rubric_results": {
-                "R1": {"passed": True, "reasoning": "done"},
-                "R2": {"passed": False, "reasoning": "missing"},
-            },
-            "reasoning": "partial",
-        }))
+        model = MagicMock()
+        model.generate.side_effect = [
+            'Thoughts: The first checkpoint is visible.\nStatus: "success"',
+            'Thoughts: The second checkpoint is missing.\nStatus: "failure"',
+        ]
+        model.last_usage = None
 
         result = grade_rubrics(
             task="Do a long task.",
@@ -115,22 +114,28 @@ def test_grade_rubrics_computes_partial_score(self):
             rubrics={"R1": {}, "R2": {}},
             screenshot_paths=[],
             model=model,
+            action_history=["Open search", "Inspect result"],
         )
 
         assert result["passed_rubrics"] == 1
         assert result["total_rubrics"] == 2
         assert result["rubric_score"] == 0.5
         assert result["is_correct"] is False
+        assert model.generate.call_count == 2
+        first_prompt = model.generate.call_args_list[0].args[0][1]["content"][0]["text"]
+        assert "Evaluate ONLY this rubric item" in first_prompt
+        assert "Full Action History" in first_prompt
+        assert "1. Open search" in first_prompt
 
     def test_grade_rubrics_marks_perfect_success(self):
         from browseruse_bench.eval.odysseys.grader import grade_rubrics
 
-        model = self._make_mock_model(json.dumps({
-            "rubric_results": {
-                "R1": {"passed": True},
-                "R2": {"passed": True},
-            }
-        }))
+        model = MagicMock()
+        model.generate.side_effect = [
+            'Thoughts: Done.\nStatus: "success"',
+            'Thoughts: Also done.\nStatus: "success"',
+        ]
+        model.last_usage = None
 
         result = grade_rubrics(
             task="Do a long task.",
@@ -143,6 +148,54 @@ def test_grade_rubrics_marks_perfect_success(self):
         assert result["rubric_score"] == 1.0
         assert result["is_correct"] is True
 
+    def test_grade_rubrics_marks_missing_status_as_failure(self):
+        from browseruse_bench.eval.odysseys.grader import grade_rubrics
+
+        model = self._make_mock_model("Thoughts: I cannot tell.")
+
+        result = grade_rubrics(
+            task="Do a long task.",
+            answer="Done.",
+            rubrics={"R1": {"requirement": "Find evidence."}},
+            screenshot_paths=[],
+            model=model,
+        )
+
+        assert result["rubric_results"]["R1"]["passed"] is False
+        assert result["rubric_score"] == 0.0
+
+    def test_grade_rubrics_accumulates_usage_across_rubrics(self):
+        from browseruse_bench.eval.odysseys.grader import grade_rubrics
+
+        class UsageModel:
+            def __init__(self):
+                self.calls = 0
+                self.last_usage = None
+
+            def generate(self, *_args, **_kwargs):
+                self.calls += 1
+                self.last_usage = {
+                    "prompt_tokens": 100 * self.calls,
+                    "completion_tokens": 10 * self.calls,
+                    "total_tokens": 110 * self.calls,
+                    "prompt_tokens_details": {"cached_tokens": 5 * self.calls},
+                }
+                return 'Thoughts: Done.\nStatus: "success"'
+
+        result = grade_rubrics(
+            task="Do a long task.",
+            answer="Done.",
+            rubrics={"R1": {}, "R2": {}},
+            screenshot_paths=[],
+            model=UsageModel(),
+        )
+
+        assert result["usage"]["prompt_tokens"] == 300
+        assert result["usage"]["completion_tokens"] == 30
+        assert result["usage"]["total_tokens"] == 330
+        assert result["usage"]["cached_tokens"] == 15
+        assert result["usage"]["prompt_tokens_details"]["cached_tokens"] == 15
+
 
 class TestOdysseysEvaluatorLoadTasks:
     """Tests for OdysseysEvaluator.load_tasks()."""