From b2f1c2a1143fc8780f143ee30f161a6d079f40c8 Mon Sep 17 00:00:00 2001 From: XuweiDing Date: Thu, 25 Jun 2026 12:15:29 +0800 Subject: [PATCH 1/2] Add LexBench post-attribution rerun workflow --- README.md | 31 + README_ZH.md | 31 + .../prompts/failure_taxonomy_system.txt | 77 ++ docs/lexbench-automated-evaluation-system.md | 356 +++++++ docs/rerun-rule-validation-12-models.md | 143 +++ docs/result-rerun-check-rules.md | 424 ++++++++ scripts/audit_m3_3_api_log_failures.py | 429 ++++++++ scripts/build_generation_comparison_docs.py | 568 +++++++++++ scripts/collect_lexbench_rerun_candidates.py | 618 +++++++++++ scripts/judge_lexbench_failure_taxonomy.py | 957 ++++++++++++++++++ scripts/plot_failure_taxonomy_figure.py | 325 ++++++ scripts/plot_generation_failure_comparison.py | 194 ++++ 12 files changed, 4153 insertions(+) create mode 100644 browseruse_bench/eval/lexbench_browser/prompts/failure_taxonomy_system.txt create mode 100644 docs/lexbench-automated-evaluation-system.md create mode 100644 docs/rerun-rule-validation-12-models.md create mode 100644 docs/result-rerun-check-rules.md create mode 100644 scripts/audit_m3_3_api_log_failures.py create mode 100644 scripts/build_generation_comparison_docs.py create mode 100644 scripts/collect_lexbench_rerun_candidates.py create mode 100644 scripts/judge_lexbench_failure_taxonomy.py create mode 100644 scripts/plot_failure_taxonomy_figure.py create mode 100644 scripts/plot_generation_failure_comparison.py diff --git a/README.md b/README.md index 4672f64..5c29ced 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,37 @@ bubench eval --agent browser-use --data LexBench-Browser --model-id gpt-4.1 > `--split` is optional — the benchmark's `default_split` (from `data_info.json`) is used automatically. Pass `--split ` only to override the default. > For the full parameter reference, see the [Quickstart docs](https://docs.bubench.lexmount.io/en/quickstart). +**Post-attribution rerun workflow** + +For LexBench-Browser result analysis, use the automated post-run workflow: + +```text +run benchmark -> eval -> failure attribution -> post-attribution rerun check +-> rerun selected tasks -> re-eval -> final attribution / visualization +``` + +The final rerun candidate set is: + +```text +result_json_hard +∪ latest_agent_run_log_hard +∪ taxonomy_primary_M3.2_or_M3.3 +``` + +Generate rerun task ids after evaluation and failure attribution: + +```bash +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --model MODEL_DIR_NAME \ + --timestamp TIMESTAMP \ + --artifact-mode hard \ + --include-taxonomy-web-constraints +``` + +See [LexBench automated evaluation system](docs/lexbench-automated-evaluation-system.md), +[rerun check rules](docs/result-rerun-check-rules.md), and +[12-model rerun rule validation](docs/rerun-rule-validation-12-models.md). + ## Data Loading Use `--data-source` to control where benchmark data is loaded from: diff --git a/README_ZH.md b/README_ZH.md index d653035..354ad10 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -200,6 +200,37 @@ bubench eval --agent browser-use --data LexBench-Browser --model-id gpt-4.1 > 全量参数说明见[快速开始文档](https://docs.bubench.lexmount.io/zh/quickstart)。 +**Post-attribution 重测流程** + +LexBench-Browser 结果分析推荐使用这套自动化 post-run 流程: + +```text +run benchmark -> eval -> failure attribution -> post-attribution rerun check +-> rerun selected tasks -> re-eval -> final attribution / visualization +``` + +最终 rerun candidate 集合是: + +```text +result_json_hard +∪ latest_agent_run_log_hard +∪ taxonomy_primary_M3.2_or_M3.3 +``` + +在 eval 和 failure attribution 之后生成 rerun task ids: + +```bash +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --model MODEL_DIR_NAME \ + --timestamp TIMESTAMP \ + --artifact-mode hard \ + --include-taxonomy-web-constraints +``` + +详见 [LexBench 自动化评测体系](docs/lexbench-automated-evaluation-system.md)、 +[rerun check rules](docs/result-rerun-check-rules.md) 和 +[12-model rerun rule validation](docs/rerun-rule-validation-12-models.md)。 + ## 数据加载 通过 `--data-source` 控制数据来源: diff --git a/browseruse_bench/eval/lexbench_browser/prompts/failure_taxonomy_system.txt b/browseruse_bench/eval/lexbench_browser/prompts/failure_taxonomy_system.txt new file mode 100644 index 0000000..58586f9 --- /dev/null +++ b/browseruse_bench/eval/lexbench_browser/prompts/failure_taxonomy_system.txt @@ -0,0 +1,77 @@ +You are an expert browser-agent benchmark analyst. A browser agent failed a LexBench-Browser task. Classify the failure into the taxonomy below. + +Use the supplied task spec, scoring rubric, evaluator feedback, agent final answer, compact action trace, runtime result, and screenshots. Prefer the evidence from the trajectory and evaluator feedback over assumptions. Do not rely on the old A1/B1/C1 failure category except as weak auxiliary context. + +## Taxonomy + +### M1 Task Reasoning +Failures in task understanding, decision making, selection, evidence use, or safety judgment. + +M1.1 Requirement Following +The agent misses explicit task requirements, required websites, required fields, required output format, required number of items, or the required safety/legal response. Use this for incomplete fulfillment of the user's objective even when the browser interactions were technically possible. + +M1.2 Target Selection +The agent applies the wrong scope, entity, date, city, item, channel, season, product, ranking criterion, filter, sort order, or comparison logic. Use this when it reaches usable pages but chooses the wrong target or fails to enforce "latest", "highest", "most viewed", "top N", date windows, or cross-platform comparison criteria. + +M1.3 Evidence Grounding +The agent fails to extract information that is available, extracts the wrong fields, mixes fields from different items, fabricates or hallucinates values, reports unverifiable data, or answers without enough evidence. Use this when the central problem is grounding and information fidelity. + +### M2 Action Execution +Failures in controlling the browser-agent loop, UI operations, recovery behavior, or tool/output protocol. These are agent capability failures, not external website failures, unless the page is blocked or unavailable. + +M2.1 UI Misoperation +The agent cannot operate normal UI elements: search boxes, buttons, date pickers, dropdowns, filters, tabs, popups, modals, pagination, detail-page links, window/tab switching, or page scrolling. Use this when the site is accessible but the agent cannot drive the interface to the needed state. + +M2.2 Infinite Loop +The agent repeats ineffective actions, gets stuck, fails to recover from a bad page state, runs out of steps, times out, or completes only a small part of a long multi-item task due to poor workflow control. Use this for loops, dead ends, and poor long-horizon task management. + +M2.3 Format Breakdown +The agent fails because of malformed JSON action output, invalid tool-call structure, parser failures, missing final response, model service no-response, failed file saving, corrupted artifacts, or required output files not being produced. Use this only when protocol or artifact generation is a direct cause of failure. + +### M3 Web Constraints +Failures mainly caused by external web environment constraints. These may still expose agent limits, but the primary obstacle is the website or access environment. + +M3.1 Bot Defense +The target site blocks automation with CAPTCHA, Cloudflare, PerimeterX, slider verification, "robot or human", 403 caused by automation, rate limits, "Too Many Requests", security control, abnormal traffic, or similar bot-detection defenses. + +M3.2 Access Barrier +The needed content or action is blocked by login, session expiry, SMS/QR authentication, membership, VIP, paywall, permissions, account-only views, paid downloads, copyright restrictions, or regional access restrictions. + +M3.3 Site Limitation +The site is down, unreachable, returns 404/server errors, has empty DOM or SPA rendering failure, does not expose the requested content, lacks the requested filter/data, or the target content genuinely does not exist on the specified site. Use this when the environment itself makes the task impossible or under-specified. + +## OTHER + +Use OTHER only when none of the nine categories captures the core failure. If OTHER is used, provide a short phrase in other_phrase. Do not use OTHER for common combinations of the above categories. Prefer assigning one or more existing categories whenever possible. + +## Multi-label rules + +- Assign every category that substantially contributed to the failed outcome. +- A trajectory may have one or multiple codes. +- Err on the side of inclusion for real contributing failures, but do not add categories that are only mentioned in the task text. +- Choose primary_code as the most direct cause that explains why the run failed. +- If the agent is blocked by CAPTCHA or rate limiting, include M3.1 even if it also fails later. +- If the page is accessible but the agent misses filters, sorting, or target selection, use M1.2, not M3.3. +- If the page is accessible and the answer is unsupported, use M1.3. +- If the agent cannot click or manipulate a normal accessible interface, use M2.1. +- If repeated ineffective attempts, timeout, or step exhaustion prevent completion, use M2.2. +- If the run stops because the model produced malformed JSON, tool-call parsing failed, or no final response was produced, use M2.3. + +## Output + +Return only a JSON object matching this schema: + +{ + "primary_code": "M1.1", + "codes": ["M1.1", "M2.2"], + "other_phrase": null, + "confidence": "high", + "reasoning": "Short evidence-based explanation.", + "evidence": [ + "Concrete evidence from evaluator feedback or trajectory.", + "Concrete evidence from agent answer or screenshot." + ] +} + +Allowed codes are M1.1, M1.2, M1.3, M2.1, M2.2, M2.3, M3.1, M3.2, M3.3, OTHER. +confidence must be high, medium, or low. diff --git a/docs/lexbench-automated-evaluation-system.md b/docs/lexbench-automated-evaluation-system.md new file mode 100644 index 0000000..cbac5a3 --- /dev/null +++ b/docs/lexbench-automated-evaluation-system.md @@ -0,0 +1,356 @@ +# LexBench Automated Evaluation System + +This branch packages the LexBench-Browser post-run workflow into one ordered +automation pipeline: + +```text +run benchmark +→ eval +→ failure attribution +→ post-attribution rerun check +→ rerun selected tasks +→ re-eval +→ final failure attribution / visualization +``` + +The final rerun check is intentionally **post-attribution**. Artifact-only +signals are still useful as hard infrastructure checks, but they are not enough +to get high M3.2/M3.3 recall with bounded false positives. + +## File Map + +Core rerun rules: + +```text +docs/result-rerun-check-rules.md +docs/rerun-rule-validation-12-models.md +scripts/collect_lexbench_rerun_candidates.py +scripts/audit_m3_3_api_log_failures.py +``` + +Failure attribution prompt and runner: + +```text +browseruse_bench/eval/lexbench_browser/prompts/failure_taxonomy_system.txt +scripts/judge_lexbench_failure_taxonomy.py +``` + +Failure-attribution visualizations and reports: + +```text +scripts/plot_failure_taxonomy_figure.py +scripts/plot_generation_failure_comparison.py +scripts/build_generation_comparison_docs.py +``` + +Standard benchmark/eval entrypoints: + +```text +scripts/run.py +scripts/eval.py +``` + +## Stage 1: Run Benchmark + +Run the benchmark normally. This produces task workspaces and agent execution +logs: + +```text +experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/tasks/*/result.json +experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/tasks/*/api_logs/ +output/logs/run/*.log +``` + +## Stage 2: Evaluate Run Results + +Run the normal LexBench-Browser evaluator: + +```zsh +PYTHONPATH=. ./.venvs/browser_use/bin/python scripts/eval.py \ + --data LexBench-Browser \ + --split All \ + --agent browser-use \ + --model MODEL_CONFIG_KEY \ + --timestamp TIMESTAMP +``` + +The expected eval output is: + +```text +experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/tasks_eval_result/ + task_gpt-4.1_per_task_threshold_stepwise_eval_results.json +``` + +## Stage 3: Failure Attribution + +Run failure attribution after evaluation. This classifies evaluator-failed tasks +into the M1/M2/M3 taxonomy. + +Prompt: + +```text +browseruse_bench/eval/lexbench_browser/prompts/failure_taxonomy_system.txt +``` + +Runner: + +```zsh +PYTHONPATH=. python scripts/judge_lexbench_failure_taxonomy.py \ + --experiments-root /Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use \ + --models MODEL_DIR_NAME \ + --eval-filename task_gpt-4.1_per_task_threshold_stepwise_eval_results.json \ + --model gpt-5.5-judge \ + --include-judge-in-output \ + --num-workers 4 +``` + +Default output: + +```text +tasks_eval_result/ + task_gpt-4.1_per_task_threshold_stepwise_failure_taxonomy_gpt-5.5-judge.jsonl + task_gpt-4.1_per_task_threshold_stepwise_failure_taxonomy_gpt-5.5-judge_summary.json +``` + +## Stage 4: Post-Attribution Rerun Check + +This is the recommended final rerun/review pool for reducing M3.2/M3.3: + +```zsh +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --model MODEL_DIR_NAME \ + --timestamp TIMESTAMP \ + --artifact-mode hard \ + --include-taxonomy-web-constraints +``` + +The final post-attribution set is: + +```text +result_json_hard +∪ latest_agent_run_log_hard +∪ taxonomy_primary_M3.2_or_M3.3 +``` + +Where: + +- `result_json_hard` catches missing/invalid results, `agent_done == error`, `env_status == failed`, early `max_steps`, and suspicious early `timeout`. +- `latest_agent_run_log_hard` catches `Stopping due to 5 consecutive failures`, `Result failed 6/6 times: LLM call timed out`, and `ERR_TUNNEL_CONNECTION_FAILED` from the latest matching agent execution log. +- `taxonomy_primary_M3.2_or_M3.3` catches attribution primary-code `M3.2 Access Barrier` or `M3.3 Site Limitation`. + +Outputs are written to: + +```text +experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/rerun_candidates/ + rerun_candidates.json + rerun_candidates.csv + rerun_candidates_summary.md + rerun_task_ids.txt +``` + +On the 12 current model runs, this reached: + +```text +M3.2/M3.3 target: 171 +hit: 171 +recall: 100.0% +total candidates: 219 +false positives vs primary M3.2/M3.3: 48 +``` + +There is also a provisional artifact-only mode for debugging before attribution +exists: + +```zsh +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --model MODEL_DIR_NAME \ + --timestamp TIMESTAMP +``` + +This provisional mode reads only `result.json`, `api_logs`, and +`output/logs/run`. It is not the final high-recall rule. + +Detailed rule definitions live in: + +```text +docs/result-rerun-check-rules.md +``` + +By default, the provisional artifact-only scanner does not include repeated +parse/LLM-timeout-only api-log evidence unless there is also +access/render/session evidence. To include those protocol-only candidates as an +even broader debugging pool, add: + +```zsh +--include-protocol-only +``` + +## Stage 5: Rerun Candidates + +Read task ids from: + +```zsh +IDS="$(cat experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/rerun_candidates/rerun_task_ids.txt)" +``` + +Then rerun those tasks in the same timestamp: + +```zsh +PYTHONPATH=. ./.venvs/browser_use/bin/python scripts/run.py \ + --agent browser-use \ + --data LexBench-Browser \ + --split All \ + --model MODEL_CONFIG_KEY \ + --timestamp TIMESTAMP \ + --mode specific \ + --task-ids $IDS \ + --concurrency 3 \ + --timeout 1800 \ + --no-group-by-site +``` + +Do not use `--skip-completed` for this rerun. These tasks are intentionally being overwritten/retested. + +## Stage 6: Re-Evaluate Rerun Results + +After rerun, run the evaluator again: + +```zsh +PYTHONPATH=. ./.venvs/browser_use/bin/python scripts/eval.py \ + --data LexBench-Browser \ + --split All \ + --agent browser-use \ + --model MODEL_CONFIG_KEY \ + --timestamp TIMESTAMP +``` + +The expected eval output is still under: + +```text +experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/tasks_eval_result/ + task_gpt-4.1_per_task_threshold_stepwise_eval_results.json +``` + +## Stage 7: Re-Run Failure Attribution + +After rerun and re-eval, run failure attribution again so final analysis uses the +latest task outcomes. + +Prompt: + +```text +browseruse_bench/eval/lexbench_browser/prompts/failure_taxonomy_system.txt +``` + +Runner: + +```zsh +PYTHONPATH=. python scripts/judge_lexbench_failure_taxonomy.py \ + --experiments-root /Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use \ + --models MODEL_DIR_NAME \ + --eval-filename task_gpt-4.1_per_task_threshold_stepwise_eval_results.json \ + --model gpt-5.5-judge \ + --include-judge-in-output \ + --num-workers 4 +``` + +Default output: + +```text +tasks_eval_result/ + task_gpt-4.1_per_task_threshold_stepwise_failure_taxonomy_gpt-5.5-judge.jsonl + task_gpt-4.1_per_task_threshold_stepwise_failure_taxonomy_gpt-5.5-judge_summary.json +``` + +The taxonomy output is both part of the post-attribution rerun check and the +input to model capability analysis. + +## Stage 8: Validate Rerun Rule Recall + +Use taxonomy output to measure whether the post-attribution rerun rule covers +M3.2/M3.3 while keeping false positives bounded. The current validation record is: + +```text +docs/rerun-rule-validation-12-models.md +``` + +Current 12-model result: + +```text +M3.2/M3.3 target: 171 +hit: 171 +recall: 100.0% +total candidates: 219 +false positives vs primary M3.2/M3.3: 48 +``` + +For auxiliary M3.3-specific api-log audits, use: + +```zsh +PYTHONPATH=. python scripts/audit_m3_3_api_log_failures.py \ + --root /Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use +``` + +This writes: + +```text +experiments/LexBench-Browser/All/browser-use/failure_taxonomy_review/ + m3_3_api_log_failure_scan.json + m3_3_api_log_failure_scan.csv + m3_3_api_log_failure_scan_summary.md +``` + +This audit is for diagnosis and rule validation. It is not the final rerun +selection rule. + +## Stage 9: Visualize Failure Attribution + +Main failure taxonomy figure: + +```zsh +PYTHONPATH=. python scripts/plot_failure_taxonomy_figure.py +``` + +Generation comparison figure: + +```zsh +PYTHONPATH=. python scripts/plot_generation_failure_comparison.py +``` + +Generation comparison document: + +```zsh +PYTHONPATH=. python scripts/build_generation_comparison_docs.py +``` + +Outputs are written under: + +```text +reports/ +reports/assets/ +``` + +`plot_failure_taxonomy_figure.py` also writes paper figures to the configured paper figure directory: + +```text +/Users/abc/Desktop/lexmount/lexbench_arxiv_paper/lexmount_tech_report/fig +``` + +## Recommended End-to-End Order + +```text +1. Run benchmark tasks. +2. Evaluate task results. +3. Run failure attribution on evaluator-failed tasks. +4. Run post-attribution rerun check. +5. Rerun selected candidates. +6. Re-evaluate rerun results. +7. Re-run failure attribution on final failures. +8. Generate taxonomy figures/reports. +9. Optionally cross-check rerun-rule recall against M3.2/M3.3. +``` + +Keep these two concepts separate: + +- **Post-attribution rerun check** answers: "Which tasks should be rerun to reduce M3.2/M3.3 and hard run-artifact failures?" +- **Failure attribution** answers: "For the evaluated failed trajectory, what capability or web-constraint category best explains the failure?" diff --git a/docs/rerun-rule-validation-12-models.md b/docs/rerun-rule-validation-12-models.md new file mode 100644 index 0000000..2e7d211 --- /dev/null +++ b/docs/rerun-rule-validation-12-models.md @@ -0,0 +1,143 @@ +# Rerun Rule Validation on 12 Model Runs + +Validation date: 2026-06-25 + +Scope: + +```text +experiments/LexBench-Browser/All/browser-use/*/*/ +``` + +Runs with failure-taxonomy output: 12 + +Validation target: + +```text +primary_code in {M3.2, M3.3} +``` + +M3.1 is excluded from the target because bot defense is usually an inherent +website/automation constraint rather than a rerun-fixable environment failure. + +## Key Finding + +Artifact-only rules cannot simultaneously achieve high M3.2/M3.3 recall and low +false positives. + +Reason: + +- Stable DOM/LLM failures are usually visible through `max_steps` or + `Stopping due to 5 consecutive failures`. +- But many M3.3 rows are semantic site/content limitations. Their logs can look + similar to M1/M2 failures or transient loading states. +- Broad `api_logs` render/session scans catch more M3.3, but also catch many + M1/M2/M3.1 and evaluator-passed tasks. + +## Rule Iterations + +### Artifact-Only Strict Rule + +Definition: + +```text +result_json_hard +∪ latest_agent_run_log_hard +∪ constrained result/api DOM/access evidence +``` + +Constraints: + +- Do not apply api-log soft rules to evaluator-passed tasks. +- Skip api-log soft rules when bot-defense signals are present. +- Do not use repeated parse/LLM-timeout-only api-log evidence by default. +- Use higher empty-DOM thresholds to avoid transient loading false positives. + +Result: + +```text +M3.2/M3.3 target: 171 +hit: 92 +recall: 53.8% +total candidates: 191 +false positives vs M3.2/M3.3: 99 +``` + +This is acceptable as a pre-attribution artifact scan, but not enough for the +final high-recall rerun pool. + +### Final Post-Attribution Rule + +Definition: + +```text +result_json_hard +∪ latest_agent_run_log_hard +∪ taxonomy_primary_M3.2_or_M3.3 +``` + +Command: + +```zsh +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --model MODEL \ + --timestamp TIMESTAMP \ + --artifact-mode hard \ + --include-taxonomy-web-constraints +``` + +Result: + +```text +M3.2/M3.3 target: 171 +hit: 171 +recall: 100.0% +total candidates: 219 +false positives vs M3.2/M3.3: 48 +``` + +False-positive breakdown: + +```text +PASS/none: 29 +M2.3: 7 +M2.2: 5 +M3.1: 3 +M1.1: 3 +M1.3: 1 +``` + +These 48 are not from broad api-log expansion; they are hard artifact signals +such as consecutive failures, early max-steps, tunnel errors, or LLM timeout 6/6. + +## 12-Model Result Table + +| Model | Candidates | M3.2/M3.3 Target | Hit | Recall | Non-M3.2/M3.3 | +|---|---:|---:|---:|---:|---:| +| MiniMax-M3 | 22 | 20 | 20 | 100.0% | 2 | +| bu-2-0 | 15 | 15 | 15 | 100.0% | 0 | +| dmx-claude-opus-4-8-thinking | 27 | 9 | 9 | 100.0% | 18 | +| doubao-seed-2-0-pro | 22 | 21 | 21 | 100.0% | 1 | +| doubao-seed-2-1-pro-260628 | 32 | 20 | 20 | 100.0% | 12 | +| gemini-3.1-pro-preview | 15 | 10 | 10 | 100.0% | 5 | +| gemini-3.5-flash | 6 | 5 | 5 | 100.0% | 1 | +| glm-5.1 | 22 | 20 | 20 | 100.0% | 2 | +| glm-5.2 | 14 | 12 | 12 | 100.0% | 2 | +| gpt-5.5 | 7 | 6 | 6 | 100.0% | 1 | +| kimi-k2.6 | 20 | 18 | 18 | 100.0% | 2 | +| qwen3.7-max | 17 | 15 | 15 | 100.0% | 2 | +| **Total** | **219** | **171** | **171** | **100.0%** | **48** | + +## Final Recommendation + +Use two phases: + +1. **Before failure attribution**: run artifact-only scanner as a provisional + rerun detector. It catches deterministic infrastructure failures without + needing judge outputs. +2. **After failure attribution**: use the final post-attribution rule above. + This is the rule that satisfies high M3.2/M3.3 recall while keeping false + positives bounded. + +Do not use broad api-log render/session evidence as a default hard rerun rule. +It should remain constrained or optional because transient empty DOM/loading +states can recover inside a successful trajectory. diff --git a/docs/result-rerun-check-rules.md b/docs/result-rerun-check-rules.md new file mode 100644 index 0000000..7dd98f1 --- /dev/null +++ b/docs/result-rerun-check-rules.md @@ -0,0 +1,424 @@ +# Result Rerun Check Rules + +This note is for deciding whether a benchmark task result should be rerun. +Do not judge by the final run summary alone. Inspect each task's `result.json`. + +## Key Fields + +Check these fields first: + +```json +{ + "env_status": "...", + "agent_done": "...", + "agent_success": null, + "error": null, + "metrics": { + "steps": 0, + "usage": { + "total_tokens": 0 + } + }, + "action_history": [], + "config": { + "timeout_seconds": 1800, + "max_steps": 40 + }, + "wall_clock_seconds": 0 +} +``` + +## Definitely Rerun + +These are run/environment failures, not valid model outcomes. + +1. Missing, empty, or invalid `result.json`. +2. `agent_done == "error"`. +3. `env_status == "failed"`. +4. `agent_done == "max_steps"` but `metrics.steps < config.max_steps`. + +Rule 4 matters because browser-use can stop early after internal consecutive failures, while the bench wrapper records it as `max_steps`. Example signs: + +```text +agent_done=max_steps +config.max_steps=40 +metrics.steps=6 +answer="Waited for 7 seconds" +``` + +This is not a real max-steps run. It usually means browser-use stopped after internal failures such as: + +```text +Stopping due to 5 consecutive failures +CDP request ... timed out +ScreenshotWatchdog ... timed out +Expected at least one handler to return a non-None result +``` + +5. `agent_done == "timeout"` but `wall_clock_seconds < config.timeout_seconds * 0.5`. + +This is the key signal: a real timeout must run for (almost) the full budget. If the +run is labeled `timeout` but actually stopped at less than half the budget, the process +died early (browser startup / CDP crash / internal error) and was mislabeled as a task +timeout. This holds **regardless of `steps`, `total_tokens`, or `action_history`**. + +There are two sub-cases, both covered by the single rule above: + +```text +# 5a) zero-progress startup death +steps == 0, total_tokens == 0, action_history == 0 +wall_clock_seconds = 66, timeout_seconds = 1800 # ratio 0.04 + +# 5b) mid-run crash with partial progress (do NOT skip these) +steps = 19, total_tokens = 58380, action_history = 9 +wall_clock_seconds = 83, timeout_seconds = 600 # ratio 0.14 +# or even wall_clock_seconds = 0 with real steps/tokens (process crashed) +``` + +Sub-case 5b is easy to miss if you only check `steps == 0`. Any timeout whose +`wall_clock_seconds` is far below the budget must be rerun even if it made progress. + +## Definitely Rerun From Agent Run Logs + +The `result.json` hard rules above are not enough by themselves. Some browser-use +internal failures are written back as `agent_done == "done"` or otherwise look +valid in `result.json`. For high recall, also inspect the corresponding **agent +execution run log** under: + +```text +/Users/abc/Desktop/lexmount/browseruse-agent-bench/output/logs/run +``` + +Use the log that matches the target run's output directory, e.g. a log containing: + +```text +Running browser-use on LexBench-Browser +Output: .../experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP +[RUNNING] Executing task with browser-use agent +[Agent] ... +[BrowserSession] ... +``` + +Do not use eval/judge-only logs for these checks. Eval logs may be newer, but they +do not contain the browser-use execution evidence. + +Add any task with these log signals to the hard rerun set: + +1. `Stopping due to 5 consecutive failures`. +2. `Result failed 6/6 times: LLM call timed out`. +3. `ERR_TUNNEL_CONNECTION_FAILED`. + +Rationale: + +```text +Stopping due to 5 consecutive failures + => browser-use hit its internal failure threshold; not a normal task outcome + +Result failed 6/6 times: LLM call timed out + => the model service failed repeatedly until browser-use stopped/recovered badly + +ERR_TUNNEL_CONNECTION_FAILED + => network/tunnel failure, not a model capability failure +``` + +For high recall, treat these as hard rerun rules even if `result.json` says +`agent_done == "done"` or `agent_success == true`. + +## Not Automatically Rerun + +These may be valid failed model outcomes. + +1. `agent_done == "timeout"` that actually used (almost) the full budget: + +```text +wall_clock_seconds >= config.timeout_seconds * 0.5 +(usually with steps > 0, total_tokens > 0, action_history non-empty) +``` + +This means the task genuinely ran out of time. The deciding factor is +`wall_clock_seconds`, not `steps`: if it is below half the budget, treat it as +Definitely Rerun rule 5 instead. + +2. `agent_done == "max_steps"` with `metrics.steps >= config.max_steps`. + +This means the task used its step budget. It can be model failure or task difficulty, not necessarily infrastructure failure. + +3. `agent_done == "done"` and `agent_success == false`. + +The agent explicitly ended and marked failure. This is usually a model/task outcome, not a broken run. + +## Provisional `api_logs` Signals + +The hard rules above only inspect `result.json`. They do **not** catch every +site/browser failure. Some runs end with `agent_done == "done"` and +`agent_success == false`, but the per-step `api_logs/step_*.json` still show +real access or rendering failures. + +These signals are useful before failure attribution exists and for validating +M3.3 coverage, but broad `api_logs` render/session matching is **not** the final +default rerun rule. It creates too many false positives because transient empty +DOM/loading states can recover inside a successful trajectory. + +Useful access/rendering/model-service signals: + +```text +Navigation failed - site unavailable +ERR_TUNNEL_CONNECTION_FAILED +ERR_TIMED_OUT / net::ERR_TIMED_OUT +ERR_SOCKET_NOT_CONNECTED +ERR_CONNECTION_RESET / ERR_CONNECTION_CLOSED / ERR_CONNECTION_REFUSED +This site can’t be reached +Current tab/URL is about:blank repeatedly +0 links, 0 interactive / 0 total elements / Empty DOM / empty content +No valid agent focus available - target may have detached +Target closed / Cannot find context / SessionManager not initialized +Event handler ... timed out after ... / CDP request ... timed out +LLM call timed out / model service no-response +Failed to parse structured output / Invalid JSON / malformed JSON +``` + +Pre-attribution interpretation: + +```text +unsuccessful task + repeated hard access errors + => provisional rerun_candidate + +unsuccessful task + repeated empty DOM/about:blank/detached focus + => provisional rerun_candidate + +otherwise successful task + transient empty DOM/loading evidence + => do not rerun by api_logs alone + +bot-defense evidence such as CAPTCHA/403/Cloudflare + => do not use api_logs alone; this is usually M3.1 +``` + +Older M3.3 audit reports may contain `manual_review`. For validation-only +M3.3 recall studies, merge those rows into `rerun_candidate`. For the final +rerun set, use the post-attribution rule below instead. + +Use the independent rerun scanner to collect final rerun ids for a target run: + +```zsh +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --root /Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use \ + --model MODEL \ + --timestamp TIMESTAMP +``` + +This pre-attribution mode does not require failure-attribution results. Outputs +are written to: + +```text +experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/rerun_candidates/ + rerun_candidates.json + rerun_candidates.csv + rerun_candidates_summary.md + rerun_task_ids.txt +``` + +By default, this pre-attribution scanner includes result hard rules, latest +run-log hard rules, and constrained api-log access/render/session evidence. +Repeated parse or LLM-timeout-only api-log evidence can be added to a broader +debugging pool with: + +```zsh +--include-protocol-only +``` + +The `api_logs` part is applied only to unsuccessful task results by default. +This avoids rerunning tasks that recovered from transient loading/empty-DOM +states and finished successfully. `result.json` hard rules and latest run-log +hard rules still take precedence even if `agent_success == true`. + +After failure attribution is available, use the final high-recall mode: + +```zsh +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --root /Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use \ + --model MODEL \ + --timestamp TIMESTAMP \ + --artifact-mode hard \ + --include-taxonomy-web-constraints +``` + +This mode uses: + +```text +result_json_hard +∪ latest_agent_run_log_hard +∪ taxonomy_primary_M3.2_or_M3.3 +``` + +On the 12 current model runs, this rule covered `171/171` primary M3.2/M3.3 +failures with `219` total candidates and `48` non-M3.2/M3.3 candidates. This is +the recommended rule when the goal is to reduce M3.2/M3.3 while keeping false +positives bounded. + +Use the M3.3 taxonomy audit script only to validate rule recall against failure +attribution: + +```zsh +PYTHONPATH=. python scripts/audit_m3_3_api_log_failures.py \ + --root /Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use +``` + +Both scanners are string-evidence detectors, not semantic oracles. They can +prove that logs contain these errors, but they cannot guarantee that rerunning +will fix the task or that the original attribution is wrong. This rule +intentionally prioritizes recall over precision. + +## Final High-Recall Rerun Set + +For each `MODEL/TIMESTAMP`, the final rerun candidate set is: + +```text +result_json_hard +∪ latest_agent_run_log_hard +∪ taxonomy_primary_M3.2_or_M3.3 +``` + +When failure attribution is not available yet, use the pre-attribution scanner +output as a provisional artifact-only rerun set. When attribution is available, +prefer the final set above. + +If asking Codex or another agent to collect ids, give this instruction: + +```text +For the target MODEL/TIMESTAMP under +/Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use, +run scripts/collect_lexbench_rerun_candidates.py with +`--artifact-mode hard --include-taxonomy-web-constraints` and return the union of: +1. result.json hard rerun ids, +2. hard ids from the latest matching agent execution log under output/logs/run, +3. failure-taxonomy ids whose primary_code is M3.2 or M3.3. + +Return sorted unique task ids and the reason for each id. +``` + +## Quick Result Check Command + +Set `TASKS_DIR` to a run's `tasks` directory: + +```zsh +TASKS_DIR=/Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/tasks + +PYTHONPATH=. ./.venvs/browser_use/bin/python - <<'PY' +import json +import os +from pathlib import Path +from collections import Counter + +root = Path(os.environ["TASKS_DIR"]) +hard = [] +timeout_suspicious = [] +load_bad = [] +counts = Counter() + +for d in sorted(root.iterdir(), key=lambda p: int(p.name) if p.name.isdigit() else 999999): + if not d.is_dir(): + continue + result_path = d / "result.json" + if not result_path.exists() or result_path.stat().st_size == 0: + load_bad.append((d.name, "missing_or_empty_result_json")) + continue + try: + result = json.loads(result_path.read_text()) + except Exception as exc: + load_bad.append((d.name, f"invalid_json:{exc}")) + continue + + done = result.get("agent_done") + counts[done] += 1 + metrics = result.get("metrics") or {} + usage = metrics.get("usage") or {} + config = result.get("config") or {} + + steps = metrics.get("steps") or 0 + total_tokens = usage.get("total_tokens") or 0 + actions = len(result.get("action_history") or []) + max_steps = config.get("max_steps") or 40 + timeout_seconds = config.get("timeout_seconds") or 0 + wall_clock_seconds = result.get("wall_clock_seconds") or 0 + + reasons = [] + if result.get("env_status") == "failed": + reasons.append("env_status=failed") + if done == "error": + reasons.append("agent_done=error") + if done == "max_steps" and steps < max_steps: + reasons.append(f"early max_steps: steps={steps} < max_steps={max_steps}") + if done == "timeout" and timeout_seconds and wall_clock_seconds < timeout_seconds * 0.5: + reasons.append( + f"suspicious timeout: wall={wall_clock_seconds} < 0.5*timeout={timeout_seconds} " + f"(steps={steps} tokens={total_tokens} actions={actions})" + ) + + if reasons: + hard.append((d.name, reasons)) + +print("agent_done_counts:", dict(counts)) +print("load_bad:", load_bad) +print("rerun_count:", len(hard)) +print("rerun_ids:", " ".join(tid for tid, _ in hard)) +print() +for tid, reasons in hard: + print(tid, "|", "; ".join(reasons)) +PY +``` + +## Quick Agent Log Check Command + +Set `RUN_LOG` to the matching agent execution log for the same `MODEL/TIMESTAMP`: + +```zsh +RUN_LOG=/Users/abc/Desktop/lexmount/browseruse-agent-bench/output/logs/run/RUN_LOG_FILE.log + +PYTHONPATH=. ./.venvs/browser_use/bin/python - <<'PY' +import os +import re +from collections import defaultdict +from pathlib import Path + +log_path = Path(os.environ["RUN_LOG"]) +line_re = re.compile(r"\[run\] \[(\d+)\] (.*)") + +reasons_by_task = defaultdict(set) + +for line in log_path.read_text(errors="replace").splitlines(): + match = line_re.search(line) + if not match: + continue + task_id, message = match.groups() + if "Stopping due to 5 consecutive failures" in message: + reasons_by_task[task_id].add("stopping_due_to_5_consecutive_failures") + if "Result failed 6/6 times" in message and "LLM call timed out" in message: + reasons_by_task[task_id].add("llm_timeout_6_of_6") + if "ERR_TUNNEL_CONNECTION_FAILED" in message: + reasons_by_task[task_id].add("err_tunnel_connection_failed") + +print("log_rerun_count:", len(reasons_by_task)) +print("log_rerun_ids:", " ".join(sorted(reasons_by_task, key=lambda x: int(x) if x.isdigit() else x))) +print() +for task_id in sorted(reasons_by_task, key=lambda x: int(x) if x.isdigit() else x): + print(task_id, "|", "; ".join(sorted(reasons_by_task[task_id]))) +PY +``` + +## Rerun Command Pattern + +Do not use `--skip-completed` when rerunning failed tasks in the same timestamp. + +```zsh +PYTHONPATH=. ./.venvs/browser_use/bin/python scripts/run.py \ + --agent browser-use \ + --data LexBench-Browser \ + --split All \ + --model MODEL_CONFIG_KEY \ + --timestamp TIMESTAMP \ + --mode specific \ + --task-ids IDS_HERE \ + --concurrency 3 \ + --timeout 1800 \ + --no-group-by-site +``` diff --git a/scripts/audit_m3_3_api_log_failures.py b/scripts/audit_m3_3_api_log_failures.py new file mode 100644 index 0000000..4d032db --- /dev/null +++ b/scripts/audit_m3_3_api_log_failures.py @@ -0,0 +1,429 @@ +#!/usr/bin/env python3 +"""Audit M3.3 failures for explicit browser/network evidence in api_logs. + +This is a high-recall log scanner for rerun candidates. It proves that +specific failure strings occurred in the trajectory; it does not prove the task +is semantically impossible or that rerunning will fix the result. +""" + +from __future__ import annotations + +import argparse +import csv +import json +import re +from collections import Counter, defaultdict +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + + +TAXONOMY_NAME = ( + "task_gpt-4.1_per_task_threshold_stepwise_failure_taxonomy_gpt-5.5-judge.jsonl" +) +EVAL_NAME = "task_gpt-4.1_per_task_threshold_stepwise_eval_results.json" + + +ACCESS_ERROR_RE = re.compile( + r"Navigation failed(?: - site unavailable)?|site unavailable|This site can.?t be reached|" + r"ERR_TUNNEL_CONNECTION_FAILED|ERR_TIMED_OUT|net::ERR_TIMED_OUT|ERR_SOCKET_NOT_CONNECTED|" + r"ERR_NAME_NOT_RESOLVED|ERR_CONNECTION_(?:RESET|CLOSED|REFUSED)|ERR_HTTP2_PROTOCOL_ERROR", + re.I, +) +NAVIGATION_FAILED_RE = re.compile(r"Navigation failed|site unavailable", re.I) +ERR_TUNNEL_RE = re.compile(r"ERR_TUNNEL_CONNECTION_FAILED|ERR_SOCKET_NOT_CONNECTED", re.I) +ERR_TIMED_OUT_RE = re.compile(r"ERR_TIMED_OUT|net::ERR_TIMED_OUT", re.I) + +CURRENT_TAB_ABOUT_BLANK_RE = re.compile( + r"(?:^|\n)(?:Tab\s+[^:\n]+:\s+about:blank|Current URL:\s+about:blank|URL:\s+about:blank)", + re.I, +) +ACTION_ABOUT_BLANK_RE = re.compile( + r"(?:Opened new tab with url|Navigated to)\s+about:blank", re.I +) +EMPTY_DOM_RE = re.compile( + r"0 links,\s*0 interactive|0 total elements|Empty DOM|empty DOM|empty content|" + r"no DOM elements|Page loaded but returned empty content", + re.I, +) +DETACHED_FOCUS_RE = re.compile( + r"No valid agent focus|target may have detached|Target closed|Cannot find context|" + r"SessionManager not initialized|browser is in an unstable state|detached target", + re.I, +) +BROWSER_EVENT_TIMEOUT_RE = re.compile( + r"Event handler .* timed out after|CDP request .* timed out|ScreenshotWatchdog .* timed out|" + r"Navigation failed: .*timed out after", + re.I | re.S, +) +LLM_TIMEOUT_RE = re.compile(r"LLM call timed out|model service no-response", re.I) +PARSE_ERROR_RE = re.compile( + r"validation error for AgentOutput|Failed to parse structured output|Invalid JSON|" + r"malformed JSON|parser failure", + re.I, +) +BOT_RE = re.compile(r"captcha|cloudflare|robot|human verification|403|429|blocked", re.I) +CONTENT_LIMIT_RE = re.compile( + r"not available|not exposed|does not expose|does not provide|no active|not exist|" + r"not found|missing requested|无法找到|没有提供|不提供|不存在|未公开|未暴露", + re.I, +) + + +@dataclass +class SignalCounts: + navigation_failed: int = 0 + err_tunnel: int = 0 + err_timed_out: int = 0 + current_tab_about_blank: int = 0 + action_about_blank: int = 0 + empty_dom: int = 0 + detached_focus: int = 0 + browser_event_timeout: int = 0 + llm_timeout: int = 0 + parse_error: int = 0 + bot_signal: int = 0 + content_limitation_text: int = 0 + + +@dataclass +class AuditRow: + agent: str + timestamp: str + task_id: str + task_type: str | None + score: Any + predicted_label: Any + agent_done: Any + agent_success: Any + wall_clock_seconds: Any + steps: Any + primary_code: str + recommendation: str + category: str + signal_counts: SignalCounts + evidence: list[str] = field(default_factory=list) + taxonomy_reasoning: str = "" + final_answer_excerpt: str = "" + + +def read_jsonl(path: Path) -> list[dict[str, Any]]: + if not path.exists() or path.stat().st_size == 0: + return [] + return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] + + +def load_json(path: Path) -> dict[str, Any]: + try: + return json.loads(path.read_text()) + except Exception: + return {} + + +def stringify(value: Any, max_len: int = 5000) -> str: + if value is None: + return "" + if isinstance(value, str): + return value[:max_len] + try: + return json.dumps(value, ensure_ascii=False)[:max_len] + except Exception: + return str(value)[:max_len] + + +def count_re(pattern: re.Pattern[str], text: str) -> int: + return len(pattern.findall(text or "")) + + +def collect_log_text_and_evidence(api_dir: Path) -> tuple[str, list[str]]: + chunks: list[str] = [] + evidence: list[str] = [] + if not api_dir.exists(): + return "", evidence + + for step_path in sorted(api_dir.glob("step_*.json")): + step = load_json(step_path) + step_no = step.get("metadata", {}).get("step_number") or step_path.stem + input_obj = step.get("input") or {} + state_message = stringify(input_obj.get("state_message"), max_len=10000) + url = stringify(input_obj.get("url"), max_len=500) + output = stringify(step.get("output"), max_len=3000) + action_results = step.get("action_results") or [] + + chunks.extend([url, state_message, output]) + + for action_result in action_results: + error = stringify(action_result.get("error"), max_len=1000) + content = stringify(action_result.get("extracted_content"), max_len=1000) + chunks.extend([error, content]) + + line = error or content + if not line: + continue + if ( + ACCESS_ERROR_RE.search(line) + or EMPTY_DOM_RE.search(line) + or DETACHED_FOCUS_RE.search(line) + or BROWSER_EVENT_TIMEOUT_RE.search(line) + or LLM_TIMEOUT_RE.search(line) + or PARSE_ERROR_RE.search(line) + or ACTION_ABOUT_BLANK_RE.search(line) + ): + evidence.append(f"{step_no}: {line[:300]}") + + # Preserve order while deduplicating. + unique_evidence = list(dict.fromkeys(evidence)) + return "\n".join(chunks), unique_evidence[:12] + + +def scan_signals(text: str, taxonomy_reasoning: str, final_answer: str) -> SignalCounts: + combined = "\n".join([text, taxonomy_reasoning, final_answer]) + return SignalCounts( + navigation_failed=count_re(NAVIGATION_FAILED_RE, combined), + err_tunnel=count_re(ERR_TUNNEL_RE, combined), + err_timed_out=count_re(ERR_TIMED_OUT_RE, combined), + current_tab_about_blank=count_re(CURRENT_TAB_ABOUT_BLANK_RE, text), + action_about_blank=count_re(ACTION_ABOUT_BLANK_RE, combined), + empty_dom=count_re(EMPTY_DOM_RE, combined), + detached_focus=count_re(DETACHED_FOCUS_RE, combined), + browser_event_timeout=count_re(BROWSER_EVENT_TIMEOUT_RE, combined), + llm_timeout=count_re(LLM_TIMEOUT_RE, combined), + parse_error=count_re(PARSE_ERROR_RE, combined), + bot_signal=count_re(BOT_RE, combined), + content_limitation_text=count_re(CONTENT_LIMIT_RE, "\n".join([taxonomy_reasoning, final_answer])), + ) + + +def classify(counts: SignalCounts) -> tuple[str, str]: + hard_access = ( + counts.err_tunnel > 0 + or counts.err_timed_out > 0 + or counts.navigation_failed >= 2 + or counts.browser_event_timeout >= 2 + ) + render_or_session = ( + counts.detached_focus > 0 + or counts.current_tab_about_blank >= 2 + or counts.action_about_blank >= 2 + or counts.empty_dom >= 2 + or counts.browser_event_timeout > 0 + ) + model_protocol = counts.llm_timeout > 0 or counts.parse_error > 0 + content_limit = counts.content_limitation_text > 0 + + if hard_access: + if model_protocol or content_limit: + return "rerun_candidate", "hard_access_error_mixed" + return "rerun_candidate", "hard_access_error" + if render_or_session: + if model_protocol or content_limit: + return "rerun_candidate", "render_or_session_error_mixed" + return "rerun_candidate", "render_or_session_error" + if content_limit and not model_protocol: + return "keep_m3_3", "content_or_site_capability_missing" + if model_protocol: + return "rerun_candidate", "model_protocol_mixed" + return "rerun_candidate", "unclear_m3_3" + + +def discover_runs(root: Path) -> list[tuple[str, str, Path]]: + runs: list[tuple[str, str, Path]] = [] + for taxonomy_path in sorted(root.glob(f"*/*/tasks_eval_result/{TAXONOMY_NAME}")): + run_dir = taxonomy_path.parents[1] + timestamp = run_dir.name + agent = run_dir.parent.name + runs.append((agent, timestamp, run_dir)) + return runs + + +def audit_run(agent: str, timestamp: str, run_dir: Path) -> list[AuditRow]: + taxonomy_path = run_dir / "tasks_eval_result" / TAXONOMY_NAME + rows = read_jsonl(taxonomy_path) + out: list[AuditRow] = [] + + for row in rows: + taxonomy = row.get("taxonomy") or {} + if taxonomy.get("primary_code") != "M3.3": + continue + + task_id = str(row.get("task_id")) + result_path = run_dir / "tasks" / task_id / "result.json" + result = load_json(result_path) + api_dir = run_dir / "tasks" / task_id / "api_logs" + log_text, evidence = collect_log_text_and_evidence(api_dir) + taxonomy_reasoning = stringify(taxonomy.get("reasoning"), max_len=2000) + final_answer = stringify(result.get("answer") or row.get("agent_response"), max_len=2000) + counts = scan_signals(log_text, taxonomy_reasoning, final_answer) + recommendation, category = classify(counts) + metrics = result.get("metrics") or {} + + out.append( + AuditRow( + agent=agent, + timestamp=timestamp, + task_id=task_id, + task_type=row.get("task_type"), + score=row.get("score"), + predicted_label=row.get("predicted_label"), + agent_done=result.get("agent_done"), + agent_success=result.get("agent_success"), + wall_clock_seconds=result.get("wall_clock_seconds"), + steps=metrics.get("steps"), + primary_code="M3.3", + recommendation=recommendation, + category=category, + signal_counts=counts, + evidence=evidence, + taxonomy_reasoning=taxonomy_reasoning, + final_answer_excerpt=final_answer.replace("\n", " ")[:500], + ) + ) + return out + + +def write_outputs(rows: list[AuditRow], out_dir: Path) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + json_path = out_dir / "m3_3_api_log_failure_scan.json" + csv_path = out_dir / "m3_3_api_log_failure_scan.csv" + md_path = out_dir / "m3_3_api_log_failure_scan_summary.md" + + payload = { + "rule_notes": { + "guarantee": "High-recall string evidence scan only. It guarantees captured patterns appeared in api_logs/result text; it does not guarantee semantic rerun correctness.", + "rerun_candidate": "Includes explicit access/render/session errors and the former manual_review cases with mixed or unclear M3.3 evidence.", + "manual_review": "Deprecated for rerun selection. Former manual_review cases are now emitted as rerun_candidate for higher recall.", + "keep_m3_3": "No hard browser/access evidence and content/site capability limitation is present.", + }, + "summary": summarize(rows), + "rows": [serialize_row(row) for row in rows], + } + json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2)) + + with csv_path.open("w", newline="") as f: + writer = csv.DictWriter( + f, + fieldnames=[ + "agent", + "timestamp", + "task_id", + "score", + "agent_done", + "wall_clock_seconds", + "steps", + "recommendation", + "category", + "navigation_failed", + "err_tunnel", + "err_timed_out", + "current_tab_about_blank", + "action_about_blank", + "empty_dom", + "detached_focus", + "browser_event_timeout", + "llm_timeout", + "parse_error", + "bot_signal", + "content_limitation_text", + "evidence", + ], + ) + writer.writeheader() + for row in rows: + data = serialize_row(row) + flat = { + **{k: data[k] for k in ["agent", "timestamp", "task_id", "score", "agent_done", "wall_clock_seconds", "steps", "recommendation", "category"]}, + **data["signal_counts"], + "evidence": " | ".join(data["evidence"][:4]), + } + writer.writerow(flat) + + md_path.write_text(render_markdown(rows, payload["summary"])) + print(json.dumps({"json": str(json_path), "csv": str(csv_path), "md": str(md_path)}, ensure_ascii=False, indent=2)) + + +def serialize_row(row: AuditRow) -> dict[str, Any]: + data = asdict(row) + data["signal_counts"] = asdict(row.signal_counts) + return data + + +def summarize(rows: list[AuditRow]) -> dict[str, Any]: + by_agent: dict[str, Any] = {} + for agent in sorted({row.agent for row in rows}): + agent_rows = [row for row in rows if row.agent == agent] + by_agent[agent] = { + "total_m3_3": len(agent_rows), + "recommendation_counts": dict(Counter(row.recommendation for row in agent_rows)), + "category_counts": dict(Counter(row.category for row in agent_rows)), + "rerun_candidate_task_ids": [row.task_id for row in agent_rows if row.recommendation == "rerun_candidate"], + "manual_review_task_ids": [row.task_id for row in agent_rows if row.recommendation == "manual_review"], + "keep_m3_3_task_ids": [row.task_id for row in agent_rows if row.recommendation == "keep_m3_3"], + } + return { + "total_m3_3": len(rows), + "overall_recommendation_counts": dict(Counter(row.recommendation for row in rows)), + "overall_category_counts": dict(Counter(row.category for row in rows)), + "by_agent": by_agent, + } + + +def render_markdown(rows: list[AuditRow], summary: dict[str, Any]) -> str: + lines = [ + "# M3.3 API Log Failure Scan", + "", + "This scan uses fixed string/threshold rules over `api_logs/step_*.json` and `result.json`.", + "It is high recall for rerun candidates, but it is not a 100% semantic classifier.", + "", + "## Overall", + "", + f"- Total M3.3 rows scanned: {summary['total_m3_3']}", + f"- Recommendation counts: `{json.dumps(summary['overall_recommendation_counts'], ensure_ascii=False)}`", + f"- Category counts: `{json.dumps(summary['overall_category_counts'], ensure_ascii=False)}`", + "", + "## By Agent", + "", + "| Agent | M3.3 | Rerun | Manual | Keep | Rerun task ids |", + "|---|---:|---:|---:|---:|---|", + ] + for agent, info in summary["by_agent"].items(): + rec = info["recommendation_counts"] + lines.append( + "| {agent} | {total} | {rerun} | {manual} | {keep} | {ids} |".format( + agent=agent, + total=info["total_m3_3"], + rerun=rec.get("rerun_candidate", 0), + manual=rec.get("manual_review", 0), + keep=rec.get("keep_m3_3", 0), + ids=", ".join(info["rerun_candidate_task_ids"]), + ) + ) + lines.extend(["", "## Rerun Candidates", ""]) + for row in rows: + if row.recommendation != "rerun_candidate": + continue + evidence = " / ".join(row.evidence[:2]) + lines.append(f"- `{row.agent}` task `{row.task_id}`: {row.category}. Evidence: {evidence}") + return "\n".join(lines) + "\n" + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--root", + type=Path, + default=Path( + "/Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use" + ), + ) + parser.add_argument("--out-dir", type=Path, default=None) + args = parser.parse_args() + + out_dir = args.out_dir or args.root / "failure_taxonomy_review" + all_rows: list[AuditRow] = [] + for agent, timestamp, run_dir in discover_runs(args.root): + all_rows.extend(audit_run(agent, timestamp, run_dir)) + write_outputs(all_rows, out_dir) + + +if __name__ == "__main__": + main() diff --git a/scripts/build_generation_comparison_docs.py b/scripts/build_generation_comparison_docs.py new file mode 100644 index 0000000..52d32aa --- /dev/null +++ b/scripts/build_generation_comparison_docs.py @@ -0,0 +1,568 @@ +#!/usr/bin/env python3 +"""Build embedded DOCX and self-contained HTML report for generation comparison.""" + +from __future__ import annotations + +import base64 +from pathlib import Path + +from docx import Document +from docx.enum.section import WD_SECTION +from docx.enum.table import WD_CELL_VERTICAL_ALIGNMENT, WD_TABLE_ALIGNMENT +from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.oxml import OxmlElement +from docx.oxml.ns import qn +from docx.shared import Inches, Pt, RGBColor + + +REPO_ROOT = Path(__file__).resolve().parents[1] +REPORT_DIR = REPO_ROOT / "reports" +ASSET_DIR = REPORT_DIR / "assets" +FIGURE = ASSET_DIR / "generation_failure_comparison.png" +DOCX_OUT = REPORT_DIR / "doubao_glm_generation_comparison.docx" +HTML_OUT = REPORT_DIR / "doubao_glm_generation_comparison.html" + +BLUE = "355F9F" +BLUE_LIGHT = "E8EEF7" +AMBER = "C6922E" +TEAL = "3E8582" +INK = "1C2B33" +MUTED = "5A6472" +BORDER = "D7DBE2" +LATIN_FONT = "Arial Unicode MS" +EAST_ASIA_FONT = "Arial Unicode MS" + +OVERALL = [ + ["Doubao Seed 2.0 Pro", "130 / 210", "80", "61.90%", "80"], + ["Doubao Seed 2.1 Pro", "99 / 210", "111", "47.14%", "111"], + ["GLM-5.1", "143 / 210", "67", "68.10%", "67"], + ["GLM-5.2", "139 / 210", "71", "66.19%", "70"], +] + +DOUBAO_LAYER = [ + ["Task Reasoning", "26", "32.5%", "21", "18.9%", "-5"], + ["Action Execution", "18", "22.5%", "57", "51.4%", "+39"], + ["Web Constraints", "36", "45.0%", "33", "29.7%", "-3"], +] +DOUBAO_TYPES = [ + ["M1.1 Requirement Following", "18", "21", "+3"], + ["M1.2 Target Selection", "4", "0", "-4"], + ["M1.3 Evidence Grounding", "4", "0", "-4"], + ["M2.1 UI Misoperation", "6", "4", "-2"], + ["M2.2 Infinite Loop", "10", "40", "+30"], + ["M2.3 Format Breakdown", "2", "13", "+11"], + ["M3.1 Bot Defense", "15", "13", "-2"], + ["M3.2 Access Barrier", "3", "2", "-1"], + ["M3.3 Site Limitation", "18", "18", "0"], +] + +GLM_LAYER = [ + ["Task Reasoning", "15", "22.4%", "27", "38.6%", "+12"], + ["Action Execution", "17", "25.4%", "20", "28.6%", "+3"], + ["Web Constraints", "35", "52.2%", "23", "32.9%", "-12"], +] +GLM_TYPES = [ + ["M1.1 Requirement Following", "10", "15", "+5"], + ["M1.2 Target Selection", "3", "8", "+5"], + ["M1.3 Evidence Grounding", "2", "4", "+2"], + ["M2.1 UI Misoperation", "5", "4", "-1"], + ["M2.2 Infinite Loop", "10", "12", "+2"], + ["M2.3 Format Breakdown", "2", "4", "+2"], + ["M3.1 Bot Defense", "15", "11", "-4"], + ["M3.2 Access Barrier", "1", "0", "-1"], + ["M3.3 Site Limitation", "19", "12", "-7"], +] + + +def set_cell_shading(cell, fill: str) -> None: + tc_pr = cell._tc.get_or_add_tcPr() + shd = tc_pr.find(qn("w:shd")) + if shd is None: + shd = OxmlElement("w:shd") + tc_pr.append(shd) + shd.set(qn("w:fill"), fill) + + +def set_cell_border(cell, color: str = BORDER, size: str = "4") -> None: + tc_pr = cell._tc.get_or_add_tcPr() + borders = tc_pr.find(qn("w:tcBorders")) + if borders is None: + borders = OxmlElement("w:tcBorders") + tc_pr.append(borders) + for edge in ("top", "left", "bottom", "right", "insideH", "insideV"): + tag = f"w:{edge}" + element = borders.find(qn(tag)) + if element is None: + element = OxmlElement(tag) + borders.append(element) + element.set(qn("w:val"), "single") + element.set(qn("w:sz"), size) + element.set(qn("w:space"), "0") + element.set(qn("w:color"), color) + + +def set_cell_margins(cell, top: int = 80, start: int = 90, bottom: int = 80, end: int = 90) -> None: + tc_pr = cell._tc.get_or_add_tcPr() + tc_mar = tc_pr.first_child_found_in("w:tcMar") + if tc_mar is None: + tc_mar = OxmlElement("w:tcMar") + tc_pr.append(tc_mar) + for margin, value in {"top": top, "start": start, "bottom": bottom, "end": end}.items(): + node = tc_mar.find(qn(f"w:{margin}")) + if node is None: + node = OxmlElement(f"w:{margin}") + tc_mar.append(node) + node.set(qn("w:w"), str(value)) + node.set(qn("w:type"), "dxa") + + +def format_paragraph(paragraph, size: float = 10.5, bold: bool = False, color: str = INK) -> None: + for run in paragraph.runs: + run.font.name = LATIN_FONT + run.font.size = Pt(size) + run.font.bold = bold + run.font.color.rgb = RGBColor.from_string(color) + run._element.rPr.rFonts.set(qn("w:eastAsia"), EAST_ASIA_FONT) + + +def add_para(doc: Document, text: str, size: float = 10.0, after: int = 5, bold: bool = False) -> None: + p = doc.add_paragraph() + p.paragraph_format.space_after = Pt(after) + p.paragraph_format.line_spacing = 1.08 + run = p.add_run(text) + run.font.name = LATIN_FONT + run.font.size = Pt(size) + run.font.bold = bold + run.font.color.rgb = RGBColor.from_string(INK) + run._element.rPr.rFonts.set(qn("w:eastAsia"), EAST_ASIA_FONT) + + +def add_heading(doc: Document, text: str, level: int = 1) -> None: + p = doc.add_paragraph() + p.paragraph_format.space_before = Pt(12 if level == 1 else 8) + p.paragraph_format.space_after = Pt(5) + run = p.add_run(text) + run.font.name = LATIN_FONT + run.font.bold = True + run.font.size = Pt(15 if level == 1 else 12.5) + run.font.color.rgb = RGBColor.from_string(BLUE if level == 1 else INK) + run._element.rPr.rFonts.set(qn("w:eastAsia"), EAST_ASIA_FONT) + + +def add_bullet(doc: Document, text: str) -> None: + p = doc.add_paragraph(style="List Bullet") + p.paragraph_format.space_after = Pt(1) + p.paragraph_format.left_indent = Inches(0.25) + p.paragraph_format.first_line_indent = Inches(-0.1) + run = p.add_run(text) + run.font.name = LATIN_FONT + run.font.size = Pt(9.8) + run.font.color.rgb = RGBColor.from_string(INK) + run._element.rPr.rFonts.set(qn("w:eastAsia"), EAST_ASIA_FONT) + + +def add_table(doc: Document, headers: list[str], rows: list[list[str]], widths: list[float]) -> None: + table = doc.add_table(rows=1, cols=len(headers)) + table.alignment = WD_TABLE_ALIGNMENT.CENTER + table.autofit = False + table.allow_autofit = False + + for i, header in enumerate(headers): + cell = table.rows[0].cells[i] + cell.width = Inches(widths[i]) + cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER + set_cell_shading(cell, BLUE_LIGHT) + set_cell_border(cell) + set_cell_margins(cell) + p = cell.paragraphs[0] + p.alignment = WD_ALIGN_PARAGRAPH.CENTER if i else WD_ALIGN_PARAGRAPH.LEFT + p.add_run(header) + format_paragraph(p, size=9.2, bold=True, color=INK) + + for row in rows: + cells = table.add_row().cells + for i, value in enumerate(row): + cell = cells[i] + cell.width = Inches(widths[i]) + cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER + set_cell_border(cell) + set_cell_margins(cell) + p = cell.paragraphs[0] + p.alignment = WD_ALIGN_PARAGRAPH.LEFT if i == 0 else WD_ALIGN_PARAGRAPH.CENTER + p.add_run(value) + color = INK + if value.startswith("+"): + color = AMBER + elif value.startswith("-"): + color = TEAL + format_paragraph(p, size=8.8, bold=i == 0, color=color) + + doc.add_paragraph().paragraph_format.space_after = Pt(2) + + +def build_docx() -> None: + doc = Document() + section = doc.sections[0] + section.top_margin = Inches(0.72) + section.bottom_margin = Inches(0.72) + section.left_margin = Inches(0.65) + section.right_margin = Inches(0.65) + + normal = doc.styles["Normal"] + normal.font.name = LATIN_FONT + normal.font.size = Pt(10.0) + normal.font.color.rgb = RGBColor.from_string(INK) + + title = doc.add_paragraph() + title.alignment = WD_ALIGN_PARAGRAPH.CENTER + title.paragraph_format.space_after = Pt(8) + r = title.add_run("Doubao 与 GLM 在 LexBench-Browser 上的代际对比") + r.font.name = LATIN_FONT + r.font.size = Pt(20) + r.font.bold = True + r.font.color.rgb = RGBColor.from_string(BLUE) + r._element.rPr.rFonts.set(qn("w:eastAsia"), EAST_ASIA_FONT) + + add_para( + doc, + "这份报告对比了两组模型代际更新在相同 BrowserUse scaffold 下的表现,评测集合为 LexBench-Browser All split。", + size=10.4, + ) + add_bullet(doc, "Doubao Seed 2.0 Pro vs. Doubao Seed 2.1 Pro") + add_bullet(doc, "GLM-5.1 vs. GLM-5.2") + add_para( + doc, + "任务完成率由 LexJudge 使用 gpt-4.1 评估;failure attribution 使用 LexBench failure taxonomy,并由 gpt-5.5 作为 judge。GLM-5.2 中有 1 条失败来自对危险请求的正确安全拒答,这类样本不属于 agent capability failure,因此已从错误归因统计中排除。", + size=10.0, + ) + + fig = doc.add_paragraph() + fig.alignment = WD_ALIGN_PARAGRAPH.CENTER + fig.paragraph_format.space_before = Pt(4) + fig.paragraph_format.space_after = Pt(3) + fig.add_run().add_picture(str(FIGURE), width=Inches(7.0)) + cap = doc.add_paragraph() + cap.alignment = WD_ALIGN_PARAGRAPH.CENTER + cap.paragraph_format.space_after = Pt(8) + rr = cap.add_run("Figure 1. Generation-level failure attribution comparison.") + rr.font.name = LATIN_FONT + rr.font.size = Pt(9) + rr.font.italic = True + rr.font.color.rgb = RGBColor.from_string(MUTED) + + add_heading(doc, "Overall Performance") + add_table( + doc, + ["Model", "Successful Tasks", "Raw Failed", "Success Rate", "Attributed Failures"], + OVERALL, + [2.25, 1.35, 1.05, 1.1, 1.35], + ) + add_para( + doc, + "Doubao Seed 2.1 Pro 相比 2.0 Pro 出现明显退化:少完成 31 个任务,Success Rate 下降 14.76 个百分点。退化并不主要来自更困难的网站环境,而是集中在 action-level failures,尤其是 Infinite Loop 和 Format Breakdown。", + ) + add_para( + doc, + "GLM-5.2 与 GLM-5.1 的差距较小,但 raw task completion 仍略低:少完成 4 个任务,Success Rate 低 1.91 个百分点。如果把安全拒答样本从失败侧去掉,GLM-5.2 的非错误结果为 140/210,即 66.67%。", + ) + + doc.add_page_break() + add_heading(doc, "Doubao 2.0 Pro vs. Doubao 2.1 Pro") + add_table( + doc, + ["Error Layer", "2.0 Count", "2.0 Ratio", "2.1 Count", "2.1 Ratio", "Change"], + DOUBAO_LAYER, + [1.75, 1.05, 1.05, 1.05, 1.05, 0.85], + ) + add_table( + doc, + ["Error Type", "Doubao 2.0", "Doubao 2.1", "Change"], + DOUBAO_TYPES, + [3.85, 1.05, 1.05, 0.9], + ) + add_para( + doc, + "Doubao Seed 2.1 Pro 的核心问题是 operational stability。M2.2 Infinite Loop 从 10 条上升到 40 条,M2.3 Format Breakdown 从 2 条上升到 13 条。相比之下,Web Constraints 的绝对数量没有增加,M3.3 Site Limitation 仍然是 18 条。", + ) + add_para( + doc, + "这说明 2.1 的退化主要是 model-side,而不是 environment-side。它不是更频繁地被网站阻断,而是更容易无法维持有效的交互循环、无法从页面状态中恢复,或者无法保持可解析的 action format。", + ) + + doc.add_page_break() + add_heading(doc, "GLM-5.1 vs. GLM-5.2") + add_table( + doc, + ["Error Layer", "5.1 Count", "5.1 Ratio", "5.2 Count", "5.2 Ratio", "Change"], + GLM_LAYER, + [1.75, 1.05, 1.05, 1.05, 1.05, 0.85], + ) + add_table( + doc, + ["Error Type", "GLM-5.1", "GLM-5.2", "Change"], + GLM_TYPES, + [3.85, 1.05, 1.05, 0.9], + ) + add_para( + doc, + "GLM-5.2 在 website-side constraints 上更干净:Web Constraints 从 35 条下降到 23 条,其中 Bot Defense 和 Site Limitation 都减少了。", + ) + add_para( + doc, + "但是,Web Constraints 的减少没有转化为更高的整体任务完成率。错误更多地转移到 model-side categories:Task Reasoning 从 15 条上升到 27 条,主要集中在 Requirement Following 和 Target Selection。", + ) + + add_heading(doc, "Takeaways") + add_bullet(doc, "Doubao 2.1 Pro 变差的主要原因是 Action Execution 不稳定。") + add_bullet(doc, "GLM-5.2 虽然更少受到 Web Constraints 影响,但 model-side reasoning errors 增加,因此整体略低于 GLM-5.1。") + add_bullet(doc, "这套 taxonomy 能把 model capability failures 和 website-side constraints 分开,从而更清楚地定位代际变化的来源。") + + doc.save(DOCX_OUT) + + +def html_table(headers: list[str], rows: list[list[str]]) -> str: + head = "".join(f"{h}" for h in headers) + body_rows = [] + for row in rows: + body_rows.append("" + "".join(f"{v}" for v in row) + "") + return f"{head}{''.join(body_rows)}
" + + +def build_html() -> None: + image_b64 = base64.b64encode(FIGURE.read_bytes()).decode("ascii") + html = f""" + + + + + Doubao 与 GLM 在 LexBench-Browser 上的代际对比 + + + +
+

Doubao 与 GLM 在 LexBench-Browser 上的代际对比

+

这份报告对比了两组模型代际更新在相同 BrowserUse scaffold 下的表现,评测集合为 LexBench-Browser All split。

+
    +
  • Doubao Seed 2.0 Pro vs. Doubao Seed 2.1 Pro
  • +
  • GLM-5.1 vs. GLM-5.2
  • +
+

任务完成率由 LexJudge 使用 gpt-4.1 评估;failure attribution 使用 LexBench failure taxonomy,并由 gpt-5.5 作为 judge。GLM-5.2 中有 1 条失败来自对危险请求的正确安全拒答,这类样本不属于 agent capability failure,因此已从错误归因统计中排除。

+ +

Failure Taxonomy

+
+
+ M1 · Task Reasoning + 模型没有正确理解、保持或验证任务要求,例如 Requirement Following、Target Selection、Evidence Grounding。 +
+
M1.1
Requirement Following:遗漏或偏离任务约束。
+
M1.2
Target Selection:选错页面、对象、商品或结果。
+
M1.3
Evidence Grounding:答案缺少足够页面证据支撑。
+
+
+
+ M2 · Action Execution + 模型在浏览器交互或工具调用层面失稳,例如 UI Misoperation、Infinite Loop、Format Breakdown。 +
+
M2.1
UI Misoperation:点击、输入、导航等浏览器操作错误。
+
M2.2
Infinite Loop:重复无效动作,无法推进任务。
+
M2.3
Format Breakdown:输出或 action format 无法被框架解析。
+
+
+
+ M3 · Web Constraints + 失败主要来自网站或浏览环境限制,例如 Bot Defense、Access Barrier、Site Limitation。 +
+
M3.1
Bot Defense:验证码、风控或反自动化机制阻断。
+
M3.2
Access Barrier:登录、权限、地区或账号状态限制。
+
M3.3
Site Limitation:网站功能、内容或页面状态本身不可用。
+
+
+
+ +
+ Generation-level failure attribution comparison +
Figure 1. Generation-level failure attribution comparison.
+
+ +

Overall Performance

+ {html_table(["Model", "Successful Tasks", "Raw Failed Tasks", "Success Rate", "Attributed Failures"], OVERALL)} +

Doubao Seed 2.1 Pro 相比 2.0 Pro 出现明显退化:少完成 31 个任务,Success Rate 下降 14.76 个百分点。退化并不主要来自更困难的网站环境,而是集中在 action-level failures,尤其是 Infinite Loop 和 Format Breakdown。

+

GLM-5.2 与 GLM-5.1 的差距较小,但 raw task completion 仍略低:少完成 4 个任务,Success Rate 低 1.91 个百分点。如果把安全拒答样本从失败侧去掉,GLM-5.2 的非错误结果为 140/210,即 66.67%。

+ +

Doubao 2.0 Pro vs. Doubao 2.1 Pro

+ {html_table(["Error Layer", "Doubao 2.0 Count", "Doubao 2.0 Ratio", "Doubao 2.1 Count", "Doubao 2.1 Ratio", "Change"], DOUBAO_LAYER)} + {html_table(["Error Type", "Doubao 2.0", "Doubao 2.1", "Change"], DOUBAO_TYPES)} +

Doubao Seed 2.1 Pro 的核心问题是 operational stability。M2.2 Infinite Loop 从 10 条上升到 40 条,M2.3 Format Breakdown 从 2 条上升到 13 条。相比之下,Web Constraints 的绝对数量没有增加,M3.3 Site Limitation 仍然是 18 条。

+

这说明 2.1 的退化主要是 model-side,而不是 environment-side。它不是更频繁地被网站阻断,而是更容易无法维持有效的交互循环、无法从页面状态中恢复,或者无法保持可解析的 action format。

+ +

GLM-5.1 vs. GLM-5.2

+ {html_table(["Error Layer", "GLM-5.1 Count", "GLM-5.1 Ratio", "GLM-5.2 Count", "GLM-5.2 Ratio", "Change"], GLM_LAYER)} + {html_table(["Error Type", "GLM-5.1", "GLM-5.2", "Change"], GLM_TYPES)} +

GLM-5.2 在 website-side constraints 上更干净:Web Constraints 从 35 条下降到 23 条,其中 Bot Defense 和 Site Limitation 都减少了。

+

但是,Web Constraints 的减少没有转化为更高的整体任务完成率。错误更多地转移到 model-side categories:Task Reasoning 从 15 条上升到 27 条,主要集中在 Requirement Following 和 Target Selection。

+ +

Takeaways

+
+
    +
  • Doubao 2.1 Pro 变差的主要原因是 Action Execution 不稳定。
  • +
  • GLM-5.2 虽然更少受到 Web Constraints 影响,但 model-side reasoning errors 增加,因此整体略低于 GLM-5.1。
  • +
  • 这套 taxonomy 能把 model capability failures 和 website-side constraints 分开,从而更清楚地定位代际变化的来源。
  • +
+
+
+ + +""" + HTML_OUT.write_text(html, encoding="utf-8") + + +def main() -> None: + build_docx() + build_html() + print(DOCX_OUT) + print(HTML_OUT) + + +if __name__ == "__main__": + main() diff --git a/scripts/collect_lexbench_rerun_candidates.py b/scripts/collect_lexbench_rerun_candidates.py new file mode 100644 index 0000000..a2c9140 --- /dev/null +++ b/scripts/collect_lexbench_rerun_candidates.py @@ -0,0 +1,618 @@ +#!/usr/bin/env python3 +"""Collect high-recall LexBench-Browser rerun task candidates. + +This scanner is intentionally independent of failure-taxonomy attribution. It +uses only run artifacts: + +- tasks//result.json +- tasks//api_logs/step_*.json +- output/logs/run/*.log matching the target MODEL/TIMESTAMP output directory +""" + +from __future__ import annotations + +import argparse +import csv +import json +import re +from collections import Counter, defaultdict +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + + +REPO_ROOT = Path(__file__).resolve().parents[1] +DEFAULT_EXPERIMENT_ROOT = REPO_ROOT / "experiments" / "LexBench-Browser" / "All" / "browser-use" +DEFAULT_RUN_LOG_DIR = REPO_ROOT / "output" / "logs" / "run" + +RUN_LOG_LINE_RE = re.compile(r"\[run\]\s+\[(?P[^\]]+)\]\s+(?P.*)") + +ACCESS_ERROR_RE = re.compile( + r"Navigation failed(?: - site unavailable)?|site unavailable|This site can.?t be reached|" + r"ERR_TUNNEL_CONNECTION_FAILED|ERR_TIMED_OUT|net::ERR_TIMED_OUT|ERR_SOCKET_NOT_CONNECTED|" + r"ERR_NAME_NOT_RESOLVED|ERR_CONNECTION_(?:RESET|CLOSED|REFUSED)|ERR_HTTP2_PROTOCOL_ERROR", + re.I, +) +NAVIGATION_FAILED_RE = re.compile(r"Navigation failed|site unavailable", re.I) +ERR_TUNNEL_RE = re.compile(r"ERR_TUNNEL_CONNECTION_FAILED|ERR_SOCKET_NOT_CONNECTED", re.I) +ERR_TIMED_OUT_RE = re.compile(r"ERR_TIMED_OUT|net::ERR_TIMED_OUT", re.I) +CURRENT_TAB_ABOUT_BLANK_RE = re.compile( + r"(?:^|\n)(?:Tab\s+[^:\n]+:\s+about:blank|Current URL:\s+about:blank|URL:\s+about:blank)", + re.I, +) +ACTION_ABOUT_BLANK_RE = re.compile(r"(?:Opened new tab with url|Navigated to)\s+about:blank", re.I) +EMPTY_DOM_RE = re.compile( + r"0 links,\s*0 interactive|0 total elements|Empty DOM|empty DOM|empty content|" + r"no DOM elements|Page loaded but returned empty content", + re.I, +) +DETACHED_FOCUS_RE = re.compile( + r"No valid agent focus|target may have detached|Target closed|Cannot find context|" + r"SessionManager not initialized|browser is in an unstable state|detached target", + re.I, +) +BROWSER_EVENT_TIMEOUT_RE = re.compile( + r"Event handler .* timed out after|CDP request .* timed out|ScreenshotWatchdog .* timed out|" + r"Navigation failed: .*timed out after", + re.I | re.S, +) +LLM_TIMEOUT_RE = re.compile(r"LLM call timed out|model service no-response", re.I) +PARSE_ERROR_RE = re.compile( + r"validation error for AgentOutput|Failed to parse structured output|Invalid JSON|" + r"malformed JSON|parser failure", + re.I, +) +BOT_RE = re.compile( + r"captcha|cloudflare|robot|human verification|403|429|blocked|Too Many Requests|" + r"abnormal traffic|security check|验证码|人机|验证", + re.I, +) +RESULT_DOM_ACCESS_RE = re.compile( + r"0 links,\s*0 interactive|0 total elements|Empty DOM|empty DOM|empty content|" + r"Page appears empty|No valid agent focus|target may have detached|" + r"browser is in an unstable state|Cannot find context|Target closed|" + r"SessionManager not initialized|Navigation failed|site unavailable|" + r"ERR_TUNNEL_CONNECTION_FAILED|ERR_TIMED_OUT|This site can.?t be reached", + re.I, +) + + +@dataclass +class Reason: + source: str + rule: str + detail: str + + +@dataclass +class Candidate: + model: str + timestamp: str + task_id: str + reasons: list[Reason] = field(default_factory=list) + + +@dataclass +class ApiSignalCounts: + navigation_failed: int = 0 + err_tunnel: int = 0 + err_timed_out: int = 0 + current_tab_about_blank: int = 0 + action_about_blank: int = 0 + empty_dom: int = 0 + detached_focus: int = 0 + browser_event_timeout: int = 0 + llm_timeout: int = 0 + parse_error: int = 0 + + +def natural_key(value: str) -> tuple[int, str]: + return (0, f"{int(value):012d}") if value.isdigit() else (1, value) + + +def read_json(path: Path) -> dict[str, Any]: + try: + return json.loads(path.read_text(encoding="utf-8")) + except Exception: + return {} + + +def count_re(pattern: re.Pattern[str], text: str) -> int: + return len(pattern.findall(text or "")) + + +def stringify(value: Any, max_len: int = 10000) -> str: + if value is None: + return "" + if isinstance(value, str): + return value[:max_len] + try: + return json.dumps(value, ensure_ascii=False)[:max_len] + except Exception: + return str(value)[:max_len] + + +def extract_xmlish_block(text: str, tag: str) -> str: + match = re.search(fr"<{tag}>(.*?)", text or "", re.S) + return match.group(1) if match else "" + + +def result_json_reasons(task_dir: Path) -> list[Reason]: + result_path = task_dir / "result.json" + if not result_path.exists() or result_path.stat().st_size == 0: + return [Reason("result_json", "missing_or_empty_result_json", str(result_path))] + + try: + result = json.loads(result_path.read_text(encoding="utf-8")) + except Exception as exc: + return [Reason("result_json", "invalid_result_json", str(exc))] + + reasons: list[Reason] = [] + done = result.get("agent_done") + metrics = result.get("metrics") or {} + config = result.get("config") or {} + steps = metrics.get("steps") or 0 + max_steps = config.get("max_steps") or 40 + timeout_seconds = config.get("timeout_seconds") or 0 + wall_clock_seconds = result.get("wall_clock_seconds") or 0 + + if result.get("env_status") == "failed": + reasons.append(Reason("result_json", "env_status_failed", "env_status == failed")) + if done == "error": + reasons.append(Reason("result_json", "agent_done_error", "agent_done == error")) + if done == "max_steps" and steps < max_steps: + reasons.append( + Reason( + "result_json", + "early_max_steps", + f"agent_done=max_steps but steps={steps} < max_steps={max_steps}", + ) + ) + if done == "timeout" and timeout_seconds and wall_clock_seconds < timeout_seconds * 0.5: + reasons.append( + Reason( + "result_json", + "suspicious_early_timeout", + f"wall_clock_seconds={wall_clock_seconds} < 0.5*timeout_seconds={timeout_seconds}", + ) + ) + return reasons + + +def result_was_unsuccessful(task_dir: Path) -> bool: + result = read_json(task_dir / "result.json") + if not result: + return True + if result.get("agent_success") is False: + return True + if result.get("agent_done") in {"error", "timeout", "max_steps"}: + return True + if result.get("env_status") == "failed": + return True + return False + + +def result_text(result: dict[str, Any]) -> str: + chunks: list[str] = [] + for key in ("answer", "error", "exception", "traceback"): + value = result.get(key) + if value: + chunks.append(stringify(value, 5000)) + for action in result.get("action_history") or []: + chunks.append(stringify(action, 2000)) + return "\n".join(chunks) + + +def result_done_false_dom_reason(task_dir: Path) -> list[Reason]: + result = read_json(task_dir / "result.json") + if result.get("agent_success") is not False: + return [] + text = result_text(result) + if not RESULT_DOM_ACCESS_RE.search(text): + return [] + return [ + Reason( + "result_json", + "done_false_with_dom_or_access_evidence", + "agent_success=false and final result/action history contains DOM/access failure evidence", + ) + ] + + +def load_eval_pass_task_ids(run_dir: Path) -> set[str]: + eval_dir = run_dir / "tasks_eval_result" + candidates = sorted(eval_dir.glob("*_eval_results.json")) + pass_ids: set[str] = set() + for path in candidates: + text = path.read_text(encoding="utf-8", errors="replace").strip() + if not text: + continue + try: + if text.startswith("["): + rows = json.loads(text) + else: + rows = [json.loads(line) for line in text.splitlines() if line.strip()] + except Exception: + continue + for row in rows: + if isinstance(row, dict) and row.get("predicted_label") == 1 and row.get("task_id") is not None: + pass_ids.add(str(row["task_id"])) + return pass_ids + + +def load_taxonomy_web_constraint_ids(run_dir: Path) -> set[str]: + eval_dir = run_dir / "tasks_eval_result" + ids: set[str] = set() + for path in sorted(eval_dir.glob("*_failure_taxonomy*.jsonl")): + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + if not line.strip(): + continue + try: + row = json.loads(line) + except Exception: + continue + taxonomy = row.get("taxonomy") or {} + if taxonomy.get("primary_code") in {"M3.2", "M3.3"} and row.get("task_id") is not None: + ids.add(str(row["task_id"])) + return ids + + +def log_matches_run(text: str, run_dir: Path) -> bool: + normalized_text = text.replace("\\", "/") + normalized_run = str(run_dir).replace("\\", "/") + if normalized_run in normalized_text: + return True + try: + model = run_dir.parent.name + timestamp = run_dir.name + except Exception: + return False + relative = f"experiments/LexBench-Browser/All/browser-use/{model}/{timestamp}" + return relative in normalized_text + + +def run_log_reasons_for_message(message: str) -> list[Reason]: + reasons: list[Reason] = [] + if "Stopping due to 5 consecutive failures" in message: + reasons.append( + Reason( + "latest_agent_run_log", + "stopping_due_to_5_consecutive_failures", + "Stopping due to 5 consecutive failures", + ) + ) + if re.search(r"Result failed\s+6/6\s+times?:.*LLM call timed out", message, re.I): + reasons.append( + Reason( + "latest_agent_run_log", + "llm_timeout_6_of_6", + "Result failed 6/6 times: LLM call timed out", + ) + ) + if "ERR_TUNNEL_CONNECTION_FAILED" in message: + reasons.append( + Reason( + "latest_agent_run_log", + "err_tunnel_connection_failed", + "ERR_TUNNEL_CONNECTION_FAILED", + ) + ) + return reasons + + +def latest_run_log_reasons(run_dir: Path, run_log_dir: Path) -> tuple[dict[str, list[Reason]], list[str]]: + matched_logs: list[Path] = [] + latest_task_reasons: dict[str, list[Reason]] = {} + + for log_path in sorted(run_log_dir.glob("*.log"), key=lambda p: (p.stat().st_mtime, p.name)): + try: + text = log_path.read_text(encoding="utf-8", errors="replace") + except OSError: + continue + if not log_matches_run(text, run_dir): + continue + matched_logs.append(log_path) + reasons_by_task: dict[str, list[Reason]] = defaultdict(list) + seen_task_ids: set[str] = set() + for line in text.splitlines(): + match = RUN_LOG_LINE_RE.search(line) + if not match: + continue + task_id = match.group("task_id") + message = match.group("message") + seen_task_ids.add(task_id) + reasons_by_task[task_id].extend(run_log_reasons_for_message(message)) + + # A later execution log for the same task supersedes older task-level + # run-log evidence, even if the later log has no hard signal. + for task_id in seen_task_ids: + latest_task_reasons[task_id] = list({r.rule: r for r in reasons_by_task.get(task_id, [])}.values()) + + return latest_task_reasons, [str(path) for path in matched_logs] + + +def collect_api_log_text(api_dir: Path) -> str: + chunks: list[str] = [] + if not api_dir.exists(): + return "" + for step_path in sorted(api_dir.glob("step_*.json")): + step = read_json(step_path) + input_obj = step.get("input") or {} + state_message = stringify(input_obj.get("state_message"), 12000) + browser_state = extract_xmlish_block(state_message, "browser_state") + chunks.extend( + [ + stringify(input_obj.get("url"), 1000), + browser_state, + stringify(step.get("output"), 5000), + ] + ) + for action_result in step.get("action_results") or []: + chunks.extend( + [ + stringify(action_result.get("error"), 2000), + stringify(action_result.get("extracted_content"), 2000), + ] + ) + return "\n".join(chunks) + + +def scan_api_signals(text: str) -> ApiSignalCounts: + return ApiSignalCounts( + navigation_failed=count_re(NAVIGATION_FAILED_RE, text), + err_tunnel=count_re(ERR_TUNNEL_RE, text), + err_timed_out=count_re(ERR_TIMED_OUT_RE, text), + current_tab_about_blank=count_re(CURRENT_TAB_ABOUT_BLANK_RE, text), + action_about_blank=count_re(ACTION_ABOUT_BLANK_RE, text), + empty_dom=count_re(EMPTY_DOM_RE, text), + detached_focus=count_re(DETACHED_FOCUS_RE, text), + browser_event_timeout=count_re(BROWSER_EVENT_TIMEOUT_RE, text), + llm_timeout=count_re(LLM_TIMEOUT_RE, text), + parse_error=count_re(PARSE_ERROR_RE, text), + ) + + +def api_log_reasons(task_dir: Path, include_protocol_only: bool, skip_bot: bool) -> list[Reason]: + text = collect_api_log_text(task_dir / "api_logs") + if not text: + return [] + if skip_bot and BOT_RE.search(text): + return [] + counts = scan_api_signals(text) + reasons: list[Reason] = [] + + hard_access = ( + counts.err_tunnel > 0 + or counts.err_timed_out > 0 + or counts.navigation_failed >= 2 + or counts.browser_event_timeout >= 2 + ) + render_or_session = ( + counts.detached_focus > 0 + or counts.current_tab_about_blank >= 2 + or counts.action_about_blank >= 2 + or counts.empty_dom >= 10 + ) + repeated_protocol = counts.llm_timeout >= 5 or counts.parse_error >= 3 + + if hard_access: + reasons.append( + Reason( + "api_logs", + "api_hard_access_error", + json.dumps(asdict(counts), ensure_ascii=False, sort_keys=True), + ) + ) + if render_or_session: + reasons.append( + Reason( + "api_logs", + "api_render_or_session_error", + json.dumps(asdict(counts), ensure_ascii=False, sort_keys=True), + ) + ) + if include_protocol_only and repeated_protocol: + reasons.append( + Reason( + "api_logs", + "api_repeated_model_protocol_error", + json.dumps(asdict(counts), ensure_ascii=False, sort_keys=True), + ) + ) + return reasons + + +def discover_runs(root: Path) -> list[Path]: + runs: list[Path] = [] + for tasks_dir in sorted(root.glob("*/*/tasks")): + run_dir = tasks_dir.parent + if (run_dir / "tasks").is_dir(): + runs.append(run_dir) + return runs + + +def collect_run( + run_dir: Path, + run_log_dir: Path, + include_protocol_only: bool, + artifact_mode: str = "strict", + include_taxonomy_web_constraints: bool = False, +) -> tuple[list[Candidate], dict[str, Any]]: + tasks_dir = run_dir / "tasks" + model = run_dir.parent.name + timestamp = run_dir.name + candidates: dict[str, Candidate] = {} + + def add(task_id: str, reasons: list[Reason]) -> None: + if not reasons: + return + candidate = candidates.setdefault(task_id, Candidate(model=model, timestamp=timestamp, task_id=task_id)) + existing = {(reason.source, reason.rule, reason.detail) for reason in candidate.reasons} + for reason in reasons: + key = (reason.source, reason.rule, reason.detail) + if key not in existing: + candidate.reasons.append(reason) + existing.add(key) + + latest_log_reasons, matched_logs = latest_run_log_reasons(run_dir, run_log_dir) + eval_pass_task_ids = load_eval_pass_task_ids(run_dir) + taxonomy_web_ids = load_taxonomy_web_constraint_ids(run_dir) if include_taxonomy_web_constraints else set() + + task_dirs = [path for path in tasks_dir.iterdir() if path.is_dir()] if tasks_dir.exists() else [] + for task_dir in sorted(task_dirs, key=lambda p: natural_key(p.name)): + task_id = task_dir.name + add(task_id, result_json_reasons(task_dir)) + add(task_id, latest_log_reasons.get(task_id, [])) + if task_id in taxonomy_web_ids: + add( + task_id, + [ + Reason( + "failure_taxonomy", + "primary_m3_2_or_m3_3", + "failure taxonomy primary_code is M3.2 or M3.3", + ) + ], + ) + if artifact_mode == "strict": + if task_id not in eval_pass_task_ids: + add(task_id, result_done_false_dom_reason(task_dir)) + if result_was_unsuccessful(task_dir) and task_id not in eval_pass_task_ids: + add(task_id, api_log_reasons(task_dir, include_protocol_only=include_protocol_only, skip_bot=True)) + + for task_id, reasons in latest_log_reasons.items(): + add(task_id, reasons) + + rows = sorted(candidates.values(), key=lambda row: natural_key(row.task_id)) + metadata = { + "model": model, + "timestamp": timestamp, + "run_dir": str(run_dir), + "matched_run_logs": matched_logs, + "eval_pass_filter_count": len(eval_pass_task_ids), + "taxonomy_web_constraint_count": len(taxonomy_web_ids), + "artifact_mode": artifact_mode, + "include_taxonomy_web_constraints": include_taxonomy_web_constraints, + "task_count": len(task_dirs), + "rerun_count": len(rows), + "source_counts": dict(Counter(reason.source for row in rows for reason in row.reasons)), + "rule_counts": dict(Counter(reason.rule for row in rows for reason in row.reasons)), + } + return rows, metadata + + +def write_outputs(rows: list[Candidate], metadata: dict[str, Any], out_dir: Path) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + payload = { + "schema_version": "lexbench_rerun_candidates_v1", + "metadata": metadata, + "rerun_ids": [row.task_id for row in rows], + "rows": [ + { + "model": row.model, + "timestamp": row.timestamp, + "task_id": row.task_id, + "reasons": [asdict(reason) for reason in row.reasons], + } + for row in rows + ], + } + + json_path = out_dir / "rerun_candidates.json" + csv_path = out_dir / "rerun_candidates.csv" + ids_path = out_dir / "rerun_task_ids.txt" + md_path = out_dir / "rerun_candidates_summary.md" + + json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + ids_path.write_text(" ".join(payload["rerun_ids"]) + "\n", encoding="utf-8") + + with csv_path.open("w", encoding="utf-8", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=["model", "timestamp", "task_id", "rules", "sources", "details"]) + writer.writeheader() + for row in rows: + writer.writerow( + { + "model": row.model, + "timestamp": row.timestamp, + "task_id": row.task_id, + "rules": ";".join(reason.rule for reason in row.reasons), + "sources": ";".join(sorted({reason.source for reason in row.reasons})), + "details": " | ".join(f"{reason.rule}: {reason.detail}" for reason in row.reasons), + } + ) + + lines = [ + "# LexBench Rerun Candidates", + "", + f"- Run: `{metadata['model']}/{metadata['timestamp']}`", + f"- Tasks scanned: {metadata['task_count']}", + f"- Rerun candidates: {metadata['rerun_count']}", + f"- Source counts: `{json.dumps(metadata['source_counts'], ensure_ascii=False)}`", + f"- Rule counts: `{json.dumps(metadata['rule_counts'], ensure_ascii=False)}`", + "", + "## Task IDs", + "", + "```text", + " ".join(payload["rerun_ids"]), + "```", + "", + "## Reasons", + "", + ] + for row in rows: + rules = ", ".join(reason.rule for reason in row.reasons) + lines.append(f"- `{row.task_id}`: {rules}") + md_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + print(json.dumps({"json": str(json_path), "csv": str(csv_path), "ids": str(ids_path), "md": str(md_path)}, indent=2)) + + +def resolve_run_dir(args: argparse.Namespace) -> Path: + if args.run_dir: + return Path(args.run_dir) + if not args.model or not args.timestamp: + raise SystemExit("Provide --run-dir or both --model and --timestamp.") + return Path(args.root) / args.model / args.timestamp + + +def main() -> int: + parser = argparse.ArgumentParser(description="Collect LexBench-Browser rerun task candidates from run artifacts.") + parser.add_argument("--root", type=Path, default=DEFAULT_EXPERIMENT_ROOT) + parser.add_argument("--run-dir", type=Path, default=None, help="Specific MODEL/TIMESTAMP run directory.") + parser.add_argument("--model", default=None, help="Model directory name under --root.") + parser.add_argument("--timestamp", default=None, help="Timestamp directory under --root/MODEL.") + parser.add_argument("--run-log-dir", type=Path, default=DEFAULT_RUN_LOG_DIR) + parser.add_argument("--out-dir", type=Path, default=None) + parser.add_argument( + "--artifact-mode", + choices=["hard", "strict"], + default="strict", + help="hard = result/log hard rules only; strict = hard plus constrained result/api evidence.", + ) + parser.add_argument( + "--include-taxonomy-web-constraints", + action="store_true", + help="Also include tasks whose failure taxonomy primary_code is M3.2 or M3.3.", + ) + parser.add_argument( + "--include-protocol-only", + action="store_true", + help="Also include repeated api_logs parse/LLM-timeout signals without access/render evidence.", + ) + args = parser.parse_args() + + run_dir = resolve_run_dir(args) + out_dir = args.out_dir or run_dir / "rerun_candidates" + rows, metadata = collect_run( + run_dir, + args.run_log_dir, + include_protocol_only=args.include_protocol_only, + artifact_mode=args.artifact_mode, + include_taxonomy_web_constraints=args.include_taxonomy_web_constraints, + ) + write_outputs(rows, metadata, out_dir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/judge_lexbench_failure_taxonomy.py b/scripts/judge_lexbench_failure_taxonomy.py new file mode 100644 index 0000000..e49f6a5 --- /dev/null +++ b/scripts/judge_lexbench_failure_taxonomy.py @@ -0,0 +1,957 @@ +#!/usr/bin/env python3 +"""Classify LexBench-Browser failed trajectories with a compact taxonomy.""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import re +import socket +import ssl +import sys +import time +import urllib.error +import urllib.request +from collections import Counter, defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from browseruse_bench.eval.model import encode_image, load_evaluation_model + +try: + from PIL import Image +except ImportError: # pragma: no cover - validated at runtime when screenshots are used + Image = None + +try: + from tqdm import tqdm +except ImportError: # pragma: no cover - optional progress display + tqdm = None + + +LOGGER = logging.getLogger("lexbench-failure-taxonomy") + +PROMPT_PATH = ( + REPO_ROOT + / "browseruse_bench" + / "eval" + / "lexbench_browser" + / "prompts" + / "failure_taxonomy_system.txt" +) + +TAXONOMY: dict[str, tuple[str, str]] = { + "M1.1": ("Task Reasoning", "Requirement Following"), + "M1.2": ("Task Reasoning", "Target Selection"), + "M1.3": ("Task Reasoning", "Evidence Grounding"), + "M2.1": ("Action Execution", "UI Misoperation"), + "M2.2": ("Action Execution", "Infinite Loop"), + "M2.3": ("Action Execution", "Format Breakdown"), + "M3.1": ("Web Constraints", "Bot Defense"), + "M3.2": ("Web Constraints", "Access Barrier"), + "M3.3": ("Web Constraints", "Site Limitation"), + "OTHER": ("Other", "Other"), +} + +ALLOWED_CODES = set(TAXONOMY) + +RESPONSE_FORMAT = { + "type": "json_schema", + "json_schema": { + "name": "lexbench_failure_taxonomy", + "schema": { + "type": "object", + "properties": { + "primary_code": {"type": "string", "enum": sorted(ALLOWED_CODES)}, + "codes": { + "type": "array", + "items": {"type": "string", "enum": sorted(ALLOWED_CODES)}, + "minItems": 1, + "uniqueItems": True, + }, + "other_phrase": {"type": ["string", "null"]}, + "confidence": {"type": "string", "enum": ["high", "medium", "low"]}, + "reasoning": {"type": "string"}, + "evidence": { + "type": "array", + "items": {"type": "string"}, + "minItems": 1, + "maxItems": 6, + }, + }, + "required": [ + "primary_code", + "codes", + "other_phrase", + "confidence", + "reasoning", + "evidence", + ], + "additionalProperties": False, + }, + }, +} + + +class SimpleChatModel: + """Small OpenAI-compatible chat client used when the OpenAI SDK is unavailable.""" + + def __init__( + self, + model: str, + api_key: str | None, + base_url: str | None, + insecure: bool = False, + claude_thinking: bool = False, + reasoning_effort: str = "medium", + ): + self.model = model + self.api_key = api_key or os.getenv("EVAL_MODEL_API_KEY") or os.getenv("OPENAI_API_KEY") + self.base_url = (base_url or os.getenv("OPENAI_BASE_URL") or "https://api.openai.com/v1").rstrip("/") + self.ssl_context = ssl._create_unverified_context() if insecure else None + self.claude_thinking = claude_thinking + self.reasoning_effort = reasoning_effort + self.last_response: dict[str, Any] | None = None + self.last_usage: dict[str, Any] | None = None + if not self.api_key: + raise ValueError("API key required: set EVAL_MODEL_API_KEY or OPENAI_API_KEY") + + def _chat_url(self) -> str: + if self.base_url.endswith("/chat/completions"): + return self.base_url + return self.base_url + "/chat/completions" + + def generate( + self, + messages: list[dict[str, Any]], + max_tokens: int = 2048, + temperature: float | None = None, + **kwargs: Any, + ) -> str: + payload: dict[str, Any] = { + "model": self.model, + "messages": messages, + "max_tokens": max_tokens, + "temperature": 0.0 if temperature is None else temperature, + } + for key in ("response_format",): + if key in kwargs and kwargs[key] is not None: + payload[key] = kwargs[key] + extra_body = kwargs.get("extra_body") + if isinstance(extra_body, dict): + payload.update(extra_body) + if self.claude_thinking: + payload.setdefault("reasoning_effort", self.reasoning_effort) + allowed = list(payload.get("allowed_openai_params") or []) + if "reasoning_effort" not in allowed: + allowed.append("reasoning_effort") + payload["allowed_openai_params"] = allowed + + request = urllib.request.Request( + self._chat_url(), + data=json.dumps(payload).encode("utf-8"), + headers={ + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + }, + method="POST", + ) + body = "" + for attempt in range(1, 4): + try: + with urllib.request.urlopen(request, timeout=300, context=self.ssl_context) as response: + body = response.read().decode("utf-8") + break + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + if (exc.code >= 500 or exc.code in {401, 429}) and attempt < 3: + LOGGER.warning("LLM HTTP %s on attempt %s/3; retrying", exc.code, attempt) + time.sleep(4 * attempt) + continue + raise RuntimeError(f"LLM HTTP error {exc.code}: {detail}") from exc + except (socket.timeout, TimeoutError, urllib.error.URLError) as exc: + if attempt < 3: + LOGGER.warning("LLM connection timeout/error on attempt %s/3; retrying: %s", attempt, exc) + time.sleep(4 * attempt) + continue + raise RuntimeError(f"LLM connection error: {exc}") from exc + + parsed = json.loads(body) + self.last_response = parsed + self.last_usage = parsed.get("usage") if isinstance(parsed.get("usage"), dict) else None + return parsed["choices"][0]["message"].get("content") or "" + + +def _load_env_file(path: Path) -> None: + if not path.exists(): + return + try: + lines = path.read_text(encoding="utf-8").splitlines() + except OSError: + return + for raw in lines: + line = raw.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + key = key.strip() + value = value.strip().strip('"').strip("'") + if key and key not in os.environ: + os.environ[key] = value + + +def _utc_now() -> str: + return datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00", "Z") + + +def _compact_text(value: Any, limit: int) -> str: + text = str(value or "") + text = text.replace("\r\n", "\n").replace("\r", "\n").strip() + if limit <= 0 or len(text) <= limit: + return text + head = max(1, limit // 2) + tail = max(1, limit - head) + return text[:head] + f"\n...[truncated {len(text) - limit} chars]...\n" + text[-tail:] + + +def _one_line(value: Any, limit: int = 240) -> str: + text = re.sub(r"\s+", " ", str(value or "")).strip() + if len(text) <= limit: + return text + return text[: limit - 3] + "..." + + +def _load_json(path: Path) -> dict[str, Any]: + try: + return json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return {} + + +def _read_jsonl(path: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as handle: + for line_no, raw in enumerate(handle, 1): + line = raw.strip() + if not line: + continue + try: + row = json.loads(line) + except json.JSONDecodeError as exc: + LOGGER.warning("Skipping malformed line %s in %s: %s", line_no, path, exc) + continue + if isinstance(row, dict): + rows.append(row) + return rows + + +def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def _sort_screenshot_key(path: Path) -> tuple[int, str]: + match = re.search(r"(\d+)", path.name) + if match: + return int(match.group(1)), path.name + return 0, path.name + + +def _find_screenshots(task_dir: Path, max_screenshots: int) -> list[Path]: + trajectory_dir = task_dir / "trajectory" + if max_screenshots <= 0 or not trajectory_dir.exists(): + return [] + screenshots = [ + path + for path in trajectory_dir.iterdir() + if path.is_file() and path.suffix.lower() in {".png", ".jpg", ".jpeg", ".webp"} + ] + screenshots.sort(key=_sort_screenshot_key) + return screenshots[-max_screenshots:] + + +def _compact_api_summary(path: Path, limit: int) -> str: + if limit <= 0 or not path.exists(): + return "" + try: + text = path.read_text(encoding="utf-8", errors="replace") + except OSError: + return "" + + keep: list[str] = [] + for raw in text.splitlines(): + line = raw.rstrip() + stripped = line.strip() + if not stripped: + continue + if stripped.startswith(("## Task Info", "- **Task ID**", "- **Model**", "- **Total Steps**")): + keep.append(stripped) + continue + if stripped.startswith("## Step "): + keep.append(stripped) + continue + if stripped.startswith(("**URL**", "**Memory**", "**Actions**", "### Action Results")): + keep.append(stripped) + continue + if re.match(r"^\d+\.\s+`", stripped): + keep.append(stripped) + continue + if stripped.startswith("- ") and re.search( + r"(Error|Clicked|Typed|Navigated|Waited|Searched|Switched|Found|Downloaded|Wrote|Read|Saved|Scrolled|Pressed)", + stripped, + re.I, + ): + keep.append(stripped) + + compact = "\n".join(keep) + if not compact: + compact = text + return _compact_text(compact, limit) + + +def _extract_eval_details(record: dict[str, Any]) -> dict[str, Any]: + details = record.get("evaluation_details") + return details if isinstance(details, dict) else {} + + +def _extract_prompt_params(record: dict[str, Any]) -> dict[str, Any]: + details = _extract_eval_details(record) + user_prompt = details.get("user_prompt") + if not isinstance(user_prompt, dict): + return {} + params = user_prompt.get("params") + return params if isinstance(params, dict) else {} + + +def _result_dir(record: dict[str, Any], eval_file: Path) -> Path: + ref = record.get("agent_result_ref") + if isinstance(ref, dict) and ref.get("result_dir"): + return Path(str(ref["result_dir"])) + return eval_file.parent.parent / "tasks" / str(record.get("task_id", "")) + + +def _build_user_text( + record: dict[str, Any], + eval_file: Path, + *, + trace_char_budget: int, + feedback_char_budget: int, +) -> str: + details = _extract_eval_details(record) + params = _extract_prompt_params(record) + task_dir = _result_dir(record, eval_file) + result = _load_json(task_dir / "result.json") + api_trace = _compact_api_summary(task_dir / "api_logs" / "summary.md", trace_char_budget) + + old_classification = details.get("failure_classification") or record.get("failure_classification") + old_reasoning = "" + if isinstance(old_classification, dict): + old_reasoning = old_classification.get("reasoning", "") or "" + + action_history = result.get("action_history") or [] + if isinstance(action_history, list): + action_history_text = "\n".join(f"{idx + 1}. {item}" for idx, item in enumerate(action_history[-25:])) + else: + action_history_text = str(action_history) + + text = f"""Classify this failed LexBench-Browser trajectory. + +## Task +Task ID: {record.get("task_id")} +Model ID: {record.get("model_id") or result.get("model_id")} +Task type: {record.get("task_type") or params.get("task_type")} +Target website: {params.get("target_website", "")} +Task description: +{record.get("task") or params.get("task_description", "")} + +## Reference and rubric +Correct steps: +{_compact_text(params.get("reference_steps", ""), 2500)} + +Key points: +{_compact_text(params.get("key_points", ""), 1800)} + +Common mistakes: +{_compact_text(params.get("common_mistakes", ""), 1800)} + +Scoring items: +{_compact_text(params.get("scoring_items", ""), 2500)} + +## Judge result +Predicted label: {record.get("predicted_label")} (0 means failed) +Score: {details.get("score")} +Score threshold: {(details.get("benchmark_details") or {}).get("score_threshold")} +Evaluator feedback: +{_compact_text(details.get("response", ""), feedback_char_budget)} + +## Agent final answer +{_compact_text(params.get("agent_answer") or result.get("answer") or record.get("agent_response"), 4000)} + +## Runtime result +env_status: {result.get("env_status")} +agent_done: {result.get("agent_done")} +agent_success: {result.get("agent_success")} +error: {result.get("error")} +steps: {(result.get("metrics") or {}).get("steps")} +wall_clock_seconds: {result.get("wall_clock_seconds")} +old_failure_category: {record.get("failure_category")} +old_failure_reasoning: +{_compact_text(old_reasoning, 1500)} + +## Result action history +{_compact_text(action_history_text, 5000)} + +## Compact API trace +{api_trace} +""" + return text + + +def _image_content(path: Path, scale_factor: float) -> dict[str, Any] | None: + if Image is None: + raise ImportError("Pillow is required when --max-screenshots is greater than 0") + try: + image = Image.open(path) + encoded = encode_image(image, scale_factor=scale_factor) + except (OSError, RuntimeError, TypeError, ValueError) as exc: + LOGGER.warning("Failed to encode screenshot %s: %s", path, exc) + return None + return { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{encoded}", "detail": "high"}, + } + + +def _build_messages( + record: dict[str, Any], + eval_file: Path, + system_prompt: str, + args: argparse.Namespace, +) -> list[dict[str, Any]]: + user_text = _build_user_text( + record, + eval_file, + trace_char_budget=args.trace_char_budget, + feedback_char_budget=args.feedback_char_budget, + ) + content: list[dict[str, Any]] = [{"type": "text", "text": user_text}] + task_dir = _result_dir(record, eval_file) + for screenshot in _find_screenshots(task_dir, args.max_screenshots): + image_part = _image_content(screenshot, args.image_scale_factor) + if image_part: + content.append(image_part) + return [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": content}, + ] + + +def _parse_response(raw: str) -> dict[str, Any]: + if not raw.strip(): + raise ValueError("empty model response") + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + match = re.search(r"\{.*\}", raw, re.S) + parsed = json.loads(match.group(0)) if match else {} + + if not isinstance(parsed, dict): + parsed = {} + if not parsed: + raise ValueError("model response did not contain a JSON object") + + raw_codes = parsed.get("codes") + if isinstance(raw_codes, str): + codes = [raw_codes] + elif isinstance(raw_codes, list): + codes = [str(code) for code in raw_codes] + else: + codes = [] + + codes = [code for code in codes if code in ALLOWED_CODES] + primary_code = str(parsed.get("primary_code") or "").strip() + if primary_code not in ALLOWED_CODES: + primary_code = codes[0] if codes else "OTHER" + if primary_code not in codes: + codes.insert(0, primary_code) + if not codes: + codes = ["OTHER"] + primary_code = "OTHER" + + confidence = str(parsed.get("confidence") or "medium").lower() + if confidence not in {"high", "medium", "low"}: + confidence = "medium" + + other_phrase = parsed.get("other_phrase") + if other_phrase is not None: + other_phrase = _one_line(other_phrase, 80) + if "OTHER" not in codes: + other_phrase = None + elif not other_phrase: + other_phrase = "uncategorized" + + evidence = parsed.get("evidence") + if isinstance(evidence, str): + evidence_items = [evidence] + elif isinstance(evidence, list): + evidence_items = [_one_line(item, 300) for item in evidence if str(item).strip()] + else: + evidence_items = [] + + return { + "primary_code": primary_code, + "codes": codes, + "other_phrase": other_phrase, + "confidence": confidence, + "reasoning": str(parsed.get("reasoning") or "").strip(), + "evidence": evidence_items, + "raw_response": raw, + } + + +def _claude_extra_body(args: argparse.Namespace) -> dict[str, Any] | None: + if not args.claude_thinking: + return None + return { + "reasoning_effort": args.claude_reasoning_effort, + "allowed_openai_params": ["reasoning_effort"], + } + + +def _taxonomy_payload(parsed: dict[str, Any]) -> dict[str, Any]: + labels = [ + { + "code": code, + "group": TAXONOMY[code][0], + "label": TAXONOMY[code][1], + } + for code in parsed["codes"] + ] + return { + "primary_code": parsed["primary_code"], + "primary_group": TAXONOMY[parsed["primary_code"]][0], + "codes": parsed["codes"], + "groups": sorted({TAXONOMY[code][0] for code in parsed["codes"]}), + "labels": labels, + "other_phrase": parsed["other_phrase"], + "confidence": parsed["confidence"], + "reasoning": parsed["reasoning"], + "evidence": parsed["evidence"], + "raw_response": parsed["raw_response"], + } + + +def _classify_one( + record: dict[str, Any], + eval_file: Path, + system_prompt: str, + model: Any, + args: argparse.Namespace, +) -> dict[str, Any]: + messages = _build_messages(record, eval_file, system_prompt, args) + extra_body = _claude_extra_body(args) + raw = "" + for attempt in range(1, 3): + raw = model.generate( + messages, + max_tokens=args.max_tokens, + temperature=args.temperature, + response_format=None if args.no_response_format else RESPONSE_FORMAT, + extra_body=extra_body, + ) + if raw.strip(): + break + LOGGER.warning("Empty model response for task %s on attempt %s/2", record.get("task_id"), attempt) + if not raw.strip(): + raise RuntimeError(f"empty model response for task {record.get('task_id')}") + try: + parsed = _parse_response(raw) + except (json.JSONDecodeError, ValueError): + LOGGER.warning("Repairing malformed JSON for task %s", record.get("task_id")) + repair_raw = model.generate( + [ + { + "role": "system", + "content": "Repair malformed judge output. Return only valid JSON matching the requested schema.", + }, + { + "role": "user", + "content": ( + "Convert this malformed failure-taxonomy response into valid JSON. " + f"Allowed codes: {', '.join(sorted(ALLOWED_CODES))}. " + "Preserve the intended labels and evidence; do not add new analysis.\n\n" + f"Malformed response:\n{raw}" + ), + }, + ], + max_tokens=min(args.max_tokens, 900), + temperature=args.temperature, + response_format=None if args.no_response_format else RESPONSE_FORMAT, + extra_body=extra_body, + ) + parsed = _parse_response(repair_raw) + raw = repair_raw + details = _extract_eval_details(record) + return { + "schema_version": "lexbench_failure_taxonomy_v1", + "classified_at": _utc_now(), + "source_eval_file": str(eval_file), + "task_id": str(record.get("task_id", "")), + "model_id": record.get("model_id"), + "task_type": record.get("task_type"), + "predicted_label": record.get("predicted_label"), + "score": details.get("score"), + "old_failure_category": record.get("failure_category"), + "taxonomy": _taxonomy_payload(parsed), + } + + +def _default_output_path(eval_file: Path) -> Path: + stem = eval_file.stem + if stem.endswith("_eval_results"): + stem = stem[: -len("_eval_results")] + return eval_file.with_name(f"{stem}_failure_taxonomy.jsonl") + + +def _slug(value: str) -> str: + slug = re.sub(r"[^A-Za-z0-9._-]+", "-", value.strip()) + return slug.strip("-") or "judge" + + +def _output_path_for_eval(eval_file: Path, args: argparse.Namespace) -> Path: + if args.output and len(args.eval_files) == 1: + return Path(args.output) + path = _default_output_path(eval_file) + if args.judge_suffix: + return path.with_name(path.stem + f"_{_slug(args.judge_suffix)}" + path.suffix) + if args.include_judge_in_output: + return path.with_name(path.stem + f"_{_slug(args.model)}" + path.suffix) + return path + + +def _summary_path(output_path: Path) -> Path: + return output_path.with_name(output_path.stem + "_summary.json") + + +def _load_existing(path: Path) -> dict[str, dict[str, Any]]: + if not path.exists(): + return {} + rows = _read_jsonl(path) + return {str(row.get("task_id")): row for row in rows if row.get("task_id") is not None} + + +def _summarize(rows: list[dict[str, Any]], total_failures: int, selected_failures: int) -> dict[str, Any]: + primary_code = Counter() + mention_code = Counter() + primary_group = Counter() + mention_group = Counter() + other_phrases = Counter() + + for row in rows: + taxonomy = row.get("taxonomy") or {} + primary = taxonomy.get("primary_code") + if primary: + primary_code[primary] += 1 + primary_group[TAXONOMY.get(primary, ("Other", "Other"))[0]] += 1 + for code in taxonomy.get("codes") or []: + mention_code[code] += 1 + mention_group[TAXONOMY.get(code, ("Other", "Other"))[0]] += 1 + if taxonomy.get("other_phrase"): + other_phrases[taxonomy["other_phrase"]] += 1 + + return { + "schema_version": "lexbench_failure_taxonomy_summary_v1", + "generated_at": _utc_now(), + "total_failures": total_failures, + "selected_failures": selected_failures, + "classified_failures": len(rows), + "primary_code_counts": dict(primary_code), + "mention_code_counts": dict(mention_code), + "primary_group_counts": dict(primary_group), + "mention_group_counts": dict(mention_group), + "other_phrase_counts": dict(other_phrases), + "taxonomy": { + code: {"group": group, "label": label} + for code, (group, label) in TAXONOMY.items() + }, + } + + +def _write_summary(path: Path, summary: dict[str, Any]) -> None: + path.write_text(json.dumps(summary, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + +def _process_eval_file(eval_file: Path, system_prompt: str, model: Any, args: argparse.Namespace) -> int: + all_failure_records = [row for row in _read_jsonl(eval_file) if row.get("predicted_label") == 0] + records = all_failure_records + if args.task_ids: + keep_task_ids = {str(task_id) for task_id in args.task_ids} + records = [row for row in records if str(row.get("task_id", "")) in keep_task_ids] + if args.max_samples is not None: + records = records[: args.max_samples] + + output_path = _output_path_for_eval(eval_file, args) + existing = {} if args.overwrite else _load_existing(output_path) + + pending = [row for row in records if str(row.get("task_id", "")) not in existing] + LOGGER.info( + "%s: failures=%s selected=%s existing=%s pending=%s output=%s", + eval_file, + len(all_failure_records), + len(records), + len(existing), + len(pending), + output_path, + ) + + if args.dry_run: + if pending: + preview = _build_user_text( + pending[0], + eval_file, + trace_char_budget=min(args.trace_char_budget, 4000), + feedback_char_budget=min(args.feedback_char_budget, 4000), + ) + LOGGER.info("Dry-run prompt preview for task %s:\n%s", pending[0].get("task_id"), preview[:5000]) + return 0 + + results: list[dict[str, Any]] = list(existing.values()) + failed_count = 0 + if pending: + with ThreadPoolExecutor(max_workers=max(1, args.num_workers)) as executor: + future_map = { + executor.submit(_classify_one, row, eval_file, system_prompt, model, args): row + for row in pending + } + completed = as_completed(future_map) + if tqdm is not None: + completed = tqdm( + completed, + total=len(future_map), + desc=f"{eval_file.parents[2].name}:{args.model}", + unit="traj", + ) + for future in completed: + row = future_map[future] + try: + classified = future.result() + except Exception as exc: # noqa: BLE001 - keep batch jobs moving + LOGGER.exception("Failed to classify task %s: %s", row.get("task_id"), exc) + failed_count += 1 + continue + if args.verbose: + LOGGER.info( + "classified task=%s primary=%s codes=%s", + classified["task_id"], + classified["taxonomy"]["primary_code"], + ",".join(classified["taxonomy"]["codes"]), + ) + results.append(classified) + + results.sort(key=lambda row: int(row["task_id"]) if str(row["task_id"]).isdigit() else str(row["task_id"])) + _write_jsonl(output_path, results) + _write_summary(_summary_path(output_path), _summarize(results, len(all_failure_records), len(records))) + return failed_count + + +def _find_eval_files(args: argparse.Namespace) -> list[Path]: + if args.eval_files: + return [Path(path).expanduser().resolve() for path in args.eval_files] + + root = Path(args.experiments_root).expanduser().resolve() + pattern = f"*/20*/tasks_eval_result/{args.eval_filename}" + files = sorted(root.glob(pattern)) + if args.models: + keep = set(args.models) + files = [path for path in files if path.parents[2].name in keep] + return files + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Classify LexBench-Browser judge-failed trajectories into the compact failure taxonomy.", + ) + parser.add_argument( + "--experiments-root", + default=str(REPO_ROOT / "experiments" / "LexBench-Browser" / "All" / "browser-use"), + help="Root containing model/timestamp run directories.", + ) + parser.add_argument( + "--eval-filename", + default="task_gpt-4.1_per_task_threshold_stepwise_eval_results.json", + help="Evaluation JSONL filename under tasks_eval_result.", + ) + parser.add_argument( + "--eval-files", + nargs="*", + default=[], + help="Specific eval JSONL files. Overrides --experiments-root discovery.", + ) + parser.add_argument("--models", nargs="*", default=[], help="Optional model directory names to include.") + parser.add_argument("--output", default=None, help="Output JSONL path. Only valid with exactly one eval file.") + parser.add_argument("--model", default="gpt-4.1", help="LLM judge model.") + parser.add_argument( + "--include-judge-in-output", + action="store_true", + help="Append the judge model name to the default output filename.", + ) + parser.add_argument("--judge-suffix", default=None, help="Custom suffix appended to the default output filename.") + parser.add_argument("--api-key", default=None, help="API key. Defaults to EVAL_MODEL_API_KEY or OPENAI_API_KEY.") + parser.add_argument("--base-url", default=None, help="OpenAI-compatible base URL.") + parser.add_argument("--temperature", type=float, default=0.0) + parser.add_argument("--max-tokens", type=int, default=1200) + parser.add_argument("--num-workers", type=int, default=4) + parser.add_argument( + "--no-response-format", + action="store_true", + help="Do not send the JSON schema response_format; rely on prompt-only JSON output.", + ) + parser.add_argument("--claude-thinking", action="store_true", help="Enable Claude extended thinking through LiteLLM.") + parser.add_argument( + "--claude-reasoning-effort", + default="medium", + choices=["minimal", "low", "medium", "high"], + help="Claude reasoning_effort value when --claude-thinking is set.", + ) + parser.add_argument( + "--smoke-thinking", + action="store_true", + help="Make one small API call and verify usage contains reasoning/thinking tokens.", + ) + parser.add_argument("--max-samples", type=int, default=None, help="Max failed samples per eval file.") + parser.add_argument("--task-ids", nargs="*", default=[], help="Only classify these task IDs.") + parser.add_argument("--max-screenshots", type=int, default=3, help="Number of final screenshots to include.") + parser.add_argument("--image-scale-factor", type=float, default=0.6) + parser.add_argument("--trace-char-budget", type=int, default=18000) + parser.add_argument("--feedback-char-budget", type=int, default=12000) + parser.add_argument("--overwrite", action="store_true", help="Reclassify even when output rows already exist.") + parser.add_argument( + "--insecure", + action="store_true", + help="Disable SSL certificate verification for the stdlib HTTP fallback client.", + ) + parser.add_argument("--verbose", action="store_true", help="Log every classified task.") + parser.add_argument("--dry-run", action="store_true", help="Print counts and a prompt preview without API calls.") + return parser.parse_args() + + +def _find_reasoning_tokens(value: Any) -> list[tuple[str, int]]: + found: list[tuple[str, int]] = [] + + def walk(node: Any, path: str) -> None: + if isinstance(node, dict): + for key, child in node.items(): + key_path = f"{path}.{key}" if path else str(key) + lower = str(key).lower() + if isinstance(child, (int, float)) and ("reason" in lower or "think" in lower): + found.append((key_path, int(child))) + walk(child, key_path) + elif isinstance(node, list): + for idx, child in enumerate(node): + walk(child, f"{path}[{idx}]") + + walk(value, "") + return found + + +def _find_reasoning_content(value: Any) -> list[tuple[str, str]]: + found: list[tuple[str, str]] = [] + + def walk(node: Any, path: str) -> None: + if isinstance(node, dict): + for key, child in node.items(): + key_path = f"{path}.{key}" if path else str(key) + lower = str(key).lower() + if isinstance(child, str) and ("reasoning_content" in lower or "thinking_blocks" in lower): + text = child.strip() + if text: + found.append((key_path, _one_line(text, 160))) + walk(child, key_path) + elif isinstance(node, list): + for idx, child in enumerate(node): + walk(child, f"{path}[{idx}]") + + walk(value, "") + return found + + +def _smoke_thinking(model: Any, args: argparse.Namespace) -> int: + extra_body = _claude_extra_body(args) + raw = model.generate( + [ + {"role": "system", "content": "Return only JSON."}, + {"role": "user", "content": "Classify this tiny failure: the agent clicked the same blocked button repeatedly until timeout. Return {\"label\":\"Infinite Loop\"}."}, + ], + max_tokens=200, + temperature=args.temperature, + extra_body=extra_body, + ) + usage = getattr(model, "last_usage", None) + response = getattr(model, "last_response", None) + print("SMOKE_RESPONSE:", raw) + print("SMOKE_USAGE:", json.dumps(usage, ensure_ascii=False, indent=2)) + reasoning_tokens = _find_reasoning_tokens(response or usage or {}) + reasoning_content = _find_reasoning_content(response or {}) + print("SMOKE_REASONING_TOKEN_FIELDS:", reasoning_tokens) + print("SMOKE_REASONING_CONTENT_FIELDS:", reasoning_content) + if args.claude_thinking and not any(value > 0 for _, value in reasoning_tokens) and not reasoning_content: + LOGGER.error("Claude thinking was requested but no reasoning/thinking token or content field was found.") + return 1 + return 0 + + +def main() -> int: + logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") + _load_env_file(REPO_ROOT / ".env") + args = parse_args() + + eval_files = _find_eval_files(args) + args.eval_files = [str(path) for path in eval_files] + if not eval_files: + LOGGER.error("No eval files found.") + return 1 + if args.output and len(eval_files) != 1: + LOGGER.error("--output is only valid when classifying exactly one eval file.") + return 1 + + system_prompt = PROMPT_PATH.read_text(encoding="utf-8") + LOGGER.info("Taxonomy prompt: %s", PROMPT_PATH) + LOGGER.info("Eval files: %s", len(eval_files)) + + model = None + if not args.dry_run: + try: + model = load_evaluation_model(args.model, args.api_key, args.base_url, temperature=args.temperature) + except ImportError as exc: + LOGGER.warning("OpenAI SDK wrapper unavailable (%s); using stdlib HTTP fallback.", exc) + model = SimpleChatModel( + args.model, + args.api_key, + args.base_url, + insecure=args.insecure, + claude_thinking=args.claude_thinking, + reasoning_effort=args.claude_reasoning_effort, + ) + + if args.smoke_thinking: + return _smoke_thinking(model, args) + + failed_total = 0 + for eval_file in eval_files: + failed_total += _process_eval_file(eval_file, system_prompt, model, args) + + return 1 if failed_total else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/plot_failure_taxonomy_figure.py b/scripts/plot_failure_taxonomy_figure.py new file mode 100644 index 0000000..13159a8 --- /dev/null +++ b/scripts/plot_failure_taxonomy_figure.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python3 +"""Plot LexBench-Browser failure taxonomy figure.""" + +from __future__ import annotations + +import json +import math +from collections import Counter +from pathlib import Path + +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.patches import Patch + + +REPO_ROOT = Path(__file__).resolve().parents[1] +EXPERIMENT_ROOT = REPO_ROOT / "experiments" / "LexBench-Browser" / "All" / "browser-use" +PAPER_FIG_DIR = Path("/Users/abc/Desktop/lexmount/lexbench_arxiv_paper/lexmount_tech_report/fig") + +TAXONOMY_FILE = ( + "task_gpt-4.1_per_task_threshold_stepwise_failure_taxonomy_gpt-5.5-judge.jsonl" +) + +CODES = ["M1.1", "M1.2", "M1.3", "M2.1", "M2.2", "M2.3", "M3.1", "M3.2", "M3.3"] +LABELS = { + "M1.1": "Requirement Following", + "M1.2": "Target Selection", + "M1.3": "Evidence Grounding", + "M2.1": "UI Misoperation", + "M2.2": "Infinite Loop", + "M2.3": "Format Breakdown", + "M3.1": "Bot Defense", + "M3.2": "Access Barrier", + "M3.3": "Site Limitation", +} +GROUPS = { + "Task Reasoning": ["M1.1", "M1.2", "M1.3"], + "Action Execution": ["M2.1", "M2.2", "M2.3"], + "Web Constraints": ["M3.1", "M3.2", "M3.3"], +} +GROUP_OF = {code: group for group, codes in GROUPS.items() for code in codes} + +# Low-saturation Lexmount-compatible palette: royal blue, amber, teal. +GROUP_COLORS = { + "Task Reasoning": "#355F9F", + "Action Execution": "#C6922E", + "Web Constraints": "#3E8582", +} +SUB_COLORS = { + "M1.1": "#254B8D", + "M1.2": "#6F8FC7", + "M1.3": "#B8C7E5", + "M2.1": "#B57E19", + "M2.2": "#D4A246", + "M2.3": "#E9C981", + "M3.1": "#2F7774", + "M3.2": "#72AAA7", + "M3.3": "#B5D6D3", +} + +DONUT_MODELS = { + "bu-2-0", + "MiniMax-M3", + "dmx-claude-opus-4-8-thinking", + "qwen3.7-max", + "gemini-3.1-pro-preview", + "gemini-3.5-flash", + "kimi-k2.6", + "glm-5.1", + "doubao-seed-2-0-pro", + "gpt-5.5", +} +COMPARE_MODELS = [ + ("gpt-5.5", "GPT-5.5", 1.0, None, "#777777"), + ("doubao-seed-2-0-pro", "Doubao 2.0 Pro", 0.62, "////", "#aaaaaa"), + ("doubao-seed-2-1-pro-260628", "Doubao 2.1 Pro", 0.38, "\\\\\\\\", "#c8c8c8"), +] + + +def load_primary_counts() -> tuple[dict[str, Counter], Counter]: + by_model: dict[str, Counter] = {} + total = Counter() + for path in sorted(EXPERIMENT_ROOT.glob(f"*/*/tasks_eval_result/{TAXONOMY_FILE}")): + model = path.relative_to(EXPERIMENT_ROOT).parts[0] + counts = Counter() + with path.open(encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + row = json.loads(line) + code = row["taxonomy"]["primary_code"] + if code == "OTHER": + continue + counts[code] += 1 + if model in DONUT_MODELS: + total[code] += 1 + by_model[model] = counts + return by_model, total + + +def autopct_values(values: list[int]) -> list[str]: + total = sum(values) + return [f"{value / total * 100:.1f}%" if value else "" for value in values] + + +def add_outer_labels(ax: plt.Axes, wedges, values: list[int], total: int) -> None: + for wedge, code, value in zip(wedges, CODES, values): + theta = math.radians((wedge.theta1 + wedge.theta2) / 2.0) + x, y = math.cos(theta), math.sin(theta) + pct = value / total * 100 + label = f"{code} {LABELS[code]}\n{pct:.1f}%" + ha = "left" if x >= 0 else "right" + xy = (0.98 * x, 0.98 * y) + xytext = (1.35 * x, 1.35 * y) + ax.annotate( + label, + xy=xy, + xytext=xytext, + ha=ha, + va="center", + fontsize=8.7, + color="#222222", + arrowprops=dict( + arrowstyle="-|>", + lw=0.75, + color="#222222", + shrinkA=0, + shrinkB=4, + connectionstyle="arc3,rad=0.08", + ), + ) + + +def plot_donut(ax: plt.Axes, total_counts: Counter) -> None: + sub_values = [total_counts[code] for code in CODES] + group_names = list(GROUPS) + group_values = [sum(total_counts[code] for code in GROUPS[group]) for group in group_names] + total = sum(sub_values) + + wedges, _ = ax.pie( + sub_values, + radius=1.0, + startangle=90, + counterclock=False, + colors=[SUB_COLORS[code] for code in CODES], + wedgeprops=dict(width=0.27, edgecolor="white", linewidth=1.0), + ) + ax.pie( + group_values, + radius=0.72, + startangle=90, + counterclock=False, + colors=[GROUP_COLORS[group] for group in group_names], + wedgeprops=dict(width=0.34, edgecolor="white", linewidth=1.0), + ) + ax.add_artist(plt.Circle((0, 0), 0.34, color="white", zorder=10)) + + # Inner labels. + cumulative = 0 + for group, value in zip(group_names, group_values): + angle = 90 - (cumulative + value / 2) / total * 360 + cumulative += value + theta = math.radians(angle) + ax.text( + 0.58 * math.cos(theta), + 0.58 * math.sin(theta), + f"{group}\n{value / total * 100:.1f}%", + ha="center", + va="center", + fontsize=8.8, + color="white", + fontweight="bold", + linespacing=0.95, + ) + + add_outer_labels(ax, wedges, sub_values, total) + ax.set(aspect="equal") + ax.set_title( + "All Models", + fontsize=14, + fontweight="bold", + color="#1C2B33", + pad=18, + ) + ax.text( + 0, + -1.28, + f"{total} failed trajectories", + ha="center", + va="center", + fontsize=9.5, + color="#555555", + ) + + +def plot_model_bars(ax: plt.Axes, by_model: dict[str, Counter]) -> None: + series = [] + for model_key, label, alpha, hatch, legend_color in COMPARE_MODELS: + counts = by_model[model_key] + total = sum(counts.values()) + values = np.array([counts[code] for code in CODES]) + series.append((label, total, values, alpha, hatch, legend_color)) + y = np.arange(len(CODES)) + height = 0.22 + offsets = np.linspace(-height, height, len(series)) + colors = [SUB_COLORS[code] for code in CODES] + + for offset, (label, total, values, alpha, hatch, _) in zip(offsets, series): + ax.barh( + y + offset, + values, + height=height, + color=colors, + alpha=alpha, + edgecolor="white", + linewidth=0.9, + hatch=hatch, + label=f"{label} ({total}/210 failed)", + ) + + for offset, (_, _, values, alpha, _, _) in zip(offsets, series): + text_color = "#222222" if alpha > 0.9 else "#555555" + for i, value in enumerate(values): + if value: + ax.text(value + 0.45, i + offset, f"{int(value)}", va="center", ha="left", fontsize=8.1, color=text_color) + + ax.set_yticks(y) + ax.set_yticklabels([f"{code}" for code in CODES], fontsize=10, fontweight="bold") + ax.invert_yaxis() + ax.set_xlim(0, 43) + ax.set_xlabel("Primary attribution count", fontsize=10.5, color="#333333") + ax.set_title("GPT-5.5 vs. Doubao 2.0/2.1", fontsize=14, fontweight="bold", color="#1C2B33", pad=18) + ax.grid(axis="x", linestyle="--", linewidth=0.7, color="#d7dbe2", alpha=0.9) + ax.set_axisbelow(True) + + for spine in ["top", "right", "left"]: + ax.spines[spine].set_visible(False) + ax.spines["bottom"].set_color("#d0d5dd") + ax.tick_params(axis="x", colors="#333333", labelsize=9) + ax.tick_params(axis="y", length=0) + + # Group labels on the left, similar to CocoaBench. + group_spans = { + "Task\nReasoning": (0, 2), + "Action\nExecution": (3, 5), + "Web\nConstraints": (6, 8), + } + for text, (lo, hi) in group_spans.items(): + mid = (lo + hi) / 2 + group_name = text.replace("\n", " ") + if "Task" in group_name: + color = GROUP_COLORS["Task Reasoning"] + elif "Action" in group_name: + color = GROUP_COLORS["Action Execution"] + else: + color = GROUP_COLORS["Web Constraints"] + ax.text( + -0.205, + mid, + text, + transform=ax.get_yaxis_transform(), + ha="right", + va="center", + rotation=90, + fontsize=12, + fontweight="bold", + color=color, + linespacing=0.9, + clip_on=False, + ) + + legend_handles = [ + Patch( + facecolor=legend_color, + edgecolor="white", + hatch=hatch, + label=f"{label} ({total}/210 failed)", + ) + for label, total, _, _, hatch, legend_color in series + ] + ax.legend( + handles=legend_handles, + loc="lower right", + frameon=True, + framealpha=0.95, + facecolor="white", + edgecolor="#dddddd", + fontsize=9.0, + ) + + +def main() -> None: + mpl.rcParams.update( + { + "font.family": "DejaVu Serif", + "axes.titlesize": 14, + "pdf.fonttype": 42, + "ps.fonttype": 42, + } + ) + + by_model, total_counts = load_primary_counts() + PAPER_FIG_DIR.mkdir(parents=True, exist_ok=True) + + fig = plt.figure(figsize=(14.5, 6.0), dpi=220) + gs = fig.add_gridspec(1, 2, width_ratios=[1.08, 1.28], wspace=0.40) + ax0 = fig.add_subplot(gs[0, 0]) + ax1 = fig.add_subplot(gs[0, 1]) + + plot_donut(ax0, total_counts) + plot_model_bars(ax1, by_model) + + fig.subplots_adjust(left=0.035, right=0.99, top=0.92, bottom=0.10) + + pdf_path = PAPER_FIG_DIR / "failure_taxonomy_doubao_comparison.pdf" + png_path = PAPER_FIG_DIR / "failure_taxonomy_doubao_comparison.png" + fig.savefig(pdf_path, bbox_inches="tight") + fig.savefig(png_path, bbox_inches="tight") + print(pdf_path) + print(png_path) + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_generation_failure_comparison.py b/scripts/plot_generation_failure_comparison.py new file mode 100644 index 0000000..345f203 --- /dev/null +++ b/scripts/plot_generation_failure_comparison.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +"""Plot generation-to-generation failure attribution comparisons.""" + +from __future__ import annotations + +import json +from collections import Counter +from pathlib import Path + +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.patches import Patch + + +REPO_ROOT = Path(__file__).resolve().parents[1] +EXPERIMENT_ROOT = REPO_ROOT / "experiments" / "LexBench-Browser" / "All" / "browser-use" +REPORT_DIR = REPO_ROOT / "reports" / "assets" + +SUMMARY_FILE = "task_gpt-4.1_per_task_threshold_stepwise_failure_taxonomy_gpt-5.5-judge_summary.json" + +CODES = ["M1.1", "M1.2", "M1.3", "M2.1", "M2.2", "M2.3", "M3.1", "M3.2", "M3.3"] +GROUPS = { + "Task Reasoning": ["M1.1", "M1.2", "M1.3"], + "Action Execution": ["M2.1", "M2.2", "M2.3"], + "Web Constraints": ["M3.1", "M3.2", "M3.3"], +} +GROUP_COLORS = { + "Task Reasoning": "#355F9F", + "Action Execution": "#C6922E", + "Web Constraints": "#3E8582", +} +SUB_COLORS = { + "M1.1": "#254B8D", + "M1.2": "#6F8FC7", + "M1.3": "#B8C7E5", + "M2.1": "#B57E19", + "M2.2": "#D4A246", + "M2.3": "#E9C981", + "M3.1": "#2F7774", + "M3.2": "#72AAA7", + "M3.3": "#B5D6D3", +} + +MODEL_DIRS = { + "doubao20": "doubao-seed-2-0-pro/20260604_100016", + "doubao21": "doubao-seed-2-1-pro-260628/20260623_164627", + "glm51": "glm-5.1/20260605_103731", + "glm52": "glm-5.2/20260624_104300", +} + + +def load_counts(key: str) -> tuple[Counter[str], int]: + path = EXPERIMENT_ROOT / MODEL_DIRS[key] / "tasks_eval_result" / SUMMARY_FILE + summary = json.loads(path.read_text(encoding="utf-8")) + counts = Counter(summary["primary_code_counts"]) + counts.pop("OTHER", None) + return counts, sum(counts.values()) + + +def plot_panel(ax: plt.Axes, title: str, series: list[tuple[str, Counter[str], int, str | None, float]]) -> None: + y = np.arange(len(CODES)) + height = 0.28 + offsets = np.linspace(-height / 1.7, height / 1.7, len(series)) + colors = [SUB_COLORS[code] for code in CODES] + + for offset, (label, counts, total, hatch, alpha) in zip(offsets, series): + values = np.array([counts.get(code, 0) for code in CODES]) + ax.barh( + y + offset, + values, + height=height, + color=colors, + alpha=alpha, + edgecolor="white", + linewidth=0.9, + hatch=hatch, + label=f"{label} ({total}/210 attributed)", + ) + for i, value in enumerate(values): + if value: + ax.text( + value + 0.45, + i + offset, + f"{int(value)}", + va="center", + ha="left", + fontsize=8.8, + color="#222222" if alpha > 0.85 else "#555555", + ) + + ax.set_yticks(y) + ax.set_yticklabels(CODES, fontsize=10.5, fontweight="bold") + ax.invert_yaxis() + ax.set_xlim(0, 43) + ax.set_xlabel("Primary attribution count", fontsize=10.5, color="#333333") + ax.set_title(title, fontsize=14, fontweight="bold", color="#1C2B33", pad=14) + ax.grid(axis="x", linestyle="--", linewidth=0.7, color="#d7dbe2", alpha=0.9) + ax.set_axisbelow(True) + + for spine in ["top", "right", "left"]: + ax.spines[spine].set_visible(False) + ax.spines["bottom"].set_color("#d0d5dd") + ax.tick_params(axis="x", colors="#333333", labelsize=9) + ax.tick_params(axis="y", length=0) + + group_spans = { + "Task\nReasoning": (0, 2), + "Action\nExecution": (3, 5), + "Web\nConstraints": (6, 8), + } + for text, (lo, hi) in group_spans.items(): + mid = (lo + hi) / 2 + if "Task" in text: + color = GROUP_COLORS["Task Reasoning"] + elif "Action" in text: + color = GROUP_COLORS["Action Execution"] + else: + color = GROUP_COLORS["Web Constraints"] + ax.text( + -0.14, + mid, + text, + transform=ax.get_yaxis_transform(), + ha="right", + va="center", + rotation=90, + fontsize=11.5, + fontweight="bold", + color=color, + linespacing=0.9, + clip_on=False, + ) + + handles = [ + Patch(facecolor="#777777", edgecolor="white", hatch=hatch, label=f"{label} ({total}/210 attributed)") + for label, _, total, hatch, _ in series + ] + ax.legend( + handles=handles, + loc="lower right", + frameon=True, + framealpha=0.95, + facecolor="white", + edgecolor="#dddddd", + fontsize=9.0, + ) + + +def main() -> None: + mpl.rcParams.update( + { + "font.family": "DejaVu Serif", + "axes.titlesize": 14, + "pdf.fonttype": 42, + "ps.fonttype": 42, + } + ) + + doubao20, doubao20_total = load_counts("doubao20") + doubao21, doubao21_total = load_counts("doubao21") + glm51, glm51_total = load_counts("glm51") + glm52, glm52_total = load_counts("glm52") + + REPORT_DIR.mkdir(parents=True, exist_ok=True) + fig, axes = plt.subplots(1, 2, figsize=(14.0, 5.9), dpi=220, sharey=False) + plot_panel( + axes[0], + "Doubao 2.0 Pro vs. Doubao 2.1 Pro", + [ + ("Doubao 2.0 Pro", doubao20, doubao20_total, None, 0.96), + ("Doubao 2.1 Pro", doubao21, doubao21_total, "////", 0.62), + ], + ) + plot_panel( + axes[1], + "GLM-5.1 vs. GLM-5.2", + [ + ("GLM-5.1", glm51, glm51_total, None, 0.96), + ("GLM-5.2", glm52, glm52_total, "////", 0.62), + ], + ) + fig.subplots_adjust(left=0.075, right=0.99, top=0.90, bottom=0.12, wspace=0.18) + + pdf_path = REPORT_DIR / "generation_failure_comparison.pdf" + png_path = REPORT_DIR / "generation_failure_comparison.png" + fig.savefig(pdf_path, bbox_inches="tight") + fig.savefig(png_path, bbox_inches="tight") + print(pdf_path) + print(png_path) + + +if __name__ == "__main__": + main() From a5859105be4a6aff20df05e0f4c14e521f583210 Mon Sep 17 00:00:00 2001 From: XuweiDing Date: Thu, 25 Jun 2026 13:16:34 +0800 Subject: [PATCH 2/2] Optimize rerun workflow with hard pre-check --- README.md | 21 +++- README_ZH.md | 20 +++- browseruse_bench/cli/eval.py | 30 ++++++ browseruse_bench/cli/run_eval.py | 1 + browseruse_bench/eval/base.py | 22 ++++- .../eval/lexbench_browser/evaluator.py | 6 ++ docs/lexbench-automated-evaluation-system.md | 99 ++++++++++++------- docs/rerun-rule-validation-12-models.md | 26 ++--- docs/result-rerun-check-rules.md | 75 +++++++++----- scripts/collect_lexbench_rerun_candidates.py | 8 +- scripts/judge_lexbench_failure_taxonomy.py | 18 ++++ 11 files changed, 243 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index 5c29ced..603433c 100644 --- a/README.md +++ b/README.md @@ -206,19 +206,30 @@ bubench eval --agent browser-use --data LexBench-Browser --model-id gpt-4.1 For LexBench-Browser result analysis, use the automated post-run workflow: ```text -run benchmark -> eval -> failure attribution -> post-attribution rerun check +run benchmark -> hard artifact pre-check -> eval remaining results +-> failure attribution excluding hard-hit tasks -> post-attribution rerun check -> rerun selected tasks -> re-eval -> final attribution / visualization ``` The final rerun candidate set is: ```text -result_json_hard -∪ latest_agent_run_log_hard -∪ taxonomy_primary_M3.2_or_M3.3 +hard_artifact_rerun +∪ taxonomy_primary_M3.2_or_M3.3_on_non_hard_tasks ``` -Generate rerun task ids after evaluation and failure attribution: +First collect deterministic hard failures, which can be excluded from judge calls: + +```bash +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --model MODEL_DIR_NAME \ + --timestamp TIMESTAMP \ + --artifact-mode hard \ + --out-dir experiments/LexBench-Browser/All/browser-use/MODEL_DIR_NAME/TIMESTAMP/rerun_candidates_hard +``` + +Then run eval / failure attribution on non-hard tasks and generate the final +rerun task ids: ```bash PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ diff --git a/README_ZH.md b/README_ZH.md index 354ad10..2b191c9 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -205,19 +205,29 @@ bubench eval --agent browser-use --data LexBench-Browser --model-id gpt-4.1 LexBench-Browser 结果分析推荐使用这套自动化 post-run 流程: ```text -run benchmark -> eval -> failure attribution -> post-attribution rerun check +run benchmark -> hard artifact pre-check -> eval remaining results +-> failure attribution excluding hard-hit tasks -> post-attribution rerun check -> rerun selected tasks -> re-eval -> final attribution / visualization ``` 最终 rerun candidate 集合是: ```text -result_json_hard -∪ latest_agent_run_log_hard -∪ taxonomy_primary_M3.2_or_M3.3 +hard_artifact_rerun +∪ taxonomy_primary_M3.2_or_M3.3_on_non_hard_tasks ``` -在 eval 和 failure attribution 之后生成 rerun task ids: +先收集确定性的 hard failures,这部分可以直接进入 rerun,并从 judge 调用中排除: + +```bash +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --model MODEL_DIR_NAME \ + --timestamp TIMESTAMP \ + --artifact-mode hard \ + --out-dir experiments/LexBench-Browser/All/browser-use/MODEL_DIR_NAME/TIMESTAMP/rerun_candidates_hard +``` + +然后对 non-hard tasks 跑 eval / failure attribution,再生成最终 rerun task ids: ```bash PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ diff --git a/browseruse_bench/cli/eval.py b/browseruse_bench/cli/eval.py index 360df25..430f105 100644 --- a/browseruse_bench/cli/eval.py +++ b/browseruse_bench/cli/eval.py @@ -229,6 +229,14 @@ def _parse_extra_args(extra_args: list[str]) -> dict[str, Any]: return extra +def _read_task_ids_file(path: Path | None) -> list[str]: + if path is None: + return [] + if not path.exists(): + raise SystemExit(f"[FAILED] Task id file does not exist: {path}") + return path.read_text(encoding="utf-8").split() + + def run_evaluation( agent_name: str, benchmark_name: str, @@ -303,6 +311,14 @@ def run_evaluation( "force_download": bool(getattr(args, "force_download", False)), } extra.update(_parse_extra_args(extra_args)) + task_ids = [str(task_id) for task_id in (getattr(args, "task_ids", None) or [])] + task_ids.extend(_read_task_ids_file(getattr(args, "task_ids_file", None))) + exclude_task_ids = [str(task_id) for task_id in (getattr(args, "exclude_task_ids", None) or [])] + exclude_task_ids.extend(_read_task_ids_file(getattr(args, "exclude_task_ids_file", None))) + if task_ids: + extra["task_ids"] = task_ids + if exclude_task_ids: + extra["exclude_task_ids"] = exclude_task_ids if max_tokens is not None: extra["max_tokens"] = max_tokens @@ -425,6 +441,20 @@ def configure_eval_parser(parser: argparse.ArgumentParser, config: dict[str, Any action="store_true", help="Rerun evaluation (default reuses existing results, only runs failure classification)", ) + parser.add_argument("--task-ids", nargs="*", default=[], help="Only evaluate these task IDs.") + parser.add_argument( + "--task-ids-file", + type=Path, + default=None, + help="Whitespace-separated task IDs to evaluate.", + ) + parser.add_argument("--exclude-task-ids", nargs="*", default=[], help="Do not evaluate these task IDs.") + parser.add_argument( + "--exclude-task-ids-file", + type=Path, + default=None, + help="Whitespace-separated task IDs to skip during evaluation.", + ) parser.add_argument( "--agent-config", type=Path, diff --git a/browseruse_bench/cli/run_eval.py b/browseruse_bench/cli/run_eval.py index 1f59e73..aac5bcf 100644 --- a/browseruse_bench/cli/run_eval.py +++ b/browseruse_bench/cli/run_eval.py @@ -117,6 +117,7 @@ def _report_output_dir(report_file: str, run_dir: Path) -> None: # stay on both.) _EVAL_ONLY_VALUE_FLAGS = { "--score-threshold", "--num-worker", "--api-key", "--base-url", "--eval-strategy", + "--task-ids-file", "--exclude-task-ids-file", } _EVAL_ONLY_BOOL_FLAGS = {"--force-reeval"} diff --git a/browseruse_bench/eval/base.py b/browseruse_bench/eval/base.py index 5e92ef3..2326873 100644 --- a/browseruse_bench/eval/base.py +++ b/browseruse_bench/eval/base.py @@ -122,11 +122,21 @@ def run(self) -> int: tasks = self.load_tasks() completed = self.list_completed_tasks() already = self._resume_skip_set() + include_task_ids = self._task_id_filter("task_ids") + exclude_task_ids = self._task_id_filter("exclude_task_ids") pending = [ p.name for p in completed if p.name not in already and p.name in tasks + and (not include_task_ids or p.name in include_task_ids) + and p.name not in exclude_task_ids ] - logger.info("Evaluating %d tasks (skip %d already done)", len(pending), len(already)) + logger.info( + "Evaluating %d tasks (skip %d already done, include_filter=%d, exclude_filter=%d)", + len(pending), + len(already), + len(include_task_ids), + len(exclude_task_ids), + ) for result in self._run_iteration(pending, tasks): self._append_result(result) # Hook runs before summary so subclasses (e.g. LexBench coverage backfill) @@ -138,6 +148,16 @@ def run(self) -> int: self._generate_summary(records) return 0 + def _task_id_filter(self, key: str) -> Set[str]: + raw = self.args.extra.get(key) + if raw is None: + return set() + if isinstance(raw, str): + return {item for item in raw.split() if item} + if isinstance(raw, (list, tuple, set)): + return {str(item) for item in raw if str(item)} + return {str(raw)} + def _load_all_records(self) -> List[Dict[str, Any]]: """Load every record currently appended to the results JSONL on disk.""" path = self.results_path() diff --git a/browseruse_bench/eval/lexbench_browser/evaluator.py b/browseruse_bench/eval/lexbench_browser/evaluator.py index f41f665..7f71c57 100644 --- a/browseruse_bench/eval/lexbench_browser/evaluator.py +++ b/browseruse_bench/eval/lexbench_browser/evaluator.py @@ -568,6 +568,12 @@ def post_eval_hook(self, records: list[dict[str, Any]]) -> None: return attempted_ids = {d.name for d in self.args.trajectories_dir.iterdir() if d.is_dir()} expected = [tid for tid in self._expected_task_ids if tid in attempted_ids] + include_task_ids = self._task_id_filter("task_ids") + exclude_task_ids = self._task_id_filter("exclude_task_ids") + if include_task_ids: + expected = [tid for tid in expected if tid in include_task_ids] + if exclude_task_ids: + expected = [tid for tid in expected if tid not in exclude_task_ids] if not expected: return records_by_task_id = { diff --git a/docs/lexbench-automated-evaluation-system.md b/docs/lexbench-automated-evaluation-system.md index cbac5a3..3fcf3ad 100644 --- a/docs/lexbench-automated-evaluation-system.md +++ b/docs/lexbench-automated-evaluation-system.md @@ -5,17 +5,19 @@ automation pipeline: ```text run benchmark -→ eval -→ failure attribution +→ hard artifact pre-check +→ eval non-hard tasks +→ failure attribution on non-hard failures → post-attribution rerun check → rerun selected tasks → re-eval → final failure attribution / visualization ``` -The final rerun check is intentionally **post-attribution**. Artifact-only -signals are still useful as hard infrastructure checks, but they are not enough -to get high M3.2/M3.3 recall with bounded false positives. +The workflow intentionally runs deterministic hard-artifact rules before judge +calls. Hard-hit tasks go straight into the rerun set and can be excluded from +eval/failure-attribution to save judge tokens. Attribution is then used only to +catch non-hard failures whose primary cause is M3.2/M3.3. ## File Map @@ -61,9 +63,35 @@ experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/tasks/*/api_logs/ output/logs/run/*.log ``` -## Stage 2: Evaluate Run Results +## Stage 2: Hard Artifact Pre-Check -Run the normal LexBench-Browser evaluator: +Run deterministic hard rules before any judge call: + +```zsh +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --model MODEL_DIR_NAME \ + --timestamp TIMESTAMP \ + --artifact-mode hard \ + --out-dir experiments/LexBench-Browser/All/browser-use/MODEL_DIR_NAME/TIMESTAMP/rerun_candidates_hard +``` + +This collects only: + +```text +result_json_hard +∪ latest_agent_run_log_hard +``` + +These tasks are definite run/infrastructure failures and do not need eval or +failure attribution before rerun. Their ids are written to: + +```text +experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/rerun_candidates_hard/rerun_task_ids.txt +``` + +## Stage 3: Evaluate Non-Hard Tasks + +Run the normal LexBench-Browser evaluator, excluding hard-hit tasks: ```zsh PYTHONPATH=. ./.venvs/browser_use/bin/python scripts/eval.py \ @@ -71,7 +99,8 @@ PYTHONPATH=. ./.venvs/browser_use/bin/python scripts/eval.py \ --split All \ --agent browser-use \ --model MODEL_CONFIG_KEY \ - --timestamp TIMESTAMP + --timestamp TIMESTAMP \ + --exclude-task-ids-file experiments/LexBench-Browser/All/browser-use/MODEL_DIR_NAME/TIMESTAMP/rerun_candidates_hard/rerun_task_ids.txt ``` The expected eval output is: @@ -81,10 +110,11 @@ experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/tasks_eval_result/ task_gpt-4.1_per_task_threshold_stepwise_eval_results.json ``` -## Stage 3: Failure Attribution +## Stage 4: Failure Attribution on Non-Hard Failures -Run failure attribution after evaluation. This classifies evaluator-failed tasks -into the M1/M2/M3 taxonomy. +Run failure attribution after evaluation, again excluding hard-hit tasks. This +keeps judge tokens focused on failures that are not already deterministic +reruns. Prompt: @@ -101,6 +131,7 @@ PYTHONPATH=. python scripts/judge_lexbench_failure_taxonomy.py \ --eval-filename task_gpt-4.1_per_task_threshold_stepwise_eval_results.json \ --model gpt-5.5-judge \ --include-judge-in-output \ + --exclude-task-ids-file experiments/LexBench-Browser/All/browser-use/MODEL_DIR_NAME/TIMESTAMP/rerun_candidates_hard/rerun_task_ids.txt \ --num-workers 4 ``` @@ -112,9 +143,9 @@ tasks_eval_result/ task_gpt-4.1_per_task_threshold_stepwise_failure_taxonomy_gpt-5.5-judge_summary.json ``` -## Stage 4: Post-Attribution Rerun Check +## Stage 5: Post-Attribution Rerun Check -This is the recommended final rerun/review pool for reducing M3.2/M3.3: +This is the recommended final rerun pool for reducing M3.2/M3.3: ```zsh PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ @@ -127,16 +158,14 @@ PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ The final post-attribution set is: ```text -result_json_hard -∪ latest_agent_run_log_hard -∪ taxonomy_primary_M3.2_or_M3.3 +hard_artifact_rerun +∪ taxonomy_primary_M3.2_or_M3.3_on_non_hard_tasks ``` Where: -- `result_json_hard` catches missing/invalid results, `agent_done == error`, `env_status == failed`, early `max_steps`, and suspicious early `timeout`. -- `latest_agent_run_log_hard` catches `Stopping due to 5 consecutive failures`, `Result failed 6/6 times: LLM call timed out`, and `ERR_TUNNEL_CONNECTION_FAILED` from the latest matching agent execution log. -- `taxonomy_primary_M3.2_or_M3.3` catches attribution primary-code `M3.2 Access Barrier` or `M3.3 Site Limitation`. +- `hard_artifact_rerun` is the Stage 2 set: result-json hard failures plus latest run-log hard failures. +- `taxonomy_primary_M3.2_or_M3.3_on_non_hard_tasks` catches attribution primary-code `M3.2 Access Barrier` or `M3.3 Site Limitation` among the remaining evaluated failures. Outputs are written to: @@ -158,7 +187,7 @@ total candidates: 219 false positives vs primary M3.2/M3.3: 48 ``` -There is also a provisional artifact-only mode for debugging before attribution +There is also a broader artifact-only mode for debugging before attribution exists: ```zsh @@ -185,7 +214,7 @@ even broader debugging pool, add: --include-protocol-only ``` -## Stage 5: Rerun Candidates +## Stage 6: Rerun Candidates Read task ids from: @@ -211,7 +240,7 @@ PYTHONPATH=. ./.venvs/browser_use/bin/python scripts/run.py \ Do not use `--skip-completed` for this rerun. These tasks are intentionally being overwritten/retested. -## Stage 6: Re-Evaluate Rerun Results +## Stage 7: Re-Evaluate Rerun Results After rerun, run the evaluator again: @@ -231,7 +260,7 @@ experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/tasks_eval_result/ task_gpt-4.1_per_task_threshold_stepwise_eval_results.json ``` -## Stage 7: Re-Run Failure Attribution +## Stage 8: Re-Run Failure Attribution After rerun and re-eval, run failure attribution again so final analysis uses the latest task outcomes. @@ -265,7 +294,7 @@ tasks_eval_result/ The taxonomy output is both part of the post-attribution rerun check and the input to model capability analysis. -## Stage 8: Validate Rerun Rule Recall +## Stage 9: Validate Rerun Rule Recall Use taxonomy output to measure whether the post-attribution rerun rule covers M3.2/M3.3 while keeping false positives bounded. The current validation record is: @@ -303,7 +332,7 @@ experiments/LexBench-Browser/All/browser-use/failure_taxonomy_review/ This audit is for diagnosis and rule validation. It is not the final rerun selection rule. -## Stage 9: Visualize Failure Attribution +## Stage 10: Visualize Failure Attribution Main failure taxonomy figure: @@ -340,17 +369,19 @@ reports/assets/ ```text 1. Run benchmark tasks. -2. Evaluate task results. -3. Run failure attribution on evaluator-failed tasks. -4. Run post-attribution rerun check. -5. Rerun selected candidates. -6. Re-evaluate rerun results. -7. Re-run failure attribution on final failures. -8. Generate taxonomy figures/reports. -9. Optionally cross-check rerun-rule recall against M3.2/M3.3. +2. Run hard artifact pre-check. +3. Evaluate non-hard task results. +4. Run failure attribution on non-hard evaluator-failed tasks. +5. Run post-attribution rerun check. +6. Rerun selected candidates. +7. Re-evaluate rerun results. +8. Re-run failure attribution on final failures. +9. Generate taxonomy figures/reports. +10. Optionally cross-check rerun-rule recall against M3.2/M3.3. ``` Keep these two concepts separate: -- **Post-attribution rerun check** answers: "Which tasks should be rerun to reduce M3.2/M3.3 and hard run-artifact failures?" +- **Hard artifact pre-check** answers: "Which tasks are deterministic infrastructure/run failures and can skip judge calls?" +- **Post-attribution rerun check** answers: "Which additional non-hard tasks should be rerun to reduce M3.2/M3.3?" - **Failure attribution** answers: "For the evaluated failed trajectory, what capability or web-constraint category best explains the failure?" diff --git a/docs/rerun-rule-validation-12-models.md b/docs/rerun-rule-validation-12-models.md index 2e7d211..c9ed7c7 100644 --- a/docs/rerun-rule-validation-12-models.md +++ b/docs/rerun-rule-validation-12-models.md @@ -65,14 +65,13 @@ false positives vs M3.2/M3.3: 99 This is acceptable as a pre-attribution artifact scan, but not enough for the final high-recall rerun pool. -### Final Post-Attribution Rule +### Final Token-Efficient Rule Definition: ```text -result_json_hard -∪ latest_agent_run_log_hard -∪ taxonomy_primary_M3.2_or_M3.3 +hard_artifact_rerun +∪ taxonomy_primary_M3.2_or_M3.3_on_non_hard_tasks ``` Command: @@ -129,14 +128,19 @@ such as consecutive failures, early max-steps, tunnel errors, or LLM timeout 6/6 ## Final Recommendation -Use two phases: +Use hard pre-check first, then attribution only for the remaining ambiguous +failures: + +1. **Hard artifact pre-check**: catch deterministic infrastructure failures and + send them directly to rerun without judge calls. +2. **Eval/failure attribution on non-hard tasks**: classify only the remaining + failed tasks. +3. **Post-attribution rerun check**: add non-hard tasks whose attribution + primary code is M3.2 or M3.3. -1. **Before failure attribution**: run artifact-only scanner as a provisional - rerun detector. It catches deterministic infrastructure failures without - needing judge outputs. -2. **After failure attribution**: use the final post-attribution rule above. - This is the rule that satisfies high M3.2/M3.3 recall while keeping false - positives bounded. +This is the rule that satisfies high M3.2/M3.3 recall while keeping false +positives bounded and avoiding unnecessary attribution tokens for deterministic +hard failures. Do not use broad api-log render/session evidence as a default hard rerun rule. It should remain constrained or optional because transient empty DOM/loading diff --git a/docs/result-rerun-check-rules.md b/docs/result-rerun-check-rules.md index 7dd98f1..e5d1507 100644 --- a/docs/result-rerun-check-rules.md +++ b/docs/result-rerun-check-rules.md @@ -197,41 +197,59 @@ Older M3.3 audit reports may contain `manual_review`. For validation-only M3.3 recall studies, merge those rows into `rerun_candidate`. For the final rerun set, use the post-attribution rule below instead. -Use the independent rerun scanner to collect final rerun ids for a target run: +Use the rerun scanner in hard mode first to collect deterministic rerun ids for +a target run: ```zsh PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ --root /Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use \ --model MODEL \ - --timestamp TIMESTAMP + --timestamp TIMESTAMP \ + --artifact-mode hard \ + --out-dir experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/rerun_candidates_hard ``` -This pre-attribution mode does not require failure-attribution results. Outputs -are written to: +This hard pre-check does not require failure-attribution results. Outputs are +written to: ```text -experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/rerun_candidates/ +experiments/LexBench-Browser/All/browser-use/MODEL/TIMESTAMP/rerun_candidates_hard/ rerun_candidates.json rerun_candidates.csv rerun_candidates_summary.md rerun_task_ids.txt ``` -By default, this pre-attribution scanner includes result hard rules, latest -run-log hard rules, and constrained api-log access/render/session evidence. -Repeated parse or LLM-timeout-only api-log evidence can be added to a broader +These hard-hit tasks should go directly into the rerun set and can be excluded +from eval and failure attribution. + +There is also an optional strict artifact diagnostic mode: + +```zsh +PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ + --root /Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use \ + --model MODEL \ + --timestamp TIMESTAMP +``` + +The strict mode includes result hard rules, latest run-log hard rules, and +constrained api-log access/render/session evidence. It is useful for debugging +before attribution exists, but it is not the final default rerun rule. Repeated +parse or LLM-timeout-only api-log evidence can be added to an even broader debugging pool with: ```zsh --include-protocol-only ``` -The `api_logs` part is applied only to unsuccessful task results by default. -This avoids rerunning tasks that recovered from transient loading/empty-DOM -states and finished successfully. `result.json` hard rules and latest run-log -hard rules still take precedence even if `agent_success == true`. +In strict mode, the `api_logs` part is applied only to unsuccessful task results +by default. This avoids rerunning tasks that recovered from transient +loading/empty-DOM states and finished successfully. `result.json` hard rules and +latest run-log hard rules still take precedence even if `agent_success == true`. -After failure attribution is available, use the final high-recall mode: +After failure attribution is available, use the final high-recall mode. In the +token-efficient workflow, run this after the hard artifact pre-check and +attribution on the remaining non-hard failures: ```zsh PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ @@ -245,9 +263,8 @@ PYTHONPATH=. python scripts/collect_lexbench_rerun_candidates.py \ This mode uses: ```text -result_json_hard -∪ latest_agent_run_log_hard -∪ taxonomy_primary_M3.2_or_M3.3 +hard_artifact_rerun +∪ taxonomy_primary_M3.2_or_M3.3_on_non_hard_tasks ``` On the 12 current model runs, this rule covered `171/171` primary M3.2/M3.3 @@ -268,19 +285,28 @@ prove that logs contain these errors, but they cannot guarantee that rerunning will fix the task or that the original attribution is wrong. This rule intentionally prioritizes recall over precision. -## Final High-Recall Rerun Set +## Final Token-Efficient Rerun Set For each `MODEL/TIMESTAMP`, the final rerun candidate set is: ```text -result_json_hard -∪ latest_agent_run_log_hard -∪ taxonomy_primary_M3.2_or_M3.3 +hard_artifact_rerun +∪ taxonomy_primary_M3.2_or_M3.3_on_non_hard_tasks +``` + +Recommended order: + +```text +1. Run hard artifact pre-check. +2. Put hard-hit tasks directly into the rerun set. +3. Exclude hard-hit tasks from eval/failure attribution when possible. +4. Run eval and failure attribution on the remaining tasks. +5. Add remaining tasks whose attribution primary_code is M3.2 or M3.3. ``` +This avoids spending judge tokens on deterministic run/infrastructure failures. When failure attribution is not available yet, use the pre-attribution scanner -output as a provisional artifact-only rerun set. When attribution is available, -prefer the final set above. +output as a provisional artifact-only rerun set. If asking Codex or another agent to collect ids, give this instruction: @@ -289,9 +315,8 @@ For the target MODEL/TIMESTAMP under /Users/abc/Desktop/lexmount/browseruse-agent-bench/experiments/LexBench-Browser/All/browser-use, run scripts/collect_lexbench_rerun_candidates.py with `--artifact-mode hard --include-taxonomy-web-constraints` and return the union of: -1. result.json hard rerun ids, -2. hard ids from the latest matching agent execution log under output/logs/run, -3. failure-taxonomy ids whose primary_code is M3.2 or M3.3. +1. hard artifact rerun ids from result.json and latest matching run logs, +2. failure-taxonomy ids whose primary_code is M3.2 or M3.3 among non-hard tasks. Return sorted unique task ids and the reason for each id. ``` diff --git a/scripts/collect_lexbench_rerun_candidates.py b/scripts/collect_lexbench_rerun_candidates.py index a2c9140..3c33ade 100644 --- a/scripts/collect_lexbench_rerun_candidates.py +++ b/scripts/collect_lexbench_rerun_candidates.py @@ -1,12 +1,16 @@ #!/usr/bin/env python3 """Collect high-recall LexBench-Browser rerun task candidates. -This scanner is intentionally independent of failure-taxonomy attribution. It -uses only run artifacts: +By default, this scanner uses only run artifacts: - tasks//result.json - tasks//api_logs/step_*.json - output/logs/run/*.log matching the target MODEL/TIMESTAMP output directory + +When ``--include-taxonomy-web-constraints`` is set, it also merges tasks whose +failure taxonomy primary code is M3.2 or M3.3. The recommended workflow is to +run hard artifact mode first, exclude those hard-hit tasks from judge calls, +then run this script again to union hard artifacts with non-hard taxonomy hits. """ from __future__ import annotations diff --git a/scripts/judge_lexbench_failure_taxonomy.py b/scripts/judge_lexbench_failure_taxonomy.py index e49f6a5..21e7ae6 100644 --- a/scripts/judge_lexbench_failure_taxonomy.py +++ b/scripts/judge_lexbench_failure_taxonomy.py @@ -697,6 +697,9 @@ def _process_eval_file(eval_file: Path, system_prompt: str, model: Any, args: ar if args.task_ids: keep_task_ids = {str(task_id) for task_id in args.task_ids} records = [row for row in records if str(row.get("task_id", "")) in keep_task_ids] + if args.exclude_task_ids: + exclude_task_ids = {str(task_id) for task_id in args.exclude_task_ids} + records = [row for row in records if str(row.get("task_id", "")) not in exclude_task_ids] if args.max_samples is not None: records = records[: args.max_samples] @@ -830,6 +833,13 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--max-samples", type=int, default=None, help="Max failed samples per eval file.") parser.add_argument("--task-ids", nargs="*", default=[], help="Only classify these task IDs.") + parser.add_argument("--exclude-task-ids", nargs="*", default=[], help="Do not classify these task IDs.") + parser.add_argument( + "--exclude-task-ids-file", + type=Path, + default=None, + help="Whitespace-separated task IDs to exclude from classification.", + ) parser.add_argument("--max-screenshots", type=int, default=3, help="Number of final screenshots to include.") parser.add_argument("--image-scale-factor", type=float, default=0.6) parser.add_argument("--trace-char-budget", type=int, default=18000) @@ -914,6 +924,14 @@ def main() -> int: logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") _load_env_file(REPO_ROOT / ".env") args = parse_args() + if args.exclude_task_ids_file: + if not args.exclude_task_ids_file.exists(): + LOGGER.error("--exclude-task-ids-file does not exist: %s", args.exclude_task_ids_file) + return 2 + args.exclude_task_ids = [ + *args.exclude_task_ids, + *args.exclude_task_ids_file.read_text(encoding="utf-8").split(), + ] eval_files = _find_eval_files(args) args.eval_files = [str(path) for path in eval_files]