diff --git a/browseruse_bench/utils/stats.py b/browseruse_bench/utils/stats.py index eb04f29..893fde6 100644 --- a/browseruse_bench/utils/stats.py +++ b/browseruse_bench/utils/stats.py @@ -1,6 +1,7 @@ """Statistical utility functions.""" from __future__ import annotations +from math import fsum from typing import Any, Dict, List, Optional @@ -98,7 +99,9 @@ def calculate_usage_stats(tasks: List[Dict[str, Any]], path: str = "evaluation_d vals.append(float(usage[field])) if vals: - usage_stats[field] = _calc_stats(vals) + field_stats = _calc_stats(vals) + field_stats["sum"] = fsum(vals) + usage_stats[field] = field_stats return usage_stats diff --git a/tests/browseruse_bench/test_stats.py b/tests/browseruse_bench/test_stats.py index da230cf..3a0a658 100644 --- a/tests/browseruse_bench/test_stats.py +++ b/tests/browseruse_bench/test_stats.py @@ -33,3 +33,48 @@ def test_filter_empty_list(self): """Test filtering empty list returns empty.""" result = filter_tasks_by_label([], key="predicted_label", val=1) assert result == [] + + +class TestGenerateEvaluationSummary: + """Tests for generated summary statistics.""" + + def test_usage_statistics_include_sum(self): + """Usage fields should expose a total across evaluated tasks.""" + results = [ + { + "task_id": "1", + "predicted_label": 1, + "evaluation_details": { + "agent_metrics": { + "usage": { + "total_prompt_tokens": 100, + "total_completion_tokens": 20, + "total_tokens": 120, + "entry_count": 1, + } + } + }, + }, + { + "task_id": "2", + "predicted_label": 0, + "evaluation_details": { + "agent_metrics": { + "usage": { + "total_prompt_tokens": 200, + "total_completion_tokens": 80, + "total_tokens": 280, + "entry_count": 2, + } + } + }, + }, + ] + + summary = generate_evaluation_summary(results, total=2) + + usage_stats = summary["metrics_statistics"]["usage"] + assert usage_stats["total_prompt_tokens"]["sum"] == 300 + assert usage_stats["total_completion_tokens"]["sum"] == 100 + assert usage_stats["total_tokens"]["sum"] == 400 + assert usage_stats["entry_count"]["sum"] == 3