From 7b6fd9366839caa9f903dde09d6700fdddecf031 Mon Sep 17 00:00:00 2001
From: WatchTree-19 <119982314+WatchTree-19@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:18:35 +0000
Subject: [PATCH] fix(benchmark_runner): record failed-item exceptions on
 TestRecord so # errors is accurate (#1353)

_make_test_record hardcoded test_item_exceptions=[], so exceptions captured on
failed items never reached the TestRecord. HazardScore.score() sums
len(test_record.test_item_exceptions) into HazardScore.exceptions, which feeds the
'# errors' column of the results table, so it always read 0 even when items raised.

Populate test_item_exceptions from run.failed_items_for(sut, test), one
TestItemExceptionRecord per failed item carrying exceptions (mirrors simple_test_runner).
Adds a regression test plus a clean-run test.
---
 src/modelbench/benchmark_runner.py            | 17 +++++++--
 .../modelbench_tests/test_benchmark_runner.py | 36 +++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py
index 494e2917..8675f408 100644
--- a/src/modelbench/benchmark_runner.py
+++ b/src/modelbench/benchmark_runner.py
@@ -23,7 +23,7 @@
 from modelgauge.monitoring import PROMETHEUS
 from modelgauge.pipeline import NullCache, Pipe, Pipeline, Sink, Source
 from modelgauge.pipeline_runner import PipelineRunner
-from modelgauge.records import TestRecord
+from modelgauge.records import TestItemExceptionRecord, TestRecord
 from modelgauge.single_turn_prompt_response import TestItem
 from modelgauge.sut import PromptResponseSUT
 
@@ -540,6 +540,19 @@ def _calculate_test_results(self, test_run):
             )
 
     def _make_test_record(self, run, sut, test, test_result):
+        # Collect the exceptions captured on failed items for this (sut, test) so they
+        # are recorded on the TestRecord. Previously this was hardcoded to an empty list,
+        # which meant downstream consumers (e.g. HazardScore.exceptions and the "# errors"
+        # column of the results table) always reported zero even when items errored.
+        test_item_exceptions = [
+            TestItemExceptionRecord(
+                test_item=item.test_item,
+                error_message=str(item.exceptions[-1]),
+                cause=str(item.exceptions[-1].__cause__),
+            )
+            for item in run.failed_items_for(sut, test)
+            if item.exceptions
+        ]
         return TestRecord(
             test_uid=test.uid,
             test_initialization=test.initialization_record,
@@ -548,7 +561,7 @@ def _make_test_record(self, run, sut, test, test_result):
             sut_uid=sut.uid,
             sut_initialization=sut.initialization_record,
             test_item_records=[],
-            test_item_exceptions=[],
+            test_item_exceptions=test_item_exceptions,
             result=TestResult.from_instance(test_result),
         )
 
diff --git a/tests/modelbench_tests/test_benchmark_runner.py b/tests/modelbench_tests/test_benchmark_runner.py
index 27bb4183..745bd5bd 100644
--- a/tests/modelbench_tests/test_benchmark_runner.py
+++ b/tests/modelbench_tests/test_benchmark_runner.py
@@ -322,6 +322,42 @@ def test_benchmark_results_collector_handles_failed(self, a_sut, tmp_path, a_wra
         assert run.finished_items_for(a_sut, a_wrapped_test) == []
         assert run.failed_items_for(a_sut, a_wrapped_test) == [item]
 
+    def test_make_test_record_records_failed_item_exceptions(self, a_sut, tmp_path, a_wrapped_test, item_from_test):
+        """Regression test for #1353: exceptions captured on failed items must be recorded
+        on the TestRecord, so HazardScore.exceptions and the "# errors" column are non-zero."""
+        runner = BenchmarkRunner(tmp_path / "run")
+        runner.secrets = fake_all_secrets()
+        run = BenchmarkRun(runner)
+        brc = TestRunResultsCollector(run)
+
+        failed = TestRunItem(a_wrapped_test, item_from_test, a_sut)
+        failed.failed = True
+        failed.exceptions.append(ValueError("sut exploded"))
+        brc.handle_item(failed)
+        assert run.failed_items_for(a_sut, a_wrapped_test) == [failed]
+
+        test_result = a_wrapped_test.aggregate_measurements([])
+        record = runner._make_test_record(run, a_sut, a_wrapped_test, test_result)
+
+        assert len(record.test_item_exceptions) == 1
+        assert "sut exploded" in record.test_item_exceptions[0].error_message
+
+    def test_make_test_record_has_no_exceptions_when_all_items_succeed(
+        self, a_sut, tmp_path, a_wrapped_test, item_from_test, sut_response
+    ):
+        """A clean run records no test item exceptions."""
+        runner = BenchmarkRunner(tmp_path / "run")
+        runner.secrets = fake_all_secrets()
+        run = BenchmarkRun(runner)
+        brc = TestRunResultsCollector(run)
+
+        item = TestRunItem(a_wrapped_test, item_from_test, a_sut, sut_response, {"a": MagicMock()})
+        brc.handle_item(item)
+
+        test_result = a_wrapped_test.aggregate_measurements([])
+        record = runner._make_test_record(run, a_sut, a_wrapped_test, test_result)
+        assert record.test_item_exceptions == []
+
     def test_basic_test_run(self, tmp_path, fake_secrets, a_test, a_sut):
         runner = TestRunner(tmp_path)
         runner.secrets = fake_secrets