mlcommons · WatchTree-19 · Jun 24, 2026
@@ -23,7 +23,7 @@
 from modelgauge.monitoring import PROMETHEUS
 from modelgauge.pipeline import NullCache, Pipe, Pipeline, Sink, Source
 from modelgauge.pipeline_runner import PipelineRunner
-from modelgauge.records import TestRecord
+from modelgauge.records import TestItemExceptionRecord, TestRecord
 from modelgauge.single_turn_prompt_response import TestItem
 from modelgauge.sut import PromptResponseSUT
 
@@ -540,6 +540,19 @@ def _calculate_test_results(self, test_run):
             )
 
     def _make_test_record(self, run, sut, test, test_result):
+        # Collect the exceptions captured on failed items for this (sut, test) so they
+        # are recorded on the TestRecord. Previously this was hardcoded to an empty list,
+        # which meant downstream consumers (e.g. HazardScore.exceptions and the "# errors"
+        # column of the results table) always reported zero even when items errored.
+        test_item_exceptions = [
+            TestItemExceptionRecord(
+                test_item=item.test_item,
+                error_message=str(item.exceptions[-1]),
+                cause=str(item.exceptions[-1].__cause__),
+            )
+            for item in run.failed_items_for(sut, test)
+            if item.exceptions
+        ]
         return TestRecord(
             test_uid=test.uid,
             test_initialization=test.initialization_record,
@@ -548,7 +561,7 @@ def _make_test_record(self, run, sut, test, test_result):
             sut_uid=sut.uid,
             sut_initialization=sut.initialization_record,
             test_item_records=[],
-            test_item_exceptions=[],
+            test_item_exceptions=test_item_exceptions,
             result=TestResult.from_instance(test_result),
         )
 

@@ -322,6 +322,42 @@ def test_benchmark_results_collector_handles_failed(self, a_sut, tmp_path, a_wra
         assert run.finished_items_for(a_sut, a_wrapped_test) == []
         assert run.failed_items_for(a_sut, a_wrapped_test) == [item]
 
+    def test_make_test_record_records_failed_item_exceptions(self, a_sut, tmp_path, a_wrapped_test, item_from_test):
+        """Regression test for #1353: exceptions captured on failed items must be recorded
+        on the TestRecord, so HazardScore.exceptions and the "# errors" column are non-zero."""
+        runner = BenchmarkRunner(tmp_path / "run")
+        runner.secrets = fake_all_secrets()
+        run = BenchmarkRun(runner)
+        brc = TestRunResultsCollector(run)
+
+        failed = TestRunItem(a_wrapped_test, item_from_test, a_sut)
+        failed.failed = True
+        failed.exceptions.append(ValueError("sut exploded"))
+        brc.handle_item(failed)
+        assert run.failed_items_for(a_sut, a_wrapped_test) == [failed]
+
+        test_result = a_wrapped_test.aggregate_measurements([])
+        record = runner._make_test_record(run, a_sut, a_wrapped_test, test_result)
+
+        assert len(record.test_item_exceptions) == 1
+        assert "sut exploded" in record.test_item_exceptions[0].error_message
+
+    def test_make_test_record_has_no_exceptions_when_all_items_succeed(
+        self, a_sut, tmp_path, a_wrapped_test, item_from_test, sut_response
+    ):
+        """A clean run records no test item exceptions."""
+        runner = BenchmarkRunner(tmp_path / "run")
+        runner.secrets = fake_all_secrets()
+        run = BenchmarkRun(runner)
+        brc = TestRunResultsCollector(run)
+
+        item = TestRunItem(a_wrapped_test, item_from_test, a_sut, sut_response, {"a": MagicMock()})
+        brc.handle_item(item)
+
+        test_result = a_wrapped_test.aggregate_measurements([])
+        record = runner._make_test_record(run, a_sut, a_wrapped_test, test_result)
+        assert record.test_item_exceptions == []
+
     def test_basic_test_run(self, tmp_path, fake_secrets, a_test, a_sut):
         runner = TestRunner(tmp_path)
         runner.secrets = fake_secrets