From 7b6fd9366839caa9f903dde09d6700fdddecf031 Mon Sep 17 00:00:00 2001 From: WatchTree-19 <119982314+WatchTree-19@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:18:35 +0000 Subject: [PATCH] fix(benchmark_runner): record failed-item exceptions on TestRecord so # errors is accurate (#1353) _make_test_record hardcoded test_item_exceptions=[], so exceptions captured on failed items never reached the TestRecord. HazardScore.score() sums len(test_record.test_item_exceptions) into HazardScore.exceptions, which feeds the '# errors' column of the results table, so it always read 0 even when items raised. Populate test_item_exceptions from run.failed_items_for(sut, test), one TestItemExceptionRecord per failed item carrying exceptions (mirrors simple_test_runner). Adds a regression test plus a clean-run test. --- src/modelbench/benchmark_runner.py | 17 +++++++-- .../modelbench_tests/test_benchmark_runner.py | 36 +++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py index 494e2917..8675f408 100644 --- a/src/modelbench/benchmark_runner.py +++ b/src/modelbench/benchmark_runner.py @@ -23,7 +23,7 @@ from modelgauge.monitoring import PROMETHEUS from modelgauge.pipeline import NullCache, Pipe, Pipeline, Sink, Source from modelgauge.pipeline_runner import PipelineRunner -from modelgauge.records import TestRecord +from modelgauge.records import TestItemExceptionRecord, TestRecord from modelgauge.single_turn_prompt_response import TestItem from modelgauge.sut import PromptResponseSUT @@ -540,6 +540,19 @@ def _calculate_test_results(self, test_run): ) def _make_test_record(self, run, sut, test, test_result): + # Collect the exceptions captured on failed items for this (sut, test) so they + # are recorded on the TestRecord. Previously this was hardcoded to an empty list, + # which meant downstream consumers (e.g. HazardScore.exceptions and the "# errors" + # column of the results table) always reported zero even when items errored. + test_item_exceptions = [ + TestItemExceptionRecord( + test_item=item.test_item, + error_message=str(item.exceptions[-1]), + cause=str(item.exceptions[-1].__cause__), + ) + for item in run.failed_items_for(sut, test) + if item.exceptions + ] return TestRecord( test_uid=test.uid, test_initialization=test.initialization_record, @@ -548,7 +561,7 @@ def _make_test_record(self, run, sut, test, test_result): sut_uid=sut.uid, sut_initialization=sut.initialization_record, test_item_records=[], - test_item_exceptions=[], + test_item_exceptions=test_item_exceptions, result=TestResult.from_instance(test_result), ) diff --git a/tests/modelbench_tests/test_benchmark_runner.py b/tests/modelbench_tests/test_benchmark_runner.py index 27bb4183..745bd5bd 100644 --- a/tests/modelbench_tests/test_benchmark_runner.py +++ b/tests/modelbench_tests/test_benchmark_runner.py @@ -322,6 +322,42 @@ def test_benchmark_results_collector_handles_failed(self, a_sut, tmp_path, a_wra assert run.finished_items_for(a_sut, a_wrapped_test) == [] assert run.failed_items_for(a_sut, a_wrapped_test) == [item] + def test_make_test_record_records_failed_item_exceptions(self, a_sut, tmp_path, a_wrapped_test, item_from_test): + """Regression test for #1353: exceptions captured on failed items must be recorded + on the TestRecord, so HazardScore.exceptions and the "# errors" column are non-zero.""" + runner = BenchmarkRunner(tmp_path / "run") + runner.secrets = fake_all_secrets() + run = BenchmarkRun(runner) + brc = TestRunResultsCollector(run) + + failed = TestRunItem(a_wrapped_test, item_from_test, a_sut) + failed.failed = True + failed.exceptions.append(ValueError("sut exploded")) + brc.handle_item(failed) + assert run.failed_items_for(a_sut, a_wrapped_test) == [failed] + + test_result = a_wrapped_test.aggregate_measurements([]) + record = runner._make_test_record(run, a_sut, a_wrapped_test, test_result) + + assert len(record.test_item_exceptions) == 1 + assert "sut exploded" in record.test_item_exceptions[0].error_message + + def test_make_test_record_has_no_exceptions_when_all_items_succeed( + self, a_sut, tmp_path, a_wrapped_test, item_from_test, sut_response + ): + """A clean run records no test item exceptions.""" + runner = BenchmarkRunner(tmp_path / "run") + runner.secrets = fake_all_secrets() + run = BenchmarkRun(runner) + brc = TestRunResultsCollector(run) + + item = TestRunItem(a_wrapped_test, item_from_test, a_sut, sut_response, {"a": MagicMock()}) + brc.handle_item(item) + + test_result = a_wrapped_test.aggregate_measurements([]) + record = runner._make_test_record(run, a_sut, a_wrapped_test, test_result) + assert record.test_item_exceptions == [] + def test_basic_test_run(self, tmp_path, fake_secrets, a_test, a_sut): runner = TestRunner(tmp_path) runner.secrets = fake_secrets