@@ -128,7 +128,8 @@ def evaluation_test( # noqa: C901
128128 rollout_processor_kwargs: Kwargs for the rollout processor.
129129 aggregation_method: How to aggregate scores across rows.
130130 passed_threshold: Threshold configuration for test success. Must be a float or EvaluationThreshold object.
131- Success rate must be above success, and if set, standard deviation must be below standard_deviation.
131+ Success rate must be above success, and if set, standard error must be below standard_error.
132+ Success rate +/- one standard_error is equivalent to 68% confidence interval.
132133 num_runs: Number of times to repeat the rollout and evaluations.
133134 max_dataset_rows: Limit dataset to the first N rows.
134135 mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -436,20 +437,22 @@ async def _execute_with_semaphore(row):
436437 )
437438 all_results [i ] = results
438439
440+ for r in results :
441+ r .eval_metadata .status = "finished"
442+
439443 scores = [
440444 sum ([r .evaluation_result .score for r in result if r .evaluation_result ]) / len (result )
441445 for result in all_results
442446 ]
443447 agg_score = aggregate (scores , aggregation_method )
444- score_std = statistics .stdev (scores ) if len (scores ) > 1 else 0.0
445448
446449 # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
447450 ci_low : float | None = None
448451 ci_high : float | None = None
449452 if aggregation_method == "mean" :
450453 try :
451454 result_ci = compute_fixed_set_mu_ci ([item for sublist in all_results for item in sublist ])
452- mu_ci_low , mu_ci_high = result_ci [ 1 ], result_ci [ 2 ]
455+ _ , mu_ci_low , mu_ci_high , standard_error = result_ci
453456 if mu_ci_low is not None and mu_ci_high is not None :
454457 ci_low = float (mu_ci_low )
455458 ci_high = float (mu_ci_high )
@@ -466,17 +469,18 @@ async def _execute_with_semaphore(row):
466469
467470 success_passed = agg_score >= threshold .success
468471
469- if threshold .standard_deviation is not None :
470- std_passed = score_std <= threshold .standard_deviation
472+ if threshold .standard_error is not None :
473+ std_passed = standard_error <= threshold .standard_error
471474
472475 passed = success_passed and std_passed
473476
474477 # Update eval metadata passed field for all results
475478 for result in all_results :
476479 for r in result :
477480 if r .eval_metadata is not None :
478- r .eval_metadata .status = "finished"
479481 r .eval_metadata .passed = passed
482+ r .evaluation_result .agg_score = agg_score
483+ r .evaluation_result .standard_error = standard_error
480484 active_logger .log (r )
481485
482486 # Optional: print and/or persist a summary artifact for CI
@@ -593,10 +597,10 @@ async def _execute_with_semaphore(row):
593597 assert (
594598 agg_score >= threshold .success
595599 ), f"Aggregated score { agg_score :.3f} below threshold { threshold .success } "
596- if threshold .standard_deviation is not None :
600+ if threshold .standard_error is not None :
597601 assert (
598- score_std <= threshold .standard_deviation
599- ), f"Standard deviation { score_std :.3f} above threshold { threshold .standard_deviation } "
602+ standard_error <= threshold .standard_error
603+ ), f"Standard error { standard_error :.3f} above threshold { threshold .standard_error } "
600604
601605 except AssertionError :
602606 _log_eval_error ("finished" , data if "data" in locals () else None , passed = False )
0 commit comments