@@ -128,7 +128,8 @@ def evaluation_test( # noqa: C901
128128 rollout_processor_kwargs: Kwargs for the rollout processor.
129129 aggregation_method: How to aggregate scores across rows.
130130 passed_threshold: Threshold configuration for test success. Must be a float or EvaluationThreshold object.
131- Success rate must be above success, and if set, standard deviation must be below standard_deviation.
131+ Success rate must be above success, and if set, standard error must be below standard_error.
132+ Success rate +/- one standard_error is equivalent to 68% confidence interval.
132133 num_runs: Number of times to repeat the rollout and evaluations.
133134 max_dataset_rows: Limit dataset to the first N rows.
134135 mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -402,7 +403,9 @@ async def _execute_with_semaphore(row):
402403 ):
403404 tasks .append (asyncio .create_task (_execute_with_semaphore (row )))
404405
405- all_results [i ] = await asyncio .gather (* tasks )
406+ results = await asyncio .gather (* tasks )
407+
408+ all_results [i ] = results
406409
407410 else :
408411 # Batch mode: collect all results first, then evaluate (no pipelining)
@@ -436,20 +439,24 @@ async def _execute_with_semaphore(row):
436439 )
437440 all_results [i ] = results
438441
442+ for r in results :
443+ if r .eval_metadata is not None :
444+ r .eval_metadata .status = "finished"
445+ active_logger .log (r )
446+
439447 scores = [
440448 sum ([r .evaluation_result .score for r in result if r .evaluation_result ]) / len (result )
441449 for result in all_results
442450 ]
443451 agg_score = aggregate (scores , aggregation_method )
444- score_std = statistics .stdev (scores ) if len (scores ) > 1 else 0.0
445452
446453 # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
447454 ci_low : float | None = None
448455 ci_high : float | None = None
449456 if aggregation_method == "mean" :
450457 try :
451458 result_ci = compute_fixed_set_mu_ci ([item for sublist in all_results for item in sublist ])
452- mu_ci_low , mu_ci_high = result_ci [ 1 ], result_ci [ 2 ]
459+ _ , mu_ci_low , mu_ci_high , standard_error = result_ci
453460 if mu_ci_low is not None and mu_ci_high is not None :
454461 ci_low = float (mu_ci_low )
455462 ci_high = float (mu_ci_high )
@@ -462,21 +469,23 @@ async def _execute_with_semaphore(row):
462469 passed = None
463470
464471 if threshold is not None :
465- success_passed , std_passed = True , True
472+ success_passed , standard_error_passed = True , True
466473
467474 success_passed = agg_score >= threshold .success
468475
469- if threshold .standard_deviation is not None :
470- std_passed = score_std <= threshold .standard_deviation
476+ if threshold .standard_error is not None and standard_error is not None :
477+ standard_error_passed = standard_error <= threshold .standard_error
471478
472- passed = success_passed and std_passed
479+ passed = success_passed and standard_error_passed
473480
474481 # Update eval metadata passed field for all results
475482 for result in all_results :
476483 for r in result :
477484 if r .eval_metadata is not None :
478- r .eval_metadata .status = "finished"
479485 r .eval_metadata .passed = passed
486+ if r .evaluation_result is not None :
487+ r .evaluation_result .agg_score = agg_score
488+ r .evaluation_result .standard_error = standard_error
480489 active_logger .log (r )
481490
482491 # Optional: print and/or persist a summary artifact for CI
@@ -593,9 +602,9 @@ async def _execute_with_semaphore(row):
593602 assert agg_score >= threshold .success , (
594603 f"Aggregated score { agg_score :.3f} below threshold { threshold .success } "
595604 )
596- if threshold .standard_deviation is not None :
597- assert score_std <= threshold .standard_deviation , (
598- f"Standard deviation { score_std :.3f} above threshold { threshold .standard_deviation } "
605+ if threshold .standard_error is not None and standard_error is not None :
606+ assert standard_error <= threshold .standard_error , (
607+ f"Standard error { standard_error :.3f} above threshold { threshold .standard_error } "
599608 )
600609
601610 except AssertionError :
0 commit comments