@@ -299,13 +299,6 @@ def wrapper_body(**kwargs):
299299
300300 cohort_id = generate_id ()
301301
302- def _log_eval_error (
303- status : Literal ["finished" , "error" ], rows : Optional [List [EvaluationRow ]] | None , passed : bool
304- ) -> None :
305- log_eval_status_and_rows (eval_metadata , rows , status , passed , default_logger )
306-
307- cohort_id = generate_id ()
308-
309302 def _log_eval_error (
310303 status : Literal ["finished" , "error" ], rows : Optional [List [EvaluationRow ]] | None , passed : bool
311304 ) -> None :
@@ -461,25 +454,9 @@ def _log_eval_error(
461454 sum ([r .evaluation_result .score for r in result if r .evaluation_result ]) / len (result )
462455 for result in all_results
463456 ]
464- print (f"SCORES: { scores } " )
465457 agg_score = aggregate (scores , aggregation_method )
466458 score_std = statistics .stdev (scores ) if len (scores ) > 1 else 0.0
467459
468- # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
469- ci_low : float | None = None
470- ci_high : float | None = None
471- if aggregation_method == "mean" :
472- try :
473- result_ci = compute_fixed_set_mu_ci ([item for sublist in all_results for item in sublist ])
474- mu_ci_low , mu_ci_high = result_ci [1 ], result_ci [2 ]
475- if mu_ci_low is not None and mu_ci_high is not None :
476- ci_low = float (mu_ci_low )
477- ci_high = float (mu_ci_high )
478- # Keep agg_score as-is (mean over scores). For equal repeats per question these match.
479- except Exception :
480- ci_low = None
481- ci_high = None
482-
483460 # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
484461 ci_low : float | None = None
485462 ci_high : float | None = None
0 commit comments