@@ -592,11 +592,30 @@ async def _collect_result(config, lst):
592592 run_id = run_id ,
593593 rollout_ids = group_rollout_ids or None ,
594594 ):
595- results = await execute_pytest (
596- test_func ,
597- processed_dataset = input_dataset ,
598- evaluation_test_kwargs = kwargs .get ("evaluation_test_kwargs" ) or {},
599- )
595+ try :
596+ results = await execute_pytest (
597+ test_func ,
598+ processed_dataset = input_dataset ,
599+ evaluation_test_kwargs = kwargs .get ("evaluation_test_kwargs" ) or {},
600+ )
601+ except AssertionError :
602+ raise
603+ except Exception as e :
604+ # Default: capture non-assert exceptions unless explicitly disabled
605+ if os .getenv ("EP_CAPTURE_EVAL_EXCEPTIONS" , "false" ).strip () == "false" :
606+ results = input_dataset
607+ for row in results :
608+ row .evaluation_result = EvaluateResult (
609+ score = 0.0 ,
610+ is_score_valid = False ,
611+ reason = f"Error during evaluation: { type (e ).__name__ } : { e } " ,
612+ )
613+ if row .eval_metadata is not None :
614+ row .eval_metadata .status = Status .error (
615+ f"Error during evaluation: { type (e ).__name__ } : { e } " ,
616+ )
617+ else :
618+ raise
600619 if (
601620 results is None
602621 or not isinstance (results , list )
@@ -624,7 +643,7 @@ async def _collect_result(config, lst):
624643 # if the eval_metadata status code has not been set to something else, consider it as finished
625644 r .eval_metadata .status = Status .eval_finished ()
626645 # Optional debug print for assistant/tool sequence
627- if os .getenv ("EP_DEBUG_SERIALIZATION" , "false " ).strip () == "false " :
646+ if os .getenv ("EP_DEBUG_SERIALIZATION" , "0 " ).strip () == "1 " :
628647 try :
629648 preview = [
630649 {
0 commit comments