Skip to content

Commit 94b1b9f

Browse files
committed
same logic for all mode
1 parent aa1ce37 commit 94b1b9f

1 file changed

Lines changed: 25 additions & 6 deletions

File tree

eval_protocol/pytest/evaluation_test.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -592,11 +592,30 @@ async def _collect_result(config, lst):
592592
run_id=run_id,
593593
rollout_ids=group_rollout_ids or None,
594594
):
595-
results = await execute_pytest(
596-
test_func,
597-
processed_dataset=input_dataset,
598-
evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
599-
)
595+
try:
596+
results = await execute_pytest(
597+
test_func,
598+
processed_dataset=input_dataset,
599+
evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
600+
)
601+
except AssertionError:
602+
raise
603+
except Exception as e:
604+
# Default: capture non-assert exceptions unless explicitly disabled
605+
if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false":
606+
results = input_dataset
607+
for row in results:
608+
row.evaluation_result = EvaluateResult(
609+
score=0.0,
610+
is_score_valid=False,
611+
reason=f"Error during evaluation: {type(e).__name__}: {e}",
612+
)
613+
if row.eval_metadata is not None:
614+
row.eval_metadata.status = Status.error(
615+
f"Error during evaluation: {type(e).__name__}: {e}",
616+
)
617+
else:
618+
raise
600619
if (
601620
results is None
602621
or not isinstance(results, list)
@@ -624,7 +643,7 @@ async def _collect_result(config, lst):
624643
# if the eval_metadata status code has not been set to something else, consider it as finished
625644
r.eval_metadata.status = Status.eval_finished()
626645
# Optional debug print for assistant/tool sequence
627-
if os.getenv("EP_DEBUG_SERIALIZATION", "false").strip() == "false":
646+
if os.getenv("EP_DEBUG_SERIALIZATION", "0").strip() == "1":
628647
try:
629648
preview = [
630649
{

0 commit comments

Comments
 (0)