Skip to content

Commit b06c608

Browse files
committed
bug
1 parent d2f132d commit b06c608

File tree

2 files changed

+7
-3
lines changed

2 files changed

+7
-3
lines changed

eval_protocol/pytest/evaluation_test.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,9 @@ async def _execute_with_semaphore(row):
403403
):
404404
tasks.append(asyncio.create_task(_execute_with_semaphore(row)))
405405

406-
all_results[i] = await asyncio.gather(*tasks)
406+
results = await asyncio.gather(*tasks)
407+
408+
all_results[i] = results
407409

408410
else:
409411
# Batch mode: collect all results first, then evaluate (no pipelining)
@@ -438,7 +440,9 @@ async def _execute_with_semaphore(row):
438440
all_results[i] = results
439441

440442
for r in results:
441-
r.eval_metadata.status = "finished"
443+
if r.eval_metadata is not None:
444+
r.eval_metadata.status = "finished"
445+
active_logger.log(r)
442446

443447
scores = [
444448
sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)

tests/pytest/test_tau_bench_airline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
7373
}
7474
],
7575
rollout_processor=MCPGymRolloutProcessor(),
76-
passed_threshold={"success": 0.4, "standard_error": 0.1},
76+
passed_threshold={"success": 0.4, "standard_error": 0.02},
7777
num_runs=8,
7878
mode="pointwise",
7979
max_concurrent_rollouts=50,

0 commit comments

Comments
 (0)