bug

xzrderek · xzrderek · commit b06c608936cc · 2025-08-15T14:46:45.000-07:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -403,7 +403,9 @@ async def _execute_with_semaphore(row):
                             ):
                                 tasks.append(asyncio.create_task(_execute_with_semaphore(row)))
 
-                            all_results[i] = await asyncio.gather(*tasks)
+                            results = await asyncio.gather(*tasks)
+
+                            all_results[i] = results
 
                         else:
                             # Batch mode: collect all results first, then evaluate (no pipelining)
@@ -438,7 +440,9 @@ async def _execute_with_semaphore(row):
                             all_results[i] = results
 
                         for r in results:
-                            r.eval_metadata.status = "finished"
+                            if r.eval_metadata is not None:
+                                r.eval_metadata.status = "finished"
+                            active_logger.log(r)
 
                     scores = [
                         sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
@@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
         }
     ],
     rollout_processor=MCPGymRolloutProcessor(),
-    passed_threshold={"success": 0.4, "standard_error": 0.1},
+    passed_threshold={"success": 0.4, "standard_error": 0.02},
     num_runs=8,
     mode="pointwise",
     max_concurrent_rollouts=50,

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval`
`73`	`73`	`}`
`74`	`74`	`],`
`75`	`75`	`rollout_processor=MCPGymRolloutProcessor(),`
`76`		`- passed_threshold={"success": 0.4, "standard_error": 0.1},`
	`76`	`+ passed_threshold={"success": 0.4, "standard_error": 0.02},`
`77`	`77`	`num_runs=8,`
`78`	`78`	`mode="pointwise",`
`79`	`79`	`max_concurrent_rollouts=50,`