We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent df26606 commit 1505018Copy full SHA for 1505018
tests/test_tau_bench_airline_smoke.py
@@ -68,7 +68,7 @@ def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> Lis
68
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
69
rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}],
70
rollout_processor=default_mcp_gym_rollout_processor,
71
- passed_threshold=0.4,
+ passed_threshold=0.36,
72
num_runs=1, # Smoke test: single run for quick feedback
73
mode="pointwise",
74
max_concurrent_rollouts=50, # Standard concurrency
0 commit comments