Skip to content

Commit cd14c75

Browse files
committed
Change defaults of tau benchmarks and unpack extra_body more explicitly
1 parent a20f233 commit cd14c75

File tree

3 files changed

+3
-3
lines changed

3 files changed

+3
-3
lines changed

eval_protocol/benchmarks/test_tau_bench_airline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
119119
rollout_processor=MCPGymRolloutProcessor(),
120120
rollout_processor_kwargs={"domain": "airline"},
121121
passed_threshold={"success": 0.4, "standard_error": 0.02},
122-
num_runs=8,
122+
num_runs=4,
123123
mode="pointwise",
124124
max_concurrent_rollouts=50,
125125
server_script_path=_get_server_script_path(),

eval_protocol/benchmarks/test_tau_bench_retail.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
109109
rollout_processor=MCPGymRolloutProcessor(),
110110
rollout_processor_kwargs={"domain": "retail"},
111111
passed_threshold={"success": 0.65, "standard_error": 0.02},
112-
num_runs=8,
112+
num_runs=4,
113113
mode="pointwise",
114114
max_concurrent_rollouts=50,
115115
server_script_path=get_server_script_path(),

eval_protocol/pytest/default_mcp_gym_rollout_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
223223
model_id=config.completion_params.get("model", None),
224224
temperature=config.completion_params.get("temperature", 0.0),
225225
max_tokens=config.completion_params.get("max_tokens", 4096),
226-
reasoning_effort=config.completion_params.get("reasoning_effort", None),
226+
**(config.completion_params.get("extra_body", {}) or {}),
227227
)
228228

229229
except Exception as e:

0 commit comments

Comments
 (0)