Skip to content

Commit d9fd14c

Browse files
committed
add aime benchmark
1 parent 7438609 commit d9fd14c

File tree

4 files changed

+24
-11
lines changed

4 files changed

+24
-11
lines changed

eval_protocol/benchmarks/suites/aime25.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
7070
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
7171
],
7272
dataset_adapter=aime2025_dataset_adapter,
73-
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
73+
rollout_input_params=[{"max_tokens": 131000, "extra_body": {"reasoning_effort": "high"}}],
7474
rollout_processor=default_single_turn_rollout_processor,
7575
aggregation_method="mean",
7676
threshold_of_success=None,
@@ -79,7 +79,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
7979
max_concurrent_rollouts=4,
8080
mode="pointwise",
8181
)
82-
def aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
82+
def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
8383
assistant_msgs = [m for m in row.messages if m.role == "assistant"]
8484
content = assistant_msgs[-1].content if assistant_msgs else ""
8585

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,22 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
3636
request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
3737
# Ensure caching is disabled only for this request (review feedback)
3838
request_params["cache"] = {"no-cache": True}
39-
# Allow passing reasoning effort to Fireworks via LiteLLM using extra_body
40-
# Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}}
41-
if "reasoning" in config.input_params:
39+
# Single-level reasoning effort: expect `reasoning_effort` only
40+
effort_val = None
41+
if isinstance(config.input_params, dict):
42+
if "reasoning_effort" in config.input_params:
43+
effort_val = str(config.input_params["reasoning_effort"]) # flat shape
44+
elif isinstance(config.input_params.get("extra_body"), dict) and "reasoning_effort" in config.input_params["extra_body"]:
45+
# Accept if user passed it directly inside extra_body
46+
effort_val = str(config.input_params["extra_body"]["reasoning_effort"]) # already in extra_body
47+
48+
if effort_val:
49+
# Always under extra_body so LiteLLM forwards to provider-specific param set
4250
request_params.setdefault("extra_body", {})
43-
request_params["extra_body"]["reasoning"] = config.input_params["reasoning"]
51+
request_params["extra_body"]["reasoning_effort"] = effort_val
52+
# Ensure unsupported top-level keys are not present
53+
if "reasoning_effort" in request_params:
54+
request_params.pop("reasoning_effort", None)
4455

4556
if row.tools is not None:
4657
request_params["tools"] = row.tools
@@ -87,7 +98,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
8798

8899
async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
89100
async with semaphore:
90-
return await process_row(r)
101+
try:
102+
return await process_row(r)
103+
except Exception as e:
104+
return r
91105

92106
tasks = [_sem_wrapper(row) for row in rows]
93107
dataset = list(await asyncio.gather(*tasks))

eval_protocol/pytest/plugin.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,9 @@ def pytest_configure(config) -> None:
131131
merged[k] = v
132132
reasoning_effort = config.getoption("--ep-reasoning-effort")
133133
if reasoning_effort:
134-
# Standardize into extra_body.reasoning.effort in EP_INPUT_PARAMS_JSON
134+
# Always place under extra_body to avoid LiteLLM rejecting top-level params
135135
eb = merged.setdefault("extra_body", {})
136-
reasoning = eb.setdefault("reasoning", {})
137-
reasoning["effort"] = str(reasoning_effort)
136+
eb["reasoning_effort"] = str(reasoning_effort)
138137
if merged:
139138
os.environ["EP_INPUT_PARAMS_JSON"] = _json.dumps(merged)
140139
except Exception:

examples/aime2025_chat_completion/tests/test_aime2025.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
5757
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
5858
],
5959
dataset_adapter=aime2025_dataset_adapter,
60-
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
60+
rollout_input_params=[{"reasoning_effort": "low"}],
6161
rollout_processor=default_single_turn_rollout_processor,
6262
aggregation_method="mean",
6363
passed_threshold=None,

0 commit comments

Comments
 (0)