File tree Expand file tree Collapse file tree
examples/aime2025_chat_completion Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1212
1313 from eval_protocol.benchmarks.registry import export_benchmark
1414
15- @export_benchmark("aime25_low ")
15+ @export_benchmark("aime25 ")
1616 @evaluation_test(...)
1717 def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow:
1818 ...
1919
2020Programmatic run:
2121
2222 from eval_protocol.benchmarks.registry import get_benchmark_runner
23- get_benchmark_runner("aime25_low ")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json")
23+ get_benchmark_runner("aime25 ")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json")
2424"""
2525
2626from __future__ import annotations
Original file line number Diff line number Diff line change 33
44Usage:
55
6- python -m eval_protocol.benchmarks.run aime25_low \
6+ python -m eval_protocol.benchmarks.run aime25 \
77 --model fireworks_ai/accounts/fireworks/models/gpt-oss-120b \
88 --print-summary \
9- --out artifacts/aime25_low .json \
9+ --out artifacts/aime25 .json \
1010 --max-rows 50 \
1111 --reasoning-effort low
1212"""
1313
1414from __future__ import annotations
1515
1616import argparse
17- from typing import Any
1817
1918from importlib import import_module
2019import pkgutil
@@ -60,7 +59,7 @@ def main() -> int:
6059 # Fallback: if nothing registered yet and a known suite was requested, try explicit import
6160 if not list_benchmarks ():
6261 known_map = {
63- "aime25_low " : "eval_protocol.benchmarks.suites.aime25" ,
62+ "aime25 " : "eval_protocol.benchmarks.suites.aime25" ,
6463 }
6564 forced = known_map .get (args .name )
6665 if forced :
@@ -73,7 +72,7 @@ def main() -> int:
7372 if args .max_rows is not None :
7473 try :
7574 max_rows = int (args .max_rows )
76- except Exception :
75+ except ValueError :
7776 max_rows = str (args .max_rows )
7877 # Build input params override if needed
7978 ip_override = {}
Load Diff This file was deleted.
Load Diff This file was deleted.
Load Diff This file was deleted.
You can’t perform that action at this time.
0 commit comments