Skip to content

Commit 87c3dcb

Browse files
author
Dylan Huang
committed
rename as its causing issues in pytest collection
1 parent c90f4c6 commit 87c3dcb

File tree

2 files changed

+9
-19
lines changed

2 files changed

+9
-19
lines changed

examples/aime2025_chat_completion/tests/test_evaluation.py renamed to examples/aime2025_chat_completion/tests/test_aime2025.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
1-
from typing import Any, Dict, List
21
import os
2+
from typing import Any, Dict, List
33

44
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
55
from eval_protocol.pytest.default_single_turn_rollout_process import (
66
default_single_turn_rollout_processor,
77
)
88
from eval_protocol.pytest.evaluation_test import evaluation_test
9-
109
from examples.aime2025_chat_completion.main import _extract_boxed_text, _normalize_to_int_or_none
1110

12-
1311
SYSTEM_PROMPT = (
14-
"You are a helpful math assistant. Please reason step by step, and put your "
15-
"final answer within \\boxed{...}."
12+
"You are a helpful math assistant. Please reason step by step, and put your " "final answer within \\boxed{...}."
1613
)
1714

1815
"""
@@ -36,8 +33,6 @@ def _ep_int(var_name: str, default_value: int | None) -> int | None:
3633
return default_value
3734

3835

39-
40-
4136
def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
4237
"""
4338
Convert raw AIME2025 rows (with keys 'question' and 'answer') to EvaluationRow.
@@ -94,9 +89,7 @@ def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow:
9489
reason=(
9590
"Parsed both integers and they matched"
9691
if score == 1.0
97-
else (
98-
"Parsed integers did not match" if is_valid else "Failed to parse integer"
99-
)
92+
else ("Parsed integers did not match" if is_valid else "Failed to parse integer")
10093
),
10194
data={
10295
"extracted_text": extracted_text,
@@ -113,5 +106,3 @@ def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow:
113106
metrics=metrics,
114107
)
115108
return row
116-
117-
Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
1-
from typing import List
2-
31
import csv
42
import io
53
import re
4+
from typing import List
5+
66
import requests
77

88
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
9-
from eval_protocol.pytest.evaluation_test import evaluation_test
109
from eval_protocol.pytest.default_single_turn_rollout_process import (
1110
default_single_turn_rollout_processor,
1211
)
13-
12+
from eval_protocol.pytest.evaluation_test import evaluation_test
1413

1514
SYSTEM_PROMPT = (
1615
"You are a helpful assistant. Read the question and options carefully. "
@@ -65,7 +64,9 @@ def _load_gpqa_messages_from_csv() -> List[List[Message]]:
6564
@evaluation_test(
6665
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
6766
input_messages=_GPQA_INPUT_MESSAGES,
68-
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], # default to low effort; override via CLI plugin
67+
rollout_input_params=[
68+
{"extra_body": {"reasoning_effort": "low"}}
69+
], # default to low effort; override via CLI plugin
6970
rollout_processor=default_single_turn_rollout_processor,
7071
aggregation_method="mean",
7172
threshold_of_success=None,
@@ -98,5 +99,3 @@ def test_gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
9899
},
99100
)
100101
return row
101-
102-

0 commit comments

Comments
 (0)