@@ -113,69 +113,3 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
113113 metrics = metrics ,
114114 )
115115 return row
116-
117-
118- # @evaluation_test(
119- # input_dataset=[
120- # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
121- # # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
122- # ],
123- # dataset_adapter=aime2025_dataset_adapter,
124- # completion_params=[
125- # {
126- # "max_tokens": 131000,
127- # "extra_body": {"reasoning_effort": "low"},
128- # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
129- # },
130- # {
131- # "max_tokens": 131000,
132- # "extra_body": {"reasoning_effort": "low"},
133- # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
134- # }
135- # ],
136- # rollout_processor=SingleTurnRolloutProcessor(),
137- # aggregation_method="mean",
138- # passed_threshold=None,
139- # num_runs=1,
140- # max_dataset_rows=2,
141- # max_concurrent_rollouts=4,
142- # mode="groupwise",
143- # )
144- # def test_aime25_groupwise(rows: List[EvaluationRow]) -> List[EvaluationRow]:
145- # output = []
146- # for row in rows:
147- # assistant_msgs = [m for m in row.messages if m.role == "assistant"]
148- # content = assistant_msgs[-1].content if assistant_msgs else ""
149-
150- # extracted_text = _extract_boxed_text(content or "")
151- # extracted_int = _normalize_to_int_or_none(extracted_text)
152- # gt_int = _normalize_to_int_or_none(row.ground_truth or "")
153-
154- # is_valid = extracted_int is not None and gt_int is not None
155- # score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
156-
157- # metrics = {
158- # "exact_match": MetricResult(
159- # score=score,
160- # is_score_valid=is_valid,
161- # reason=(
162- # "Parsed both integers and they matched"
163- # if score == 1.0
164- # else ("Parsed integers did not match" if is_valid else "Failed to parse integer")
165- # ),
166- # data={
167- # "extracted_text": extracted_text,
168- # "extracted_int": extracted_int,
169- # "ground_truth_int": gt_int,
170- # },
171- # )
172- # }
173-
174- # row.evaluation_result = EvaluateResult(
175- # score=score,
176- # reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
177- # is_score_valid=is_valid,
178- # metrics=metrics,
179- # )
180- # output.append(row)
181- # return output
0 commit comments