Skip to content

Commit 3406889

Browse files
committed
remove useless test
1 parent 1b8032d commit 3406889

File tree

1 file changed

+0
-66
lines changed

1 file changed

+0
-66
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 0 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -113,69 +113,3 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
113113
metrics=metrics,
114114
)
115115
return row
116-
117-
118-
# @evaluation_test(
119-
# input_dataset=[
120-
# "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
121-
# # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
122-
# ],
123-
# dataset_adapter=aime2025_dataset_adapter,
124-
# completion_params=[
125-
# {
126-
# "max_tokens": 131000,
127-
# "extra_body": {"reasoning_effort": "low"},
128-
# "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
129-
# },
130-
# {
131-
# "max_tokens": 131000,
132-
# "extra_body": {"reasoning_effort": "low"},
133-
# "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
134-
# }
135-
# ],
136-
# rollout_processor=SingleTurnRolloutProcessor(),
137-
# aggregation_method="mean",
138-
# passed_threshold=None,
139-
# num_runs=1,
140-
# max_dataset_rows=2,
141-
# max_concurrent_rollouts=4,
142-
# mode="groupwise",
143-
# )
144-
# def test_aime25_groupwise(rows: List[EvaluationRow]) -> List[EvaluationRow]:
145-
# output = []
146-
# for row in rows:
147-
# assistant_msgs = [m for m in row.messages if m.role == "assistant"]
148-
# content = assistant_msgs[-1].content if assistant_msgs else ""
149-
150-
# extracted_text = _extract_boxed_text(content or "")
151-
# extracted_int = _normalize_to_int_or_none(extracted_text)
152-
# gt_int = _normalize_to_int_or_none(row.ground_truth or "")
153-
154-
# is_valid = extracted_int is not None and gt_int is not None
155-
# score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
156-
157-
# metrics = {
158-
# "exact_match": MetricResult(
159-
# score=score,
160-
# is_score_valid=is_valid,
161-
# reason=(
162-
# "Parsed both integers and they matched"
163-
# if score == 1.0
164-
# else ("Parsed integers did not match" if is_valid else "Failed to parse integer")
165-
# ),
166-
# data={
167-
# "extracted_text": extracted_text,
168-
# "extracted_int": extracted_int,
169-
# "ground_truth_int": gt_int,
170-
# },
171-
# )
172-
# }
173-
174-
# row.evaluation_result = EvaluateResult(
175-
# score=score,
176-
# reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
177-
# is_score_valid=is_valid,
178-
# metrics=metrics,
179-
# )
180-
# output.append(row)
181-
# return output

0 commit comments

Comments
 (0)