|
| 1 | +from typing import List |
| 2 | + |
| 3 | +import csv |
| 4 | +import io |
| 5 | +import re |
| 6 | +import requests |
| 7 | + |
| 8 | +from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult |
| 9 | +from eval_protocol.pytest.evaluation_test import evaluation_test |
| 10 | +from eval_protocol.pytest.default_single_turn_rollout_process import ( |
| 11 | + default_single_turn_rollout_processor, |
| 12 | +) |
| 13 | +from eval_protocol.benchmarks.registry import export_benchmark |
| 14 | + |
| 15 | + |
| 16 | +SYSTEM_PROMPT = ( |
| 17 | + "You are a helpful assistant. Read the question and options carefully. " |
| 18 | + "Express your final answer strictly as a single letter: A, B, C, or D." |
| 19 | +) |
| 20 | + |
| 21 | + |
| 22 | +def _load_gpqa_messages_from_csv() -> List[List[Message]]: |
| 23 | + url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv" |
| 24 | + resp = requests.get(url, timeout=60) |
| 25 | + resp.raise_for_status() |
| 26 | + |
| 27 | + messages_list: List[List[Message]] = [] |
| 28 | + reader = csv.DictReader(io.StringIO(resp.text)) |
| 29 | + for ex in reader: |
| 30 | + q = str(ex.get("Question", "")) |
| 31 | + correct = str(ex.get("Correct Answer", "")).strip() |
| 32 | + inc1 = str(ex.get("Incorrect Answer 1", "")) |
| 33 | + inc2 = str(ex.get("Incorrect Answer 2", "")) |
| 34 | + inc3 = str(ex.get("Incorrect Answer 3", "")) |
| 35 | + choices = [correct, inc1, inc2, inc3] |
| 36 | + user_content = ( |
| 37 | + f"{q}\n\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}\n\nAnswer with one letter." |
| 38 | + ) |
| 39 | + messages_list.append( |
| 40 | + [ |
| 41 | + Message(role="system", content=SYSTEM_PROMPT), |
| 42 | + Message(role="user", content=user_content), |
| 43 | + # Correct answer is always option A by construction |
| 44 | + Message(role="system", content="__GT__:A"), |
| 45 | + ] |
| 46 | + ) |
| 47 | + if not messages_list: |
| 48 | + raise RuntimeError("Failed to load GPQA messages: no rows found from source") |
| 49 | + return messages_list |
| 50 | + |
| 51 | + |
| 52 | +def _extract_abcd_letter(text: str) -> str | None: |
| 53 | + if not text: |
| 54 | + return None |
| 55 | + m = re.search(r"\b([ABCD])\b", text.upper()) |
| 56 | + return m.group(1) if m else None |
| 57 | + |
| 58 | + |
| 59 | +_GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv() |
| 60 | + |
| 61 | + |
| 62 | +@export_benchmark("gpqa") |
| 63 | +@evaluation_test( |
| 64 | + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], |
| 65 | + input_messages=_GPQA_INPUT_MESSAGES, |
| 66 | + rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], |
| 67 | + rollout_processor=default_single_turn_rollout_processor, |
| 68 | + aggregation_method="mean", |
| 69 | + threshold_of_success=None, |
| 70 | + num_runs=8, |
| 71 | + mode="pointwise", |
| 72 | +) |
| 73 | +def gpqa_pointwise(row: EvaluationRow) -> EvaluationRow: |
| 74 | + assistant_msgs = [m for m in row.messages if m.role == "assistant"] |
| 75 | + content = assistant_msgs[-1].content if assistant_msgs else "" |
| 76 | + |
| 77 | + pred = _extract_abcd_letter(content or "") |
| 78 | + # Retrieve GT from the trailing system message we appended |
| 79 | + gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")] |
| 80 | + gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None |
| 81 | + |
| 82 | + is_valid = pred is not None and gt in {"A", "B", "C", "D"} |
| 83 | + score = 1.0 if (is_valid and pred == gt) else 0.0 |
| 84 | + |
| 85 | + row.evaluation_result = EvaluateResult( |
| 86 | + score=score, |
| 87 | + reason=("Correct option" if score == 1.0 else "Incorrect option"), |
| 88 | + is_score_valid=is_valid, |
| 89 | + metrics={ |
| 90 | + "exact_match": MetricResult( |
| 91 | + score=score, |
| 92 | + is_score_valid=is_valid, |
| 93 | + reason=("Matched" if score == 1.0 else "Not matched"), |
| 94 | + data={"pred": pred, "gt": gt}, |
| 95 | + ) |
| 96 | + }, |
| 97 | + ) |
| 98 | + return row |
| 99 | + |
| 100 | + |
0 commit comments