Skip to content

Commit 508eeeb

Browse files
committed
bench: add GPQA exported benchmark suite; default low effort, num_runs=8
1 parent d9fd14c commit 508eeeb

File tree

1 file changed

+100
-0
lines changed
  • eval_protocol/benchmarks/suites

1 file changed

+100
-0
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from typing import List
2+
3+
import csv
4+
import io
5+
import re
6+
import requests
7+
8+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
9+
from eval_protocol.pytest.evaluation_test import evaluation_test
10+
from eval_protocol.pytest.default_single_turn_rollout_process import (
11+
default_single_turn_rollout_processor,
12+
)
13+
from eval_protocol.benchmarks.registry import export_benchmark
14+
15+
16+
SYSTEM_PROMPT = (
17+
"You are a helpful assistant. Read the question and options carefully. "
18+
"Express your final answer strictly as a single letter: A, B, C, or D."
19+
)
20+
21+
22+
def _load_gpqa_messages_from_csv() -> List[List[Message]]:
23+
url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
24+
resp = requests.get(url, timeout=60)
25+
resp.raise_for_status()
26+
27+
messages_list: List[List[Message]] = []
28+
reader = csv.DictReader(io.StringIO(resp.text))
29+
for ex in reader:
30+
q = str(ex.get("Question", ""))
31+
correct = str(ex.get("Correct Answer", "")).strip()
32+
inc1 = str(ex.get("Incorrect Answer 1", ""))
33+
inc2 = str(ex.get("Incorrect Answer 2", ""))
34+
inc3 = str(ex.get("Incorrect Answer 3", ""))
35+
choices = [correct, inc1, inc2, inc3]
36+
user_content = (
37+
f"{q}\n\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}\n\nAnswer with one letter."
38+
)
39+
messages_list.append(
40+
[
41+
Message(role="system", content=SYSTEM_PROMPT),
42+
Message(role="user", content=user_content),
43+
# Correct answer is always option A by construction
44+
Message(role="system", content="__GT__:A"),
45+
]
46+
)
47+
if not messages_list:
48+
raise RuntimeError("Failed to load GPQA messages: no rows found from source")
49+
return messages_list
50+
51+
52+
def _extract_abcd_letter(text: str) -> str | None:
53+
if not text:
54+
return None
55+
m = re.search(r"\b([ABCD])\b", text.upper())
56+
return m.group(1) if m else None
57+
58+
59+
_GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv()
60+
61+
62+
@export_benchmark("gpqa")
63+
@evaluation_test(
64+
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
65+
input_messages=_GPQA_INPUT_MESSAGES,
66+
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
67+
rollout_processor=default_single_turn_rollout_processor,
68+
aggregation_method="mean",
69+
threshold_of_success=None,
70+
num_runs=8,
71+
mode="pointwise",
72+
)
73+
def gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
74+
assistant_msgs = [m for m in row.messages if m.role == "assistant"]
75+
content = assistant_msgs[-1].content if assistant_msgs else ""
76+
77+
pred = _extract_abcd_letter(content or "")
78+
# Retrieve GT from the trailing system message we appended
79+
gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")]
80+
gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None
81+
82+
is_valid = pred is not None and gt in {"A", "B", "C", "D"}
83+
score = 1.0 if (is_valid and pred == gt) else 0.0
84+
85+
row.evaluation_result = EvaluateResult(
86+
score=score,
87+
reason=("Correct option" if score == 1.0 else "Incorrect option"),
88+
is_score_valid=is_valid,
89+
metrics={
90+
"exact_match": MetricResult(
91+
score=score,
92+
is_score_valid=is_valid,
93+
reason=("Matched" if score == 1.0 else "Not matched"),
94+
data={"pred": pred, "gt": gt},
95+
)
96+
},
97+
)
98+
return row
99+
100+

0 commit comments

Comments
 (0)