fix live bench and rollout processor

benjibc · benjibc · commit 10a43810fb19 · 2025-08-13T03:32:41.000Z
diff --git a/eval_protocol/benchmarks/suites/gpqa.py b/eval_protocol/benchmarks/suites/gpqa.py
@@ -56,13 +56,29 @@ def _extract_abcd_letter(text: str) -> str | None:
 
 _GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv()
 
+def _strip_gt_messages(msgs: List[Message]) -> List[Message]:
+    return [m for m in msgs if not (m.role == "system" and (m.content or "").startswith("__GT__:"))]
+
+
+async def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) -> List[EvaluationRow]:
+    """Preprocess rows to set ground_truth and remove __GT__ messages, then delegate to default processor."""
+    processed: List[EvaluationRow] = []
+    for r in rows:
+        gt_tokens = [m.content for m in r.messages if m.role == "system" and (m.content or "").startswith("__GT__:")]
+        if gt_tokens:
+            gt_val = gt_tokens[-1].split(":", 1)[1].strip()
+            r.ground_truth = gt_val
+            r.messages = [m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:"))]
+        processed.append(r)
+    return await default_single_turn_rollout_processor(processed, config)
+
 
 @export_benchmark("gpqa")
 @evaluation_test(
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
     input_messages=_GPQA_INPUT_MESSAGES,
     rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
-    rollout_processor=default_single_turn_rollout_processor,
+    rollout_processor=gpqa_strip_gt_rollout_processor,
     aggregation_method="mean",
     passed_threshold=None,
     num_runs=8,
@@ -73,9 +89,8 @@ def gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
     content = assistant_msgs[-1].content if assistant_msgs else ""
 
     pred = _extract_abcd_letter(content or "")
-    # Retrieve GT from the trailing system message we appended
-    gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")]
-    gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None
+    # GPQA diamond CSV constructs options so that the correct answer is always A
+    gt = "A"
 
     is_valid = pred is not None and gt in {"A", "B", "C", "D"}
     score = 1.0 if (is_valid and pred == gt) else 0.0
diff --git a/eval_protocol/benchmarks/suites/livebench_data_analysis.py b/eval_protocol/benchmarks/suites/livebench_data_analysis.py
@@ -315,7 +315,7 @@ def _read_jsonl_table_from_text(text: str, header_cols: List[str]):
 SYSTEM_PROMPT = "You are a helpful data analyst. Read the task and answer precisely."
 
 
-def _load_livebench_da_messages(task_name: str) -> List[List[Message]]:
+def _load_livebench_da_messages(task_name: str) -> List[EvaluationRow]:
     try:
         from datasets import load_dataset  # type: ignore
     except Exception as e:  # pragma: no cover
@@ -324,58 +324,57 @@ def _load_livebench_da_messages(task_name: str) -> List[List[Message]]:
         ) from e
 
     ds = load_dataset("livebench/data_analysis", split="test")
-    rows: List[List[Message]] = []
+    rows: List[EvaluationRow] = []
     for ex in ds:
         if str(ex.get("task", "")) != task_name:
             continue
         question_text = str(ex.get("turns", [""])[0])
         ground_truth = ex.get("ground_truth")
+        release = ex.get("livebench_release_date", "")
         try:
-            gt_json = json.dumps({
-                "ground_truth": ground_truth,
-                "release": ex.get("livebench_release_date", ""),
-            }, ensure_ascii=False)
+            gt_payload = json.dumps({"ground_truth": ground_truth, "release": release}, ensure_ascii=False)
         except TypeError:
-            # Some rows may include non-serializable types; fall back to string cast
-            gt_json = json.dumps({"ground_truth": str(ground_truth), "release": str(ex.get("livebench_release_date", ""))})
+            gt_payload = json.dumps({"ground_truth": str(ground_truth), "release": str(release)})
         rows.append(
-            [
-                Message(role="system", content=SYSTEM_PROMPT),
-                Message(role="user", content=question_text),
-                Message(role="system", content=f"__GT__:{gt_json}"),
-            ]
+            EvaluationRow(
+                messages=[
+                    Message(role="system", content=SYSTEM_PROMPT),
+                    Message(role="user", content=question_text),
+                ],
+                ground_truth=gt_payload,
+            )
         )
     if not rows:
         raise RuntimeError(f"No rows found for LiveBench data_analysis task '{task_name}'")
     return rows
 
 
 def _extract_gt(row: EvaluationRow) -> Dict[str, Any]:
-    gt_tokens = [
-        m.content
-        for m in row.messages
-        if m.role == "system" and (m.content or "").startswith("__GT__:")
-    ]
-    if not gt_tokens:
+    # For LiveBench Data Analysis, we fetch the ground truth from the HF dataset
+    # and store it in the top-level ground_truth field in the adapter below.
+    # Here, just parse row.ground_truth if it contains a JSON payload, else string.
+    if row.ground_truth is None:
         return {"ground_truth": None, "release": None}
     try:
-        payload = json.loads(gt_tokens[-1].split(":", 1)[1])
-        return payload if isinstance(payload, dict) else {"ground_truth": payload, "release": None}
+        payload = json.loads(row.ground_truth)
+        if isinstance(payload, dict):
+            return payload
     except Exception:
-        return {"ground_truth": gt_tokens[-1].split(":", 1)[1], "release": None}
+        pass
+    return {"ground_truth": row.ground_truth, "release": None}
 
 
 # -------------------------
 # CTA
 # -------------------------
 
-_CTA_MESSAGES = _load_livebench_da_messages("cta")
+_CTA_ROWS = _load_livebench_da_messages("cta")
 
 
 @export_benchmark("live_bench/data_analysis/cta")
 @evaluation_test(
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    input_messages=_CTA_MESSAGES,
+    input_messages=[[m for m in r.messages] for r in _CTA_ROWS],
     rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
@@ -412,13 +411,13 @@ def livebench_cta_pointwise(row: EvaluationRow) -> EvaluationRow:
 # Table Join
 # -------------------------
 
-_TABLEJOIN_MESSAGES = _load_livebench_da_messages("tablejoin")
+_TABLEJOIN_ROWS = _load_livebench_da_messages("tablejoin")
 
 
 @export_benchmark("live_bench/data_analysis/tablejoin")
 @evaluation_test(
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    input_messages=_TABLEJOIN_MESSAGES,
+    input_messages=[[m for m in r.messages] for r in _TABLEJOIN_ROWS],
     rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
@@ -456,13 +455,13 @@ def livebench_tablejoin_pointwise(row: EvaluationRow) -> EvaluationRow:
 # Table Reformat
 # -------------------------
 
-_TABLEREFORMAT_MESSAGES = _load_livebench_da_messages("tablereformat")
+_TABLEREFORMAT_ROWS = _load_livebench_da_messages("tablereformat")
 
 
 @export_benchmark("live_bench/data_analysis/tablereformat")
 @evaluation_test(
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    input_messages=_TABLEREFORMAT_MESSAGES,
+    input_messages=[[m for m in r.messages] for r in _TABLEREFORMAT_ROWS],
     rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -31,12 +31,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         if len(row.messages) == 0:
             raise ValueError("Messages is empty. Please provide a non-empty dataset")
 
-        # Filter out any sentinel ground-truth system messages (e.g., "__GT__:") before sending to the model
-        messages_payload = [
-            {"role": m.role, "content": m.content}
-            for m in row.messages
-            if not (m.role == "system" and (m.content or "").startswith("__GT__:"))
-        ]
+        messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
         request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
         # Ensure caching is disabled only for this request (review feedback)