Skip to content

Commit 2a8be50

Browse files
author
Dylan Huang
committed
fix
1 parent b39ac57 commit 2a8be50

1 file changed

Lines changed: 18 additions & 21 deletions

File tree

eval_protocol/benchmarks/suites/livebench_data_analysis.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,19 @@
1-
from typing import Any, Dict, List, Optional
2-
31
import json
42
import re
3+
from typing import Any, Dict, List, Optional
54

5+
from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark
66
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
77
from eval_protocol.pytest.default_single_turn_rollout_process import (
88
default_single_turn_rollout_processor,
99
)
1010
from eval_protocol.pytest.evaluation_test import evaluation_test
11-
from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark
12-
1311

1412
# -------------------------
1513
# Lightweight ports of LiveBench scoring utilities for data_analysis tasks
1614
# -------------------------
1715

16+
1817
def _lb_clean_text(text: str) -> str:
1918
text = text.lower().strip()
2019
text = re.sub(r"[^\w]", "", text)
@@ -36,9 +35,7 @@ def _cta_process_results(ground_truth: str, llm_answer: str) -> int:
3635
boxed = _extract_last_boxed_segment(parsed_answer)
3736
if boxed is not None:
3837
parsed_answer = boxed
39-
parsed_answer = (
40-
parsed_answer.replace("\\text{", "").replace("}", "").replace("\\", "")
41-
)
38+
parsed_answer = parsed_answer.replace("\\text{", "").replace("}", "").replace("\\", "")
4239

4340
gt_clean = _lb_clean_text(ground_truth)
4441
ans_clean = _lb_clean_text(parsed_answer)
@@ -132,17 +129,15 @@ def _tablejoin_process_results(ground_truth: Any, llm_answer: str) -> float:
132129
return round((2 * tp) / denom, 2)
133130

134131

135-
def _tablereformat_process_results(
136-
input_command: str, ground_truth: str, llm_answer: str, version: str
137-
) -> int:
132+
def _tablereformat_process_results(input_command: str, ground_truth: str, llm_answer: str, version: str) -> int:
138133
try:
139134
import pandas as pd # type: ignore
140135
except Exception:
141136
return 0
142137

143-
from io import StringIO
144138
import math as _math
145139
import traceback as _traceback
140+
from io import StringIO
146141

147142
def _read_df_v1(df_type: str, df_str: str):
148143
if df_type == "json":
@@ -252,8 +247,12 @@ def _read_jsonl_table_from_text(text: str, header_cols: List[str]):
252247
)
253248
else:
254249
lines = input_command.split("\n")
255-
input_fmt = [l for l in lines if "Source Format" in l][-1].split("Source Format: ")[-1].strip().lower()
256-
output_fmt = [l for l in lines if "Target Format" in l][-1].split("Target Format: ")[-1].strip().lower()
250+
input_fmt = (
251+
[line for line in lines if "Source Format" in line][-1].split("Source Format: ")[-1].strip().lower()
252+
)
253+
output_fmt = (
254+
[line for line in lines if "Target Format" in line][-1].split("Target Format: ")[-1].strip().lower()
255+
)
257256

258257
reader = _read_df_v1 if version == "v1" else _read_df_v2
259258
gt_df = reader(output_fmt, ground_truth)
@@ -373,9 +372,9 @@ def _extract_gt(row: EvaluationRow) -> Dict[str, Any]:
373372

374373
@export_benchmark("live_bench/data_analysis/cta")
375374
@evaluation_test(
376-
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
375+
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
377376
input_messages=[[m for m in r.messages] for r in _CTA_ROWS],
378-
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
377+
rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}],
379378
rollout_processor=default_single_turn_rollout_processor,
380379
aggregation_method="mean",
381380
passed_threshold=None,
@@ -416,9 +415,9 @@ def livebench_cta_pointwise(row: EvaluationRow) -> EvaluationRow:
416415

417416
@export_benchmark("live_bench/data_analysis/tablejoin")
418417
@evaluation_test(
419-
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
418+
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
420419
input_messages=[[m for m in r.messages] for r in _TABLEJOIN_ROWS],
421-
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
420+
rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}],
422421
rollout_processor=default_single_turn_rollout_processor,
423422
aggregation_method="mean",
424423
passed_threshold=None,
@@ -460,9 +459,9 @@ def livebench_tablejoin_pointwise(row: EvaluationRow) -> EvaluationRow:
460459

461460
@export_benchmark("live_bench/data_analysis/tablereformat")
462461
@evaluation_test(
463-
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
462+
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
464463
input_messages=[[m for m in r.messages] for r in _TABLEREFORMAT_ROWS],
465-
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
464+
rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}],
466465
rollout_processor=default_single_turn_rollout_processor,
467466
aggregation_method="mean",
468467
passed_threshold=None,
@@ -508,5 +507,3 @@ def livebench_tablereformat_pointwise(row: EvaluationRow) -> EvaluationRow:
508507
"live_bench/data_analysis/tablereformat",
509508
],
510509
)
511-
512-

0 commit comments

Comments
 (0)