1- from typing import Any , Dict , List , Optional
2-
31import json
42import re
3+ from typing import Any , Dict , List , Optional
54
5+ from eval_protocol .benchmarks .registry import export_benchmark , register_composite_benchmark
66from eval_protocol .models import EvaluateResult , EvaluationRow , Message , MetricResult
77from eval_protocol .pytest .default_single_turn_rollout_process import (
88 default_single_turn_rollout_processor ,
99)
1010from eval_protocol .pytest .evaluation_test import evaluation_test
11- from eval_protocol .benchmarks .registry import export_benchmark , register_composite_benchmark
12-
1311
1412# -------------------------
1513# Lightweight ports of LiveBench scoring utilities for data_analysis tasks
1614# -------------------------
1715
16+
1817def _lb_clean_text (text : str ) -> str :
1918 text = text .lower ().strip ()
2019 text = re .sub (r"[^\w]" , "" , text )
@@ -36,9 +35,7 @@ def _cta_process_results(ground_truth: str, llm_answer: str) -> int:
3635 boxed = _extract_last_boxed_segment (parsed_answer )
3736 if boxed is not None :
3837 parsed_answer = boxed
39- parsed_answer = (
40- parsed_answer .replace ("\\ text{" , "" ).replace ("}" , "" ).replace ("\\ " , "" )
41- )
38+ parsed_answer = parsed_answer .replace ("\\ text{" , "" ).replace ("}" , "" ).replace ("\\ " , "" )
4239
4340 gt_clean = _lb_clean_text (ground_truth )
4441 ans_clean = _lb_clean_text (parsed_answer )
@@ -132,17 +129,15 @@ def _tablejoin_process_results(ground_truth: Any, llm_answer: str) -> float:
132129 return round ((2 * tp ) / denom , 2 )
133130
134131
135- def _tablereformat_process_results (
136- input_command : str , ground_truth : str , llm_answer : str , version : str
137- ) -> int :
132+ def _tablereformat_process_results (input_command : str , ground_truth : str , llm_answer : str , version : str ) -> int :
138133 try :
139134 import pandas as pd # type: ignore
140135 except Exception :
141136 return 0
142137
143- from io import StringIO
144138 import math as _math
145139 import traceback as _traceback
140+ from io import StringIO
146141
147142 def _read_df_v1 (df_type : str , df_str : str ):
148143 if df_type == "json" :
@@ -252,8 +247,12 @@ def _read_jsonl_table_from_text(text: str, header_cols: List[str]):
252247 )
253248 else :
254249 lines = input_command .split ("\n " )
255- input_fmt = [l for l in lines if "Source Format" in l ][- 1 ].split ("Source Format: " )[- 1 ].strip ().lower ()
256- output_fmt = [l for l in lines if "Target Format" in l ][- 1 ].split ("Target Format: " )[- 1 ].strip ().lower ()
250+ input_fmt = (
251+ [line for line in lines if "Source Format" in line ][- 1 ].split ("Source Format: " )[- 1 ].strip ().lower ()
252+ )
253+ output_fmt = (
254+ [line for line in lines if "Target Format" in line ][- 1 ].split ("Target Format: " )[- 1 ].strip ().lower ()
255+ )
257256
258257 reader = _read_df_v1 if version == "v1" else _read_df_v2
259258 gt_df = reader (output_fmt , ground_truth )
@@ -373,9 +372,9 @@ def _extract_gt(row: EvaluationRow) -> Dict[str, Any]:
373372
374373@export_benchmark ("live_bench/data_analysis/cta" )
375374@evaluation_test (
376- model = [ " fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ],
375+ completion_params = [{ "model" : " fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} ],
377376 input_messages = [[m for m in r .messages ] for r in _CTA_ROWS ],
378- rollout_input_params = [{"extra_body" : {"reasoning_effort" : "low" }}],
377+ rollout_processor_kwargs = [{"extra_body" : {"reasoning_effort" : "low" }}],
379378 rollout_processor = default_single_turn_rollout_processor ,
380379 aggregation_method = "mean" ,
381380 passed_threshold = None ,
@@ -416,9 +415,9 @@ def livebench_cta_pointwise(row: EvaluationRow) -> EvaluationRow:
416415
417416@export_benchmark ("live_bench/data_analysis/tablejoin" )
418417@evaluation_test (
419- model = [ " fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ],
418+ completion_params = [{ "model" : " fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} ],
420419 input_messages = [[m for m in r .messages ] for r in _TABLEJOIN_ROWS ],
421- rollout_input_params = [{"extra_body" : {"reasoning_effort" : "low" }}],
420+ rollout_processor_kwargs = [{"extra_body" : {"reasoning_effort" : "low" }}],
422421 rollout_processor = default_single_turn_rollout_processor ,
423422 aggregation_method = "mean" ,
424423 passed_threshold = None ,
@@ -460,9 +459,9 @@ def livebench_tablejoin_pointwise(row: EvaluationRow) -> EvaluationRow:
460459
461460@export_benchmark ("live_bench/data_analysis/tablereformat" )
462461@evaluation_test (
463- model = [ " fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ],
462+ completion_params = [{ "model" : " fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} ],
464463 input_messages = [[m for m in r .messages ] for r in _TABLEREFORMAT_ROWS ],
465- rollout_input_params = [{"extra_body" : {"reasoning_effort" : "low" }}],
464+ rollout_processor_kwargs = [{"extra_body" : {"reasoning_effort" : "low" }}],
466465 rollout_processor = default_single_turn_rollout_processor ,
467466 aggregation_method = "mean" ,
468467 passed_threshold = None ,
@@ -508,5 +507,3 @@ def livebench_tablereformat_pointwise(row: EvaluationRow) -> EvaluationRow:
508507 "live_bench/data_analysis/tablereformat" ,
509508 ],
510509)
511-
512-
0 commit comments