88
99import pytest
1010
11- from eval_protocol .models import EvaluateResult , EvaluationRow , MetricResult , Message
11+ from eval_protocol .models import EvaluateResult , EvaluationRow , MetricResult
1212from eval_protocol .pytest import evaluation_test
1313from eval_protocol .pytest .default_single_turn_rollout_process import SingleTurnRolloutProcessor
1414from eval_protocol .quickstart .utils import (
2323from openai import AsyncOpenAI
2424
2525
26- def aime2025_dataset_adapter (rows : List [Dict [str , Any ]]) -> List [EvaluationRow ]:
27- converted : List [EvaluationRow ] = []
28- for r in rows :
29- question = r .get ("question" , "" )
30- answer = r .get ("answer" , None )
31- messages = [
32- Message (
33- role = "system" ,
34- content = "You are a helpful math assistant. Please reason step by step, and put your final answer within \\ boxed{...}." ,
35- ),
36- Message (role = "user" , content = str (question )),
37- ]
38- converted .append (EvaluationRow (messages = messages , ground_truth = str (answer ) if answer is not None else None ))
39- return converted
40-
41-
4226@pytest .mark .asyncio
4327@evaluation_test (
44- input_dataset = [
45- "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl" ,
46- "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl" ,
28+ input_rows = [
29+ fetch_langfuse_traces_as_evaluation_rows (
30+ hours_back = 24 ,
31+ limit = 1 ,
32+ page_size = 10 ,
33+ sleep_between_gets = 3.0 ,
34+ max_retries = 5 ,
35+ )
4736 ],
48- dataset_adapter = aime2025_dataset_adapter ,
4937 completion_params = [
5038 # {
5139 # "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
5240 # },
53- # {"model": "gpt-4.1"},
41+ {"model" : "gpt-4.1" },
5442 {
5543 "max_tokens" : 131000 ,
5644 "extra_body" : {"reasoning_effort" : "medium" },
@@ -63,14 +51,11 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
6351 },
6452 ],
6553 rollout_processor = SingleTurnRolloutProcessor (),
66- aggregation_method = "mean" ,
67- passed_threshold = 0.8 ,
68- num_runs = 1 ,
69- max_dataset_rows = 1 ,
54+ # preprocess_fn=split_multi_turn_rows,
7055 max_concurrent_rollouts = 64 ,
71- mode = "pointwise " ,
56+ mode = "all " ,
7257)
73- async def test_llm_judge (row : EvaluationRow ) -> EvaluationRow :
58+ async def test_llm_judge (rows : list [ EvaluationRow ] ) -> list [ EvaluationRow ] :
7459 """
7560 LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
7661
@@ -87,69 +72,68 @@ async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
8772 Returns:
8873 Same rows with updated evaluation_result containing scores and judgments
8974 """
90- return row
9175
92- # # judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py.
93- # judge_name = "gpt-4.1"
76+ # judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py.
77+ judge_name = "gpt-4.1"
9478
95- # if not rows:
96- # print("❌ No evaluation rows provided")
97- # return rows
79+ if not rows :
80+ print ("❌ No evaluation rows provided" )
81+ return rows
9882
99- # print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging...")
83+ print (f"🔄 Processing { len (rows )} evaluation rows for LLM judging..." )
10084
101- # model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
85+ model_name = rows [0 ].input_metadata .completion_params .get ("model" , "unknown_model" )
10286
103- # judgments = []
104- # max_concurrency = JUDGE_CONFIGS[judge_name]["max_concurrency"]
87+ judgments = []
88+ max_concurrency = JUDGE_CONFIGS [judge_name ]["max_concurrency" ]
10589
106- # judge_config = JUDGE_CONFIGS[judge_name]
90+ judge_config = JUDGE_CONFIGS [judge_name ]
10791
108- # async with AsyncOpenAI(
109- # api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")
110- # ) as shared_client:
111- # semaphore = asyncio.Semaphore(max_concurrency)
92+ async with AsyncOpenAI (
93+ api_key = judge_config .get ("api_key" ), base_url = judge_config .get ("base_url" )
94+ ) as shared_client :
95+ semaphore = asyncio .Semaphore (max_concurrency )
11296
113- # async def run_judgment_with_semaphore(row):
114- # async with semaphore:
115- # return await run_judgment_async_with_shared_client(row, model_name, judge_name, shared_client)
97+ async def run_judgment_with_semaphore (row ):
98+ async with semaphore :
99+ return await run_judgment_async_with_shared_client (row , model_name , judge_name , shared_client )
116100
117- # tasks = [run_judgment_with_semaphore(row) for row in rows]
101+ tasks = [run_judgment_with_semaphore (row ) for row in rows ]
118102
119- # for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Generating judgments"):
120- # result = await coro
121- # if result and result["games"][0] and result["games"][1]:
122- # judgments.append(result)
103+ for coro in tqdm (asyncio .as_completed (tasks ), total = len (tasks ), desc = "Generating judgments" ):
104+ result = await coro
105+ if result and result ["games" ][0 ] and result ["games" ][1 ]:
106+ judgments .append (result )
123107
124- # if not judgments:
125- # print("❌ No valid judgments generated")
126- # return rows
108+ if not judgments :
109+ print ("❌ No valid judgments generated" )
110+ return rows
127111
128- # print(f"✅ Generated {len(judgments)} valid judgments")
112+ print (f"✅ Generated { len (judgments )} valid judgments" )
129113
130- # # Calculate bootstrap scores
131- # mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
114+ # Calculate bootstrap scores
115+ mean_score , lower_score , upper_score = calculate_bootstrap_scores (judgments )
132116
133- # if mean_score == 0.0:
134- # print("❌ No valid scores extracted")
135- # return rows
117+ if mean_score == 0.0 :
118+ print ("❌ No valid scores extracted" )
119+ return rows
136120
137- # # Print leaderboard
138- # print("\n##### LLM Judge Results (90th percentile CI) #####")
121+ # Print leaderboard
122+ print ("\n ##### LLM Judge Results (90th percentile CI) #####" )
139123
140- # clean_model_name = model_name.split("/")[-1] # Clean model name
124+ clean_model_name = model_name .split ("/" )[- 1 ] # Clean model name
141125
142- # print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
143- # print("original: 50.0% (CI: 50.0% - 50.0%)")
126+ print (f"{ clean_model_name } : { mean_score :.1%} (CI: { lower_score :.1%} - { upper_score :.1%} )" )
127+ print ("original: 50.0% (CI: 50.0% - 50.0%)" )
144128
145- # for row in rows:
146- # if row.evaluation_result:
147- # row.evaluation_result.score = mean_score
148- # row.evaluation_result.standard_error = (upper_score - lower_score) / (
149- # 2 * 1.645
150- # ) # Standard error approximation from 90% CI
129+ for row in rows :
130+ if row .evaluation_result :
131+ row .evaluation_result .score = mean_score
132+ row .evaluation_result .standard_error = (upper_score - lower_score ) / (
133+ 2 * 1.645
134+ ) # Standard error approximation from 90% CI
151135
152- # # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
153- # # push_scores_to_langfuse(rows, model_name, mean_score)
136+ # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
137+ # push_scores_to_langfuse(rows, model_name, mean_score)
154138
155- # return rows
139+ return rows
0 commit comments