Skip to content

Commit c5d17e4

Browse files
committed
try to mute and see what happens
1 parent 620611f commit c5d17e4

2 files changed

Lines changed: 67 additions & 74 deletions

File tree

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,16 @@
55
from typing import List
66

77
from litellm import acompletion
8+
import litellm
89
from typing import Dict
910

11+
# Fix LiteLLM event loop binding issues by setting logging to ERROR level
12+
# This disables the logging worker that causes event loop binding problems
13+
import os
14+
15+
if os.environ.get("LITELLM_LOG") is None:
16+
os.environ["LITELLM_LOG"] = "ERROR"
17+
1018
from eval_protocol.dataset_logger import default_logger
1119
from eval_protocol.models import EvaluationRow, Message
1220
from openai.types import CompletionUsage
@@ -21,6 +29,7 @@ class SingleTurnRolloutProcessor(RolloutProcessor):
2129

2230
def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
2331
"""Generate single turn rollout tasks and return them for external handling."""
32+
2433
# Do not modify global LiteLLM cache. Disable caching per-request instead.
2534

2635
async def process_row(row: EvaluationRow) -> EvaluationRow:

eval_protocol/quickstart/llm_judge.py

Lines changed: 58 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import pytest
1010

11-
from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult, Message
11+
from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
1212
from eval_protocol.pytest import evaluation_test
1313
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
1414
from eval_protocol.quickstart.utils import (
@@ -23,34 +23,22 @@
2323
from openai import AsyncOpenAI
2424

2525

26-
def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
27-
converted: List[EvaluationRow] = []
28-
for r in rows:
29-
question = r.get("question", "")
30-
answer = r.get("answer", None)
31-
messages = [
32-
Message(
33-
role="system",
34-
content="You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}.",
35-
),
36-
Message(role="user", content=str(question)),
37-
]
38-
converted.append(EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None))
39-
return converted
40-
41-
4226
@pytest.mark.asyncio
4327
@evaluation_test(
44-
input_dataset=[
45-
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
46-
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
28+
input_rows=[
29+
fetch_langfuse_traces_as_evaluation_rows(
30+
hours_back=24,
31+
limit=1,
32+
page_size=10,
33+
sleep_between_gets=3.0,
34+
max_retries=5,
35+
)
4736
],
48-
dataset_adapter=aime2025_dataset_adapter,
4937
completion_params=[
5038
# {
5139
# "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
5240
# },
53-
# {"model": "gpt-4.1"},
41+
{"model": "gpt-4.1"},
5442
{
5543
"max_tokens": 131000,
5644
"extra_body": {"reasoning_effort": "medium"},
@@ -63,14 +51,11 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
6351
},
6452
],
6553
rollout_processor=SingleTurnRolloutProcessor(),
66-
aggregation_method="mean",
67-
passed_threshold=0.8,
68-
num_runs=1,
69-
max_dataset_rows=1,
54+
# preprocess_fn=split_multi_turn_rows,
7055
max_concurrent_rollouts=64,
71-
mode="pointwise",
56+
mode="all",
7257
)
73-
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
58+
async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
7459
"""
7560
LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
7661
@@ -87,69 +72,68 @@ async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
8772
Returns:
8873
Same rows with updated evaluation_result containing scores and judgments
8974
"""
90-
return row
9175

92-
# # judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py.
93-
# judge_name = "gpt-4.1"
76+
# judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py.
77+
judge_name = "gpt-4.1"
9478

95-
# if not rows:
96-
# print("❌ No evaluation rows provided")
97-
# return rows
79+
if not rows:
80+
print("❌ No evaluation rows provided")
81+
return rows
9882

99-
# print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging...")
83+
print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging...")
10084

101-
# model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
85+
model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
10286

103-
# judgments = []
104-
# max_concurrency = JUDGE_CONFIGS[judge_name]["max_concurrency"]
87+
judgments = []
88+
max_concurrency = JUDGE_CONFIGS[judge_name]["max_concurrency"]
10589

106-
# judge_config = JUDGE_CONFIGS[judge_name]
90+
judge_config = JUDGE_CONFIGS[judge_name]
10791

108-
# async with AsyncOpenAI(
109-
# api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")
110-
# ) as shared_client:
111-
# semaphore = asyncio.Semaphore(max_concurrency)
92+
async with AsyncOpenAI(
93+
api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")
94+
) as shared_client:
95+
semaphore = asyncio.Semaphore(max_concurrency)
11296

113-
# async def run_judgment_with_semaphore(row):
114-
# async with semaphore:
115-
# return await run_judgment_async_with_shared_client(row, model_name, judge_name, shared_client)
97+
async def run_judgment_with_semaphore(row):
98+
async with semaphore:
99+
return await run_judgment_async_with_shared_client(row, model_name, judge_name, shared_client)
116100

117-
# tasks = [run_judgment_with_semaphore(row) for row in rows]
101+
tasks = [run_judgment_with_semaphore(row) for row in rows]
118102

119-
# for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Generating judgments"):
120-
# result = await coro
121-
# if result and result["games"][0] and result["games"][1]:
122-
# judgments.append(result)
103+
for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Generating judgments"):
104+
result = await coro
105+
if result and result["games"][0] and result["games"][1]:
106+
judgments.append(result)
123107

124-
# if not judgments:
125-
# print("❌ No valid judgments generated")
126-
# return rows
108+
if not judgments:
109+
print("❌ No valid judgments generated")
110+
return rows
127111

128-
# print(f"✅ Generated {len(judgments)} valid judgments")
112+
print(f"✅ Generated {len(judgments)} valid judgments")
129113

130-
# # Calculate bootstrap scores
131-
# mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
114+
# Calculate bootstrap scores
115+
mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
132116

133-
# if mean_score == 0.0:
134-
# print("❌ No valid scores extracted")
135-
# return rows
117+
if mean_score == 0.0:
118+
print("❌ No valid scores extracted")
119+
return rows
136120

137-
# # Print leaderboard
138-
# print("\n##### LLM Judge Results (90th percentile CI) #####")
121+
# Print leaderboard
122+
print("\n##### LLM Judge Results (90th percentile CI) #####")
139123

140-
# clean_model_name = model_name.split("/")[-1] # Clean model name
124+
clean_model_name = model_name.split("/")[-1] # Clean model name
141125

142-
# print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
143-
# print("original: 50.0% (CI: 50.0% - 50.0%)")
126+
print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
127+
print("original: 50.0% (CI: 50.0% - 50.0%)")
144128

145-
# for row in rows:
146-
# if row.evaluation_result:
147-
# row.evaluation_result.score = mean_score
148-
# row.evaluation_result.standard_error = (upper_score - lower_score) / (
149-
# 2 * 1.645
150-
# ) # Standard error approximation from 90% CI
129+
for row in rows:
130+
if row.evaluation_result:
131+
row.evaluation_result.score = mean_score
132+
row.evaluation_result.standard_error = (upper_score - lower_score) / (
133+
2 * 1.645
134+
) # Standard error approximation from 90% CI
151135

152-
# # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
153-
# # push_scores_to_langfuse(rows, model_name, mean_score)
136+
# Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
137+
# push_scores_to_langfuse(rows, model_name, mean_score)
154138

155-
# return rows
139+
return rows

0 commit comments

Comments
 (0)