Skip to content

Commit 15cdda9

Browse files
committed
first step
1 parent 95307e1 commit 15cdda9

File tree

1 file changed

+66
-0
lines changed

1 file changed

+66
-0
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""
2+
LLM Judge quickstart that PULLS DATA FROM OpenAI Responses API and persists results locally via Eval Protocol.
3+
4+
This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses
5+
OpenAI Responses API as the source of evaluation rows.
6+
7+
Env vars:
8+
export OPENAI_API_KEY=... # required to fetch examples
9+
10+
Judge model keys:
11+
- Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY
12+
- Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY
13+
14+
Run:
15+
pytest python-sdk/eval_protocol/quickstart/llm_judge_openai_responses.py -q -s
16+
"""
17+
18+
import os
19+
20+
import pytest
21+
22+
from eval_protocol import (
23+
evaluation_test,
24+
aha_judge,
25+
EvaluationRow,
26+
SingleTurnRolloutProcessor,
27+
OpenAIResponsesAdapter,
28+
DynamicDataLoader,
29+
)
30+
from eval_protocol import multi_turn_assistant_to_ground_truth
31+
32+
33+
def openai_responses_data_generator():
34+
adapter = OpenAIResponsesAdapter()
35+
return adapter.get_evaluation_rows(
36+
response_ids=[
37+
"resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f",
38+
# "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
39+
# "resp_0c96a910416e87aa0068c994d0b34c81a3bda0eddf22445aec",
40+
# "resp_0efe023280e986f90068c994b85e088190bc8d8263fa603e02",
41+
]
42+
)
43+
44+
45+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
46+
@pytest.mark.parametrize(
47+
"completion_params",
48+
[
49+
{
50+
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
51+
},
52+
{
53+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
54+
},
55+
],
56+
)
57+
@evaluation_test(
58+
data_loaders=DynamicDataLoader(
59+
generators=[openai_responses_data_generator],
60+
preprocess_fn=multi_turn_assistant_to_ground_truth,
61+
),
62+
rollout_processor=SingleTurnRolloutProcessor(),
63+
max_concurrent_evaluations=2,
64+
)
65+
async def test_llm_judge_openai_responses(row: EvaluationRow) -> EvaluationRow:
66+
return await aha_judge(row)

0 commit comments

Comments
 (0)