|
| 1 | +""" |
| 2 | +LLM Judge quickstart that PULLS DATA FROM OpenAI Responses API and persists results locally via Eval Protocol. |
| 3 | +
|
| 4 | +This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses |
| 5 | +OpenAI Responses API as the source of evaluation rows. |
| 6 | +
|
| 7 | +Env vars: |
| 8 | + export OPENAI_API_KEY=... # required to fetch examples |
| 9 | +
|
| 10 | +Judge model keys: |
| 11 | + - Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY |
| 12 | + - Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY |
| 13 | +
|
| 14 | +Run: |
| 15 | + pytest python-sdk/eval_protocol/quickstart/llm_judge_openai_responses.py -q -s |
| 16 | +""" |
| 17 | + |
| 18 | +import os |
| 19 | + |
| 20 | +import pytest |
| 21 | + |
| 22 | +from eval_protocol import ( |
| 23 | + evaluation_test, |
| 24 | + aha_judge, |
| 25 | + EvaluationRow, |
| 26 | + SingleTurnRolloutProcessor, |
| 27 | + OpenAIResponsesAdapter, |
| 28 | + DynamicDataLoader, |
| 29 | +) |
| 30 | +from eval_protocol import multi_turn_assistant_to_ground_truth |
| 31 | + |
| 32 | + |
| 33 | +def openai_responses_data_generator(): |
| 34 | + adapter = OpenAIResponsesAdapter() |
| 35 | + return adapter.get_evaluation_rows( |
| 36 | + response_ids=[ |
| 37 | + "resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f", |
| 38 | + # "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c", |
| 39 | + # "resp_0c96a910416e87aa0068c994d0b34c81a3bda0eddf22445aec", |
| 40 | + # "resp_0efe023280e986f90068c994b85e088190bc8d8263fa603e02", |
| 41 | + ] |
| 42 | + ) |
| 43 | + |
| 44 | + |
| 45 | +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") |
| 46 | +@pytest.mark.parametrize( |
| 47 | + "completion_params", |
| 48 | + [ |
| 49 | + { |
| 50 | + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", |
| 51 | + }, |
| 52 | + { |
| 53 | + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", |
| 54 | + }, |
| 55 | + ], |
| 56 | +) |
| 57 | +@evaluation_test( |
| 58 | + data_loaders=DynamicDataLoader( |
| 59 | + generators=[openai_responses_data_generator], |
| 60 | + preprocess_fn=multi_turn_assistant_to_ground_truth, |
| 61 | + ), |
| 62 | + rollout_processor=SingleTurnRolloutProcessor(), |
| 63 | + max_concurrent_evaluations=2, |
| 64 | +) |
| 65 | +async def test_llm_judge_openai_responses(row: EvaluationRow) -> EvaluationRow: |
| 66 | + return await aha_judge(row) |
0 commit comments