Skip to content

Commit 23cca87

Browse files
committed
Adding push scores back to langfuse in quickstart
1 parent 579d048 commit 23cca87

2 files changed

Lines changed: 35 additions & 7 deletions

File tree

eval_protocol/adapters/langfuse.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def get_evaluation_rows(
100100

101101
for trace in traces.data:
102102
try:
103+
trace: TraceWithFullDetails = self.client.api.trace.get(trace.id)
103104
eval_row = self._convert_trace_to_evaluation_row(trace, include_tool_calls)
104105
if eval_row:
105106
eval_rows.append(eval_row)
@@ -135,7 +136,7 @@ def get_evaluation_rows_by_ids(
135136
return eval_rows
136137

137138
def _convert_trace_to_evaluation_row(
138-
self, trace: Trace, include_tool_calls: bool = True
139+
self, trace: TraceWithFullDetails, include_tool_calls: bool = True
139140
) -> Optional[EvaluationRow]:
140141
"""Convert a Langfuse trace to EvaluationRow format.
141142
@@ -147,8 +148,6 @@ def _convert_trace_to_evaluation_row(
147148
EvaluationRow or None if conversion fails
148149
"""
149150
try:
150-
trace = self.client.api.trace.get("2d9f3474-83ab-4431-9788-049ca4219023")
151-
152151
# Extract messages from trace input and output
153152
messages = self._extract_messages_from_trace(trace, include_tool_calls)
154153

@@ -163,13 +162,20 @@ def _convert_trace_to_evaluation_row(
163162
return EvaluationRow(
164163
messages=messages,
165164
tools=tools,
165+
input_metadata=InputMetadata(
166+
session_data={
167+
"langfuse_trace_id": trace.id, # Store the trace ID here
168+
}
169+
),
166170
)
167171

168172
except (AttributeError, ValueError, KeyError) as e:
169173
logger.error("Error converting trace %s: %s", trace.id, e)
170174
return None
171175

172-
def _extract_messages_from_trace(self, trace: Any, include_tool_calls: bool = True) -> List[Message]:
176+
def _extract_messages_from_trace(
177+
self, trace: TraceWithFullDetails, include_tool_calls: bool = True
178+
) -> List[Message]:
173179
"""Extract messages from Langfuse trace input and output.
174180
175181
Args:

eval_protocol/quickstart/llm_judge.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import concurrent.futures
2020
from concurrent.futures import ThreadPoolExecutor
2121

22+
# Judge configs from the original Arena-Hard-Auto paper, feel free to add your own judge!
2223
JUDGE_CONFIGS = {
2324
"gpt-4.1": {
2425
"model": "gpt-4.1",
@@ -67,12 +68,14 @@ def fetch_langfuse_traces_as_evaluation_rows(
6768
@evaluation_test(
6869
input_rows=[fetch_langfuse_traces_as_evaluation_rows()],
6970
completion_params=[
70-
{"model": "gpt-5"},
7171
{
72-
# "max_tokens": 131000,
73-
# "extra_body": {"reasoning_effort": "low"},
7472
"model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
7573
},
74+
{
75+
"max_tokens": 131000,
76+
"extra_body": {"reasoning_effort": "low"},
77+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
78+
},
7679
],
7780
rollout_processor=SingleTurnRolloutProcessor(),
7881
preprocess_fn=split_multi_turn_rows,
@@ -218,4 +221,23 @@ def run_judgment(row: EvaluationRow) -> Optional[Dict[str, Any]]:
218221
2 * 1.645
219222
) # Standard error approximation from 90% CI
220223

224+
# Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
225+
try:
226+
langfuse = create_langfuse_adapter().client
227+
except Exception:
228+
langfuse = None
229+
230+
if langfuse:
231+
for trace_id in set(
232+
row.input_metadata.session_data["langfuse_trace_id"]
233+
for row in rows
234+
if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
235+
):
236+
if trace_id:
237+
langfuse.create_score(
238+
trace_id=trace_id,
239+
name=model_name,
240+
value=mean_score,
241+
)
242+
221243
return rows

0 commit comments

Comments
 (0)