Adding push scores back to langfuse in quickstart

xzrderek · xzrderek · commit 23cca8775a58 · 2025-09-11T13:34:16.000-07:00
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
@@ -100,6 +100,7 @@ def get_evaluation_rows(
 
         for trace in traces.data:
             try:
+                trace: TraceWithFullDetails = self.client.api.trace.get(trace.id)
                 eval_row = self._convert_trace_to_evaluation_row(trace, include_tool_calls)
                 if eval_row:
                     eval_rows.append(eval_row)
@@ -135,7 +136,7 @@ def get_evaluation_rows_by_ids(
         return eval_rows
 
     def _convert_trace_to_evaluation_row(
-        self, trace: Trace, include_tool_calls: bool = True
+        self, trace: TraceWithFullDetails, include_tool_calls: bool = True
     ) -> Optional[EvaluationRow]:
         """Convert a Langfuse trace to EvaluationRow format.
 
@@ -147,8 +148,6 @@ def _convert_trace_to_evaluation_row(
             EvaluationRow or None if conversion fails
         """
         try:
-            trace = self.client.api.trace.get("2d9f3474-83ab-4431-9788-049ca4219023")
-
             # Extract messages from trace input and output
             messages = self._extract_messages_from_trace(trace, include_tool_calls)
 
@@ -163,13 +162,20 @@ def _convert_trace_to_evaluation_row(
             return EvaluationRow(
                 messages=messages,
                 tools=tools,
+                input_metadata=InputMetadata(
+                    session_data={
+                        "langfuse_trace_id": trace.id,  # Store the trace ID here
+                    }
+                ),
             )
 
         except (AttributeError, ValueError, KeyError) as e:
             logger.error("Error converting trace %s: %s", trace.id, e)
             return None
 
-    def _extract_messages_from_trace(self, trace: Any, include_tool_calls: bool = True) -> List[Message]:
+    def _extract_messages_from_trace(
+        self, trace: TraceWithFullDetails, include_tool_calls: bool = True
+    ) -> List[Message]:
         """Extract messages from Langfuse trace input and output.
 
         Args:
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
@@ -19,6 +19,7 @@
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 
+# Judge configs from the original Arena-Hard-Auto paper, feel free to add your own judge!
 JUDGE_CONFIGS = {
     "gpt-4.1": {
         "model": "gpt-4.1",
@@ -67,12 +68,14 @@ def fetch_langfuse_traces_as_evaluation_rows(
 @evaluation_test(
     input_rows=[fetch_langfuse_traces_as_evaluation_rows()],
     completion_params=[
-        {"model": "gpt-5"},
         {
-            # "max_tokens": 131000,
-            # "extra_body": {"reasoning_effort": "low"},
             "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
         },
+        {
+            "max_tokens": 131000,
+            "extra_body": {"reasoning_effort": "low"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+        },
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=split_multi_turn_rows,
@@ -218,4 +221,23 @@ def run_judgment(row: EvaluationRow) -> Optional[Dict[str, Any]]:
                 2 * 1.645
             )  # Standard error approximation from 90% CI
 
+    # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
+    try:
+        langfuse = create_langfuse_adapter().client
+    except Exception:
+        langfuse = None
+
+    if langfuse:
+        for trace_id in set(
+            row.input_metadata.session_data["langfuse_trace_id"]
+            for row in rows
+            if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
+        ):
+            if trace_id:
+                langfuse.create_score(
+                    trace_id=trace_id,
+                    name=model_name,
+                    value=mean_score,
+                )
+
     return rows