Skip to content

Commit df0fc3a

Browse files
authored
Braintrust Example (#163)
* braintrust example * fix uv lock * braintrust example
1 parent 0d11ed8 commit df0fc3a

10 files changed

Lines changed: 967 additions & 125 deletions

File tree

Lines changed: 235 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,240 @@
1-
"""Deprecated adapter wrappers for Braintrust.
1+
"""Braintrust adapter for Eval Protocol.
22
3-
This module forwards imports to :mod:`eval_protocol.integrations.braintrust`.
3+
This adapter allows pulling data from Braintrust deployments and converting it
4+
to EvaluationRow format for use in evaluation pipelines.
45
"""
56

7+
import logging
8+
import os
9+
import random
10+
import time
11+
from datetime import datetime, timedelta
12+
from typing import Any, Dict, List, Optional, Protocol
13+
14+
import requests
15+
16+
from eval_protocol.models import EvaluationRow, InputMetadata, Message
17+
from .utils import extract_messages_from_data
18+
19+
# Keep backward compatibility
620
from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
721

8-
__all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer"]
22+
23+
logger = logging.getLogger(__name__)
24+
25+
26+
class TraceConverter(Protocol):
27+
"""Protocol for custom trace-to-EvaluationRow converter functions.
28+
29+
A converter function should take a Braintrust trace along with processing
30+
options and return an EvaluationRow or None to skip the trace.
31+
"""
32+
33+
def __call__(
34+
self,
35+
trace: Dict[str, Any],
36+
include_tool_calls: bool,
37+
) -> Optional[EvaluationRow]:
38+
"""Convert a Braintrust trace to an EvaluationRow.
39+
40+
Args:
41+
trace: The Braintrust trace object to convert
42+
include_tool_calls: Whether to include tool calling information
43+
44+
Returns:
45+
EvaluationRow or None if the trace should be skipped
46+
"""
47+
...
48+
49+
50+
def convert_trace_to_evaluation_row(trace: Dict[str, Any], include_tool_calls: bool = True) -> Optional[EvaluationRow]:
51+
"""Convert a Braintrust trace to EvaluationRow format.
52+
53+
Args:
54+
trace: Braintrust trace object
55+
include_tool_calls: Whether to include tool calling information
56+
57+
Returns:
58+
EvaluationRow or None if conversion fails
59+
"""
60+
try:
61+
# Extract messages from the trace
62+
messages = extract_messages_from_trace(trace, include_tool_calls)
63+
64+
# Extract tools if available
65+
tools = None
66+
if include_tool_calls:
67+
metadata = trace.get("metadata", {})
68+
tools = metadata.get("tools")
69+
if not tools:
70+
hidden_params = metadata.get("hidden_params", {})
71+
optional_params = hidden_params.get("optional_params", {})
72+
tools = optional_params.get("tools")
73+
74+
if not messages:
75+
return None
76+
77+
return EvaluationRow(
78+
messages=messages,
79+
tools=tools,
80+
input_metadata=InputMetadata(
81+
session_data={
82+
"braintrust_trace_id": trace.get("id"),
83+
}
84+
),
85+
)
86+
87+
except (AttributeError, ValueError, KeyError) as e:
88+
logger.error("Error converting trace %s: %s", trace.get("id", "unknown"), e)
89+
return None
90+
91+
92+
def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool = True) -> List[Message]:
93+
"""Extract messages from Braintrust trace input and output.
94+
95+
Args:
96+
trace: Braintrust trace object
97+
include_tool_calls: Whether to include tool calling information
98+
99+
Returns:
100+
List of Message objects
101+
"""
102+
messages = []
103+
104+
try:
105+
# Look for complete conversations (input + output arrays)
106+
input_data = trace.get("input")
107+
108+
output_data = None
109+
output_list = trace.get("output", [])
110+
if output_list and len(output_list) > 0:
111+
first_output = output_list[0]
112+
if isinstance(first_output, dict):
113+
output_data = first_output.get("message")
114+
115+
# Skip spans without meaningful conversation data
116+
if not input_data or not output_data:
117+
return messages
118+
119+
# Extract messages from input and output
120+
if input_data:
121+
messages.extend(extract_messages_from_data(input_data, include_tool_calls))
122+
if output_data:
123+
messages.extend(extract_messages_from_data(output_data, include_tool_calls))
124+
125+
except (AttributeError, ValueError, KeyError) as e:
126+
logger.warning("Error processing trace %s: %s", trace.get("id", "unknown"), e)
127+
128+
return messages
129+
130+
131+
class BraintrustAdapter:
132+
"""Adapter to pull data from Braintrust and convert to EvaluationRow format.
133+
134+
This adapter can pull both chat conversations and tool calling traces from
135+
Braintrust deployments and convert them into the EvaluationRow format expected
136+
by the evaluation protocol.
137+
138+
Examples:
139+
Basic usage:
140+
>>> adapter = BraintrustAdapter(
141+
... api_key="your_api_key",
142+
... project_id="your_project_id"
143+
... )
144+
>>> btql_query = "select: * from: project_logs('your_project_id') traces limit: 10"
145+
>>> rows = adapter.get_evaluation_rows(btql_query)
146+
147+
Using BTQL for custom queries:
148+
>>> btql_query = '''
149+
... select: *
150+
... from: project_logs('your_project_id') traces
151+
... filter: metadata.agent_name = 'agent_instance'
152+
... limit: 50
153+
... '''
154+
>>> rows = adapter.get_evaluation_rows(btql_query)
155+
"""
156+
157+
def __init__(
158+
self,
159+
api_key: Optional[str] = None,
160+
api_url: Optional[str] = None,
161+
project_id: Optional[str] = None,
162+
):
163+
"""Initialize the Braintrust adapter.
164+
165+
Args:
166+
api_key: Braintrust API key (defaults to BRAINTRUST_API_KEY env var)
167+
api_url: Braintrust API URL (defaults to BRAINTRUST_API_URL env var)
168+
project_id: Project ID to fetch logs from (defaults to BRAINTRUST_PROJECT_ID env var)
169+
"""
170+
self.api_key = api_key or os.getenv("BRAINTRUST_API_KEY")
171+
self.api_url = api_url or os.getenv("BRAINTRUST_API_URL", "https://api.braintrust.dev")
172+
self.project_id = project_id or os.getenv("BRAINTRUST_PROJECT_ID")
173+
174+
if not self.api_key:
175+
raise ValueError("BRAINTRUST_API_KEY environment variable or api_key parameter required")
176+
if not self.project_id:
177+
raise ValueError("BRAINTRUST_PROJECT_ID environment variable or project_id parameter required")
178+
179+
def get_evaluation_rows(
180+
self,
181+
btql_query: str,
182+
include_tool_calls: bool = True,
183+
converter: Optional[TraceConverter] = None,
184+
) -> List[EvaluationRow]:
185+
"""Get evaluation rows using a custom BTQL query.
186+
187+
Args:
188+
btql_query: The BTQL query string to execute
189+
include_tool_calls: Whether to include tool calling information
190+
converter: Optional custom converter implementing TraceConverter protocol
191+
192+
Returns:
193+
List[EvaluationRow]: Converted evaluation rows
194+
"""
195+
eval_rows = []
196+
197+
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
198+
199+
response = requests.post(f"{self.api_url}/btql", headers=headers, json={"query": btql_query, "fmt": "json"})
200+
response.raise_for_status()
201+
query_response = response.json()
202+
203+
if not query_response or not query_response.get("data"):
204+
logger.debug("No data returned from BTQL query")
205+
return eval_rows
206+
207+
all_traces = query_response["data"]
208+
logger.debug("BTQL query returned %d traces", len(all_traces))
209+
210+
# Process each selected trace
211+
for trace in all_traces:
212+
try:
213+
if converter:
214+
eval_row = converter(trace, include_tool_calls)
215+
else:
216+
eval_row = convert_trace_to_evaluation_row(trace, include_tool_calls)
217+
if eval_row:
218+
eval_rows.append(eval_row)
219+
except (AttributeError, ValueError, KeyError) as e:
220+
logger.warning("Failed to convert trace %s: %s", trace.get("id", "unknown"), e)
221+
continue
222+
223+
logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
224+
return eval_rows
225+
226+
227+
def create_braintrust_adapter(
228+
api_key: Optional[str] = None,
229+
api_url: Optional[str] = None,
230+
project_id: Optional[str] = None,
231+
) -> BraintrustAdapter:
232+
"""Factory function to create a Braintrust adapter."""
233+
return BraintrustAdapter(
234+
api_key=api_key,
235+
api_url=api_url,
236+
project_id=project_id,
237+
)
238+
239+
240+
__all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer", "BraintrustAdapter", "create_braintrust_adapter"]

eval_protocol/adapters/langfuse.py

Lines changed: 34 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from typing import Any, Dict, List, Optional, Protocol
1313

1414
from eval_protocol.models import EvaluationRow, InputMetadata, Message
15+
from .utils import extract_messages_from_data
1516

1617
logger = logging.getLogger(__name__)
1718

@@ -112,7 +113,7 @@ def extract_messages_from_trace(
112113
if span_name: # Look for a generation tied to a span name
113114
try:
114115
# Find the final generation in the named span
115-
gen: ObservationsView | None = find_final_generation_in_span(trace, span_name)
116+
gen: ObservationsView | None = get_final_generation_in_span(trace, span_name)
116117
if not gen:
117118
return messages
118119

@@ -141,87 +142,8 @@ def extract_messages_from_trace(
141142
return messages
142143

143144

144-
def extract_messages_from_data(data, include_tool_calls: bool) -> List[Message]:
145-
"""Extract messages from data (works for both input and output).
146-
147-
Args:
148-
data: Data from trace or generation (input or output)
149-
include_tool_calls: Whether to include tool calling information
150-
151-
Returns:
152-
List of Message objects
153-
"""
154-
messages = []
155-
156-
if isinstance(data, dict):
157-
if "messages" in data:
158-
# OpenAI-style messages format
159-
for msg in data["messages"]:
160-
messages.append(dict_to_message(msg, include_tool_calls))
161-
elif "role" in data:
162-
# Single message format
163-
messages.append(dict_to_message(data, include_tool_calls))
164-
elif "prompt" in data:
165-
# Simple prompt format
166-
messages.append(Message(role="user", content=str(data["prompt"])))
167-
elif "content" in data:
168-
# Simple content format
169-
messages.append(Message(role="assistant", content=str(data["content"])))
170-
else:
171-
# Fallback: treat as single message
172-
messages.append(dict_to_message(data, include_tool_calls))
173-
elif isinstance(data, list):
174-
# Direct list of message dicts
175-
for msg in data:
176-
if isinstance(msg, dict):
177-
messages.append(dict_to_message(msg, include_tool_calls))
178-
elif isinstance(data, str):
179-
# Simple string - role depends on context, default to user
180-
messages.append(Message(role="user", content=data))
181-
182-
return messages
183-
184-
185-
def dict_to_message(msg_dict: Dict[str, Any], include_tool_calls: bool = True) -> Message:
186-
"""Convert a dictionary to a Message object.
187-
188-
Args:
189-
msg_dict: Dictionary containing message data
190-
include_tool_calls: Whether to include tool calling information
191-
192-
Returns:
193-
Message object
194-
"""
195-
# Extract basic message components
196-
role = msg_dict.get("role", "assistant")
197-
content = msg_dict.get("content")
198-
name = msg_dict.get("name")
199-
200-
# Handle tool calls if enabled
201-
tool_calls = None
202-
tool_call_id = None
203-
function_call = None
204-
205-
if include_tool_calls:
206-
if "tool_calls" in msg_dict:
207-
tool_calls = msg_dict["tool_calls"]
208-
if "tool_call_id" in msg_dict:
209-
tool_call_id = msg_dict["tool_call_id"]
210-
if "function_call" in msg_dict:
211-
function_call = msg_dict["function_call"]
212-
213-
return Message(
214-
role=role,
215-
content=content,
216-
name=name,
217-
tool_call_id=tool_call_id,
218-
tool_calls=tool_calls,
219-
function_call=function_call,
220-
)
221-
222-
223-
def find_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) -> ObservationsView | None:
224-
"""Find the final generation within a named span that contains full message history.
145+
def get_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) -> ObservationsView | None:
146+
"""Get the final generation within a named span that contains full message history.
225147
226148
Args:
227149
trace: Langfuse trace object
@@ -511,6 +433,36 @@ def get_evaluation_rows_by_ids(
511433
continue
512434
return eval_rows
513435

436+
def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
437+
"""Push evaluation scores back to Langfuse traces for tracking and analysis.
438+
439+
Creates a score entry in Langfuse for each unique trace_id found in the evaluation
440+
rows' session data. This allows you to see evaluation results directly in the
441+
Langfuse UI alongside the original traces.
442+
443+
Args:
444+
rows: List of EvaluationRow objects with session_data containing trace IDs
445+
model_name: Name of the model (used as the score name in Langfuse)
446+
mean_score: The calculated mean score to push to Langfuse
447+
448+
Note:
449+
Silently handles errors if rows lack session data
450+
"""
451+
try:
452+
for trace_id in set(
453+
row.input_metadata.session_data["langfuse_trace_id"]
454+
for row in rows
455+
if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
456+
):
457+
if trace_id:
458+
self.client.create_score(
459+
trace_id=trace_id,
460+
name=model_name,
461+
value=mean_score,
462+
)
463+
except Exception as e:
464+
logger.warning("Failed to push scores to Langfuse: %s", e)
465+
514466

515467
def create_langfuse_adapter() -> LangfuseAdapter:
516468
"""Factory function to create a Langfuse adapter."""

0 commit comments

Comments
 (0)