Skip to content

Commit 3e737fc

Browse files
author
Dylan Huang
committed
Merge branch 'main' into link-to-local-ui
2 parents d495ea5 + 36e88b1 commit 3e737fc

25 files changed

+1219
-343
lines changed

eval_protocol/__init__.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,19 +37,19 @@
3737
from .resources import create_llm_resource
3838
from .reward_function import RewardFunction
3939
from .typed_interface import reward_function
40-
from .quickstart import aha_judge, split_multi_turn_rows
40+
from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
4141
from .pytest import evaluation_test, SingleTurnRolloutProcessor
4242
from .pytest.parameterize import DefaultParameterIdGenerator
4343

4444
from .adapters import OpenAIResponsesAdapter
4545

4646
try:
47-
from .adapters import LangfuseAdapter
47+
from .adapters import LangfuseAdapter, create_langfuse_adapter
4848
except ImportError:
4949
LangfuseAdapter = None
5050

5151
try:
52-
from .adapters import BraintrustAdapter
52+
from .adapters import BraintrustAdapter, create_braintrust_adapter
5353
except ImportError:
5454
BraintrustAdapter = None
5555

@@ -64,12 +64,15 @@
6464
__all__ = [
6565
"DefaultParameterIdGenerator",
6666
"aha_judge",
67-
"split_multi_turn_rows",
67+
"multi_turn_assistant_to_ground_truth",
68+
"assistant_to_ground_truth",
6869
"evaluation_test",
6970
"SingleTurnRolloutProcessor",
7071
"OpenAIResponsesAdapter",
7172
"LangfuseAdapter",
73+
"create_langfuse_adapter",
7274
"BraintrustAdapter",
75+
"create_braintrust_adapter",
7376
"LangSmithAdapter",
7477
# Core interfaces
7578
"Message",

eval_protocol/adapters/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,7 @@ def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
1919
def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
2020
"""Upload evaluation scores back to the data source for tracking and analysis."""
2121
pass
22+
23+
def upload_score(self, row: EvaluationRow, model_name: str) -> None:
24+
"""Upload evaluation score for a single row back to the data source."""
25+
pass

eval_protocol/adapters/braintrust.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,40 @@ def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score:
264264
except Exception as e:
265265
logger.warning("Failed to push scores to Braintrust: %s", e)
266266

267+
def upload_score(self, row: EvaluationRow, model_name: str) -> None:
268+
"""Upload evaluation score for a single row back to Braintrust.
269+
270+
Args:
271+
row: Single EvaluationRow with evaluation_result and session_data containing trace ID
272+
model_name: Name of the model (used as the score name in Braintrust)
273+
"""
274+
try:
275+
if (
276+
row.evaluation_result
277+
and row.evaluation_result.is_score_valid
278+
and row.input_metadata
279+
and row.input_metadata.session_data
280+
and "braintrust_trace_id" in row.input_metadata.session_data
281+
):
282+
headers = {
283+
"Authorization": f"Bearer {self.api_key}",
284+
"Content-Type": "application/json",
285+
}
286+
287+
trace_id = row.input_metadata.session_data["braintrust_trace_id"]
288+
if trace_id:
289+
feedback_items = [{"id": trace_id, "scores": {model_name: row.evaluation_result.score}}]
290+
291+
response = requests.post(
292+
f"{self.api_url}/v1/feedback",
293+
headers=headers,
294+
json={"feedback": feedback_items},
295+
timeout=30,
296+
)
297+
response.raise_for_status()
298+
except Exception as e:
299+
logger.warning("Failed to upload single score to Braintrust: %s", e)
300+
267301

268302
def create_braintrust_adapter(
269303
api_key: Optional[str] = None,

0 commit comments

Comments
 (0)