Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions py/examples/evals/intent_classification_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from braintrust import Eval
from openai import OpenAI

client = OpenAI()

DATASET = [
{
"input": "What's your return policy?",
"expected": "policy_question",
},
{
"input": "I need help with my order",
"expected": "support_request",
},
]


def task(input):
response = client.responses.create(
model="gpt-5-mini",
input=[{"role": "user", "content": input}],
)
return response.output_text


def intent_classifier(input, output, expected, metadata):
keywords = {
"policy_question": ["policy", "return", "refund", "warranty"],
"support_request": ["help", "issue", "problem", "support"],
"product_inquiry": ["price", "feature", "available", "buy"],
}

for intent, words in keywords.items():
if any(word in input.lower() for word in words):
return intent

return "other"


Eval(
"Intent Classification",
data=DATASET,
task=task,
scores=[intent_classifier],
)
107 changes: 96 additions & 11 deletions py/src/braintrust/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import traceback
import warnings
from collections import defaultdict
from collections.abc import Awaitable, Callable, Coroutine, Iterable, Iterator, Sequence
from collections.abc import Awaitable, Callable, Coroutine, Iterable, Iterator, Mapping, Sequence
from concurrent.futures import ThreadPoolExecutor
from contextlib import contextmanager
from multiprocessing import cpu_count
Expand Down Expand Up @@ -232,7 +232,64 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output]):
metadata: Metadata | None = None


OneOrMoreScores = Union[float, int, bool, None, Score, list[Score]]
OneOrMoreScores = Union[
float,
int,
bool,
None,
Score,
list[Score],
str,
list[str],
Mapping[str, Any],
list[Mapping[str, Any]],
]


def _normalize_classification_item(item: Mapping[str, Any]) -> dict[str, Any]:
classification_id = item.get("id")
if not isinstance(classification_id, str):
raise ValueError(f"Classification item must include string id. Got: {item}")

label = item.get("label")
if label is not None and not isinstance(label, str):
raise ValueError(f"Classification item label must be a string when specified. Got: {item}")

confidence = item.get("confidence")
if confidence is not None and not isinstance(confidence, (int, float)):
raise ValueError(f"Classification item confidence must be a number when specified. Got: {item}")

metadata = item.get("metadata")
if metadata is not None and not isinstance(metadata, Mapping):
raise ValueError(f"Classification item metadata must be an object when specified. Got: {item}")

result = dict(item)
if result.get("label") is None:
result["label"] = classification_id
return result


def _try_parse_classification_output(value: Any) -> list[dict[str, Any]] | None:
if isinstance(value, str):
return [{"id": value, "label": value}]

if isinstance(value, Mapping):
return [_normalize_classification_item(value)]

if not isinstance(value, Iterable):
return None

values = list(value)
if len(values) == 0:
return []

if all(isinstance(item, str) for item in values):
return [{"id": item, "label": item} for item in values]

if all(isinstance(item, Mapping) for item in values):
return [_normalize_classification_item(item) for item in values]

return None


# Synchronous scorer interface - implements callable
Expand Down Expand Up @@ -1358,16 +1415,40 @@ async def await_or_run_scorer(root_span, scorer, name, **kwargs):
if isinstance(result, dict):
try:
result = Score.from_dict(result)
except Exception as e:
raise ValueError(f"When returning a dict, it must be a valid Score object. Got: {result}") from e
except Exception as score_parse_error:
try:
classification_items = _try_parse_classification_output(result)
except Exception as classification_parse_error:
raise ValueError(
f"When returning a dict, it must be a valid Score object or classification item. Got: {result}"
) from classification_parse_error

if classification_items is None:
raise ValueError(
f"When returning a dict, it must be a valid Score object or classification item. Got: {result}"
) from score_parse_error

span.log(output=classification_items, classifications={name: classification_items})
return {"scores": [], "classifications": {name: classification_items}}

if isinstance(result, str):
classification_items = _try_parse_classification_output(result)
span.log(output=classification_items, classifications={name: classification_items})
return {"scores": [], "classifications": {name: classification_items}}

if isinstance(result, Iterable):
for s in result:
if not is_score(s):
result = list(result)
if all(is_score(s) for s in result):
result = list(result)
else:
classification_items = _try_parse_classification_output(result)
if classification_items is None:
raise ValueError(
f"When returning an array of scores, each score must be a valid Score object. Got: {s}"
"When returning an array, each item must be a valid Score object "
f"or classification item. Got: {result}"
)
result = list(result)
span.log(output=classification_items, classifications={name: classification_items})
return {"scores": [], "classifications": {name: classification_items}}
elif is_score(result):
result = [result]
else:
Expand All @@ -1383,7 +1464,7 @@ def get_other_fields(s):

scores = {r.name: r.score for r in result}
span.log(output=result_output, metadata=result_metadata, scores=scores)
return result
return {"scores": result, "classifications": {}}

# First, resolve the scorers if they are classes
scorers = [scorer() if inspect.isclass(scorer) and is_scorer(scorer) else scorer for scorer in evaluator.scores]
Expand All @@ -1399,6 +1480,7 @@ async def run_evaluator_task(datum, trial_index=0):
error = None
exc_info = None
scores = {}
classifications = {}
tags = datum.tags

event_dataset = (
Expand Down Expand Up @@ -1559,10 +1641,11 @@ async def ensure_spans_flushed():
failing_scorers_and_exceptions = []
for name, p in zip(scorer_names, score_promises):
try:
score_results = await p
for score in score_results:
scorer_result = await p
for score in scorer_result["scores"]:
passing_scorers_and_results.append((score.name, score))
scores[score.name] = score.score
classifications.update(scorer_result["classifications"])
except Exception as e:
exc_info = traceback.format_exc()
failing_scorers_and_exceptions.append((name, e, exc_info))
Expand All @@ -1582,6 +1665,8 @@ async def ensure_spans_flushed():
f"Found exceptions for the following scorers: {names}",
exceptions,
)
if classifications:
root_span.log(classifications=classifications)
except Exception as e:
exc_type, exc_value, tb = sys.exc_info()
root_span.log(error=stringify_exception(exc_type, exc_value, tb))
Expand Down
107 changes: 107 additions & 0 deletions py/src/braintrust/test_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,113 @@ def sometimes_none_scorer(input, output, expected):
assert result.summary.scores["conditional"].score == 1.0 # Only the second score counts


@pytest.mark.asyncio
async def test_run_evaluator_classifier_scorer_logs_classifications(with_memory_logger, with_simulate_login):
def identity_task(input_value):
return input_value

def classifier_scorer(input_value, output, expected):
return "animal"

evaluator = Evaluator(
project_name="test-project",
eval_name="test-classifier-scorer",
data=[EvalCase(input="cat", expected="animal")],
task=identity_task,
scores=[classifier_scorer],
experiment_name="test-classifier-scorer",
metadata=None,
)

exp = init_test_exp("test-classifier-scorer", "test-project")
result = await run_evaluator(experiment=exp, evaluator=evaluator, position=None, filters=[])

assert len(result.results) == 1
assert result.results[0].scores == {}
assert result.summary.scores == {}

logs = with_memory_logger.pop()
scorer_spans = [log for log in logs if log.get("span_attributes", {}).get("type") == "score"]
assert len(scorer_spans) == 1
assert scorer_spans[0].get("classifications") == {
"classifier_scorer": [{"id": "animal", "label": "animal"}]
}

root_spans = [log for log in logs if not log["span_parents"]]
assert len(root_spans) == 1
assert root_spans[0].get("classifications") == {
"classifier_scorer": [{"id": "animal", "label": "animal"}]
}


@pytest.mark.asyncio
async def test_eval_mixed_score_and_classifier_scorers(with_memory_logger, with_simulate_login):
def identity_task(input_value):
return input_value

def numeric_scorer(input_value, output, expected):
return 1.0 if output == expected else 0.0

def classifier_scorer(input_value, output, expected):
return ["animal", "pet"]

evaluator = Evaluator(
project_name="test-project",
eval_name="test-mixed-scorers",
data=[EvalCase(input="cat", expected="cat")],
task=identity_task,
scores=[numeric_scorer, classifier_scorer],
experiment_name="test-mixed-scorers",
metadata=None,
)

exp = init_test_exp("test-mixed-scorers", "test-project")
result = await run_evaluator(experiment=exp, evaluator=evaluator, position=None, filters=[])

assert len(result.results) == 1
assert result.results[0].scores == {"numeric_scorer": 1.0}

logs = with_memory_logger.pop()
scorer_spans = [log for log in logs if log.get("span_attributes", {}).get("type") == "score"]
assert len(scorer_spans) == 2

numeric_spans = [log for log in scorer_spans if log.get("scores", {}).get("numeric_scorer") == 1.0]
assert len(numeric_spans) == 1

root_spans = [log for log in logs if not log["span_parents"]]
assert len(root_spans) == 1
assert root_spans[0].get("classifications") == {
"classifier_scorer": [{"id": "animal", "label": "animal"}, {"id": "pet", "label": "pet"}]
}


@pytest.mark.asyncio
async def test_eval_invalid_classifier_payload_returns_scorer_error():
def identity_task(input_value):
return input_value

def invalid_classifier(input_value, output, expected):
return {"label": "missing-id"}

evaluator = Evaluator(
project_name="test-project",
eval_name="test-invalid-classifier",
data=[EvalCase(input="cat", expected="animal")],
task=identity_task,
scores=[invalid_classifier],
experiment_name=None,
metadata=None,
)

result = await run_evaluator(experiment=None, evaluator=evaluator, position=None, filters=[])

assert len(result.results) == 1
assert result.results[0].scores == {}
assert "scorer_errors" in result.results[0].metadata
assert "invalid_classifier" in result.results[0].metadata["scorer_errors"]
assert "valid Score object or classification item" in result.results[0].metadata["scorer_errors"]["invalid_classifier"]


@pytest.mark.asyncio
async def test_hooks_tags_append(with_memory_logger, with_simulate_login, simple_scorer):
"""Test that hooks.tags can be appended to and logged."""
Expand Down