diff --git a/py/examples/evals/intent_classification_eval.py b/py/examples/evals/intent_classification_eval.py new file mode 100644 index 00000000..9fe69296 --- /dev/null +++ b/py/examples/evals/intent_classification_eval.py @@ -0,0 +1,45 @@ +from braintrust import Eval +from openai import OpenAI + +client = OpenAI() + +DATASET = [ + { + "input": "What's your return policy?", + "expected": "policy_question", + }, + { + "input": "I need help with my order", + "expected": "support_request", + }, +] + + +def task(input): + response = client.responses.create( + model="gpt-5-mini", + input=[{"role": "user", "content": input}], + ) + return response.output_text + + +def intent_classifier(input, output, expected, metadata): + keywords = { + "policy_question": ["policy", "return", "refund", "warranty"], + "support_request": ["help", "issue", "problem", "support"], + "product_inquiry": ["price", "feature", "available", "buy"], + } + + for intent, words in keywords.items(): + if any(word in input.lower() for word in words): + return intent + + return "other" + + +Eval( + "Intent Classification", + data=DATASET, + task=task, + scores=[intent_classifier], +) diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index 4794785a..94e842e4 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -9,7 +9,7 @@ import traceback import warnings from collections import defaultdict -from collections.abc import Awaitable, Callable, Coroutine, Iterable, Iterator, Sequence +from collections.abc import Awaitable, Callable, Coroutine, Iterable, Iterator, Mapping, Sequence from concurrent.futures import ThreadPoolExecutor from contextlib import contextmanager from multiprocessing import cpu_count @@ -232,7 +232,64 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output]): metadata: Metadata | None = None -OneOrMoreScores = Union[float, int, bool, None, Score, list[Score]] +OneOrMoreScores = Union[ + float, + int, + bool, + None, + Score, + list[Score], + str, + list[str], + Mapping[str, Any], + list[Mapping[str, Any]], +] + + +def _normalize_classification_item(item: Mapping[str, Any]) -> dict[str, Any]: + classification_id = item.get("id") + if not isinstance(classification_id, str): + raise ValueError(f"Classification item must include string id. Got: {item}") + + label = item.get("label") + if label is not None and not isinstance(label, str): + raise ValueError(f"Classification item label must be a string when specified. Got: {item}") + + confidence = item.get("confidence") + if confidence is not None and not isinstance(confidence, (int, float)): + raise ValueError(f"Classification item confidence must be a number when specified. Got: {item}") + + metadata = item.get("metadata") + if metadata is not None and not isinstance(metadata, Mapping): + raise ValueError(f"Classification item metadata must be an object when specified. Got: {item}") + + result = dict(item) + if result.get("label") is None: + result["label"] = classification_id + return result + + +def _try_parse_classification_output(value: Any) -> list[dict[str, Any]] | None: + if isinstance(value, str): + return [{"id": value, "label": value}] + + if isinstance(value, Mapping): + return [_normalize_classification_item(value)] + + if not isinstance(value, Iterable): + return None + + values = list(value) + if len(values) == 0: + return [] + + if all(isinstance(item, str) for item in values): + return [{"id": item, "label": item} for item in values] + + if all(isinstance(item, Mapping) for item in values): + return [_normalize_classification_item(item) for item in values] + + return None # Synchronous scorer interface - implements callable @@ -1358,16 +1415,40 @@ async def await_or_run_scorer(root_span, scorer, name, **kwargs): if isinstance(result, dict): try: result = Score.from_dict(result) - except Exception as e: - raise ValueError(f"When returning a dict, it must be a valid Score object. Got: {result}") from e + except Exception as score_parse_error: + try: + classification_items = _try_parse_classification_output(result) + except Exception as classification_parse_error: + raise ValueError( + f"When returning a dict, it must be a valid Score object or classification item. Got: {result}" + ) from classification_parse_error + + if classification_items is None: + raise ValueError( + f"When returning a dict, it must be a valid Score object or classification item. Got: {result}" + ) from score_parse_error + + span.log(output=classification_items, classifications={name: classification_items}) + return {"scores": [], "classifications": {name: classification_items}} + + if isinstance(result, str): + classification_items = _try_parse_classification_output(result) + span.log(output=classification_items, classifications={name: classification_items}) + return {"scores": [], "classifications": {name: classification_items}} if isinstance(result, Iterable): - for s in result: - if not is_score(s): + result = list(result) + if all(is_score(s) for s in result): + result = list(result) + else: + classification_items = _try_parse_classification_output(result) + if classification_items is None: raise ValueError( - f"When returning an array of scores, each score must be a valid Score object. Got: {s}" + "When returning an array, each item must be a valid Score object " + f"or classification item. Got: {result}" ) - result = list(result) + span.log(output=classification_items, classifications={name: classification_items}) + return {"scores": [], "classifications": {name: classification_items}} elif is_score(result): result = [result] else: @@ -1383,7 +1464,7 @@ def get_other_fields(s): scores = {r.name: r.score for r in result} span.log(output=result_output, metadata=result_metadata, scores=scores) - return result + return {"scores": result, "classifications": {}} # First, resolve the scorers if they are classes scorers = [scorer() if inspect.isclass(scorer) and is_scorer(scorer) else scorer for scorer in evaluator.scores] @@ -1399,6 +1480,7 @@ async def run_evaluator_task(datum, trial_index=0): error = None exc_info = None scores = {} + classifications = {} tags = datum.tags event_dataset = ( @@ -1559,10 +1641,11 @@ async def ensure_spans_flushed(): failing_scorers_and_exceptions = [] for name, p in zip(scorer_names, score_promises): try: - score_results = await p - for score in score_results: + scorer_result = await p + for score in scorer_result["scores"]: passing_scorers_and_results.append((score.name, score)) scores[score.name] = score.score + classifications.update(scorer_result["classifications"]) except Exception as e: exc_info = traceback.format_exc() failing_scorers_and_exceptions.append((name, e, exc_info)) @@ -1582,6 +1665,8 @@ async def ensure_spans_flushed(): f"Found exceptions for the following scorers: {names}", exceptions, ) + if classifications: + root_span.log(classifications=classifications) except Exception as e: exc_type, exc_value, tb = sys.exc_info() root_span.log(error=stringify_exception(exc_type, exc_value, tb)) diff --git a/py/src/braintrust/test_framework.py b/py/src/braintrust/test_framework.py index 9acf284b..da129f2c 100644 --- a/py/src/braintrust/test_framework.py +++ b/py/src/braintrust/test_framework.py @@ -371,6 +371,113 @@ def sometimes_none_scorer(input, output, expected): assert result.summary.scores["conditional"].score == 1.0 # Only the second score counts +@pytest.mark.asyncio +async def test_run_evaluator_classifier_scorer_logs_classifications(with_memory_logger, with_simulate_login): + def identity_task(input_value): + return input_value + + def classifier_scorer(input_value, output, expected): + return "animal" + + evaluator = Evaluator( + project_name="test-project", + eval_name="test-classifier-scorer", + data=[EvalCase(input="cat", expected="animal")], + task=identity_task, + scores=[classifier_scorer], + experiment_name="test-classifier-scorer", + metadata=None, + ) + + exp = init_test_exp("test-classifier-scorer", "test-project") + result = await run_evaluator(experiment=exp, evaluator=evaluator, position=None, filters=[]) + + assert len(result.results) == 1 + assert result.results[0].scores == {} + assert result.summary.scores == {} + + logs = with_memory_logger.pop() + scorer_spans = [log for log in logs if log.get("span_attributes", {}).get("type") == "score"] + assert len(scorer_spans) == 1 + assert scorer_spans[0].get("classifications") == { + "classifier_scorer": [{"id": "animal", "label": "animal"}] + } + + root_spans = [log for log in logs if not log["span_parents"]] + assert len(root_spans) == 1 + assert root_spans[0].get("classifications") == { + "classifier_scorer": [{"id": "animal", "label": "animal"}] + } + + +@pytest.mark.asyncio +async def test_eval_mixed_score_and_classifier_scorers(with_memory_logger, with_simulate_login): + def identity_task(input_value): + return input_value + + def numeric_scorer(input_value, output, expected): + return 1.0 if output == expected else 0.0 + + def classifier_scorer(input_value, output, expected): + return ["animal", "pet"] + + evaluator = Evaluator( + project_name="test-project", + eval_name="test-mixed-scorers", + data=[EvalCase(input="cat", expected="cat")], + task=identity_task, + scores=[numeric_scorer, classifier_scorer], + experiment_name="test-mixed-scorers", + metadata=None, + ) + + exp = init_test_exp("test-mixed-scorers", "test-project") + result = await run_evaluator(experiment=exp, evaluator=evaluator, position=None, filters=[]) + + assert len(result.results) == 1 + assert result.results[0].scores == {"numeric_scorer": 1.0} + + logs = with_memory_logger.pop() + scorer_spans = [log for log in logs if log.get("span_attributes", {}).get("type") == "score"] + assert len(scorer_spans) == 2 + + numeric_spans = [log for log in scorer_spans if log.get("scores", {}).get("numeric_scorer") == 1.0] + assert len(numeric_spans) == 1 + + root_spans = [log for log in logs if not log["span_parents"]] + assert len(root_spans) == 1 + assert root_spans[0].get("classifications") == { + "classifier_scorer": [{"id": "animal", "label": "animal"}, {"id": "pet", "label": "pet"}] + } + + +@pytest.mark.asyncio +async def test_eval_invalid_classifier_payload_returns_scorer_error(): + def identity_task(input_value): + return input_value + + def invalid_classifier(input_value, output, expected): + return {"label": "missing-id"} + + evaluator = Evaluator( + project_name="test-project", + eval_name="test-invalid-classifier", + data=[EvalCase(input="cat", expected="animal")], + task=identity_task, + scores=[invalid_classifier], + experiment_name=None, + metadata=None, + ) + + result = await run_evaluator(experiment=None, evaluator=evaluator, position=None, filters=[]) + + assert len(result.results) == 1 + assert result.results[0].scores == {} + assert "scorer_errors" in result.results[0].metadata + assert "invalid_classifier" in result.results[0].metadata["scorer_errors"] + assert "valid Score object or classification item" in result.results[0].metadata["scorer_errors"]["invalid_classifier"] + + @pytest.mark.asyncio async def test_hooks_tags_append(with_memory_logger, with_simulate_login, simple_scorer): """Test that hooks.tags can be appended to and logged."""