braintrustdata · Alex Z (CLowbrow) · Mar 9, 2026 · Mar 9, 2026
diff --git a/py/examples/evals/intent_classification_eval.py b/py/examples/evals/intent_classification_eval.py
@@ -0,0 +1,45 @@
+from braintrust import Eval
+from openai import OpenAI
+
+client = OpenAI()
+
+DATASET = [
+    {
+        "input": "What's your return policy?",
+        "expected": "policy_question",
+    },
+    {
+        "input": "I need help with my order",
+        "expected": "support_request",
+    },
+]
+
+
+def task(input):
+    response = client.responses.create(
+        model="gpt-5-mini",
+        input=[{"role": "user", "content": input}],
+    )
+    return response.output_text
+
+
+def intent_classifier(input, output, expected, metadata):
+    keywords = {
+        "policy_question": ["policy", "return", "refund", "warranty"],
+        "support_request": ["help", "issue", "problem", "support"],
+        "product_inquiry": ["price", "feature", "available", "buy"],
+    }
+
+    for intent, words in keywords.items():
+        if any(word in input.lower() for word in words):
+            return intent
+
+    return "other"
+
+
+Eval(
+    "Intent Classification",
+    data=DATASET,
+    task=task,
+    scores=[intent_classifier],
+)
diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py
@@ -9,7 +9,7 @@
 import traceback
 import warnings
 from collections import defaultdict
-from collections.abc import Awaitable, Callable, Coroutine, Iterable, Iterator, Sequence
+from collections.abc import Awaitable, Callable, Coroutine, Iterable, Iterator, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from multiprocessing import cpu_count
@@ -232,7 +232,64 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output]):
     metadata: Metadata | None = None
 
 
-OneOrMoreScores = Union[float, int, bool, None, Score, list[Score]]
+OneOrMoreScores = Union[
+    float,
+    int,
+    bool,
+    None,
+    Score,
+    list[Score],
+    str,
+    list[str],
+    Mapping[str, Any],
+    list[Mapping[str, Any]],
+]
+
+
+def _normalize_classification_item(item: Mapping[str, Any]) -> dict[str, Any]:
+    classification_id = item.get("id")
+    if not isinstance(classification_id, str):
+        raise ValueError(f"Classification item must include string id. Got: {item}")
+
+    label = item.get("label")
+    if label is not None and not isinstance(label, str):
+        raise ValueError(f"Classification item label must be a string when specified. Got: {item}")
+
+    confidence = item.get("confidence")
+    if confidence is not None and not isinstance(confidence, (int, float)):
+        raise ValueError(f"Classification item confidence must be a number when specified. Got: {item}")
+
+    metadata = item.get("metadata")
+    if metadata is not None and not isinstance(metadata, Mapping):
+        raise ValueError(f"Classification item metadata must be an object when specified. Got: {item}")
+
+    result = dict(item)
+    if result.get("label") is None:
+        result["label"] = classification_id
+    return result
+
+
+def _try_parse_classification_output(value: Any) -> list[dict[str, Any]] | None:
+    if isinstance(value, str):
+        return [{"id": value, "label": value}]
+
+    if isinstance(value, Mapping):
+        return [_normalize_classification_item(value)]
+
+    if not isinstance(value, Iterable):
+        return None
+
+    values = list(value)
+    if len(values) == 0:
+        return []
+
+    if all(isinstance(item, str) for item in values):
+        return [{"id": item, "label": item} for item in values]
+
+    if all(isinstance(item, Mapping) for item in values):
+        return [_normalize_classification_item(item) for item in values]
+
+    return None
 
 
 # Synchronous scorer interface - implements callable
@@ -1358,16 +1415,40 @@ async def await_or_run_scorer(root_span, scorer, name, **kwargs):
             if isinstance(result, dict):
                 try:
                     result = Score.from_dict(result)
-                except Exception as e:
-                    raise ValueError(f"When returning a dict, it must be a valid Score object. Got: {result}") from e
+                except Exception as score_parse_error:
+                    try:
+                        classification_items = _try_parse_classification_output(result)
+                    except Exception as classification_parse_error:
+                        raise ValueError(
+                            f"When returning a dict, it must be a valid Score object or classification item. Got: {result}"
+                        ) from classification_parse_error
+
+                    if classification_items is None:
+                        raise ValueError(
+                            f"When returning a dict, it must be a valid Score object or classification item. Got: {result}"
+                        ) from score_parse_error
+
+                    span.log(output=classification_items, classifications={name: classification_items})
+                    return {"scores": [], "classifications": {name: classification_items}}
+
+            if isinstance(result, str):
+                classification_items = _try_parse_classification_output(result)
+                span.log(output=classification_items, classifications={name: classification_items})
+                return {"scores": [], "classifications": {name: classification_items}}
 
             if isinstance(result, Iterable):
-                for s in result:
-                    if not is_score(s):
+                result = list(result)
+                if all(is_score(s) for s in result):
+                    result = list(result)
+                else:
+                    classification_items = _try_parse_classification_output(result)
+                    if classification_items is None:
                         raise ValueError(
-                            f"When returning an array of scores, each score must be a valid Score object. Got: {s}"
+                            "When returning an array, each item must be a valid Score object "
+                            f"or classification item. Got: {result}"
                         )
-                result = list(result)
+                    span.log(output=classification_items, classifications={name: classification_items})
+                    return {"scores": [], "classifications": {name: classification_items}}
             elif is_score(result):
                 result = [result]
             else:
@@ -1383,7 +1464,7 @@ def get_other_fields(s):
 
             scores = {r.name: r.score for r in result}
             span.log(output=result_output, metadata=result_metadata, scores=scores)
-            return result
+            return {"scores": result, "classifications": {}}
 
     # First, resolve the scorers if they are classes
     scorers = [scorer() if inspect.isclass(scorer) and is_scorer(scorer) else scorer for scorer in evaluator.scores]
@@ -1399,6 +1480,7 @@ async def run_evaluator_task(datum, trial_index=0):
         error = None
         exc_info = None
         scores = {}
+        classifications = {}
         tags = datum.tags
 
         event_dataset = (
@@ -1559,10 +1641,11 @@ async def ensure_spans_flushed():
                 failing_scorers_and_exceptions = []
                 for name, p in zip(scorer_names, score_promises):
                     try:
-                        score_results = await p
-                        for score in score_results:
+                        scorer_result = await p
+                        for score in scorer_result["scores"]:
                             passing_scorers_and_results.append((score.name, score))
                             scores[score.name] = score.score
+                        classifications.update(scorer_result["classifications"])
                     except Exception as e:
                         exc_info = traceback.format_exc()
                         failing_scorers_and_exceptions.append((name, e, exc_info))
@@ -1582,6 +1665,8 @@ async def ensure_spans_flushed():
                         f"Found exceptions for the following scorers: {names}",
                         exceptions,
                     )
+                if classifications:
+                    root_span.log(classifications=classifications)
             except Exception as e:
                 exc_type, exc_value, tb = sys.exc_info()
                 root_span.log(error=stringify_exception(exc_type, exc_value, tb))

diff --git a/py/src/braintrust/test_framework.py b/py/src/braintrust/test_framework.py
@@ -371,6 +371,113 @@ def sometimes_none_scorer(input, output, expected):
     assert result.summary.scores["conditional"].score == 1.0  # Only the second score counts
 
 
+@pytest.mark.asyncio
+async def test_run_evaluator_classifier_scorer_logs_classifications(with_memory_logger, with_simulate_login):
+    def identity_task(input_value):
+        return input_value
+
+    def classifier_scorer(input_value, output, expected):
+        return "animal"
+
+    evaluator = Evaluator(
+        project_name="test-project",
+        eval_name="test-classifier-scorer",
+        data=[EvalCase(input="cat", expected="animal")],
+        task=identity_task,
+        scores=[classifier_scorer],
+        experiment_name="test-classifier-scorer",
+        metadata=None,
+    )
+
+    exp = init_test_exp("test-classifier-scorer", "test-project")
+    result = await run_evaluator(experiment=exp, evaluator=evaluator, position=None, filters=[])
+
+    assert len(result.results) == 1
+    assert result.results[0].scores == {}
+    assert result.summary.scores == {}
+
+    logs = with_memory_logger.pop()
+    scorer_spans = [log for log in logs if log.get("span_attributes", {}).get("type") == "score"]
+    assert len(scorer_spans) == 1
+    assert scorer_spans[0].get("classifications") == {
+        "classifier_scorer": [{"id": "animal", "label": "animal"}]
+    }
+
+    root_spans = [log for log in logs if not log["span_parents"]]
+    assert len(root_spans) == 1
+    assert root_spans[0].get("classifications") == {
+        "classifier_scorer": [{"id": "animal", "label": "animal"}]
+    }
+
+
+@pytest.mark.asyncio
+async def test_eval_mixed_score_and_classifier_scorers(with_memory_logger, with_simulate_login):
+    def identity_task(input_value):
+        return input_value
+
+    def numeric_scorer(input_value, output, expected):
+        return 1.0 if output == expected else 0.0
+
+    def classifier_scorer(input_value, output, expected):
+        return ["animal", "pet"]
+
+    evaluator = Evaluator(
+        project_name="test-project",
+        eval_name="test-mixed-scorers",
+        data=[EvalCase(input="cat", expected="cat")],
+        task=identity_task,
+        scores=[numeric_scorer, classifier_scorer],
+        experiment_name="test-mixed-scorers",
+        metadata=None,
+    )
+
+    exp = init_test_exp("test-mixed-scorers", "test-project")
+    result = await run_evaluator(experiment=exp, evaluator=evaluator, position=None, filters=[])
+
+    assert len(result.results) == 1
+    assert result.results[0].scores == {"numeric_scorer": 1.0}
+
+    logs = with_memory_logger.pop()
+    scorer_spans = [log for log in logs if log.get("span_attributes", {}).get("type") == "score"]
+    assert len(scorer_spans) == 2
+
+    numeric_spans = [log for log in scorer_spans if log.get("scores", {}).get("numeric_scorer") == 1.0]
+    assert len(numeric_spans) == 1
+
+    root_spans = [log for log in logs if not log["span_parents"]]
+    assert len(root_spans) == 1
+    assert root_spans[0].get("classifications") == {
+        "classifier_scorer": [{"id": "animal", "label": "animal"}, {"id": "pet", "label": "pet"}]
+    }
+
+
+@pytest.mark.asyncio
+async def test_eval_invalid_classifier_payload_returns_scorer_error():
+    def identity_task(input_value):
+        return input_value
+
+    def invalid_classifier(input_value, output, expected):
+        return {"label": "missing-id"}
+
+    evaluator = Evaluator(
+        project_name="test-project",
+        eval_name="test-invalid-classifier",
+        data=[EvalCase(input="cat", expected="animal")],
+        task=identity_task,
+        scores=[invalid_classifier],
+        experiment_name=None,
+        metadata=None,
+    )
+
+    result = await run_evaluator(experiment=None, evaluator=evaluator, position=None, filters=[])
+
+    assert len(result.results) == 1
+    assert result.results[0].scores == {}
+    assert "scorer_errors" in result.results[0].metadata
+    assert "invalid_classifier" in result.results[0].metadata["scorer_errors"]
+    assert "valid Score object or classification item" in result.results[0].metadata["scorer_errors"]["invalid_classifier"]
+
+
 @pytest.mark.asyncio
 async def test_hooks_tags_append(with_memory_logger, with_simulate_login, simple_scorer):
     """Test that hooks.tags can be appended to and logged."""