Skip to content

Commit f37cfec

Browse files
authored
feat(framework): add classifier support to Eval (#282)
Add a new `classifiers` parameter to `Eval`/`EvalAsync`/`Evaluator` that runs classification functions alongside scorers. Classifier results are recorded under a dedicated `classifications` field on `EvalResult` and logged to classifier-typed spans. Based on JS PR: braintrustdata/braintrust-sdk-javascript#1553 and the spec: https://github.com/braintrustdata/braintrust-spec/blob/main/docs/telemetry/classifier.md
1 parent 1321215 commit f37cfec

10 files changed

Lines changed: 549 additions & 58 deletions

File tree

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
interactions:
2+
- request:
3+
body: '{"id": "test-classifier-span"}'
4+
headers:
5+
Accept:
6+
- '*/*'
7+
Accept-Encoding:
8+
- gzip, deflate, br, zstd
9+
Connection:
10+
- keep-alive
11+
Content-Length:
12+
- '30'
13+
Content-Type:
14+
- application/json
15+
User-Agent:
16+
- python-requests/2.32.5
17+
method: POST
18+
uri: https://www.braintrust.dev/api/base_experiment/get_id
19+
response:
20+
body:
21+
string: "[\n {\n \"validation\": \"uuid\",\n \"code\": \"invalid_string\",\n
22+
\ \"message\": \"Invalid uuid\",\n \"path\": [\n \"id\"\n ]\n
23+
\ }\n] [user_email=___braintrust_anon_user___@braintrustdata.com] [timestamp=1776185951.284]
24+
[request_id=yul1::rlkb8-1776185950926-a394af342bfa]"
25+
headers:
26+
Cache-Control:
27+
- public, max-age=0, must-revalidate
28+
Content-Length:
29+
- '267'
30+
Content-Security-Policy:
31+
- 'script-src ''self'' ''unsafe-eval'' ''wasm-unsafe-eval'' ''strict-dynamic''
32+
''nonce-NGNlMGVjNTUtNmE2MC00ZDA3LWE3OGMtMDQ3NWExNWVkZGUz'' *.js.stripe.com
33+
js.stripe.com maps.googleapis.com ; style-src ''self'' ''unsafe-inline'' *.braintrust.dev
34+
btcm6qilbbhv4yi1.public.blob.vercel-storage.com fonts.googleapis.com www.gstatic.com
35+
d4tuoctqmanu0.cloudfront.net; font-src ''self'' data: fonts.gstatic.com btcm6qilbbhv4yi1.public.blob.vercel-storage.com
36+
cdn.jsdelivr.net d4tuoctqmanu0.cloudfront.net fonts.googleapis.com mintlify-assets.b-cdn.net
37+
fonts.cdnfonts.com; object-src ''none''; base-uri ''self''; form-action ''self'';
38+
frame-ancestors ''self''; worker-src ''self'' blob:; report-uri https://o4507221741076480.ingest.us.sentry.io/api/4507221754380288/security/?sentry_key=27fa5ac907cf7c6ce4a1ab2a03f805b4&sentry_environment=production&sentry_release=16;
39+
report-to csp-endpoint-0'
40+
Content-Type:
41+
- text/plain; charset=utf-8
42+
Date:
43+
- Tue, 14 Apr 2026 16:59:11 GMT
44+
Etag:
45+
- '"ox95rpt6sr7f"'
46+
Reporting-Endpoints:
47+
- csp-endpoint-0="https://o4507221741076480.ingest.us.sentry.io/api/4507221754380288/security/?sentry_key=27fa5ac907cf7c6ce4a1ab2a03f805b4&sentry_environment=production&sentry_release=16"
48+
Server:
49+
- Vercel
50+
Set-Cookie:
51+
- __Host-authjs.csrf-token=29800db0ea46edba4ca7714d00cba09854ccf08e98f28666bd7a8f816dc260ea%7C4f5ce689e487bc9972f79ed54f75a99deeb00baec720e82e1efa5e58f551d316;
52+
Path=/; HttpOnly; Secure; SameSite=Lax
53+
- __Secure-authjs.callback-url=https%3A%2F%2Fwww.braintrustdata.com; Path=/;
54+
HttpOnly; Secure; SameSite=Lax
55+
Strict-Transport-Security:
56+
- max-age=63072000
57+
X-Clerk-Auth-Reason:
58+
- session-token-and-uat-missing
59+
X-Clerk-Auth-Status:
60+
- signed-out
61+
X-Content-Type-Options:
62+
- nosniff
63+
X-Frame-Options:
64+
- SAMEORIGIN
65+
X-Matched-Path:
66+
- /api/base_experiment/get_id
67+
X-Nonce:
68+
- NGNlMGVjNTUtNmE2MC00ZDA3LWE3OGMtMDQ3NWExNWVkZGUz
69+
X-Vercel-Cache:
70+
- MISS
71+
X-Vercel-Id:
72+
- yul1::iad1::rlkb8-1776185950926-a394af342bfa
73+
status:
74+
code: 400
75+
message: Bad Request
76+
version: 1

py/src/braintrust/cli/push.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import requests
1919
import slugify
20-
from braintrust.framework import _evals, _scorer_name, _set_lazy_load
20+
from braintrust.framework import _classifier_name, _evals, _scorer_name, _set_lazy_load
2121

2222
from .. import api_conn, login, org_id, proxy_conn
2323
from ..framework2 import ProjectIdCache, global_
@@ -303,8 +303,11 @@ def _collect_evaluator_defs(
303303
evaluator = eval_instance.evaluator
304304
project_id = project_ids.get_by_name(evaluator.project_name)
305305

306-
scores = [{"name": _scorer_name(scorer, i)} for i, scorer in enumerate(evaluator.scores)]
307-
evaluator_definition: dict[str, Any] = {"scores": scores}
306+
scores = [{"name": _scorer_name(scorer, i)} for i, scorer in enumerate(evaluator.scores or [])]
307+
classifiers = [
308+
{"name": _classifier_name(classifier, i)} for i, classifier in enumerate(evaluator.classifiers or [])
309+
]
310+
evaluator_definition: dict[str, Any] = {"scores": scores, "classifiers": classifiers}
308311
if evaluator.parameters is not None:
309312
evaluator_definition["parameters"] = serialize_remote_eval_parameters_container(evaluator.parameters)
310313

py/src/braintrust/cli/test_push_evaluator.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ def _make_scorer(name):
1515
return scorer
1616

1717

18-
def _make_evaluator(project_name, scorer_names, parameters=None):
18+
def _make_evaluator(project_name, scorer_names, parameters=None, classifier_names=None):
1919
evaluator = MagicMock()
2020
evaluator.project_name = project_name
2121
evaluator.scores = [_make_scorer(n) for n in scorer_names]
22+
evaluator.classifiers = [_make_scorer(n) for n in (classifier_names or [])]
2223
evaluator.parameters = parameters
2324

2425
instance = MagicMock()
@@ -50,7 +51,10 @@ def test_basic_evaluator_def_structure(self, mock_project_ids):
5051
"sandbox_spec": {"provider": "lambda"},
5152
"entrypoints": ["evals/my_eval.py"],
5253
"eval_name": "my_eval",
53-
"evaluator_definition": {"scores": [{"name": "accuracy"}]},
54+
"evaluator_definition": {
55+
"scores": [{"name": "accuracy"}],
56+
"classifiers": [],
57+
},
5458
},
5559
"bundle_id": "bundle-abc",
5660
},
@@ -98,6 +102,16 @@ def test_evaluator_with_parameters(self, mock_project_ids):
98102
assert parameters["source"] is None
99103
assert parameters["schema"]["prompt"]["type"] == "prompt"
100104

105+
def test_evaluator_with_classifiers(self, mock_project_ids):
106+
evaluators = {"eval1": _make_evaluator("test-project", ["accuracy"], classifier_names=["category"])}
107+
108+
functions = []
109+
_collect_evaluator_defs(mock_project_ids, functions, "bundle-1", "replace", "eval.py", evaluators)
110+
111+
eval_def = functions[0]["function_data"]["data"]["location"]["evaluator_definition"]
112+
assert eval_def["scores"] == [{"name": "accuracy"}]
113+
assert eval_def["classifiers"] == [{"name": "category"}]
114+
101115
def test_slug_from_source_file(self, mock_project_ids):
102116
evaluators = {"Test Eval": _make_evaluator("test-project", ["accuracy"])}
103117

py/src/braintrust/devserver/server.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
Evaluator,
3333
ExperimentSummary,
3434
SSEProgressEvent,
35+
_classifier_name,
36+
_scorer_name,
3537
)
3638
from ..generated_types import FunctionId
3739
from ..logger import BraintrustState, bt_iscoroutinefunction
@@ -123,7 +125,10 @@ async def list_evaluators(request: Request) -> JSONResponse:
123125
"parameters": (
124126
serialize_remote_eval_parameters_container(evaluator.parameters) if evaluator.parameters else None
125127
),
126-
"scores": [{"name": getattr(score, "name", f"score_{i}")} for i, score in enumerate(evaluator.scores)],
128+
"scores": [{"name": _scorer_name(score, i)} for i, score in enumerate(evaluator.scores or [])],
129+
"classifiers": [
130+
{"name": _classifier_name(classifier, i)} for i, classifier in enumerate(evaluator.classifiers or [])
131+
],
127132
}
128133

129134
return JSONResponse(evaluator_list)
@@ -227,7 +232,7 @@ def stream_fn(event: SSEProgressEvent):
227232
**{
228233
**eval_kwargs,
229234
"state": state,
230-
"scores": evaluator.scores
235+
"scores": (evaluator.scores or [])
231236
+ [
232237
make_scorer(state, score["name"], score["function_id"], ctx.project_id)
233238
for score in eval_data.get("scores", [])

py/src/braintrust/devserver/test_server_integration.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ def scorer(input: str, output: str, expected: str) -> float:
4949
"""Simple exact match scorer."""
5050
return 1.0 if output == expected else 0.0
5151

52+
def classifier(input: str, output: str, expected: str) -> dict[str, str]:
53+
return {"id": "correct" if output == expected else "incorrect", "name": "answer_type"}
54+
5255
evaluator = Evaluator(
5356
project_name="test-math-eval",
5457
eval_name="simple-math-eval",
@@ -59,6 +62,7 @@ def scorer(input: str, output: str, expected: str) -> float:
5962
],
6063
task=task,
6164
scores=[scorer],
65+
classifiers=[classifier],
6266
experiment_name=None,
6367
metadata=None,
6468
)
@@ -114,6 +118,8 @@ def test_devserver_list_evaluators(client, api_key, org_name):
114118
assert response.status_code == 200
115119
evaluators = response.json()
116120
assert "simple-math-eval" in evaluators
121+
assert evaluators["simple-math-eval"]["scores"] == [{"name": "scorer"}]
122+
assert evaluators["simple-math-eval"]["classifiers"] == [{"name": "classifier"}]
117123

118124

119125
def parse_sse_events(response_text: str) -> list[dict[str, Any]]:

0 commit comments

Comments
 (0)