From 69505ff007a16a2836f6e16d046de60bcc6c4d40 Mon Sep 17 00:00:00 2001 From: Will Frey Date: Wed, 29 Apr 2026 14:38:59 -0400 Subject: [PATCH 1/2] fix(devserver): honor evaluator.project_id when request omits it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dev-server's run_eval built EvalAsync(...) kwargs with {**eval_kwargs, ..., "project_id": eval_data.get("project_id")} The trailing key always wins in dict-spread merging, so a request body that omits project_id silently overrode the registered evaluator's project_id to None. EvalAsync(name=..., project_id=None) then fell back to using the eval name as the project name (per Eval(... project_id) docs: "If specified, uses the given project ID instead of the evaluator's name to identify the project."), so experiments routed into a per-evaluator-name auto-created project instead of the project the evaluator was registered against. Use evaluator.project_id as a fallback when the request omits it. An explicit project_id in the request still takes precedence. Tests: - test_eval_falls_back_to_evaluator_project_id_when_request_omits_it — registers an evaluator with a known project_id, posts /eval without project_id, asserts EvalAsync receives the registered id. - test_eval_request_project_id_overrides_evaluator — confirms an explicit request-level project_id still wins. --- py/src/braintrust/devserver/server.py | 10 +- .../devserver/test_server_integration.py | 123 ++++++++++++++++++ 2 files changed, 132 insertions(+), 1 deletion(-) diff --git a/py/src/braintrust/devserver/server.py b/py/src/braintrust/devserver/server.py index dec1196b..56f7e449 100644 --- a/py/src/braintrust/devserver/server.py +++ b/py/src/braintrust/devserver/server.py @@ -225,6 +225,14 @@ def stream_fn(event: SSEProgressEvent): if validated_parameters is not None and not isinstance(evaluator.parameters, RemoteEvalParameters): eval_kwargs["parameters"] = validated_parameters + # Honor an explicit project_id from the request when present; otherwise + # fall back to the registered evaluator's project_id. Without this + # fallback, requests that omit project_id silently route into a + # per-evaluator-name auto-created project (Eval(project_id=None) uses + # name as the project name) instead of the project the evaluator was + # registered against. + project_id = eval_data.get("project_id", evaluator.project_id) + try: eval_task = asyncio.create_task( EvalAsync( @@ -243,7 +251,7 @@ def stream_fn(event: SSEProgressEvent): "task": task, "experiment_name": eval_data.get("experiment_name"), "parent": parent, - "project_id": eval_data.get("project_id"), + "project_id": project_id, }, ) ) diff --git a/py/src/braintrust/devserver/test_server_integration.py b/py/src/braintrust/devserver/test_server_integration.py index b74f4821..5875dd93 100644 --- a/py/src/braintrust/devserver/test_server_integration.py +++ b/py/src/braintrust/devserver/test_server_integration.py @@ -298,3 +298,126 @@ async def fake_eval_async(*, task, data, parameters, **_kwargs): ) assert response.status_code == 200 + + +def test_eval_falls_back_to_evaluator_project_id_when_request_omits_it(api_key, org_name, monkeypatch): + """run_eval must honor the registered evaluator's project_id when the request omits it. + + Regression: ``run_eval`` builds ``EvalAsync(...)`` kwargs with + ``{**eval_kwargs, ..., "project_id": eval_data.get("project_id")}``. + The trailing key always wins in dict-spread merging, so a request + that omits ``project_id`` clobbers the registered evaluator's + ``project_id`` to ``None``. ``EvalAsync`` then falls back to using + ``name`` as the project name (per ``framework.Eval`` docstring), + routing experiments into a per-evaluator-name auto-created project + instead of the project the evaluator was registered against. + """ + from braintrust import Evaluator + from braintrust.devserver import server as devserver_module + from braintrust.devserver.server import create_app + from braintrust.logger import BraintrustState + from starlette.testclient import TestClient + + evaluator = Evaluator( + project_name="ignored-project-name", + eval_name="project-id-fallback-eval", + data=lambda: [{"input": "ping", "expected": "pong"}], + task=lambda input, _hooks: "pong", + scores=[], + experiment_name=None, + metadata=None, + project_id="evaluator-registered-project-id", + ) + + captured: dict[str, Any] = {} + + async def fake_cached_login(**_kwargs): + return BraintrustState() + + class FakeSummary: + def as_dict(self): + return {"experiment_name": evaluator.eval_name, "project_name": "", "scores": {}} + + class FakeResult: + summary = FakeSummary() + + async def fake_eval_async(*, project_id, **_kwargs): + captured["project_id"] = project_id + return FakeResult() + + monkeypatch.setattr(devserver_module, "cached_login", fake_cached_login) + monkeypatch.setattr(devserver_module, "EvalAsync", fake_eval_async) + + response = TestClient(create_app([evaluator])).post( + "/eval", + headers={ + "x-bt-auth-token": api_key, + "x-bt-org-name": org_name, + "Content-Type": "application/json", + }, + json={ + "name": "project-id-fallback-eval", + "stream": False, + "data": [{"input": "ping", "expected": "pong"}], + }, + ) + + assert response.status_code == 200 + assert captured["project_id"] == "evaluator-registered-project-id" + + +def test_eval_request_project_id_overrides_evaluator(api_key, org_name, monkeypatch): + """An explicit ``project_id`` in the request body still takes precedence.""" + from braintrust import Evaluator + from braintrust.devserver import server as devserver_module + from braintrust.devserver.server import create_app + from braintrust.logger import BraintrustState + from starlette.testclient import TestClient + + evaluator = Evaluator( + project_name="ignored-project-name", + eval_name="project-id-override-eval", + data=lambda: [{"input": "ping", "expected": "pong"}], + task=lambda input, _hooks: "pong", + scores=[], + experiment_name=None, + metadata=None, + project_id="evaluator-registered-project-id", + ) + + captured: dict[str, Any] = {} + + async def fake_cached_login(**_kwargs): + return BraintrustState() + + class FakeSummary: + def as_dict(self): + return {"experiment_name": evaluator.eval_name, "project_name": "", "scores": {}} + + class FakeResult: + summary = FakeSummary() + + async def fake_eval_async(*, project_id, **_kwargs): + captured["project_id"] = project_id + return FakeResult() + + monkeypatch.setattr(devserver_module, "cached_login", fake_cached_login) + monkeypatch.setattr(devserver_module, "EvalAsync", fake_eval_async) + + response = TestClient(create_app([evaluator])).post( + "/eval", + headers={ + "x-bt-auth-token": api_key, + "x-bt-org-name": org_name, + "Content-Type": "application/json", + }, + json={ + "name": "project-id-override-eval", + "stream": False, + "data": [{"input": "ping", "expected": "pong"}], + "project_id": "request-explicit-project-id", + }, + ) + + assert response.status_code == 200 + assert captured["project_id"] == "request-explicit-project-id" From 2f81ac1f99e50b93df7f9c898ab8ac0e92eb8a39 Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Fri, 1 May 2026 15:28:26 -0400 Subject: [PATCH 2/2] use or to match backend devserver logic --- py/src/braintrust/devserver/server.py | 2 +- .../devserver/test_server_integration.py | 21 ++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/py/src/braintrust/devserver/server.py b/py/src/braintrust/devserver/server.py index 56f7e449..a141fc79 100644 --- a/py/src/braintrust/devserver/server.py +++ b/py/src/braintrust/devserver/server.py @@ -231,7 +231,7 @@ def stream_fn(event: SSEProgressEvent): # per-evaluator-name auto-created project (Eval(project_id=None) uses # name as the project name) instead of the project the evaluator was # registered against. - project_id = eval_data.get("project_id", evaluator.project_id) + project_id = eval_data.get("project_id") or evaluator.project_id try: eval_task = asyncio.create_task( diff --git a/py/src/braintrust/devserver/test_server_integration.py b/py/src/braintrust/devserver/test_server_integration.py index 5875dd93..4a013684 100644 --- a/py/src/braintrust/devserver/test_server_integration.py +++ b/py/src/braintrust/devserver/test_server_integration.py @@ -300,8 +300,11 @@ async def fake_eval_async(*, task, data, parameters, **_kwargs): assert response.status_code == 200 -def test_eval_falls_back_to_evaluator_project_id_when_request_omits_it(api_key, org_name, monkeypatch): - """run_eval must honor the registered evaluator's project_id when the request omits it. +@pytest.mark.parametrize("request_project_id", [pytest.param("", id="empty"), pytest.param("__omit__", id="omitted")]) +def test_eval_falls_back_to_evaluator_project_id_when_request_omits_or_empty_it( + api_key, org_name, monkeypatch, request_project_id +): + """run_eval must honor the registered evaluator's project_id when the request omits/empties it. Regression: ``run_eval`` builds ``EvalAsync(...)`` kwargs with ``{**eval_kwargs, ..., "project_id": eval_data.get("project_id")}``. @@ -348,6 +351,14 @@ async def fake_eval_async(*, project_id, **_kwargs): monkeypatch.setattr(devserver_module, "cached_login", fake_cached_login) monkeypatch.setattr(devserver_module, "EvalAsync", fake_eval_async) + eval_request = { + "name": "project-id-fallback-eval", + "stream": False, + "data": [{"input": "ping", "expected": "pong"}], + } + if request_project_id != "__omit__": + eval_request["project_id"] = request_project_id + response = TestClient(create_app([evaluator])).post( "/eval", headers={ @@ -355,11 +366,7 @@ async def fake_eval_async(*, project_id, **_kwargs): "x-bt-org-name": org_name, "Content-Type": "application/json", }, - json={ - "name": "project-id-fallback-eval", - "stream": False, - "data": [{"input": "ping", "expected": "pong"}], - }, + json=eval_request, ) assert response.status_code == 200