From 27dc963fee70edb0db3cc35368ef5c6043e7d59e Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Wed, 25 Mar 2026 14:07:37 -0700 Subject: [PATCH 1/3] feat: Add agentscope integration --- py/noxfile.py | 12 + py/src/braintrust/auto.py | 12 +- py/src/braintrust/conftest.py | 1 + py/src/braintrust/integrations/__init__.py | 9 +- .../braintrust/integrations/adk/test_adk.py | 1 + .../integrations/agentscope/__init__.py | 20 + ...quential_pipeline_creates_parent_span.yaml | 545 ++++++++++++++++++ .../test_agentscope_simple_agent_run.yaml | 320 ++++++++++ ...agentscope_tool_use_creates_tool_span.yaml | 451 +++++++++++++++ .../cassettes/test_auto_agentscope.yaml | 122 ++++ .../integrations/agentscope/integration.py | 26 + .../integrations/agentscope/patchers.py | 103 ++++ .../agentscope/test_agentscope.py | 221 +++++++ .../integrations/agentscope/tracing.py | 269 +++++++++ .../auto_test_scripts/test_auto_agentscope.py | 73 +++ .../braintrust/wrappers/test_google_genai.py | 1 + 16 files changed, 2184 insertions(+), 2 deletions(-) create mode 100644 py/src/braintrust/integrations/agentscope/__init__.py create mode 100644 py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_sequential_pipeline_creates_parent_span.yaml create mode 100644 py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_simple_agent_run.yaml create mode 100644 py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_tool_use_creates_tool_span.yaml create mode 100644 py/src/braintrust/integrations/agentscope/cassettes/test_auto_agentscope.yaml create mode 100644 py/src/braintrust/integrations/agentscope/integration.py create mode 100644 py/src/braintrust/integrations/agentscope/patchers.py create mode 100644 py/src/braintrust/integrations/agentscope/test_agentscope.py create mode 100644 py/src/braintrust/integrations/agentscope/tracing.py create mode 100644 py/src/braintrust/integrations/auto_test_scripts/test_auto_agentscope.py diff --git a/py/noxfile.py b/py/noxfile.py index 61ea0aec..0f50d440 100644 --- a/py/noxfile.py +++ b/py/noxfile.py @@ -62,6 +62,7 @@ def _pinned_python_version(): # validate things work with or without them. VENDOR_PACKAGES = ( "agno", + "agentscope", "anthropic", "dspy", "openai", @@ -89,6 +90,7 @@ def _pinned_python_version(): # Keep LATEST for newest API coverage, and pin 2.4.0 to cover the 2.4 -> 2.5 breaking change # to internals we leverage for instrumentation. AGNO_VERSIONS = (LATEST, "2.4.0", "2.1.0") +AGENTSCOPE_VERSIONS = (LATEST, "1.0.0") # pydantic_ai 1.x requires Python >= 3.10 # Two test suites with different version requirements: # 1. wrap_openai approach: works with older versions (0.1.9+) @@ -172,6 +174,16 @@ def test_agno(session, version): _run_core_tests(session) +@nox.session() +@nox.parametrize("version", AGENTSCOPE_VERSIONS, ids=AGENTSCOPE_VERSIONS) +def test_agentscope(session, version): + _install_test_deps(session) + _install(session, "agentscope", version) + _install(session, "openai") + _run_tests(session, f"{INTEGRATION_DIR}/agentscope/test_agentscope.py") + _run_core_tests(session) + + @nox.session() @nox.parametrize("version", ANTHROPIC_VERSIONS, ids=ANTHROPIC_VERSIONS) def test_anthropic(session, version): diff --git a/py/src/braintrust/auto.py b/py/src/braintrust/auto.py index 25dd436a..7ac407aa 100644 --- a/py/src/braintrust/auto.py +++ b/py/src/braintrust/auto.py @@ -7,7 +7,13 @@ import logging from contextlib import contextmanager -from braintrust.integrations import ADKIntegration, AgnoIntegration, AnthropicIntegration, ClaudeAgentSDKIntegration +from braintrust.integrations import ( + ADKIntegration, + AgentScopeIntegration, + AgnoIntegration, + AnthropicIntegration, + ClaudeAgentSDKIntegration, +) __all__ = ["auto_instrument"] @@ -34,6 +40,7 @@ def auto_instrument( pydantic_ai: bool = True, google_genai: bool = True, agno: bool = True, + agentscope: bool = True, claude_agent_sdk: bool = True, dspy: bool = True, adk: bool = True, @@ -54,6 +61,7 @@ def auto_instrument( pydantic_ai: Enable Pydantic AI instrumentation (default: True) google_genai: Enable Google GenAI instrumentation (default: True) agno: Enable Agno instrumentation (default: True) + agentscope: Enable AgentScope instrumentation (default: True) claude_agent_sdk: Enable Claude Agent SDK instrumentation (default: True) dspy: Enable DSPy instrumentation (default: True) adk: Enable Google ADK instrumentation (default: True) @@ -116,6 +124,8 @@ def auto_instrument( results["google_genai"] = _instrument_google_genai() if agno: results["agno"] = _instrument_integration(AgnoIntegration) + if agentscope: + results["agentscope"] = _instrument_integration(AgentScopeIntegration) if claude_agent_sdk: results["claude_agent_sdk"] = _instrument_integration(ClaudeAgentSDKIntegration) if dspy: diff --git a/py/src/braintrust/conftest.py b/py/src/braintrust/conftest.py index 0fbdf40b..2345b227 100644 --- a/py/src/braintrust/conftest.py +++ b/py/src/braintrust/conftest.py @@ -191,6 +191,7 @@ def get_vcr_config(): "decode_compressed_response": True, "filter_headers": [ "authorization", + "Authorization", "openai-organization", "x-api-key", "api-key", diff --git a/py/src/braintrust/integrations/__init__.py b/py/src/braintrust/integrations/__init__.py index 35324c1c..095f7f35 100644 --- a/py/src/braintrust/integrations/__init__.py +++ b/py/src/braintrust/integrations/__init__.py @@ -1,7 +1,14 @@ from .adk import ADKIntegration +from .agentscope import AgentScopeIntegration from .agno import AgnoIntegration from .anthropic import AnthropicIntegration from .claude_agent_sdk import ClaudeAgentSDKIntegration -__all__ = ["ADKIntegration", "AgnoIntegration", "AnthropicIntegration", "ClaudeAgentSDKIntegration"] +__all__ = [ + "ADKIntegration", + "AgentScopeIntegration", + "AgnoIntegration", + "AnthropicIntegration", + "ClaudeAgentSDKIntegration", +] diff --git a/py/src/braintrust/integrations/adk/test_adk.py b/py/src/braintrust/integrations/adk/test_adk.py index bed6f3e6..9d9be979 100644 --- a/py/src/braintrust/integrations/adk/test_adk.py +++ b/py/src/braintrust/integrations/adk/test_adk.py @@ -41,6 +41,7 @@ def before_record_request(request): "cassette_library_dir": str(Path(__file__).parent / "cassettes"), "filter_headers": [ "authorization", + "Authorization", "x-goog-api-key", ], "before_record_request": before_record_request, diff --git a/py/src/braintrust/integrations/agentscope/__init__.py b/py/src/braintrust/integrations/agentscope/__init__.py new file mode 100644 index 00000000..534f1db8 --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/__init__.py @@ -0,0 +1,20 @@ +"""Braintrust integration for AgentScope.""" + +from braintrust.logger import NOOP_SPAN, current_span, init_logger + +from .integration import AgentScopeIntegration + + +__all__ = ["AgentScopeIntegration", "setup_agentscope"] + + +def setup_agentscope( + api_key: str | None = None, + project_id: str | None = None, + project_name: str | None = None, +) -> bool: + """Setup Braintrust integration with AgentScope.""" + if current_span() == NOOP_SPAN: + init_logger(project=project_name, api_key=api_key, project_id=project_id) + + return AgentScopeIntegration.setup() diff --git a/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_sequential_pipeline_creates_parent_span.yaml b/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_sequential_pipeline_creates_parent_span.yaml new file mode 100644 index 00000000..1ceea647 --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_sequential_pipeline_creates_parent_span.yaml @@ -0,0 +1,545 @@ +interactions: +- request: + body: '{"messages":[{"role":"system","content":[{"type":"text","text":"You rewrite + the input as a short title."}]},{"role":"user","content":[{"type":"text","text":"# + Conversation History\nThe content between tags contains + your conversation history\n\nuser: Summarize why tests should use real + recorded traffic.\n"}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '403' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyCzzdwMAAAD//41SwW7bMAy95ysMneMi + yZwou/a2a5EehqIwFImytcmiIcpbhyH/PspOarfrgF0EmI/v+fGR9+0ed1V1+Hl4HI5f2oevzekU + BrHODDx/A51urDuNzIPkMEywjqASZNWtlFUld/KzHIEODfhMa/pUVlh2Lrhyt9lV5UaW2+OV3aLT + QNz2xJ9F8Xt8s89g4IXLm/Wt0gGRaoBrtyYuRvS5IhSRo6RCEusZ1BgShNH6PQSwLlGBtngkF5ri + AZTnR2M0YIpTVNY6XbhQnIASNyyFItiBVB4mDN4vABUCJpXDGEd4viKXV9Memz7imd5RheUwqK05 + O+Ig2SAl7MWIXvh9HsMZ3swrWKjrU53wO4y/28tJTswrmcHjFUvsz8/lw379gVhtICnnaZGt0Eq3 + YGbmvAg1GIcLYLUY+W8vH2lPY3PI/yM/A1pDz6dW9xGM02/nndsi5Hv9V9trxKNhQRB/8AHWyUHM + azBg1eCnKxL0ixJ0Ne+qgdhHN52S7WutPoE0cnu2YnVZ/QE7MxO8WAMAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e166fbae67b2-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:06:38 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '664' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=0vN_GTd.Taalah9PDsZg_Ru.1_PcZ_NBP9qkR2MCsFE-1774472797.273874-1.0.1.1-_HsrwKoaPPyMDTMtccbmEvGb.WDYukiKlNhKyLTp32aZR8vwDwqATyzmrwTg82HAg9bVn2GQnmrENihz.LTaMxGxvJCORGScpnet2yitftoFB0LwZa12LFWkWMzlprHK; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:36:38 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999940' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_bcc5d231b4f24f049b2e0a0ba5d880a9 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","content":[{"type":"text","text":"You rewrite + the input as a short title."}]},{"role":"user","content":[{"type":"text","text":"# + Conversation History\nThe content between tags contains + your conversation history\n\nuser: Summarize why tests should use real + recorded traffic.\n"}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '403' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/4xSwW7bMAy95ysEneMhcdI56G3FTus2DFt3GgpDkShHjSwJIj2sKPLvo+2kdrcO + 2EWA+fieHx/5tBBCOiOvhdQHRbpNvnj/+cvDR328uXUfNv7WfTuGlSrfVZ9Sh4+tXPaMuH8ATRfW + Gx2ZB+RiGGGdQRH0quuq2m6rclduBqCNBnxPaxIV21i0LriiXJXbYlUV692ZfYhOA3LbD/4U4ml4 + e5/BwC8ur5aXSguIqgGuXZq4mKPvK1IhOiQVSC4nUMdAEAbrNxDAOkIRrfiOLjTiKyjPj47ZgBF3 + WVnrtHBB3AESN8yFMtgOVT9M6LyfASqESKoPYxjh/oycnk372KQc9/gHVVoOAw81Z4ccJBtEikkO + 6Inf+yGc7sW8koXaRDXFIwy/u6pGOTmtZAJ3Z4zYn5/Kb6+Wr4jVBkg5j7NspVb6AGZiTotQnXFx + BixmI//t5TXtcWwO+X/kJ0BrSHxqdcpgnH4579SWob/Xf7U9RzwYlgj5Jx9gTQ5yvwYDVnV+vCKJ + j0jQ1ryrBnLKbjwlm2qtNlCZar23cnFa/AYAAP//AwB2CCbyWAMAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e208d9766142-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:03 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '652' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=nPA03jdSYpPIt1MCtq5c3dH__SdRu5fP0HUZaKsaPsk-1774472823.1741066-1.0.1.1-QeI7TdvczvDfZcpacbJMilgyA_s79AH1EgKxSLO_1Z_BUh6jqZue4vjSpv9Sr.ihxNrRzZkdyq7EzAFHuIxCU9THwR_hF6iuXTWOT0BzCGX_rHDteSnXa7BlEfiRqaBH; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:03 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999940' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_09e84f9d413d4a8d9495fef9e4133024 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","content":[{"type":"text","text":"You answer + the previous message in one sentence."}]},{"role":"user","content":[{"type":"text","text":"# + Conversation History\nThe content between tags contains + your conversation history\n\nAlice: Benefits of Using Real Recorded + Traffic in Testing\n"}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '410' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyPKtcAYAAAD//4xTy27bMBC8+ysInqXA + dpQqPRfIMUBb9FQEAk2upG35ApcyYhT+9y6lOHLaFOiFgHZ2docz1OeTg+bT1y/mMH54GB7H5+ZW + pV5WhREOP0DnC+tGB+ZBxuAXWCdQGcrUXds2Tbu/3zcz4IIBW2hDzHUTaoce6/1239Tbtt7dv7DH + gBqI277zpxC/5rPo9AaeubytLhUHRGoArl2auJiCLRWpiJCy8llWK6iDz+Bn6d8I/SBYquVDh2TA + iJxU36MW6EUGyqUB/Kg86xFK6ykpfaoEupjCkUsRUh+SKzhTCIcxUyWUN2IEG0mg4V3Yn0QMZSvy + KiSamMkLlHAhwSyAdfJS8EdMwTvuvLnWnKCfSBXf/GTtFaC8D1kV32e3nl6Q86s/Ngys9EB/UGXP + vtPY8WrizNgLyiHKGT3z+TTnML2xVvIgF3OXw0+Y193dLePkmv4KLnkzmFmgXevtx+qdaZ2BrNDS + VY5SKz2CWZlr6GoyGK6AzdWd/xbz3uzl3hzu/4xfAa0h8rPuYgKD+u2F17YE5d/4V9urx7NgSZCO + /Ni7jJBKDgZ6NdnlxUo6UQbXcVgDpJhwebZ97LS6hda0u0MvN+fNbwAAAP//AwDs2pZ8xAMAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e20db9eddfce-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:04 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '669' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=HayDHNMCRsqHCSCg5YnFHqR8EGszTInfOwZSaeO5eyg-1774472823.957494-1.0.1.1-T1qVHFP.Enq2BHk4byqWQwdinEXG5pjJnQ6C55kaXNumTZcQVGLUbKoXNUrcL8OlUtndtdfReMfEtfZ5GDvmyZ8zIPnM28rC8KnQNIAYwe3oTr24xzHY4xyLaaAQfXw_; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:04 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999940' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_074bde6fdf2a4c4caf77d081e3a5af31 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","content":[{"type":"text","text":"You rewrite + the input as a short title."}]},{"role":"user","content":[{"type":"text","text":"# + Conversation History\nThe content between tags contains + your conversation history\n\nuser: Summarize why tests should use real + recorded traffic.\n"}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '403' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyDb29QAAAAD//4xSTW/bMAy951cYOsdD + vlrl3AI99DJsaIGiQ2EoEuWolSVPpLsMQ/77KDup3a0DdhFgPr7nx0c+SL99PHxZP94crg/XPz6v + v6/tw62YZ0bcPYOmM+uTjswDcjEMsE6gCLLqUsrNRq62l7IHmmjAZ1rdUrmJZeOCK1eL1aZcyHK5 + PbH30WlAbvvGn0Xxq3+zz2DgwOXF/FxpAFHVwLVzExdT9LkiFKJDUoHEfAR1DASht34FAawjLKIt + 7tGFuvgKyvOjYzJgirukrHW6cKG4AyRumAolsB2qPEzovJ8AKoRIKofRj/B0Qo5vpn2s2xR3+AdV + WA4D9xVnhxwkG0SKrejRI79PfTjdu3kFCzUtVRRfoP/dhRzkxLiSEdyeMGJ/fixfXsw/EKsMkHIe + J9kKrfQezMgcF6E64+IEmE1G/tvLR9rD2Bzy/8iPgNbQ8qlVbQLj9Pt5x7YE+V7/1fYWcW9YIKRX + PsCKHKS8BgNWdX64IoE/kaCpeFc1pDa54ZRsW2m1BmnkcmfF7Dj7DQAA//8DANl/ZwFYAwAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e31aaae4cf8f-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:47 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '724' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=Mzctd7Vf4Pj8hLREk5uV3xfhGAJqHTBdS7Z68xdqJ3E-1774472866.9835687-1.0.1.1-9060awKZeUAS__HpRhOh3ZDNpHvhVrftE8gP2f5h5qVQ7SUhdLjp1QfnUBiOHckFloYERDcS4nSTfy3q7RPbo.rO9ak4DB2RBS7.iNAo4m7Yek2xS4nBLaBUTUBc7VfU; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:47 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999940' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_923040bc8f624770996dc8d06bd77fef + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","content":[{"type":"text","text":"You answer + the previous message in one sentence."}]},{"role":"user","content":[{"type":"text","text":"# + Conversation History\nThe content between tags contains + your conversation history\n\nAlice: Benefits of Using Real Recorded + Traffic in Testing\n"}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '410' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyDbOSAcAAAD//4xTwW7bMAy95ysEnZ0i + yZy6uw/tYehp6y5DYSgSbbOVJUGUswZD/n2UndTp2gG7CDAfH/n0nvz19un2W7i//rK6t9u777vD + rx/q4UUWmeF3T6DTmXWlPfMgoXcTrCOoBHnquqrKstrcXFcj0HsDNtPakJalX/bocLlZbcrlqlqu + b07szqMG4raf/CnE7/HMOp2BFy6vinOlByLVAtfOTVyM3uaKVERISbkkixnU3iVwo/QHQtcKlmr5 + 0D4aMCJF1TSoBTqRgFJuANcpx3qE0nqISh8KgX2Ifs+lALHxsc84UwjbLlEhlDOiAxtIoOFd2BxE + 8Hkr8iokGpjJC5TofYRRAOvkpeD2GL3rufPqUnOEZiCVfXODtReAcs4nlX0f3Xo8IcdXf6xvWemO + /qLKhn2nrubVxJmxF5R8kCN65PNxzGF4Y63kQX1IdfLPMK7bbqdxck5/BjflCUws0M716nPxwbTa + QFJo6SJHqZXuwMzMOXQ1GPQXwOLizu/FfDR7ujeH+z/jZ0BrCPys6xDBoH574bktQv43/tX26vEo + WBLEPT/2OiHEnIOBRg12erGSDpSgrzmsFmKIOD3bJtRafYLKVOtdIxfHxR8AAAD//wMAQ8dY3cQD + AAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e31fec3dc132-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:48 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '768' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=JYxrn1QLzqghAM1PJW6V89UMbmiLatd1atmmQw6NPr0-1774472867.8239858-1.0.1.1-hwGgJY4XYCiJtbeBfm6fdTUACpklHSNTd64qYVFPn23d73s8.NBRfBlLx6nUV4d3.tfIInsBJq50FlZh7Wv9iTFtj7HY1hVkAYRHdnNzeq_4_piEn49lFlc_GkLVdmE0; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:48 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999940' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_13f99272933545f8bac789ebd64bb0f1 + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_simple_agent_run.yaml b/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_simple_agent_run.yaml new file mode 100644 index 00000000..87f6358e --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_simple_agent_run.yaml @@ -0,0 +1,320 @@ +interactions: +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a concise assistant. Answer in one sentence."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Say + hello in exactly two words."}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '290' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyCzNtgAAAAD//4xSsU7DMBDd+xWV56Rq + 05TQGYbODCyoilz7khgc29hOFYT675ydtglQJBZLvnfv/N7zPRzNkb33u02/qXTPN48AT89bkgSG + PrwC8xfWgmnkgRdaDTCzQD2EqauiyPMiK7Z3EWg1BxlotfFprtNWKJFmyyxPl0W6uj+zGy0YOGx7 + wet8/hnPoFNx6LG8TC6VFpyjNWDt0oRFq2WoEOqccJ4qT5IRZFp5UFH6DqTUc9+AhcW0xULVORpk + qk7KCUCV0p4Gm1Hc/oycrnKkro3VB/eDSiq06ZoSU3EYET7tvDYkoic899F2980JwUGt8aXXbxCf + W2fDODKGPQHPmEd9clLeJDeGlRw8FdJNUiOMsgb4yBwjph0XegLMJpZ/a7k1e7AtVP2f8SPAGBhc + otJY4IJ99zu2WQib+FfbNeIomDiwR1yt0guw4Rs4VLSTw34Q9+E8tCX+VQ3WWDEsSWVKRtdQ8GJ1 + qMjsNPsCAAD//wMAQQCebTIDAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e1608aaf6142-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:06:37 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '373' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=0r2bm90d_zmUe.y8EYZpgoGyTVS4QQSDoJVx.yDpJng-1774472796.2480545-1.0.1.1-onYisUL_Bju9EhfGXQnZBZkwk3gdjG7tHXVdr34BVePUh3JL0OqfVWApVIaF_KDBKfw4HIiGBvzONzv_AS91kbK7eL.FzFDwILNg8_F1h3hsPpZO.pIoeUN1dp_.acW6; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:36:37 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999975' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_0b44866bb8cd459db8712e04e4248889 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a concise assistant. Answer in one sentence."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Say + hello in exactly two words."}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '290' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyPJOswAAAAD//4xS0WrCMBR99yskz61o + 7aiv7mX6MJgwtsGQEpPbNjNNQpKOjeG/76ZVWzcHewnknntuzjm5t/dsebeB9Wq/qZ7WD4/L55eU + piQKDL17A+ZPrAnTyAMvtOpgZoF6CFNnWZamWbJIkhaoNQcZaKXxcarjWigRJ9MkjadZPFsc2ZUW + DBy2veJ1PP5qz6BTcfjA8jQ6VWpwjpaAtVMTFq2WoUKoc8J5qjyJepBp5UG10lcgpR77CixMhi0W + isbRIFM1Ug4AqpT2NNhsxW2PyOEsR+rSWL1zP6ikQJuuyjEVhxHh085rQ1r0gOe2td1cOCE4qDY+ + 93oP7XPzpBtH+rAH4BHzqE8OyjfRlWE5B0+FdIPUCKOsAt4z+4hpw4UeAKOB5d9ars3ubAtV/md8 + DzAGBpcoNxa4YJd++zYLYRP/ajtH3AomDuw7rlbuBdjwDRwK2shuP4j7dB7qHP+qBGus6JakMDmj + c8h4NtsVZHQYfQMAAP//AwDajPLhMgMAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e202698a251d-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:03 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '334' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=GJt8W0lyxqfgtKXEJ4M8twcG5pNqc4RmQiPqQ_IukiU-1774472822.1494222-1.0.1.1-Jv3zKpnCjFAAQQaXBEt3RElEP.QjtEFbqrvr8BASrk5X7XSiOj1UBc4tUR3t9QbKmOM0VrcVW6R3HYLaYxbTHQqz4Dpjl7Z.Sz9BslefycjBprbfLjQ1aoOYxrSO7lkO; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:03 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999977' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_8c118a37a5da44069de28fba911081e0 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a concise assistant. Answer in one sentence."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Say + hello in exactly two words."}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '290' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyDbKMgIAAAD//4xSQU7DMBC89xWVz0mV + pKHhDALBDSFxQlXk2pvE4NiW7dAi1L+zTtomhSJxseSdnfXMeO+zuxuWbp9vt4/pU/WyonrHk45E + gaE3b8D8kbVgGnnghVYDzCxQD2FqWhR5XmTXq1UPtJqDDLTa+DjXcSuUiLMky+OkiNPrA7vRgoHD + tle8zudf/Rl0Kg47LCfRsdKCc7QGrB2bsGi1DBVCnRPOU+VJNIJMKw+ql/4AUuq5b8DCYtpioeoc + DTJVJ+UEoEppT4PNXtz6gOxPcqSujdUb94NKKrTpmhJTcRgRPu28NqRH93iue9vdmROCg1rjS6/f + oX9umQ3jyBj2BDxgHvXJSfkqujCs5OCpkG6SGmGUNcBH5hgx7bjQE2A2sfxby6XZg22h6v+MHwHG + wOASlcYCF+zc79hmIWziX22niHvBxIH9wNUqvQAbvoFDRTs57Adxn85DW+Jf1WCNFcOSVKZkdAkF + L9JNRWb72TcAAAD//wMAHqai9DIDAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e3168f481d99-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:46 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '370' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=QmJ_mn8jkQXEiES_f8MtNfFrG.TdUhMi9F3CsYesW.Y-1774472866.3273165-1.0.1.1-9jM9NCSYxoeVuM4GWc_rXbAFSxA2USepp4rz5niKNzmDK.TtF1V7PSKWLir8akfbpuyXvva5QQRFgDCcMT3P0xa_1Tzm9mW9uidzQIoQBbq5O6t0XUQzY8bQC4ddKj1i; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:46 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999977' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_2ac33c8b77a343ea881d0fe0fbeccc14 + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_tool_use_creates_tool_span.yaml b/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_tool_use_creates_tool_span.yaml new file mode 100644 index 00000000..8b272df9 --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_tool_use_creates_tool_span.yaml @@ -0,0 +1,451 @@ +interactions: +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a helpful assistant. Use tools when required and keep answers brief."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Use + Python to compute 6 * 7 and return just the result."}]}],"model":"gpt-4o-mini","stream":false,"temperature":0,"tools":[{"type":"function","function":{"name":"execute_python_code","parameters":{"properties":{"code":{"description":"The + Python code to be executed.","type":"string"},"timeout":{"default":300,"description":"The + maximum time (in seconds) allowed for the code to run.","type":"number"}},"required":["code"],"type":"object"},"description":"Execute + the given python code in a temp file and capture the return\ncode, standard + output and error. Note you must `print` the output to get\nthe result, and the + tmp file will be removed right after the execution."}}]}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '897' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyKxINAYAAAD//41TXW+bMBR951cgP21T + mICSklTawzZtUvuwTZ32sI7Kcs0NeDM2tU3TKMp/n20SIGkmjQcE99xzfD+ObxT9unj88PPmS5as + 85q+v9usSYVmjiEffgM1B9ZbKi0PDJOih6kCYsCpJnmeZXmaL5ceaGQJ3NGq1kSZjBomWJTGaRbF + eZQs9uxaMgrapv2yv2G49W9Xpyjh2Ybj2SHSgNakAhs7JNmgktxFENGaaUOEQbMRpFIYEK500XE+ + AYyUHFPC+Xhw/2wn3+OwbCI2Fx8vmutvn0hbPz3yz3fXvLldf/+xmJzXS29aX9CqE3QY0gQf4lcn + h1lMkMZz4RloZwC3G1NLgakd5ImMTSaq6hrbnWsBbQvksgp0VSAFuuMmfBdehm/CvChEq5gwr/rw + 6wLt0JHULjj3fT+ZloJVpwl/OUYihDTEdePneL9HdsPKuKxaJR/0CRWtrBV0ja1ztJ/EdCHBoRBf + AuqOdo6sXNMabOQf8Icm87RXRaMvRzSd70Fj6+QTVp7PzujhEgxh3hSDDymhNZQjdfQj6UomJ0Aw + 6f1lNee0+/6ZqP5HfgQohdbeONwqKBk97nhMU+Cu7b/Shin7gpEG9WTvITYMlNtHCSti3dLfUb3R + Bhpsl1aB8mbyBm9xHM/LyyRe5gsU7IK/crsEFF8EAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e16f0b9aaf0d-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:06:40 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '1239' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=ncH1zypNLZZs6ohCgRXU6EAQ0iJm0PqJ2cW8rY7HVI8-1774472798.5665667-1.0.1.1-k0DS.3SU2tAkF.r3ZjBzLnG4fwZB2Fu5meB0y3aUWpZtAKeGqa66nGagt.hDaY2vSKIEbmMrwf7bozk5KFrU29xqoOc0X32xTuik1N0lOr3jOPFP1_u5ceUvxEjzvGTM; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:36:40 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999962' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_02a17794efcb4607a4ef7c7281e6d187 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a helpful assistant. Use tools when required and keep answers brief."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Use + Python to compute 6 * 7 and return just the result."}]}],"model":"gpt-4o-mini","stream":false,"temperature":0,"tools":[{"type":"function","function":{"name":"execute_python_code","parameters":{"properties":{"code":{"description":"The + Python code to be executed.","type":"string"},"timeout":{"default":300,"description":"The + maximum time (in seconds) allowed for the code to run.","type":"number"}},"required":["code"],"type":"object"},"description":"Execute + the given python code in a temp file and capture the return\ncode, standard + output and error. Note you must `print` the output to get\nthe result, and the + tmp file will be removed right after the execution."}}]}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '897' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/4xTXWvbMBR9968wetpGPBzj1ElgD2NlHSsrhY21tC5ClW9sdbJkJDkkDfnvk+Qk + dtIM5gdj33PP0f042gRhiFiB5iGiFTG0bnh0eXP78uP2Cyw+x79urq713e9LU94v5fr+q6jRyDHk + 8wtQs2d9pNLywDApOpgqIAac6jjL0jRLpknqgVoWwB2tbEyUyqhmgkVJnKRRnEXj6Y5dSUZB27RH + +xuGG/92dYoCVjYcj/aRGrQmJdjYPskGleQugojWTBsiDBr1IJXCgHCli5bzAWCk5JgSzvuDu2cz + +O6HZRPx3Wv6IK+vxvKbjOPlagbVKjM/Z98H53XS68YXtGgFPQxpgB/i85PDLCZI7bmwAtoawM3a + VFJgagd5ImOTiSrb2nbnWkCbHLmsHM1zpEC33ISfwovwQ5jluWgUE+ZdF36foy06ktoG576fBtNS + sGg14W/HSISQhrhu/Byfdsj2sDIuy0bJZ31CRQtrBV1h6xztJzFcSLAvxJeA2qOdIytXNwYb+Qf8 + oeNJ0qmi3pc9mkx2oLF18gEry0Zn9HABhjBvioMPKaEVFD219yNpCyYHQDDo/W0157S7/pko/0e+ + ByiFxt443CgoGD3uuE9T4K7tv9IOU/YFIw1qae8hNgyU20cBC2Ld0t1RvdYGamyXVoLyZvIGb3Ac + T4qLcTzLpijYBn8BAAD//wMAHPLnAF8EAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e212f84f7e56-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:05 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '760' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=cEkxxhx5PI02Ywh8nf.Xs6jrFBlPOsqdtqnmu2dQoUs-1774472824.7966123-1.0.1.1-l8rKWW5YXcEa4qiqAmWX7gYkTwxcPBKKDdUiau.bKkThNBJdZMRBk8E6aelB55XL8mGvLIuOpvyEL1u_F6R5238q7PZg1iu7hVZPFcKKIeqC1cnWzWuetLrt0ahyQJcC; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:05 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999965' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_5e3d6b5e29b545dc85b5ddedeb732996 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a helpful assistant. Use tools when required and keep answers brief."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Use + Python to compute 6 * 7 and return just the result."}]}],"model":"gpt-4o-mini","stream":false,"temperature":0,"tools":[{"type":"function","function":{"name":"execute_python_code","parameters":{"properties":{"code":{"description":"The + Python code to be executed.","type":"string"},"timeout":{"default":300,"description":"The + maximum time (in seconds) allowed for the code to run.","type":"number"}},"required":["code"],"type":"object"},"description":"Execute + the given python code in a temp file and capture the return\ncode, standard + output and error. Note you must `print` the output to get\nthe result, and the + tmp file will be removed right after the execution."}}]}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '897' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyDYtDwEAAAD//4xTXW+bMBR951cgP21T + mADR0FTaw6ZuL6m2h+WlHZXlmBvi1tjMNmmzKP99tkmApJk0HhDcc8/x/Tguk+dGP8hNJedPt2Jz + //B1/rKYo4ljyOUTUHNkfaTS8sAwKTqYKiAGnGqS51mWp9fTmQdqWQJ3tKoxUSajmgkWpXGaRXEe + JdcH9loyCtqm/bK/Ybjzb1enKOHVhuPJMVKD1qQCGzsm2aCS3EUQ0ZppQ4RBkwGkUhgQrnTRcj4C + jJQcU8L5cHD37Ebfw7BsIs7n2Uv14/Zukc9+t1/ufork23fx+c9idF4nvW18QatW0H5II7yP35wd + ZjFBas+FV6CtAdxszVoKTO0gz2RsMlFVW9vuXAtoVyCXVaCbAinQLTfhp3AafgjzohCNYsK868Lv + C7RHJ1L74NL342haClatJvztGIkQ0hDXjZ/j4wHZ9yvjsmqUXOozKlpZK+g1ts7RfhLjhQTHQnwJ + qD3ZObJydWOwkc/gD02u0k4VDb4c0PTqABpbJx+x8nxyQQ+XYAjzpuh9SAldQzlQBz+StmRyBASj + 3t9Wc0m765+J6n/kB4BSaOyNw42CktHTjoc0Be7a/iutn7IvGGlQG3sPsWGg3D5KWBHrlu6O6q02 + UGO7tAqUN5M3eIOzlKbTeJnHMxTsg78AAAD//wMAxy9FZF8EAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e3264eb7f005-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:49 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '724' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=0XpUc1fNMZh7X5an8o2lKE5Me2U0kr5yZMBg0xANHr0-1774472868.8462937-1.0.1.1-dprcxP4hyQ0IPDL_vk1NK7FsJ11DyabA3P8I942JNvTn7zKho4A0pRDev9WYlxrW1LwDshBeG3MLMUsQs5Y9hxWVr.Wu3JoHqUF6i2Ho7_hr0NaKJDRn8f.qyG5mv6hc; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:49 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999962' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_4413081d046145c59f7df6e62731b407 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a helpful assistant. Use tools when required and keep answers brief."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Use + Python to compute 6 * 7 and return just the result."}]},{"role":"assistant","name":"Jarvis","content":null,"tool_calls":[{"id":"call_7K4wgODLT79quBLSn1FNnAzT","type":"function","function":{"name":"execute_python_code","arguments":"{\"code\": + \"result = 6 * 7\\nprint(result)\"}"}}]},{"role":"tool","tool_call_id":"call_7K4wgODLT79quBLSn1FNnAzT","content":"042\n","name":"execute_python_code"}],"model":"gpt-4o-mini","stream":false,"temperature":0,"tools":[{"type":"function","function":{"name":"execute_python_code","parameters":{"properties":{"code":{"description":"The + Python code to be executed.","type":"string"},"timeout":{"default":300,"description":"The + maximum time (in seconds) allowed for the code to run.","type":"number"}},"required":["code"],"type":"object"},"description":"Execute + the given python code in a temp file and capture the return\ncode, standard + output and error. Note you must `print` the output to get\nthe result, and the + tmp file will be removed right after the execution."}}]}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '1293' + Content-Type: + - application/json + Cookie: + - __cf_bm=0XpUc1fNMZh7X5an8o2lKE5Me2U0kr5yZMBg0xANHr0-1774472868.8462937-1.0.1.1-dprcxP4hyQ0IPDL_vk1NK7FsJ11DyabA3P8I942JNvTn7zKho4A0pRDev9WYlxrW1LwDshBeG3MLMUsQs5Y9hxWVr.Wu3JoHqUF6i2Ho7_hr0NaKJDRn8f.qyG5mv6hc + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyDZzdwUAAAD//41SwW6cMBC971dYPqXS + EgGiC7lGinLLKbcmQo4ZwAnYyDO0Sav9947NJpB2K/Vi7c6b93jzZp4r3/rsZ3t9c3Orf3RV22d3 + RSH3geGenkHTO+tSO+YBGWcXWHtQBEE1K8uiKPOqTCMwugaGQOsmSgqXjMaaJE/zIknLJKtO7N4Z + Dcht3/ivEL/iG3zaBl65HLViZQRE1QHX3pu46N0QKlIhGiRlSe5XUDtLYKP1+x6EB5wHEq4VDw8X + 4sAvGRYVJf/6IgyKIr/c0j20M6owgp2HYQMoax2pEEE0/nhCjh9WB9dN3j3hH1TZcgTY15wYcnxs + C8lNMqJHfh9jJPOnKSULjRPV5F4gfi5PrxY9uW5iRbPyBBI7HDas/LA/o1c3QMoMuAlVaqV7aFbq + ugE1N8ZtgN1m6r/dnNNeJje2+x/5FdAaJr6xevLQGP154rXNQzjUf7V9pBwNSwT/nS+vJgM+bKKB + VvF1LFeJb0gw1ryuDvzkzXJD7VSn6dfmkKVXZSV3x91vPJeasFEDAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e32d6800f95b-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:51 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '642' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999947' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_962b361c03444494ac60f59571e1d91c + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/agentscope/cassettes/test_auto_agentscope.yaml b/py/src/braintrust/integrations/agentscope/cassettes/test_auto_agentscope.yaml new file mode 100644 index 00000000..221dcabb --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/cassettes/test_auto_agentscope.yaml @@ -0,0 +1,122 @@ +interactions: +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a helpful assistant. Be brief."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Say + hi in two words."}]}],"model":"gpt-4o-mini","stream":true,"stream_options":{"include_usage":true},"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '304' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Raw-Response: + - 'true' + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: 'data: {"id":"chatcmpl-DNPizxiSOmMQFp69qki8RpeoxlQWv","object":"chat.completion.chunk","created":1774472801,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_ca3e7d71bf","choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}],"usage":null,"obfuscation":"bSxoCuDli"} + + + data: {"id":"chatcmpl-DNPizxiSOmMQFp69qki8RpeoxlQWv","object":"chat.completion.chunk","created":1774472801,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_ca3e7d71bf","choices":[{"index":0,"delta":{"content":"Hello"},"logprobs":null,"finish_reason":null}],"usage":null,"obfuscation":"RR3fPq"} + + + data: {"id":"chatcmpl-DNPizxiSOmMQFp69qki8RpeoxlQWv","object":"chat.completion.chunk","created":1774472801,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_ca3e7d71bf","choices":[{"index":0,"delta":{"content":" + there"},"logprobs":null,"finish_reason":null}],"usage":null,"obfuscation":"Ks8Wg"} + + + data: {"id":"chatcmpl-DNPizxiSOmMQFp69qki8RpeoxlQWv","object":"chat.completion.chunk","created":1774472801,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_ca3e7d71bf","choices":[{"index":0,"delta":{"content":"!"},"logprobs":null,"finish_reason":null}],"usage":null,"obfuscation":"OTuTCk6Yhl"} + + + data: {"id":"chatcmpl-DNPizxiSOmMQFp69qki8RpeoxlQWv","object":"chat.completion.chunk","created":1774472801,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_ca3e7d71bf","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}],"usage":null,"obfuscation":"Ky1jT"} + + + data: {"id":"chatcmpl-DNPizxiSOmMQFp69qki8RpeoxlQWv","object":"chat.completion.chunk","created":1774472801,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_ca3e7d71bf","choices":[],"usage":{"prompt_tokens":29,"completion_tokens":3,"total_tokens":32,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}},"obfuscation":"tz0KSnmcPPt"} + + + data: [DONE] + + + ' + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e1806b8d67a6-SJC + Connection: + - keep-alive + Content-Type: + - text/event-stream; charset=utf-8 + Date: + - Wed, 25 Mar 2026 21:06:43 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '1276' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=26gJQ7Ja3taZFTBb7A3G23kqPcIKnacz5qIbjguQkYs-1774472801.3417854-1.0.1.1-9JqXxOO8Hh_qhFlPUB0VZAqRq_.bnwDdhOr_sD9UVAZbZxHIG013WyWO1wxnxzpoHB2eF6tlQndU7CKalttwp.wptRHYq2G6erRwpDHPPQiZU_8r.6r_TsmfH2ya11Un; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:36:43 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999982' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_369a94628d4547e69137ec894aa584f3 + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/agentscope/integration.py b/py/src/braintrust/integrations/agentscope/integration.py new file mode 100644 index 00000000..fac8fe8d --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/integration.py @@ -0,0 +1,26 @@ +"""AgentScope integration orchestration.""" + +from braintrust.integrations.base import BaseIntegration + +from .patchers import ( + AgentCallPatcher, + ChatModelPatcher, + FanoutPipelinePatcher, + SequentialPipelinePatcher, + ToolkitCallToolFunctionPatcher, +) + + +class AgentScopeIntegration(BaseIntegration): + """Braintrust instrumentation for AgentScope. Requires AgentScope v1.0.0 or higher.""" + + name = "agentscope" + import_names = ("agentscope",) + min_version = "1.0.0" + patchers = ( + AgentCallPatcher, + SequentialPipelinePatcher, + FanoutPipelinePatcher, + ToolkitCallToolFunctionPatcher, + ChatModelPatcher, + ) diff --git a/py/src/braintrust/integrations/agentscope/patchers.py b/py/src/braintrust/integrations/agentscope/patchers.py new file mode 100644 index 00000000..a0a9ba21 --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/patchers.py @@ -0,0 +1,103 @@ +"""AgentScope patchers.""" + +from braintrust.integrations.base import CompositeFunctionWrapperPatcher, FunctionWrapperPatcher + +from .tracing import ( + _agent_call_wrapper, + _fanout_pipeline_wrapper, + _model_call_wrapper, + _sequential_pipeline_wrapper, + _toolkit_call_tool_function_wrapper, +) + + +class AgentCallPatcher(FunctionWrapperPatcher): + """Patch AgentScope agent execution.""" + + name = "agentscope.agent.call" + target_module = "agentscope.agent" + target_path = "AgentBase.__call__" + wrapper = _agent_call_wrapper + + +class SequentialPipelinePatcher(FunctionWrapperPatcher): + """Patch AgentScope sequential pipeline execution.""" + + name = "agentscope.pipeline.sequential" + target_module = "agentscope.pipeline" + target_path = "sequential_pipeline" + wrapper = _sequential_pipeline_wrapper + + +class FanoutPipelinePatcher(FunctionWrapperPatcher): + """Patch AgentScope fanout pipeline execution.""" + + name = "agentscope.pipeline.fanout" + target_module = "agentscope.pipeline" + target_path = "fanout_pipeline" + wrapper = _fanout_pipeline_wrapper + + +class ToolkitCallToolFunctionPatcher(FunctionWrapperPatcher): + """Patch AgentScope toolkit execution.""" + + name = "agentscope.tool.call_tool_function" + target_module = "agentscope.tool" + target_path = "Toolkit.call_tool_function" + wrapper = _toolkit_call_tool_function_wrapper + + +class _OpenAIChatModelPatcher(FunctionWrapperPatcher): + name = "agentscope.model.openai" + target_module = "agentscope.model" + target_path = "OpenAIChatModel.__call__" + wrapper = _model_call_wrapper + + +class _DashScopeChatModelPatcher(FunctionWrapperPatcher): + name = "agentscope.model.dashscope" + target_module = "agentscope.model" + target_path = "DashScopeChatModel.__call__" + wrapper = _model_call_wrapper + + +class _AnthropicChatModelPatcher(FunctionWrapperPatcher): + name = "agentscope.model.anthropic" + target_module = "agentscope.model" + target_path = "AnthropicChatModel.__call__" + wrapper = _model_call_wrapper + + +class _OllamaChatModelPatcher(FunctionWrapperPatcher): + name = "agentscope.model.ollama" + target_module = "agentscope.model" + target_path = "OllamaChatModel.__call__" + wrapper = _model_call_wrapper + + +class _GeminiChatModelPatcher(FunctionWrapperPatcher): + name = "agentscope.model.gemini" + target_module = "agentscope.model" + target_path = "GeminiChatModel.__call__" + wrapper = _model_call_wrapper + + +class _TrinityChatModelPatcher(FunctionWrapperPatcher): + name = "agentscope.model.trinity" + target_module = "agentscope.model" + target_path = "TrinityChatModel.__call__" + wrapper = _model_call_wrapper + + +class ChatModelPatcher(CompositeFunctionWrapperPatcher): + """Patch the built-in AgentScope chat model implementations.""" + + name = "agentscope.model" + sub_patchers = ( + _OpenAIChatModelPatcher, + _DashScopeChatModelPatcher, + _AnthropicChatModelPatcher, + _OllamaChatModelPatcher, + _GeminiChatModelPatcher, + _TrinityChatModelPatcher, + ) diff --git a/py/src/braintrust/integrations/agentscope/test_agentscope.py b/py/src/braintrust/integrations/agentscope/test_agentscope.py new file mode 100644 index 00000000..ee09064a --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/test_agentscope.py @@ -0,0 +1,221 @@ +from pathlib import Path + +import pytest +from braintrust import logger +from braintrust.integrations.agentscope import setup_agentscope +from braintrust.span_types import SpanTypeAttribute +from braintrust.test_helpers import init_test_logger +from braintrust.wrappers.test_utils import verify_autoinstrument_script + + +PROJECT_NAME = "test_agentscope" + +setup_agentscope(project_name=PROJECT_NAME) + + +@pytest.fixture(scope="module") +def vcr_config(): + return { + "cassette_library_dir": str(Path(__file__).parent / "cassettes"), + } + + +@pytest.fixture +def memory_logger(): + init_test_logger(PROJECT_NAME) + with logger._internal_with_memory_background_logger() as bgl: + yield bgl + + +def _span_type(span): + span_type = span["span_attributes"]["type"] + return span_type.value if hasattr(span_type, "value") else span_type + + +def _make_model(*, stream: bool = False): + from agentscope.model import OpenAIChatModel + + return OpenAIChatModel( + model_name="gpt-4o-mini", + stream=stream, + generate_kwargs={"temperature": 0}, + ) + + +def _make_agent(name: str, sys_prompt: str, *, toolkit=None, multi_agent: bool = False): + from agentscope.agent import ReActAgent + from agentscope.formatter import OpenAIChatFormatter, OpenAIMultiAgentFormatter + from agentscope.memory import InMemoryMemory + from agentscope.tool import Toolkit + + agent = ReActAgent( + name=name, + sys_prompt=sys_prompt, + model=_make_model(), + formatter=OpenAIMultiAgentFormatter() if multi_agent else OpenAIChatFormatter(), + toolkit=toolkit or Toolkit(), + memory=InMemoryMemory(), + ) + agent.set_console_output_enabled(False) + return agent + + +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_agentscope_simple_agent_run(memory_logger): + from agentscope.message import Msg + + assert not memory_logger.pop() + + agent = _make_agent( + "Friday", + "You are a concise assistant. Answer in one sentence.", + ) + + response = await agent( + Msg( + name="user", + content="Say hello in exactly two words.", + role="user", + ) + ) + + assert response is not None + + spans = memory_logger.pop() + agent_span = next(span for span in spans if span["span_attributes"]["name"] == "Friday.reply") + llm_spans = [span for span in spans if _span_type(span) == SpanTypeAttribute.LLM] + + assert _span_type(agent_span) == "task" + assert llm_spans + assert llm_spans[0]["metadata"]["model"] == "gpt-4o-mini" + assert "args" not in llm_spans[0]["input"] + assert llm_spans[0]["input"]["messages"][0]["role"] == "system" + assert llm_spans[0]["input"]["messages"][1]["role"] == "user" + assert llm_spans[0]["input"]["messages"][1]["content"][0]["text"] == "Say hello in exactly two words." + assert llm_spans[0]["output"]["role"] == "assistant" + assert llm_spans[0]["output"]["content"][0]["text"] == "Hello there." + assert "usage" not in llm_spans[0]["output"] + assert agent_span["span_id"] in llm_spans[0]["span_parents"] + + +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_agentscope_sequential_pipeline_creates_parent_span(memory_logger): + from agentscope.message import Msg + from agentscope.pipeline import sequential_pipeline + + assert not memory_logger.pop() + + agents = [ + _make_agent("Alice", "You rewrite the input as a short title.", multi_agent=True), + _make_agent("Bob", "You answer the previous message in one sentence.", multi_agent=True), + ] + + result = await sequential_pipeline( + agents=agents, + msg=Msg( + name="user", + content="Summarize why tests should use real recorded traffic.", + role="user", + ), + ) + + assert result is not None + + spans = memory_logger.pop() + pipeline_span = next(span for span in spans if span["span_attributes"]["name"] == "sequential_pipeline.run") + alice_span = next(span for span in spans if span["span_attributes"]["name"] == "Alice.reply") + bob_span = next(span for span in spans if span["span_attributes"]["name"] == "Bob.reply") + + assert _span_type(pipeline_span) == "task" + assert pipeline_span["span_id"] in alice_span["span_parents"] + assert pipeline_span["span_id"] in bob_span["span_parents"] + + +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_agentscope_tool_use_creates_tool_span(memory_logger): + from agentscope.message import Msg + from agentscope.tool import Toolkit, execute_python_code + + assert not memory_logger.pop() + + toolkit = Toolkit() + toolkit.register_tool_function(execute_python_code) + agent = _make_agent( + "Jarvis", + "You are a helpful assistant. Use tools when required and keep answers brief.", + toolkit=toolkit, + ) + + response = await agent( + Msg( + name="user", + content="Use Python to compute 6 * 7 and return just the result.", + role="user", + ) + ) + + assert response is not None + + spans = memory_logger.pop() + tool_spans = [span for span in spans if _span_type(span) == "tool"] + + assert tool_spans + assert tool_spans[0]["span_attributes"]["name"] == "execute_python_code.execute" + assert tool_spans[0]["input"]["tool_name"] == "execute_python_code" + assert tool_spans[0]["output"]["content"] + + llm_spans = [span for span in spans if _span_type(span) == SpanTypeAttribute.LLM] + assert llm_spans + assert llm_spans[0]["output"]["role"] == "assistant" + assert llm_spans[0]["output"]["content"][0]["type"] == "tool_use" + assert "usage" not in llm_spans[0]["output"] + + +@pytest.mark.asyncio +async def test_model_call_wrapper_stream_logs_final_output_and_metrics(memory_logger): + from braintrust.integrations.agentscope.tracing import _model_call_wrapper + + assert not memory_logger.pop() + + class FakeOpenAIChatModel: + model_name = "gpt-4o-mini" + + async def wrapped(*_args, **_kwargs): + async def _stream(): + yield {"content": [{"type": "text", "text": "Hello"}]} + yield { + "content": [{"type": "text", "text": "Hello there!"}], + "usage": {"prompt_tokens": 29, "completion_tokens": 3, "total_tokens": 32}, + } + + return _stream() + + stream = await _model_call_wrapper( + wrapped, + FakeOpenAIChatModel(), + args=([{"role": "user", "content": "Say hi in two words."}],), + kwargs={}, + ) + + chunks = [chunk async for chunk in stream] + + assert chunks[-1]["content"][0]["text"] == "Hello there!" + + spans = memory_logger.pop() + assert len(spans) == 1 + llm_span = spans[0] + + assert _span_type(llm_span) == SpanTypeAttribute.LLM + assert llm_span["output"]["role"] == "assistant" + assert llm_span["output"]["content"][0]["text"] == "Hello there!" + assert llm_span["metrics"]["prompt_tokens"] == 29 + assert llm_span["metrics"]["completion_tokens"] == 3 + assert llm_span["metrics"]["tokens"] == 32 + + +class TestAutoInstrumentAgentScope: + def test_auto_instrument_agentscope(self): + verify_autoinstrument_script("test_auto_agentscope.py") diff --git a/py/src/braintrust/integrations/agentscope/tracing.py b/py/src/braintrust/integrations/agentscope/tracing.py new file mode 100644 index 00000000..efb4891d --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/tracing.py @@ -0,0 +1,269 @@ +"""AgentScope-specific span creation and stream aggregation.""" + +from contextlib import aclosing +from typing import Any + +from braintrust.logger import start_span +from braintrust.span_types import SpanTypeAttribute + + +def _clean(mapping: dict[str, Any]) -> dict[str, Any]: + return {key: value for key, value in mapping.items() if value is not None} + + +def _args_kwargs_input(args: Any, kwargs: dict[str, Any]) -> dict[str, Any]: + return _clean( + { + "args": list(args) if args else None, + "kwargs": kwargs if kwargs else None, + } + ) + + +def _agent_name(instance: Any) -> str: + return getattr(instance, "name", None) or instance.__class__.__name__ + + +def _pipeline_metadata(args: Any, kwargs: dict[str, Any]) -> dict[str, Any]: + agents = kwargs.get("agents") + if agents is None and args: + agents = args[0] + + agent_names = None + if agents: + agent_names = [getattr(agent, "name", agent.__class__.__name__) for agent in agents] + + return _clean({"agent_names": agent_names}) + + +def _extract_metrics(*candidates: Any) -> dict[str, float] | None: + key_map = { + "prompt_tokens": "prompt_tokens", + "input_tokens": "prompt_tokens", + "completion_tokens": "completion_tokens", + "output_tokens": "completion_tokens", + "total_tokens": "tokens", + "tokens": "tokens", + } + + for candidate in candidates: + data = _field_value(candidate, "usage") or candidate + + metrics = {} + for source_key, target_key in key_map.items(): + value = _field_value(data, source_key) + if isinstance(value, (int, float)): + metrics[target_key] = float(value) + if metrics: + return metrics + + return None + + +def _model_provider_name(instance: Any) -> str: + class_name = instance.__class__.__name__ + if class_name.endswith("Model"): + return class_name[: -len("Model")] + return class_name + + +def _model_metadata(instance: Any) -> dict[str, Any]: + return _clean( + { + "model": getattr(instance, "model_name", None), + "provider": _model_provider_name(instance), + "model_class": instance.__class__.__name__, + } + ) + + +def _model_call_input(args: Any, kwargs: dict[str, Any]) -> dict[str, Any]: + messages = kwargs.get("messages") + if messages is None and args: + messages = args[0] + + tools = kwargs.get("tools") + if tools is None and len(args) > 1: + tools = args[1] + + tool_choice = kwargs.get("tool_choice") + if tool_choice is None and len(args) > 2: + tool_choice = args[2] + + structured_model = kwargs.get("structured_model") + if structured_model is None and len(args) > 3: + structured_model = args[3] + + return _clean( + { + "messages": messages, + "tools": tools, + "tool_choice": tool_choice, + "structured_model": structured_model, + } + ) + + +def _model_call_metadata(instance: Any, kwargs: dict[str, Any]) -> dict[str, Any]: + extra_kwargs = { + key: value + for key, value in kwargs.items() + if key not in {"messages", "tools", "tool_choice", "structured_model"} and value is not None + } + return {**_model_metadata(instance), **extra_kwargs} + + +def _model_call_output(result: Any) -> Any: + if isinstance(result, dict): + data = result + elif _field_value(result, "content") is not None or _field_value(result, "metadata") is not None: + data = { + "content": _field_value(result, "content"), + "metadata": _field_value(result, "metadata"), + } + else: + return result + + normalized = _clean( + { + "role": "assistant" if data.get("content") is not None else None, + "content": data.get("content"), + "metadata": data.get("metadata"), + } + ) + return normalized or data + + +def _field_value(data: Any, key: str) -> Any: + if isinstance(data, dict): + return data.get(key) + try: + return getattr(data, key, None) + except Exception: + return None + + +def _tool_name(tool_call: Any) -> str: + if isinstance(tool_call, dict): + return str(tool_call.get("name") or "unknown_tool") + return str(getattr(tool_call, "name", "unknown_tool")) + + +async def _agent_call_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: + with start_span( + name=f"{_agent_name(instance)}.reply", + type=SpanTypeAttribute.TASK, + input=_args_kwargs_input(args, kwargs), + metadata=_clean({"agent_class": instance.__class__.__name__}), + ) as span: + try: + result = await wrapped(*args, **kwargs) + span.log(output=result) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + + +async def _sequential_pipeline_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: + with start_span( + name="sequential_pipeline.run", + type=SpanTypeAttribute.TASK, + input=_args_kwargs_input(args, kwargs), + metadata=_pipeline_metadata(args, kwargs), + ) as span: + try: + result = await wrapped(*args, **kwargs) + span.log(output=result) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + + +async def _fanout_pipeline_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: + with start_span( + name="fanout_pipeline.run", + type=SpanTypeAttribute.TASK, + input=_args_kwargs_input(args, kwargs), + metadata=_pipeline_metadata(args, kwargs), + ) as span: + try: + result = await wrapped(*args, **kwargs) + span.log(output=result) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + + +async def _toolkit_call_tool_function_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: + tool_call = args[0] if args else kwargs.get("tool_call") + tool_name = _tool_name(tool_call) + with start_span( + name=f"{tool_name}.execute", + type=SpanTypeAttribute.TOOL, + input=_clean( + { + "tool_name": tool_name, + "tool_call": tool_call, + } + ), + metadata=_clean({"toolkit_class": instance.__class__.__name__}), + ) as span: + try: + result = await wrapped(*args, **kwargs) + if _is_async_iterator(result): + + async def _trace(): + last_chunk = None + async with aclosing(result) as agen: + async for chunk in agen: + last_chunk = chunk + yield chunk + if last_chunk is not None: + span.log(output=last_chunk) + + return _trace() + + span.log(output=result) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + + +def _is_async_iterator(value: Any) -> bool: + try: + return getattr(value, "__aiter__", None) is not None and getattr(value, "__anext__", None) is not None + except Exception: + return False + + +async def _model_call_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: + with start_span( + name=f"{_model_provider_name(instance)}.call", + type=SpanTypeAttribute.LLM, + input=_model_call_input(args, kwargs), + metadata=_model_call_metadata(instance, kwargs), + ) as span: + try: + result = await wrapped(*args, **kwargs) + if _is_async_iterator(result): + + async def _trace(): + last_chunk = None + async with aclosing(result) as agen: + async for chunk in agen: + last_chunk = chunk + yield chunk + if last_chunk is not None: + span.log(output=_model_call_output(last_chunk), metrics=_extract_metrics(last_chunk)) + + return _trace() + + span.log(output=_model_call_output(result), metrics=_extract_metrics(result)) + return result + except Exception as exc: + span.log(error=str(exc)) + raise diff --git a/py/src/braintrust/integrations/auto_test_scripts/test_auto_agentscope.py b/py/src/braintrust/integrations/auto_test_scripts/test_auto_agentscope.py new file mode 100644 index 00000000..08efd409 --- /dev/null +++ b/py/src/braintrust/integrations/auto_test_scripts/test_auto_agentscope.py @@ -0,0 +1,73 @@ +"""Test auto_instrument for AgentScope.""" + +import os +from pathlib import Path + + +os.environ["BRAINTRUST_CASSETTES_DIR"] = str(Path(__file__).resolve().parent.parent / "agentscope" / "cassettes") + +from braintrust.auto import auto_instrument +from braintrust.wrappers.test_utils import autoinstrument_test_context + + +results = auto_instrument() +assert results.get("agentscope") == True, "auto_instrument should return True for agentscope" + +results2 = auto_instrument() +assert results2.get("agentscope") == True, "auto_instrument should still return True on second call" + +from agentscope.agent import AgentBase, ReActAgent +from agentscope.formatter import OpenAIChatFormatter +from agentscope.memory import InMemoryMemory +from agentscope.message import Msg +from agentscope.model import OpenAIChatModel +from agentscope.pipeline import fanout_pipeline, sequential_pipeline +from agentscope.tool import Toolkit + + +assert hasattr(AgentBase.__call__, "__wrapped__"), "AgentBase.__call__ should be wrapped" +assert hasattr(sequential_pipeline, "__wrapped__"), "sequential_pipeline should be wrapped" +assert hasattr(fanout_pipeline, "__wrapped__"), "fanout_pipeline should be wrapped" +assert hasattr(Toolkit.call_tool_function, "__wrapped__"), "Toolkit.call_tool_function should be wrapped" +assert hasattr(OpenAIChatModel.__call__, "__wrapped__"), "OpenAIChatModel.__call__ should be wrapped" + + +with autoinstrument_test_context("test_auto_agentscope") as memory_logger: + agent = ReActAgent( + name="Test Agent", + sys_prompt="You are a helpful assistant. Be brief.", + model=OpenAIChatModel( + model_name="gpt-4o-mini", + generate_kwargs={"temperature": 0}, + ), + formatter=OpenAIChatFormatter(), + toolkit=Toolkit(), + memory=InMemoryMemory(), + ) + agent.set_console_output_enabled(False) + + response = agent( + Msg( + name="user", + content="Say hi in two words.", + role="user", + ) + ) + + import asyncio + + result = asyncio.run(response) + assert result is not None + + spans = memory_logger.pop() + assert len(spans) >= 2, f"Expected at least 2 spans (agent + model), got {len(spans)}" + + agent_span = next(span for span in spans if span["span_attributes"]["name"] == "Test Agent.reply") + llm_spans = [span for span in spans if span["span_attributes"]["type"].value == "llm"] + + assert agent_span["span_attributes"]["type"].value == "task" + assert llm_spans, "Should have at least one LLM span" + assert llm_spans[0]["metadata"]["model"] == "gpt-4o-mini" + assert agent_span["span_id"] in llm_spans[0]["span_parents"] + +print("SUCCESS") diff --git a/py/src/braintrust/wrappers/test_google_genai.py b/py/src/braintrust/wrappers/test_google_genai.py index 73a31e71..7839ed0c 100644 --- a/py/src/braintrust/wrappers/test_google_genai.py +++ b/py/src/braintrust/wrappers/test_google_genai.py @@ -31,6 +31,7 @@ def before_record_request(request): "record_mode": record_mode, "filter_headers": [ "authorization", + "Authorization", "x-api-key", "x-goog-api-key", ], From fd6b14b9595ad85de0eff4df18cc2d5f381c0609 Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Wed, 25 Mar 2026 15:39:14 -0700 Subject: [PATCH 2/3] test: fix AgentScope 1.0.0 compatibility --- .../integrations/agentscope/test_agentscope.py | 5 ++++- .../auto_test_scripts/test_auto_agentscope.py | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/py/src/braintrust/integrations/agentscope/test_agentscope.py b/py/src/braintrust/integrations/agentscope/test_agentscope.py index ee09064a..249a1b7e 100644 --- a/py/src/braintrust/integrations/agentscope/test_agentscope.py +++ b/py/src/braintrust/integrations/agentscope/test_agentscope.py @@ -56,7 +56,10 @@ def _make_agent(name: str, sys_prompt: str, *, toolkit=None, multi_agent: bool = toolkit=toolkit or Toolkit(), memory=InMemoryMemory(), ) - agent.set_console_output_enabled(False) + if hasattr(agent, "set_console_output_enabled"): + agent.set_console_output_enabled(False) + elif hasattr(agent, "disable_console_output"): + agent.disable_console_output() return agent diff --git a/py/src/braintrust/integrations/auto_test_scripts/test_auto_agentscope.py b/py/src/braintrust/integrations/auto_test_scripts/test_auto_agentscope.py index 08efd409..2cc545a9 100644 --- a/py/src/braintrust/integrations/auto_test_scripts/test_auto_agentscope.py +++ b/py/src/braintrust/integrations/auto_test_scripts/test_auto_agentscope.py @@ -21,13 +21,20 @@ from agentscope.memory import InMemoryMemory from agentscope.message import Msg from agentscope.model import OpenAIChatModel -from agentscope.pipeline import fanout_pipeline, sequential_pipeline +from agentscope.pipeline import sequential_pipeline from agentscope.tool import Toolkit +try: + from agentscope.pipeline import fanout_pipeline +except ImportError: + fanout_pipeline = None + + assert hasattr(AgentBase.__call__, "__wrapped__"), "AgentBase.__call__ should be wrapped" assert hasattr(sequential_pipeline, "__wrapped__"), "sequential_pipeline should be wrapped" -assert hasattr(fanout_pipeline, "__wrapped__"), "fanout_pipeline should be wrapped" +if fanout_pipeline is not None: + assert hasattr(fanout_pipeline, "__wrapped__"), "fanout_pipeline should be wrapped" assert hasattr(Toolkit.call_tool_function, "__wrapped__"), "Toolkit.call_tool_function should be wrapped" assert hasattr(OpenAIChatModel.__call__, "__wrapped__"), "OpenAIChatModel.__call__ should be wrapped" @@ -44,7 +51,10 @@ toolkit=Toolkit(), memory=InMemoryMemory(), ) - agent.set_console_output_enabled(False) + if hasattr(agent, "set_console_output_enabled"): + agent.set_console_output_enabled(False) + elif hasattr(agent, "disable_console_output"): + agent.disable_console_output() response = agent( Msg( From dba55b3df64dfe6ae78a8b267248691d909b6eec Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Wed, 25 Mar 2026 17:30:10 -0700 Subject: [PATCH 3/3] feat: Add agentscope eval instrumentation --- .../integrations/agentscope/__init__.py | 43 ++- ..._general_evaluator_creates_eval_spans.yaml | 320 ++++++++++++++++ .../integrations/agentscope/integration.py | 25 ++ .../integrations/agentscope/patchers.py | 71 ++++ .../agentscope/test_agentscope.py | 252 ++++++++++++- .../integrations/agentscope/tracing.py | 344 +++++++++++++++++- py/src/braintrust/integrations/base.py | 3 +- 7 files changed, 1053 insertions(+), 5 deletions(-) create mode 100644 py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_general_evaluator_creates_eval_spans.yaml diff --git a/py/src/braintrust/integrations/agentscope/__init__.py b/py/src/braintrust/integrations/agentscope/__init__.py index 534f1db8..b78c158e 100644 --- a/py/src/braintrust/integrations/agentscope/__init__.py +++ b/py/src/braintrust/integrations/agentscope/__init__.py @@ -1,20 +1,59 @@ """Braintrust integration for AgentScope.""" +from typing import Any + from braintrust.logger import NOOP_SPAN, current_span, init_logger from .integration import AgentScopeIntegration +from .patchers import ( + GeneralEvaluatorPatcher, + MetricCallPatcher, + RayEvaluatorRunPatcher, + TaskEvaluatePatcher, +) -__all__ = ["AgentScopeIntegration", "setup_agentscope"] +__all__ = ["AgentScopeIntegration", "setup_agentscope", "wrap_evaluator"] def setup_agentscope( api_key: str | None = None, project_id: str | None = None, project_name: str | None = None, + instrument_evals: bool = True, ) -> bool: """Setup Braintrust integration with AgentScope.""" if current_span() == NOOP_SPAN: init_logger(project=project_name, api_key=api_key, project_id=project_id) - return AgentScopeIntegration.setup() + return AgentScopeIntegration.setup(instrument_evals=instrument_evals) + + +def wrap_evaluator(Evaluator: Any) -> Any: + """Manually patch an AgentScope evaluator class for tracing. + + This helper patches the evaluator class itself and, when available, also + enables task and metric tracing from the exported ``agentscope.evaluate`` + module so ``GeneralEvaluator`` produces nested evaluation spans even when + global setup is not used. + """ + class_name = getattr(Evaluator, "__name__", "") + if class_name == "RayEvaluator": + RayEvaluatorRunPatcher.wrap_target(Evaluator) + else: + GeneralEvaluatorPatcher.wrap_target(Evaluator) + + try: + import agentscope.evaluate as agentscope_evaluate + except ImportError: + return Evaluator + + task_cls = getattr(agentscope_evaluate, "Task", None) + if task_cls is not None: + TaskEvaluatePatcher.wrap_target(task_cls) + + metric_cls = getattr(agentscope_evaluate, "MetricBase", None) + if metric_cls is not None: + MetricCallPatcher.wrap_target(metric_cls) + + return Evaluator diff --git a/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_general_evaluator_creates_eval_spans.yaml b/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_general_evaluator_creates_eval_spans.yaml new file mode 100644 index 00000000..87f6358e --- /dev/null +++ b/py/src/braintrust/integrations/agentscope/cassettes/test_agentscope_general_evaluator_creates_eval_spans.yaml @@ -0,0 +1,320 @@ +interactions: +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a concise assistant. Answer in one sentence."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Say + hello in exactly two words."}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '290' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyCzNtgAAAAD//4xSsU7DMBDd+xWV56Rq + 05TQGYbODCyoilz7khgc29hOFYT675ydtglQJBZLvnfv/N7zPRzNkb33u02/qXTPN48AT89bkgSG + PrwC8xfWgmnkgRdaDTCzQD2EqauiyPMiK7Z3EWg1BxlotfFprtNWKJFmyyxPl0W6uj+zGy0YOGx7 + wet8/hnPoFNx6LG8TC6VFpyjNWDt0oRFq2WoEOqccJ4qT5IRZFp5UFH6DqTUc9+AhcW0xULVORpk + qk7KCUCV0p4Gm1Hc/oycrnKkro3VB/eDSiq06ZoSU3EYET7tvDYkoic899F2980JwUGt8aXXbxCf + W2fDODKGPQHPmEd9clLeJDeGlRw8FdJNUiOMsgb4yBwjph0XegLMJpZ/a7k1e7AtVP2f8SPAGBhc + otJY4IJ99zu2WQib+FfbNeIomDiwR1yt0guw4Rs4VLSTw34Q9+E8tCX+VQ3WWDEsSWVKRtdQ8GJ1 + qMjsNPsCAAD//wMAQQCebTIDAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e1608aaf6142-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:06:37 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '373' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=0r2bm90d_zmUe.y8EYZpgoGyTVS4QQSDoJVx.yDpJng-1774472796.2480545-1.0.1.1-onYisUL_Bju9EhfGXQnZBZkwk3gdjG7tHXVdr34BVePUh3JL0OqfVWApVIaF_KDBKfw4HIiGBvzONzv_AS91kbK7eL.FzFDwILNg8_F1h3hsPpZO.pIoeUN1dp_.acW6; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:36:37 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999975' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_0b44866bb8cd459db8712e04e4248889 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a concise assistant. Answer in one sentence."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Say + hello in exactly two words."}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '290' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyPJOswAAAAD//4xS0WrCMBR99yskz61o + 7aiv7mX6MJgwtsGQEpPbNjNNQpKOjeG/76ZVWzcHewnknntuzjm5t/dsebeB9Wq/qZ7WD4/L55eU + piQKDL17A+ZPrAnTyAMvtOpgZoF6CFNnWZamWbJIkhaoNQcZaKXxcarjWigRJ9MkjadZPFsc2ZUW + DBy2veJ1PP5qz6BTcfjA8jQ6VWpwjpaAtVMTFq2WoUKoc8J5qjyJepBp5UG10lcgpR77CixMhi0W + isbRIFM1Ug4AqpT2NNhsxW2PyOEsR+rSWL1zP6ikQJuuyjEVhxHh085rQ1r0gOe2td1cOCE4qDY+ + 93oP7XPzpBtH+rAH4BHzqE8OyjfRlWE5B0+FdIPUCKOsAt4z+4hpw4UeAKOB5d9ars3ubAtV/md8 + DzAGBpcoNxa4YJd++zYLYRP/ajtH3AomDuw7rlbuBdjwDRwK2shuP4j7dB7qHP+qBGus6JakMDmj + c8h4NtsVZHQYfQMAAP//AwDajPLhMgMAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e202698a251d-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:03 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '334' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=GJt8W0lyxqfgtKXEJ4M8twcG5pNqc4RmQiPqQ_IukiU-1774472822.1494222-1.0.1.1-Jv3zKpnCjFAAQQaXBEt3RElEP.QjtEFbqrvr8BASrk5X7XSiOj1UBc4tUR3t9QbKmOM0VrcVW6R3HYLaYxbTHQqz4Dpjl7Z.Sz9BslefycjBprbfLjQ1aoOYxrSO7lkO; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:03 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999977' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_8c118a37a5da44069de28fba911081e0 + status: + code: 200 + message: OK +- request: + body: '{"messages":[{"role":"system","name":"system","content":[{"type":"text","text":"You + are a concise assistant. Answer in one sentence."}]},{"role":"user","name":"user","content":[{"type":"text","text":"Say + hello in exactly two words."}]}],"model":"gpt-4o-mini","stream":false,"temperature":0}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '290' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - AsyncOpenAI/Python 2.29.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - async:asyncio + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.29.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAA/6rmUlBQykxRslJQSs5ILEnOLcjRdfELyDbKMgIAAAD//4xSQU7DMBC89xWVz0mV + pKHhDALBDSFxQlXk2pvE4NiW7dAi1L+zTtomhSJxseSdnfXMeO+zuxuWbp9vt4/pU/WyonrHk45E + gaE3b8D8kbVgGnnghVYDzCxQD2FqWhR5XmTXq1UPtJqDDLTa+DjXcSuUiLMky+OkiNPrA7vRgoHD + tle8zudf/Rl0Kg47LCfRsdKCc7QGrB2bsGi1DBVCnRPOU+VJNIJMKw+ql/4AUuq5b8DCYtpioeoc + DTJVJ+UEoEppT4PNXtz6gOxPcqSujdUb94NKKrTpmhJTcRgRPu28NqRH93iue9vdmROCg1rjS6/f + oX9umQ3jyBj2BDxgHvXJSfkqujCs5OCpkG6SGmGUNcBH5hgx7bjQE2A2sfxby6XZg22h6v+MHwHG + wOASlcYCF+zc79hmIWziX22niHvBxIH9wNUqvQAbvoFDRTs57Adxn85DW+Jf1WCNFcOSVKZkdAkF + L9JNRWb72TcAAAD//wMAHqai9DIDAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e20e3168f481d99-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 25 Mar 2026 21:07:46 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '370' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=QmJ_mn8jkQXEiES_f8MtNfFrG.TdUhMi9F3CsYesW.Y-1774472866.3273165-1.0.1.1-9jM9NCSYxoeVuM4GWc_rXbAFSxA2USepp4rz5niKNzmDK.TtF1V7PSKWLir8akfbpuyXvva5QQRFgDCcMT3P0xa_1Tzm9mW9uidzQIoQBbq5O6t0XUQzY8bQC4ddKj1i; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Wed, 25 Mar 2026 + 21:37:46 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999977' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_2ac33c8b77a343ea881d0fe0fbeccc14 + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/agentscope/integration.py b/py/src/braintrust/integrations/agentscope/integration.py index fac8fe8d..20b5f203 100644 --- a/py/src/braintrust/integrations/agentscope/integration.py +++ b/py/src/braintrust/integrations/agentscope/integration.py @@ -6,7 +6,11 @@ AgentCallPatcher, ChatModelPatcher, FanoutPipelinePatcher, + GeneralEvaluatorPatcher, + MetricCallPatcher, + RayEvaluatorRunPatcher, SequentialPipelinePatcher, + TaskEvaluatePatcher, ToolkitCallToolFunctionPatcher, ) @@ -23,4 +27,25 @@ class AgentScopeIntegration(BaseIntegration): FanoutPipelinePatcher, ToolkitCallToolFunctionPatcher, ChatModelPatcher, + GeneralEvaluatorPatcher, + RayEvaluatorRunPatcher, + TaskEvaluatePatcher, + MetricCallPatcher, ) + + eval_patchers = ( + GeneralEvaluatorPatcher, + RayEvaluatorRunPatcher, + TaskEvaluatePatcher, + MetricCallPatcher, + ) + + @classmethod + def setup( + cls, + *, + target=None, + instrument_evals: bool = True, + ) -> bool: + patchers = cls.patchers if instrument_evals else tuple(p for p in cls.patchers if p not in cls.eval_patchers) + return super().setup(target=target, patchers=patchers) diff --git a/py/src/braintrust/integrations/agentscope/patchers.py b/py/src/braintrust/integrations/agentscope/patchers.py index a0a9ba21..8d4eb314 100644 --- a/py/src/braintrust/integrations/agentscope/patchers.py +++ b/py/src/braintrust/integrations/agentscope/patchers.py @@ -5,8 +5,14 @@ from .tracing import ( _agent_call_wrapper, _fanout_pipeline_wrapper, + _general_evaluator_run_evaluation_wrapper, + _general_evaluator_run_solution_wrapper, + _general_evaluator_run_wrapper, + _metric_call_wrapper, _model_call_wrapper, + _ray_evaluator_run_wrapper, _sequential_pipeline_wrapper, + _task_evaluate_wrapper, _toolkit_call_tool_function_wrapper, ) @@ -101,3 +107,68 @@ class ChatModelPatcher(CompositeFunctionWrapperPatcher): _GeminiChatModelPatcher, _TrinityChatModelPatcher, ) + + +class _GeneralEvaluatorRunPatcher(FunctionWrapperPatcher): + """Patch AgentScope GeneralEvaluator root execution.""" + + name = "agentscope.evaluate.general.run" + target_module = "agentscope.evaluate" + target_path = "GeneralEvaluator.run" + wrapper = _general_evaluator_run_wrapper + + +class _GeneralEvaluatorRunSolutionPatcher(FunctionWrapperPatcher): + """Patch AgentScope GeneralEvaluator solution execution.""" + + name = "agentscope.evaluate.general.run_solution" + target_module = "agentscope.evaluate" + target_path = "GeneralEvaluator.run_solution" + wrapper = _general_evaluator_run_solution_wrapper + + +class _GeneralEvaluatorRunEvaluationPatcher(FunctionWrapperPatcher): + """Patch AgentScope GeneralEvaluator evaluation execution.""" + + name = "agentscope.evaluate.general.run_evaluation" + target_module = "agentscope.evaluate" + target_path = "GeneralEvaluator.run_evaluation" + wrapper = _general_evaluator_run_evaluation_wrapper + + +class GeneralEvaluatorPatcher(CompositeFunctionWrapperPatcher): + """Patch AgentScope GeneralEvaluator for Braintrust eval tracing.""" + + name = "agentscope.evaluate.general" + sub_patchers = ( + _GeneralEvaluatorRunPatcher, + _GeneralEvaluatorRunSolutionPatcher, + _GeneralEvaluatorRunEvaluationPatcher, + ) + + +class RayEvaluatorRunPatcher(FunctionWrapperPatcher): + """Patch AgentScope RayEvaluator root execution.""" + + name = "agentscope.evaluate.ray" + target_module = "agentscope.evaluate" + target_path = "RayEvaluator.run" + wrapper = _ray_evaluator_run_wrapper + + +class TaskEvaluatePatcher(FunctionWrapperPatcher): + """Patch AgentScope task evaluation.""" + + name = "agentscope.evaluate.task" + target_module = "agentscope.evaluate" + target_path = "Task.evaluate" + wrapper = _task_evaluate_wrapper + + +class MetricCallPatcher(FunctionWrapperPatcher): + """Patch AgentScope metric execution.""" + + name = "agentscope.evaluate.metric" + target_module = "agentscope.evaluate" + target_path = "MetricBase.__call__" + wrapper = _metric_call_wrapper diff --git a/py/src/braintrust/integrations/agentscope/test_agentscope.py b/py/src/braintrust/integrations/agentscope/test_agentscope.py index 249a1b7e..3688fcd7 100644 --- a/py/src/braintrust/integrations/agentscope/test_agentscope.py +++ b/py/src/braintrust/integrations/agentscope/test_agentscope.py @@ -1,8 +1,19 @@ +import sys +from dataclasses import dataclass from pathlib import Path +from types import ModuleType import pytest from braintrust import logger -from braintrust.integrations.agentscope import setup_agentscope +from braintrust.integrations.agentscope import setup_agentscope, wrap_evaluator +from braintrust.integrations.agentscope.patchers import ( + AgentCallPatcher, + MetricCallPatcher, + TaskEvaluatePatcher, + _GeneralEvaluatorRunEvaluationPatcher, + _GeneralEvaluatorRunPatcher, + _GeneralEvaluatorRunSolutionPatcher, +) from braintrust.span_types import SpanTypeAttribute from braintrust.test_helpers import init_test_logger from braintrust.wrappers.test_utils import verify_autoinstrument_script @@ -219,6 +230,245 @@ async def _stream(): assert llm_span["metrics"]["tokens"] == 32 +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_agentscope_general_evaluator_creates_eval_spans(memory_logger, tmp_path): + from agentscope.evaluate import ( + BenchmarkBase, + FileEvaluatorStorage, + GeneralEvaluator, + MetricBase, + MetricResult, + MetricType, + SolutionOutput, + Task, + ) + from agentscope.message import Msg + + assert not memory_logger.pop() + + class ExactMatchMetric(MetricBase): + def __init__(self, ground_truth: str): + super().__init__( + name="exact_match", + metric_type=MetricType.NUMERICAL, + description="Check whether the model answer exactly matches the ground truth.", + categories=[], + ) + self.ground_truth = ground_truth + + async def __call__(self, solution: SolutionOutput) -> MetricResult: + is_match = solution.output == self.ground_truth + return MetricResult( + name=self.name, + result=1.0 if is_match else 0.0, + message="Correct" if is_match else "Incorrect", + ) + + class ToyBenchmark(BenchmarkBase): + def __init__(self, tasks): + super().__init__( + name="Toy benchmark", + description="A one-task benchmark for AgentScope eval instrumentation.", + ) + self.tasks = tasks + + def __iter__(self): + yield from self.tasks + + def __len__(self): + return len(self.tasks) + + def __getitem__(self, index): + return self.tasks[index] + + task = Task( + id="hello-task", + input="Say hello in exactly two words.", + ground_truth="Hello there.", + metrics=[ExactMatchMetric("Hello there.")], + tags={"difficulty": "easy", "category": "greeting"}, + metadata={"suite": "toy"}, + ) + evaluator = GeneralEvaluator( + name="Toy benchmark evaluation", + benchmark=ToyBenchmark([task]), + n_repeat=1, + storage=FileEvaluatorStorage(save_dir=str(tmp_path / "agentscope-eval")), + n_workers=1, + ) + + async def solution(eval_task: Task, pre_hook): + agent = _make_agent( + "Friday", + "You are a concise assistant. Answer in one sentence.", + ) + if hasattr(agent, "register_instance_hook"): + agent.register_instance_hook("pre_print", "save_logging", pre_hook) + + response = await agent( + Msg( + name="user", + content=eval_task.input, + role="user", + ) + ) + + content = response.content + if isinstance(content, list): + output = next( + (item["text"] for item in content if isinstance(item, dict) and item.get("type") == "text"), + None, + ) + trajectory = content + else: + output = content + trajectory = [content] + + return SolutionOutput( + success=True, + output=output, + trajectory=trajectory, + meta={"agent": "Friday"}, + ) + + await evaluator.run(solution) + + spans = memory_logger.pop() + root_span = next(span for span in spans if span["span_attributes"]["name"] == "agentscope.evaluate.run") + solution_span = next(span for span in spans if span["span_attributes"]["name"] == "hello-task.solution") + evaluation_span = next(span for span in spans if span["span_attributes"]["name"] == "hello-task.evaluate") + metric_span = next(span for span in spans if span["span_attributes"]["name"] == "exact_match") + agent_span = next(span for span in spans if span["span_attributes"]["name"] == "Friday.reply") + + assert _span_type(root_span) == "eval" + assert root_span["metadata"]["benchmark_name"] == "Toy benchmark" + assert root_span["metadata"]["task_count"] == 1 + assert root_span["output"]["status"] == "completed" + + assert _span_type(solution_span) == "task" + assert solution_span["input"] == "Say hello in exactly two words." + assert solution_span["expected"] == "Hello there." + assert solution_span["tags"] == ["category:greeting", "difficulty:easy"] + assert solution_span["metadata"]["repeat_id"] == "0" + assert solution_span["metadata"]["metric_names"] == ["exact_match"] + assert solution_span["metadata"]["task_tags"] == {"difficulty": "easy", "category": "greeting"} + assert solution_span["output"]["output"] == "Hello there." + assert solution_span["span_id"] in agent_span["span_parents"] + + assert _span_type(evaluation_span) == "eval" + assert evaluation_span["span_id"] in metric_span["span_parents"] + assert solution_span["span_id"] in evaluation_span["span_parents"] + assert root_span["span_id"] in solution_span["span_parents"] + assert evaluation_span["output"][0]["result"] == 1.0 + assert evaluation_span["output"][0]["message"] == "Correct" + + assert _span_type(metric_span) == "score" + assert metric_span["scores"]["exact_match"] == 1.0 + assert metric_span["output"]["result"] == 1.0 + assert metric_span["output"]["message"] == "Correct" + + +@dataclass +class _FakeAgentscopeModules: + AgentBase: type + GeneralEvaluator: type + MetricBase: type + Task: type + + +@pytest.fixture +def fake_agentscope_modules(monkeypatch): + agentscope_module = ModuleType("agentscope") + agentscope_module.__path__ = [] + agentscope_module.__version__ = "1.0.0" + + agent_module = ModuleType("agentscope.agent") + evaluate_module = ModuleType("agentscope.evaluate") + + class AgentBase: + async def __call__(self, *_args, **_kwargs): + return "ok" + + class Task: + async def evaluate(self, *_args, **_kwargs): + return [] + + class MetricBase: + async def __call__(self, *_args, **_kwargs): + return None + + class GeneralEvaluator: + async def run(self, *_args, **_kwargs): + return None + + async def run_solution(self, *_args, **_kwargs): + return None + + async def run_evaluation(self, *_args, **_kwargs): + return None + + agent_module.AgentBase = AgentBase + evaluate_module.GeneralEvaluator = GeneralEvaluator + evaluate_module.Task = Task + evaluate_module.MetricBase = MetricBase + + agentscope_module.agent = agent_module + agentscope_module.evaluate = evaluate_module + + monkeypatch.setitem(sys.modules, "agentscope", agentscope_module) + monkeypatch.setitem(sys.modules, "agentscope.agent", agent_module) + monkeypatch.setitem(sys.modules, "agentscope.evaluate", evaluate_module) + + return _FakeAgentscopeModules( + AgentBase=AgentBase, + GeneralEvaluator=GeneralEvaluator, + MetricBase=MetricBase, + Task=Task, + ) + + +def test_setup_agentscope_can_skip_eval_patchers(fake_agentscope_modules): + result = setup_agentscope(project_name=PROJECT_NAME, instrument_evals=False) + + assert result is True + assert getattr(fake_agentscope_modules.AgentBase.__call__, AgentCallPatcher.patch_marker_attr(), False) + assert not getattr( + fake_agentscope_modules.GeneralEvaluator, _GeneralEvaluatorRunPatcher.patch_marker_attr(), False + ) + assert not getattr( + fake_agentscope_modules.GeneralEvaluator, + _GeneralEvaluatorRunSolutionPatcher.patch_marker_attr(), + False, + ) + assert not getattr( + fake_agentscope_modules.GeneralEvaluator, + _GeneralEvaluatorRunEvaluationPatcher.patch_marker_attr(), + False, + ) + assert not getattr(fake_agentscope_modules.Task, TaskEvaluatePatcher.patch_marker_attr(), False) + assert not getattr(fake_agentscope_modules.MetricBase, MetricCallPatcher.patch_marker_attr(), False) + + +def test_wrap_evaluator_patches_evaluator_and_eval_types(fake_agentscope_modules): + wrapped = wrap_evaluator(fake_agentscope_modules.GeneralEvaluator) + wrapped_again = wrap_evaluator(fake_agentscope_modules.GeneralEvaluator) + + assert wrapped is fake_agentscope_modules.GeneralEvaluator + assert wrapped_again is fake_agentscope_modules.GeneralEvaluator + assert getattr(fake_agentscope_modules.GeneralEvaluator, _GeneralEvaluatorRunPatcher.patch_marker_attr(), False) + assert getattr( + fake_agentscope_modules.GeneralEvaluator, _GeneralEvaluatorRunSolutionPatcher.patch_marker_attr(), False + ) + assert getattr( + fake_agentscope_modules.GeneralEvaluator, + _GeneralEvaluatorRunEvaluationPatcher.patch_marker_attr(), + False, + ) + assert getattr(fake_agentscope_modules.Task, TaskEvaluatePatcher.patch_marker_attr(), False) + assert getattr(fake_agentscope_modules.MetricBase, MetricCallPatcher.patch_marker_attr(), False) + + class TestAutoInstrumentAgentScope: def test_auto_instrument_agentscope(self): verify_autoinstrument_script("test_auto_agentscope.py") diff --git a/py/src/braintrust/integrations/agentscope/tracing.py b/py/src/braintrust/integrations/agentscope/tracing.py index efb4891d..b5e1e4ad 100644 --- a/py/src/braintrust/integrations/agentscope/tracing.py +++ b/py/src/braintrust/integrations/agentscope/tracing.py @@ -1,10 +1,14 @@ """AgentScope-specific span creation and stream aggregation.""" from contextlib import aclosing +from contextvars import ContextVar from typing import Any from braintrust.logger import start_span -from braintrust.span_types import SpanTypeAttribute +from braintrust.span_types import SpanPurpose, SpanTypeAttribute + + +_SUPPRESS_TASK_EVALUATE_SPAN: ContextVar[bool] = ContextVar("_SUPPRESS_TASK_EVALUATE_SPAN", default=False) def _clean(mapping: dict[str, Any]) -> dict[str, Any]: @@ -149,6 +153,184 @@ def _tool_name(tool_call: Any) -> str: return str(getattr(tool_call, "name", "unknown_tool")) +def _call_arg(args: Any, kwargs: dict[str, Any], index: int, key: str) -> Any: + if key in kwargs: + return kwargs[key] + return args[index] if len(args) > index else None + + +def _maybe_awaitable_name(value: Any) -> str | None: + return getattr(value, "__name__", None) or getattr(value, "__qualname__", None) + + +def _metric_name(metric: Any) -> str: + return str(getattr(metric, "name", None) or metric.__class__.__name__) + + +def _task_id(task: Any) -> str: + return str(_field_value(task, "id") or _field_value(task, "name") or task.__class__.__name__) + + +def _task_input(task: Any) -> Any: + for key in ("input", "input_data", "question", "prompt"): + value = _field_value(task, key) + if value is not None: + return value + return None + + +def _task_expected(task: Any) -> Any: + for key in ("ground_truth", "expected", "reference", "answer"): + value = _field_value(task, key) + if value is not None: + return value + return None + + +def _task_tags(task: Any) -> Any: + tags = _field_value(task, "tags") + if isinstance(tags, dict): + return [f"{key}:{value}" for key, value in sorted(tags.items())] + return tags + + +def _task_metric_names(task: Any) -> list[str] | None: + metrics = _field_value(task, "metrics") + if not metrics: + return None + return [_metric_name(metric) for metric in metrics] + + +def _task_metadata(task: Any) -> dict[str, Any]: + metadata = _field_value(task, "metadata") + if isinstance(metadata, dict): + return metadata + return {} + + +def _solution_output_summary(solution_output: Any) -> Any: + if solution_output is None: + return None + if isinstance(solution_output, dict): + return solution_output + + summary = _clean( + { + "output": _field_value(solution_output, "output"), + "success": _field_value(solution_output, "success"), + "trajectory": _field_value(solution_output, "trajectory"), + "meta": _field_value(solution_output, "meta") or _field_value(solution_output, "metadata"), + "message": _field_value(solution_output, "message"), + } + ) + return summary or solution_output + + +def _metric_result_summary(result: Any) -> Any: + if result is None: + return None + if isinstance(result, dict): + return result + + summary = _clean( + { + "result": _field_value(result, "result"), + "message": _field_value(result, "message"), + "detail": _field_value(result, "detail"), + "metadata": _field_value(result, "metadata") or _field_value(result, "meta"), + } + ) + return summary or result + + +def _metric_score(metric: Any, result: Any) -> dict[str, float] | None: + value = _field_value(result, "result") + if isinstance(value, bool): + return {_metric_name(metric): 1.0 if value else 0.0} + if isinstance(value, (int, float)): + return {_metric_name(metric): float(value)} + return None + + +def _evaluator_metadata(instance: Any, solution: Any = None) -> dict[str, Any]: + benchmark = getattr(instance, "benchmark", None) + task_count = len(benchmark) if benchmark is not None and hasattr(benchmark, "__len__") else None + return _clean( + { + "evaluator_class": instance.__class__.__name__, + "evaluator_name": getattr(instance, "name", None), + "benchmark_name": _field_value(benchmark, "name"), + "benchmark_description": _field_value(benchmark, "description"), + "task_count": task_count, + "n_repeat": getattr(instance, "n_repeat", None), + "n_workers": getattr(instance, "n_workers", None), + "storage_class": getattr(getattr(instance, "storage", None), "__class__", type(None)).__name__, + "solution_name": _maybe_awaitable_name(solution), + } + ) + + +def _task_span_metadata(task: Any, repeat_id: str | None = None, **extra: Any) -> dict[str, Any]: + raw_tags = _field_value(task, "tags") + return _clean( + { + **_task_metadata(task), + "task_id": _task_id(task), + "repeat_id": repeat_id, + "metric_names": _task_metric_names(task), + "task_tags": raw_tags if isinstance(raw_tags, dict) else None, + **extra, + } + ) + + +def _storage_get(storage: Any, method_name: str, *args: Any) -> Any: + method = getattr(storage, method_name, None) + if method is None: + return None + try: + return method(*args) + except Exception: + return None + + +def _stored_solution_output(instance: Any, task: Any, repeat_id: str) -> Any: + storage = getattr(instance, "storage", None) + if storage is None: + return None + return _storage_get(storage, "get_solution_result", _task_id(task), repeat_id) + + +def _stored_evaluation_results(instance: Any, task: Any, repeat_id: str) -> list[Any] | None: + storage = getattr(instance, "storage", None) + metrics = _field_value(task, "metrics") or [] + if storage is None or not metrics: + return None + + results = [] + for metric in metrics: + result = _storage_get(storage, "get_evaluation_result", _task_id(task), repeat_id, _metric_name(metric)) + if result is None: + return None + results.append(result) + return results + + +def _log_metric_span(parent_span: Any, metric: Any, solution_output: Any, result: Any) -> None: + with parent_span.start_span( + name=_metric_name(metric), + type=SpanTypeAttribute.SCORE, + span_attributes={"purpose": SpanPurpose.SCORER.value}, + input=_solution_output_summary(solution_output), + metadata=_clean({"metric_class": metric.__class__.__name__}), + ) as metric_span: + metric_span.log( + output=_metric_result_summary(result), + metadata=_field_value(result, "metadata") or _field_value(result, "meta"), + scores=_metric_score(metric, result), + ) + + async def _agent_call_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: with start_span( name=f"{_agent_name(instance)}.reply", @@ -165,6 +347,166 @@ async def _agent_call_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: di raise +async def _general_evaluator_run_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: + solution = _call_arg(args, kwargs, 0, "solution") + with start_span( + name="agentscope.evaluate.run", + type=SpanTypeAttribute.EVAL, + input=_clean( + { + "benchmark_name": _field_value(getattr(instance, "benchmark", None), "name"), + "n_repeat": getattr(instance, "n_repeat", None), + "n_workers": getattr(instance, "n_workers", None), + } + ), + metadata=_evaluator_metadata(instance, solution), + ) as span: + try: + result = await wrapped(*args, **kwargs) + span.log(output={"status": "completed"}) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + + +async def _ray_evaluator_run_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: + solution = _call_arg(args, kwargs, 0, "solution") + with start_span( + name="agentscope.evaluate.run", + type=SpanTypeAttribute.EVAL, + input=_clean( + { + "benchmark_name": _field_value(getattr(instance, "benchmark", None), "name"), + "n_repeat": getattr(instance, "n_repeat", None), + "n_workers": getattr(instance, "n_workers", None), + } + ), + metadata={**_evaluator_metadata(instance, solution), "distributed": True}, + ) as span: + try: + result = await wrapped(*args, **kwargs) + span.log(output={"status": "completed"}) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + + +async def _general_evaluator_run_solution_wrapper( + wrapped: Any, + instance: Any, + args: Any, + kwargs: dict[str, Any], +) -> Any: + repeat_id = str(_call_arg(args, kwargs, 0, "repeat_id")) + task = _call_arg(args, kwargs, 1, "task") + storage = getattr(instance, "storage", None) + was_cached = False + if storage is not None and task is not None: + exists = getattr(storage, "solution_result_exists", None) + if exists is not None: + try: + was_cached = bool(exists(_task_id(task), repeat_id)) + except Exception: + was_cached = False + + with start_span( + name=f"{_task_id(task)}.solution", + type=SpanTypeAttribute.TASK, + input=_task_input(task), + expected=_task_expected(task), + tags=_task_tags(task), + metadata=_task_span_metadata(task, repeat_id, cached=was_cached), + ) as span: + try: + result = await wrapped(*args, **kwargs) + solution_output = _stored_solution_output(instance, task, repeat_id) + span.log(output=_solution_output_summary(solution_output)) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + + +async def _general_evaluator_run_evaluation_wrapper( + wrapped: Any, + instance: Any, + args: Any, + kwargs: dict[str, Any], +) -> Any: + task = _call_arg(args, kwargs, 0, "task") + repeat_id = str(_call_arg(args, kwargs, 1, "repeat_id")) + solution_output = _call_arg(args, kwargs, 2, "solution_output") + + with start_span( + name=f"{_task_id(task)}.evaluate", + type=SpanTypeAttribute.EVAL, + input=_solution_output_summary(solution_output), + metadata=_task_span_metadata(task, repeat_id), + ) as span: + token = _SUPPRESS_TASK_EVALUATE_SPAN.set(True) + try: + result = await wrapped(*args, **kwargs) + evaluation_results = _stored_evaluation_results(instance, task, repeat_id) + if evaluation_results is not None: + metrics = _field_value(task, "metrics") or [] + for metric, evaluation_result in zip(metrics, evaluation_results): + _log_metric_span(span, metric, solution_output, evaluation_result) + span.log(output=[_metric_result_summary(item) for item in evaluation_results]) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + finally: + _SUPPRESS_TASK_EVALUATE_SPAN.reset(token) + + +async def _task_evaluate_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: + if _SUPPRESS_TASK_EVALUATE_SPAN.get(): + return await wrapped(*args, **kwargs) + + solution_output = _call_arg(args, kwargs, 0, "solution_output") + with start_span( + name=f"{_task_id(instance)}.evaluate", + type=SpanTypeAttribute.EVAL, + input=_solution_output_summary(solution_output), + metadata=_task_span_metadata(instance), + ) as span: + try: + result = await wrapped(*args, **kwargs) + span.log(output=[_metric_result_summary(item) for item in result] if result is not None else None) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + + +async def _metric_call_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: + if _SUPPRESS_TASK_EVALUATE_SPAN.get(): + return await wrapped(*args, **kwargs) + + solution_output = _call_arg(args, kwargs, 0, "solution_output") + with start_span( + name=_metric_name(instance), + type=SpanTypeAttribute.SCORE, + span_attributes={"purpose": SpanPurpose.SCORER.value}, + input=_solution_output_summary(solution_output), + metadata=_clean({"metric_class": instance.__class__.__name__}), + ) as span: + try: + result = await wrapped(*args, **kwargs) + span.log( + output=_metric_result_summary(result), + metadata=_field_value(result, "metadata") or _field_value(result, "meta"), + scores=_metric_score(instance, result), + ) + return result + except Exception as exc: + span.log(error=str(exc)) + raise + + async def _sequential_pipeline_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any: with start_span( name="sequential_pipeline.run", diff --git a/py/src/braintrust/integrations/base.py b/py/src/braintrust/integrations/base.py index 690e6c22..7d491b96 100644 --- a/py/src/braintrust/integrations/base.py +++ b/py/src/braintrust/integrations/base.py @@ -337,6 +337,7 @@ def setup( cls, *, target: Any | None = None, + patchers: tuple[type[BasePatcher], ...] | None = None, ) -> bool: """Apply all applicable patchers for this integration.""" module = _import_first_available(cls.import_names) @@ -347,7 +348,7 @@ def setup( return False success = False - selected_patchers = cls.resolve_patchers() + selected_patchers = cls.resolve_patchers() if patchers is None else patchers for patcher in sorted(selected_patchers, key=lambda patcher: patcher.priority): if not patcher.applies(module, version, target=target): continue