From bc8806605c8953cf9e02cbb15b3bfe4631729831 Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 24 Mar 2026 15:17:18 -0700 Subject: [PATCH] chore: Add benchmarking setup to SDK --- .agents/skills/sdk-benchmarking/SKILL.md | 101 +++++++++ .../sdk-benchmarking/agents/openai.yaml | 4 + .../references/benchmark-patterns.md | 93 ++++++++ AGENTS.md | 20 ++ CONTRIBUTING.md | 81 +++++++ Makefile | 30 ++- py/Makefile | 14 +- py/benchmarks/__init__.py | 1 + py/benchmarks/__main__.py | 48 +++++ py/benchmarks/_utils.py | 19 ++ py/benchmarks/benches/__init__.py | 12 ++ py/benchmarks/benches/bench_bt_json.py | 29 +++ py/benchmarks/fixtures.py | 203 ++++++++++++++++++ py/benchmarks/perf.py | 35 --- py/requirements-dev.txt | 1 + 15 files changed, 644 insertions(+), 47 deletions(-) create mode 100644 .agents/skills/sdk-benchmarking/SKILL.md create mode 100644 .agents/skills/sdk-benchmarking/agents/openai.yaml create mode 100644 .agents/skills/sdk-benchmarking/references/benchmark-patterns.md create mode 100644 py/benchmarks/__init__.py create mode 100644 py/benchmarks/__main__.py create mode 100644 py/benchmarks/_utils.py create mode 100644 py/benchmarks/benches/__init__.py create mode 100644 py/benchmarks/benches/bench_bt_json.py create mode 100644 py/benchmarks/fixtures.py delete mode 100644 py/benchmarks/perf.py diff --git a/.agents/skills/sdk-benchmarking/SKILL.md b/.agents/skills/sdk-benchmarking/SKILL.md new file mode 100644 index 00000000..720dfc4e --- /dev/null +++ b/.agents/skills/sdk-benchmarking/SKILL.md @@ -0,0 +1,101 @@ +--- +name: sdk-benchmarking +description: Run, compare, and extend Braintrust Python SDK pyperf benchmarks. Use when touching hot-path code in `py/src/braintrust/` such as serialization, deep-copy, span creation, or logging; when adding or updating files under `py/benchmarks/`; or when you need baseline/branch performance measurements with `cd py && make bench` and `make bench-compare`. +--- + +# SDK Benchmarking + +Use this skill for benchmark work in the Braintrust Python SDK repository. + +Benchmark support already exists in `py/benchmarks/`. Use the current repo workflow, not commit archaeology, once you have identified the relevant benchmark surface. + +## Read First + +Always read: + +- `AGENTS.md` +- `CONTRIBUTING.md` +- `py/Makefile` +- `py/benchmarks/__main__.py` +- `py/benchmarks/_utils.py` +- `py/benchmarks/benches/__init__.py` + +Read when relevant: + +- `py/benchmarks/benches/bench_bt_json.py` for the module pattern +- `py/benchmarks/fixtures.py` for shared payload builders +- `py/setup.py` when benchmarking the optional `orjson` fast path +- `references/benchmark-patterns.md` in this skill for command and module templates + +## Workflow + +1. Identify the hot path or API surface that changed. +2. Find the nearest existing benchmark module under `py/benchmarks/benches/`. +3. Run the narrowest useful benchmark first. +4. Add or update a `bench_*.py` module only if the current suite does not cover the changed path. +5. Reuse or extend `py/benchmarks/fixtures.py` for realistic shared payloads instead of inlining bulky test data. +6. Save before/after results and compare them when the task is about regression detection or improvement claims. + +## Commands + +Run benchmarks from `py/`: + +```bash +cd py +make bench +make bench BENCH_ARGS="--fast" +make bench BENCH_ARGS="-o /tmp/before.json" +make bench BENCH_ARGS="-o /tmp/after.json" +make bench-compare BENCH_BASE=/tmp/before.json BENCH_NEW=/tmp/after.json +python -m benchmarks.benches.bench_bt_json +``` + +Use `python -m benchmarks --help` for extra `pyperf` flags. + +If the benchmark should measure the optional `orjson` path, install the performance extra first: + +```bash +cd py +python -m uv pip install -e '.[performance]' +``` + +## Adding Benchmarks + +Put new modules in `py/benchmarks/benches/` and name them `bench_.py`. + +Every benchmark module must: + +- expose `main(runner: pyperf.Runner | None = None) -> None` +- create its own `pyperf.Runner()` only when `runner` is `None` +- call `disable_pyperf_psutil()` before creating that runner +- register benchmarks with stable, descriptive names via `runner.bench_func(...)` +- remain executable directly with `python -m benchmarks.benches.bench_` + +Do not add manual registration. `python -m benchmarks` auto-discovers every `bench_*.py` module in `py/benchmarks/benches/`. + +## Fixtures + +Keep reusable payload builders and synthetic objects in `py/benchmarks/fixtures.py`. + +Prefer fixture helpers when: + +- several benchmark cases share similar payloads +- the inputs are large enough to distract from the benchmark itself +- you need variants such as small, medium, large, circular, or non-string-key cases + +Keep fixture builders deterministic and focused on representative data shapes. + +## Validation + +- Run the narrowest affected benchmark first. +- Use `BENCH_ARGS="--fast"` for quick local sanity checks while iterating. +- Save JSON outputs and use `make bench-compare` for baseline versus branch comparisons. +- If you changed code paths that also have correctness tests, run the smallest relevant test target in addition to the benchmark. + +## Pitfalls + +- Measuring import/setup overhead instead of the hot function under test. +- Inlining ad hoc payload construction in each benchmark instead of reusing fixtures. +- Forgetting the standalone `main()` pattern, which breaks auto-discovery and direct execution symmetry. +- Claiming performance changes from a single unsaved local run instead of comparing saved results. +- Benchmarking the `orjson` fast path without explicitly installing `.[performance]`. diff --git a/.agents/skills/sdk-benchmarking/agents/openai.yaml b/.agents/skills/sdk-benchmarking/agents/openai.yaml new file mode 100644 index 00000000..6be21da5 --- /dev/null +++ b/.agents/skills/sdk-benchmarking/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "SDK Benchmarking" + short_description: "Run and extend Braintrust SDK benchmarks" + default_prompt: "Use $sdk-benchmarking to run, compare, or add Braintrust Python SDK benchmarks." diff --git a/.agents/skills/sdk-benchmarking/references/benchmark-patterns.md b/.agents/skills/sdk-benchmarking/references/benchmark-patterns.md new file mode 100644 index 00000000..0d9fcf81 --- /dev/null +++ b/.agents/skills/sdk-benchmarking/references/benchmark-patterns.md @@ -0,0 +1,93 @@ +# Benchmark Patterns + +Use this reference when adding or updating SDK benchmarks. + +## Command Cheatsheet + +```bash +cd py + +# Run everything +make bench + +# Faster local iteration +make bench BENCH_ARGS="--fast" + +# Save results for comparison +make bench BENCH_ARGS="-o /tmp/before.json" +make bench BENCH_ARGS="-o /tmp/after.json" +make bench-compare BENCH_BASE=/tmp/before.json BENCH_NEW=/tmp/after.json + +# Run one module directly +python -m benchmarks.benches.bench_bt_json + +# Inspect all forwarded pyperf flags +python -m benchmarks --help +``` + +## Module Skeleton + +```python +import pathlib +import sys + +import pyperf + + +if __package__ in (None, ""): + sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2])) + +from benchmarks._utils import disable_pyperf_psutil + + +def target(value): + return value + + +def main(runner: pyperf.Runner | None = None) -> None: + if runner is None: + disable_pyperf_psutil() + runner = pyperf.Runner() + + runner.bench_func("example.target[case-name]", target, "value") + + +if __name__ == "__main__": + main() +``` + +Follow the existing `py/benchmarks/benches/bench_bt_json.py` pattern when importing repo code. The `sys.path` adjustment keeps direct module execution working from inside `py/`. + +## Fixture Guidance + +Put reusable builders in `py/benchmarks/fixtures.py` when: + +- several benchmark cases need the same payload shape +- the payload should model realistic nested SDK inputs +- the benchmark should cover edge cases such as circular references or non-string keys + +Current fixture patterns already cover: + +- small, medium, and large nested payloads +- circular structures +- non-string dictionary keys +- dataclass-like and pydantic-like values + +Extend those helpers before creating one-off payload factories in a new benchmark module. + +## Comparison Workflow + +For branch-to-branch comparisons: + +```bash +cd py +git checkout main +make bench BENCH_ARGS="-o /tmp/main.json" + +git checkout my-branch +make bench BENCH_ARGS="-o /tmp/branch.json" + +make bench-compare BENCH_BASE=/tmp/main.json BENCH_NEW=/tmp/branch.json +``` + +Use `--rigorous` only when you need lower-noise final numbers; use `--fast` while iterating. diff --git a/AGENTS.md b/AGENTS.md index 8acef533..bf9d2ee6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -12,6 +12,7 @@ Guide for contributing to the Braintrust Python SDK repository. ## Repo Map - `py/`: main Python package, tests, examples, nox sessions, release build +- `py/benchmarks/`: pyperf performance benchmarks - `integrations/`: separate integration packages - `internal/golden/`: compatibility and golden projects - `docs/`: supporting docs @@ -122,6 +123,25 @@ BRAINTRUST_CLAUDE_AGENT_SDK_RECORD_MODE=all nox -s "test_claude_agent_sdk(latest Only re-record HTTP or subprocess cassettes when the behavior change is intentional. If in doubt, ask the user. +## Benchmarks + +Run `cd py && make bench` when touching hot-path code (serialization, deep-copy, span creation, logging). Not required for every change. + +Benchmarks use pyperf. All `bench_*.py` files in `py/benchmarks/benches/` are auto-discovered — no registration needed. + +Key commands: + +```bash +cd py +make bench # run all benchmarks +make bench BENCH_ARGS="--fast" # quick sanity check +make bench BENCH_ARGS="-o /tmp/before.json" # save baseline before a change +make bench BENCH_ARGS="-o /tmp/after.json" # save after a change +make bench-compare BENCH_BASE=/tmp/before.json BENCH_NEW=/tmp/after.json +``` + +New benchmark files go in `py/benchmarks/benches/bench_.py`. Each must expose `main(runner: pyperf.Runner | None = None)`. Shared payload builders go in `py/benchmarks/fixtures.py`. See existing `bench_bt_json.py` for the pattern. + ## Build Notes Build from `py/`: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 695994c1..dc1a266a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -147,6 +147,87 @@ Common ones include: The `memory_logger` fixture from `braintrust.test_helpers` is useful for asserting on logged spans without a real Braintrust backend. +## Benchmarks + +The SDK includes local performance benchmarks powered by [pyperf](https://pyperf.readthedocs.io/), located in `py/benchmarks/`. These cover hot paths like serialization and deep-copy routines. + +### Running benchmarks + +```bash +cd py + +# Run all benchmarks +make bench + +# Quick sanity check (fewer iterations) +make bench BENCH_ARGS="--fast" + +# Save results for later comparison +make bench BENCH_ARGS="-o /tmp/results.json" + +# Run a single benchmark module directly +python -m benchmarks.benches.bench_bt_json +``` + +To benchmark with the optional `orjson` fast-path installed: + +```bash +cd py +python -m uv pip install -e '.[performance]' +make bench +``` + +### Comparing across branches + +```bash +cd py + +git checkout main +make bench BENCH_ARGS="-o /tmp/main.json" + +git checkout my-branch +make bench BENCH_ARGS="-o /tmp/branch.json" + +make bench-compare BENCH_BASE=/tmp/main.json BENCH_NEW=/tmp/branch.json +``` + +### Useful pyperf flags + +| Flag | Purpose | +| --------------- | ------------------------------------------------- | +| `--fast` | Fewer iterations — good for a quick sanity check | +| `--rigorous` | More iterations — reduces noise for final numbers | +| `-o FILE` | Write results to a JSON file for later comparison | +| `--append FILE` | Append to an existing results file | + +Run `python -m benchmarks --help` for the full list. + +### Adding a new benchmark + +Drop a new `bench_.py` file into `py/benchmarks/benches/`. It will be picked up automatically — no registration required. + +Your module needs to expose a `main()` function that accepts an optional `pyperf.Runner`: + +```python +import pyperf + +from benchmarks._utils import disable_pyperf_psutil + + +def main(runner: pyperf.Runner | None = None) -> None: + if runner is None: + disable_pyperf_psutil() + runner = pyperf.Runner() + + runner.bench_func("my_benchmark", my_func, my_arg) + + +if __name__ == "__main__": + main() +``` + +If your benchmark needs reusable test data, add builder functions to `py/benchmarks/fixtures.py`. + ## CI GitHub Actions workflows live in `.github/workflows/`. diff --git a/Makefile b/Makefile index ff5c5edd..4dba3a47 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ SHELL := /bin/bash -.PHONY: help develop install-dev install-deps fixup test test-core test-wheel lint pylint nox +.PHONY: help develop install-dev install-deps fixup test test-core test-wheel lint pylint nox bench bench-compare develop: install-dev mise exec -- pre-commit install @@ -30,19 +30,27 @@ lint: pylint: mise exec -- $(MAKE) -C py pylint +bench: + mise exec -- $(MAKE) -C py bench BENCH_ARGS="$(BENCH_ARGS)" + +bench-compare: + mise exec -- $(MAKE) -C py bench-compare BENCH_BASE="$(BENCH_BASE)" BENCH_NEW="$(BENCH_NEW)" + nox: test help: @echo "Available targets:" - @echo " develop - Install tools with mise, install py/ deps, and install pre-commit hooks" - @echo " fixup - Run pre-commit hooks across the repo" - @echo " install-deps - Install Python SDK dependencies via py/Makefile" - @echo " install-dev - Install pinned tools and create/update the repo env via mise" - @echo " lint - Run pre-commit hooks plus Python SDK pylint via py/Makefile" - @echo " pylint - Run Python SDK pylint only via py/Makefile" - @echo " nox - Alias for test" - @echo " test - Run the Python SDK nox matrix via py/Makefile" - @echo " test-core - Run Python SDK core tests via py/Makefile" - @echo " test-wheel - Run Python SDK wheel sanity tests via py/Makefile (requires a built wheel)" + @echo " bench - Run benchmarks via py/Makefile (pass extra flags via BENCH_ARGS=)" + @echo " bench-compare - Compare two benchmark results via py/Makefile (BENCH_BASE=... BENCH_NEW=...)" + @echo " develop - Install tools with mise, install py/ deps, and install pre-commit hooks" + @echo " fixup - Run pre-commit hooks across the repo" + @echo " install-deps - Install Python SDK dependencies via py/Makefile" + @echo " install-dev - Install pinned tools and create/update the repo env via mise" + @echo " lint - Run pre-commit hooks plus Python SDK pylint via py/Makefile" + @echo " pylint - Run Python SDK pylint only via py/Makefile" + @echo " nox - Alias for test" + @echo " test - Run the Python SDK nox matrix via py/Makefile" + @echo " test-core - Run Python SDK core tests via py/Makefile" + @echo " test-wheel - Run Python SDK wheel sanity tests via py/Makefile (requires a built wheel)" .DEFAULT_GOAL := help diff --git a/py/Makefile b/py/Makefile index a14a5530..56becdfe 100644 --- a/py/Makefile +++ b/py/Makefile @@ -2,7 +2,7 @@ PYTHON ?= python UV := $(PYTHON) -m uv UV_VERSION := $(shell awk '$$1=="uv" { print $$2 }' ../.tool-versions) -.PHONY: lint pylint test test-wheel _template-version clean fixup build verify-build verify help install-build-deps install-dev install-optional test-core _check-git-clean +.PHONY: lint pylint test test-wheel _template-version clean fixup build verify-build verify help install-build-deps install-dev install-optional test-core _check-git-clean bench bench-compare clean: rm -rf build dist @@ -32,6 +32,16 @@ test-wheel: test-core: nox -s test_core +bench: + $(PYTHON) -m benchmarks $(BENCH_ARGS) + +bench-compare: + @if [ -z "$(BENCH_BASE)" ] || [ -z "$(BENCH_NEW)" ]; then \ + echo "Usage: make bench-compare BENCH_BASE=/tmp/base.json BENCH_NEW=/tmp/new.json"; \ + exit 1; \ + fi + $(PYTHON) -m pyperf compare_to $(BENCH_BASE) $(BENCH_NEW) + _template-version: @$(PYTHON) scripts/template-version.py @@ -66,6 +76,8 @@ install-optional: install-dev .DEFAULT_GOAL := help help: @echo "Available targets:" + @echo " bench - Run benchmarks (pass extra flags via BENCH_ARGS=, e.g. --fast)" + @echo " bench-compare - Compare two benchmark results (BENCH_BASE=... BENCH_NEW=...)" @echo " build - Build Python package" @echo " clean - Remove build artifacts" @echo " help - Show this help message" diff --git a/py/benchmarks/__init__.py b/py/benchmarks/__init__.py new file mode 100644 index 00000000..f79c08c5 --- /dev/null +++ b/py/benchmarks/__init__.py @@ -0,0 +1 @@ +"""Local performance benchmarks for the Braintrust Python SDK.""" diff --git a/py/benchmarks/__main__.py b/py/benchmarks/__main__.py new file mode 100644 index 00000000..2288e42d --- /dev/null +++ b/py/benchmarks/__main__.py @@ -0,0 +1,48 @@ +"""Run every ``bench_*.py`` module inside ``benchmarks.benches``. + +Usage:: + + cd py + python -m benchmarks # run all benchmarks + python -m benchmarks --fast # pyperf flags are forwarded + python -m benchmarks -o /tmp/b.json # save results +""" + +import importlib +import pathlib +import pkgutil +import sys + + +# Ensure ``py/`` is on sys.path so pyperf worker subprocesses can resolve +# the ``benchmarks`` package regardless of their working directory. +_PY_DIR = str(pathlib.Path(__file__).resolve().parents[1]) +if _PY_DIR not in sys.path: + sys.path.insert(0, _PY_DIR) + +import pyperf + +import benchmarks.benches as _benches_pkg +from benchmarks._utils import disable_pyperf_psutil + + +def _discover_bench_modules(): + """Yield imported bench modules that expose a ``main()`` callable.""" + for info in pkgutil.iter_modules(_benches_pkg.__path__): + if not info.name.startswith("bench_"): + continue + mod = importlib.import_module(f"benchmarks.benches.{info.name}") + if callable(getattr(mod, "main", None)): + yield mod + + +def main() -> None: + disable_pyperf_psutil() + runner = pyperf.Runner() + + for mod in _discover_bench_modules(): + mod.main(runner) + + +if __name__ == "__main__": + main() diff --git a/py/benchmarks/_utils.py b/py/benchmarks/_utils.py new file mode 100644 index 00000000..2c28a90f --- /dev/null +++ b/py/benchmarks/_utils.py @@ -0,0 +1,19 @@ +"""Shared helpers for benchmark modules.""" + +import sys + + +def disable_pyperf_psutil() -> None: + """Disable pyperf's psutil-based metadata collection on macOS. + + pyperf's optional system metadata collection can hit sandboxed sysctl + paths on macOS. Disabling it keeps local runs portable. + """ + if sys.platform != "darwin": + return + + import pyperf._collect_metadata as collect_metadata + import pyperf._cpu_utils as cpu_utils + + collect_metadata.psutil = None + cpu_utils.psutil = None diff --git a/py/benchmarks/benches/__init__.py b/py/benchmarks/benches/__init__.py new file mode 100644 index 00000000..1123c815 --- /dev/null +++ b/py/benchmarks/benches/__init__.py @@ -0,0 +1,12 @@ +"""Individual benchmark modules. + +Every ``bench_*.py`` file in this package must expose a ``main()`` function +that accepts an optional ``pyperf.Runner`` and registers its benchmarks on it. + +Signature:: + + def main(runner: pyperf.Runner | None = None) -> None: ... + +When *runner* is ``None`` the module should create its own ``Runner`` so it +can still be executed standalone (``python -m benchmarks.benches.bench_foo``). +""" diff --git a/py/benchmarks/benches/bench_bt_json.py b/py/benchmarks/benches/bench_bt_json.py new file mode 100644 index 00000000..5bf4fe46 --- /dev/null +++ b/py/benchmarks/benches/bench_bt_json.py @@ -0,0 +1,29 @@ +import pathlib +import sys + +import pyperf + + +if __package__ in (None, ""): + sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2])) + +from braintrust.bt_json import _to_bt_safe, bt_safe_deep_copy + +from benchmarks._utils import disable_pyperf_psutil +from benchmarks.fixtures import make_bt_safe_deep_copy_cases, make_to_bt_safe_cases + + +def main(runner: pyperf.Runner | None = None) -> None: + if runner is None: + disable_pyperf_psutil() + runner = pyperf.Runner() + + for case_name, value in make_to_bt_safe_cases(): + runner.bench_func(f"bt_json._to_bt_safe[{case_name}]", _to_bt_safe, value) + + for case_name, value in make_bt_safe_deep_copy_cases(): + runner.bench_func(f"bt_json.bt_safe_deep_copy[{case_name}]", bt_safe_deep_copy, value) + + +if __name__ == "__main__": + main() diff --git a/py/benchmarks/fixtures.py b/py/benchmarks/fixtures.py new file mode 100644 index 00000000..907983e1 --- /dev/null +++ b/py/benchmarks/fixtures.py @@ -0,0 +1,203 @@ +from dataclasses import dataclass +from typing import Any + +from braintrust.span_types import SpanTypeAttribute + + +@dataclass(frozen=True) +class BenchmarkDataclass: + model: str + temperature: float + tags: list[str] + metadata: dict[str, Any] + + +class PydanticLikeV2: + def __init__(self, payload: dict[str, Any]): + self._payload = payload + + def model_dump(self, exclude_none: bool = True) -> dict[str, Any]: + if not exclude_none: + return dict(self._payload) + return {k: v for k, v in self._payload.items() if v is not None} + + +class PydanticLikeV1: + def __init__(self, payload: dict[str, Any]): + self._payload = payload + + def dict(self, exclude_none: bool = True) -> dict[str, Any]: + if not exclude_none: + return dict(self._payload) + return {k: v for k, v in self._payload.items() if v is not None} + + +class StringifiableKey: + def __init__(self, label: str): + self.label = label + + def __str__(self) -> str: + return f"key:{self.label}" + + +class NonStringifiableKey: + def __str__(self) -> str: + raise RuntimeError("cannot stringify") + + +def make_small_payload() -> dict[str, Any]: + return { + "input": {"prompt": "Summarize this email", "attempt": 1}, + "metadata": {"user_id": "user-123", "session_id": "sess-123"}, + "scores": {"helpfulness": 0.9}, + "tags": ["support", "email"], + } + + +def make_medium_payload() -> dict[str, Any]: + messages = [ + {"role": "system", "content": "You are a concise assistant."}, + {"role": "user", "content": "Summarize the following issue thread."}, + ] + for idx in range(8): + messages.append( + { + "role": "assistant" if idx % 2 else "user", + "content": f"message-{idx}", + "metadata": { + "turn": idx, + "token_count": 64 + idx, + "tool_calls": [{"name": "lookup", "args": {"id": idx}}], + }, + } + ) + + return { + "input": {"messages": messages}, + "metadata": { + "user_id": "user-123", + "session_id": "sess-123", + "workspace_id": "workspace-456", + "feature_flags": {"structured_output": True, "tool_calling": True}, + }, + "metrics": {"prompt_tokens": 512, "completion_tokens": 128, "latency_ms": 183.4}, + "span_attributes": {"type": "llm", "model": "gpt-4.1", "provider": "openai"}, + "tags": ["support", "email", "benchmark"], + } + + +def make_large_payload() -> dict[str, Any]: + messages = [] + for idx in range(48): + messages.append( + { + "role": "assistant" if idx % 2 else "user", + "content": f"message-{idx}-" + ("x" * 80), + "metadata": { + "turn": idx, + "token_count": 256 + idx, + "tool_calls": [ + {"name": "lookup", "args": {"id": idx, "scope": "thread"}}, + {"name": "render", "args": {"template": "summary", "format": "markdown"}}, + ], + }, + } + ) + + docs = [] + for idx in range(20): + docs.append( + { + "id": f"doc-{idx}", + "title": f"Document {idx}", + "score": 0.95 - idx / 100, + "metadata": {"source": "kb", "lang": "en", "chunk": idx}, + } + ) + + return { + "input": {"messages": messages, "retrieved_documents": docs}, + "output": {"summary": "done", "citations": [doc["id"] for doc in docs]}, + "metadata": { + "user_id": "user-123", + "session_id": "sess-123", + "workspace_id": "workspace-456", + "feature_flags": { + "structured_output": True, + "tool_calling": True, + "reasoning_tokens": True, + }, + "routing": {"tier": "premium", "region": "us-west-2", "experiment": "bench-large"}, + }, + "metrics": {"prompt_tokens": 4096, "completion_tokens": 640, "latency_ms": 812.7}, + "span_attributes": {"type": "llm", "model": "gpt-4.1", "provider": "openai"}, + "tags": ["support", "email", "benchmark", "large"], + } + + +def make_circular_payload() -> dict[str, Any]: + payload = make_medium_payload() + payload["self"] = payload + payload["input"]["parent"] = payload["input"] + return payload + + +def make_non_string_key_payload() -> dict[Any, Any]: + return { + 1: "integer-key", + ("tuple", "key"): {"nested": True}, + StringifiableKey("custom"): [1, 2, 3], + NonStringifiableKey(): "fallback-key-path", + } + + +def make_dataclass_value() -> BenchmarkDataclass: + return BenchmarkDataclass( + model="gpt-4.1", + temperature=0.2, + tags=["support", "triage"], + metadata={"user_id": "user-123", "session_id": "sess-123"}, + ) + + +def make_pydantic_v2_like_value() -> PydanticLikeV2: + return PydanticLikeV2( + { + "model": "gpt-4.1", + "temperature": 0.2, + "user_id": "user-123", + "optional_field": None, + } + ) + + +def make_pydantic_v1_like_value() -> PydanticLikeV1: + return PydanticLikeV1( + { + "model": "gpt-4.1", + "temperature": 0.2, + "user_id": "user-123", + "optional_field": None, + } + ) + + +def make_to_bt_safe_cases() -> list[tuple[str, Any]]: + return [ + ("primitive-int", 42), + ("primitive-float-nan", float("nan")), + ("str-subclass-enum", SpanTypeAttribute.TOOL), + ("dataclass", make_dataclass_value()), + ("pydantic-v2-like", make_pydantic_v2_like_value()), + ("pydantic-v1-like", make_pydantic_v1_like_value()), + ] + + +def make_bt_safe_deep_copy_cases() -> list[tuple[str, Any]]: + return [ + ("small", make_small_payload()), + ("medium", make_medium_payload()), + ("large", make_large_payload()), + ("circular", make_circular_payload()), + ("non-string-keys", make_non_string_key_payload()), + ] diff --git a/py/benchmarks/perf.py b/py/benchmarks/perf.py deleted file mode 100644 index 128ae2f2..00000000 --- a/py/benchmarks/perf.py +++ /dev/null @@ -1,35 +0,0 @@ -import time - -import braintrust -from braintrust import traced - - -LOOPS = 2000 - -braintrust.init_logger(project="perf_test") - - -@traced -def root(input: int) -> int: - return input * 2 - - -@traced -def child(input: int) -> int: - with braintrust.start_span(name="child") as span: - span.log(metadata={"m1": "v1", "m2": "v2"}) - result = root(input) - span.log(metrics={"result": result}) - return result - - -def main(): - t = time.time() - for i in range(LOOPS): - child(i) - elapsed = time.time() - t - print(f"ran {LOOPS} in {elapsed:.3f}s") - - -if __name__ == "__main__": - main() diff --git a/py/requirements-dev.txt b/py/requirements-dev.txt index f8bc52e0..e79dff02 100644 --- a/py/requirements-dev.txt +++ b/py/requirements-dev.txt @@ -4,6 +4,7 @@ nox pre-commit pydoc-markdown pylint +pyperf pytest pytest-asyncio pytest-forked