From bc8806605c8953cf9e02cbb15b3bfe4631729831 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Tue, 24 Mar 2026 15:17:18 -0700
Subject: [PATCH] chore: Add benchmarking setup to SDK

---
 .agents/skills/sdk-benchmarking/SKILL.md      | 101 +++++++++
 .../sdk-benchmarking/agents/openai.yaml       |   4 +
 .../references/benchmark-patterns.md          |  93 ++++++++
 AGENTS.md                                     |  20 ++
 CONTRIBUTING.md                               |  81 +++++++
 Makefile                                      |  30 ++-
 py/Makefile                                   |  14 +-
 py/benchmarks/__init__.py                     |   1 +
 py/benchmarks/__main__.py                     |  48 +++++
 py/benchmarks/_utils.py                       |  19 ++
 py/benchmarks/benches/__init__.py             |  12 ++
 py/benchmarks/benches/bench_bt_json.py        |  29 +++
 py/benchmarks/fixtures.py                     | 203 ++++++++++++++++++
 py/benchmarks/perf.py                         |  35 ---
 py/requirements-dev.txt                       |   1 +
 15 files changed, 644 insertions(+), 47 deletions(-)
 create mode 100644 .agents/skills/sdk-benchmarking/SKILL.md
 create mode 100644 .agents/skills/sdk-benchmarking/agents/openai.yaml
 create mode 100644 .agents/skills/sdk-benchmarking/references/benchmark-patterns.md
 create mode 100644 py/benchmarks/__init__.py
 create mode 100644 py/benchmarks/__main__.py
 create mode 100644 py/benchmarks/_utils.py
 create mode 100644 py/benchmarks/benches/__init__.py
 create mode 100644 py/benchmarks/benches/bench_bt_json.py
 create mode 100644 py/benchmarks/fixtures.py
 delete mode 100644 py/benchmarks/perf.py

diff --git a/.agents/skills/sdk-benchmarking/SKILL.md b/.agents/skills/sdk-benchmarking/SKILL.md
new file mode 100644
index 00000000..720dfc4e
--- /dev/null
+++ b/.agents/skills/sdk-benchmarking/SKILL.md
@@ -0,0 +1,101 @@
+---
+name: sdk-benchmarking
+description: Run, compare, and extend Braintrust Python SDK pyperf benchmarks. Use when touching hot-path code in `py/src/braintrust/` such as serialization, deep-copy, span creation, or logging; when adding or updating files under `py/benchmarks/`; or when you need baseline/branch performance measurements with `cd py && make bench` and `make bench-compare`.
+---
+
+# SDK Benchmarking
+
+Use this skill for benchmark work in the Braintrust Python SDK repository.
+
+Benchmark support already exists in `py/benchmarks/`. Use the current repo workflow, not commit archaeology, once you have identified the relevant benchmark surface.
+
+## Read First
+
+Always read:
+
+- `AGENTS.md`
+- `CONTRIBUTING.md`
+- `py/Makefile`
+- `py/benchmarks/__main__.py`
+- `py/benchmarks/_utils.py`
+- `py/benchmarks/benches/__init__.py`
+
+Read when relevant:
+
+- `py/benchmarks/benches/bench_bt_json.py` for the module pattern
+- `py/benchmarks/fixtures.py` for shared payload builders
+- `py/setup.py` when benchmarking the optional `orjson` fast path
+- `references/benchmark-patterns.md` in this skill for command and module templates
+
+## Workflow
+
+1. Identify the hot path or API surface that changed.
+2. Find the nearest existing benchmark module under `py/benchmarks/benches/`.
+3. Run the narrowest useful benchmark first.
+4. Add or update a `bench_*.py` module only if the current suite does not cover the changed path.
+5. Reuse or extend `py/benchmarks/fixtures.py` for realistic shared payloads instead of inlining bulky test data.
+6. Save before/after results and compare them when the task is about regression detection or improvement claims.
+
+## Commands
+
+Run benchmarks from `py/`:
+
+```bash
+cd py
+make bench
+make bench BENCH_ARGS="--fast"
+make bench BENCH_ARGS="-o /tmp/before.json"
+make bench BENCH_ARGS="-o /tmp/after.json"
+make bench-compare BENCH_BASE=/tmp/before.json BENCH_NEW=/tmp/after.json
+python -m benchmarks.benches.bench_bt_json
+```
+
+Use `python -m benchmarks --help` for extra `pyperf` flags.
+
+If the benchmark should measure the optional `orjson` path, install the performance extra first:
+
+```bash
+cd py
+python -m uv pip install -e '.[performance]'
+```
+
+## Adding Benchmarks
+
+Put new modules in `py/benchmarks/benches/` and name them `bench_<name>.py`.
+
+Every benchmark module must:
+
+- expose `main(runner: pyperf.Runner | None = None) -> None`
+- create its own `pyperf.Runner()` only when `runner` is `None`
+- call `disable_pyperf_psutil()` before creating that runner
+- register benchmarks with stable, descriptive names via `runner.bench_func(...)`
+- remain executable directly with `python -m benchmarks.benches.bench_<name>`
+
+Do not add manual registration. `python -m benchmarks` auto-discovers every `bench_*.py` module in `py/benchmarks/benches/`.
+
+## Fixtures
+
+Keep reusable payload builders and synthetic objects in `py/benchmarks/fixtures.py`.
+
+Prefer fixture helpers when:
+
+- several benchmark cases share similar payloads
+- the inputs are large enough to distract from the benchmark itself
+- you need variants such as small, medium, large, circular, or non-string-key cases
+
+Keep fixture builders deterministic and focused on representative data shapes.
+
+## Validation
+
+- Run the narrowest affected benchmark first.
+- Use `BENCH_ARGS="--fast"` for quick local sanity checks while iterating.
+- Save JSON outputs and use `make bench-compare` for baseline versus branch comparisons.
+- If you changed code paths that also have correctness tests, run the smallest relevant test target in addition to the benchmark.
+
+## Pitfalls
+
+- Measuring import/setup overhead instead of the hot function under test.
+- Inlining ad hoc payload construction in each benchmark instead of reusing fixtures.
+- Forgetting the standalone `main()` pattern, which breaks auto-discovery and direct execution symmetry.
+- Claiming performance changes from a single unsaved local run instead of comparing saved results.
+- Benchmarking the `orjson` fast path without explicitly installing `.[performance]`.
diff --git a/.agents/skills/sdk-benchmarking/agents/openai.yaml b/.agents/skills/sdk-benchmarking/agents/openai.yaml
new file mode 100644
index 00000000..6be21da5
--- /dev/null
+++ b/.agents/skills/sdk-benchmarking/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "SDK Benchmarking"
+  short_description: "Run and extend Braintrust SDK benchmarks"
+  default_prompt: "Use $sdk-benchmarking to run, compare, or add Braintrust Python SDK benchmarks."
diff --git a/.agents/skills/sdk-benchmarking/references/benchmark-patterns.md b/.agents/skills/sdk-benchmarking/references/benchmark-patterns.md
new file mode 100644
index 00000000..0d9fcf81
--- /dev/null
+++ b/.agents/skills/sdk-benchmarking/references/benchmark-patterns.md
@@ -0,0 +1,93 @@
+# Benchmark Patterns
+
+Use this reference when adding or updating SDK benchmarks.
+
+## Command Cheatsheet
+
+```bash
+cd py
+
+# Run everything
+make bench
+
+# Faster local iteration
+make bench BENCH_ARGS="--fast"
+
+# Save results for comparison
+make bench BENCH_ARGS="-o /tmp/before.json"
+make bench BENCH_ARGS="-o /tmp/after.json"
+make bench-compare BENCH_BASE=/tmp/before.json BENCH_NEW=/tmp/after.json
+
+# Run one module directly
+python -m benchmarks.benches.bench_bt_json
+
+# Inspect all forwarded pyperf flags
+python -m benchmarks --help
+```
+
+## Module Skeleton
+
+```python
+import pathlib
+import sys
+
+import pyperf
+
+
+if __package__ in (None, ""):
+    sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))
+
+from benchmarks._utils import disable_pyperf_psutil
+
+
+def target(value):
+    return value
+
+
+def main(runner: pyperf.Runner | None = None) -> None:
+    if runner is None:
+        disable_pyperf_psutil()
+        runner = pyperf.Runner()
+
+    runner.bench_func("example.target[case-name]", target, "value")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+Follow the existing `py/benchmarks/benches/bench_bt_json.py` pattern when importing repo code. The `sys.path` adjustment keeps direct module execution working from inside `py/`.
+
+## Fixture Guidance
+
+Put reusable builders in `py/benchmarks/fixtures.py` when:
+
+- several benchmark cases need the same payload shape
+- the payload should model realistic nested SDK inputs
+- the benchmark should cover edge cases such as circular references or non-string keys
+
+Current fixture patterns already cover:
+
+- small, medium, and large nested payloads
+- circular structures
+- non-string dictionary keys
+- dataclass-like and pydantic-like values
+
+Extend those helpers before creating one-off payload factories in a new benchmark module.
+
+## Comparison Workflow
+
+For branch-to-branch comparisons:
+
+```bash
+cd py
+git checkout main
+make bench BENCH_ARGS="-o /tmp/main.json"
+
+git checkout my-branch
+make bench BENCH_ARGS="-o /tmp/branch.json"
+
+make bench-compare BENCH_BASE=/tmp/main.json BENCH_NEW=/tmp/branch.json
+```
+
+Use `--rigorous` only when you need lower-noise final numbers; use `--fast` while iterating.
diff --git a/AGENTS.md b/AGENTS.md
index 8acef533..bf9d2ee6 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -12,6 +12,7 @@ Guide for contributing to the Braintrust Python SDK repository.
 ## Repo Map
 
 - `py/`: main Python package, tests, examples, nox sessions, release build
+- `py/benchmarks/`: pyperf performance benchmarks
 - `integrations/`: separate integration packages
 - `internal/golden/`: compatibility and golden projects
 - `docs/`: supporting docs
@@ -122,6 +123,25 @@ BRAINTRUST_CLAUDE_AGENT_SDK_RECORD_MODE=all nox -s "test_claude_agent_sdk(latest
 
 Only re-record HTTP or subprocess cassettes when the behavior change is intentional. If in doubt, ask the user.
 
+## Benchmarks
+
+Run `cd py && make bench` when touching hot-path code (serialization, deep-copy, span creation, logging). Not required for every change.
+
+Benchmarks use pyperf. All `bench_*.py` files in `py/benchmarks/benches/` are auto-discovered — no registration needed.
+
+Key commands:
+
+```bash
+cd py
+make bench                                   # run all benchmarks
+make bench BENCH_ARGS="--fast"               # quick sanity check
+make bench BENCH_ARGS="-o /tmp/before.json"  # save baseline before a change
+make bench BENCH_ARGS="-o /tmp/after.json"   # save after a change
+make bench-compare BENCH_BASE=/tmp/before.json BENCH_NEW=/tmp/after.json
+```
+
+New benchmark files go in `py/benchmarks/benches/bench_<name>.py`. Each must expose `main(runner: pyperf.Runner | None = None)`. Shared payload builders go in `py/benchmarks/fixtures.py`. See existing `bench_bt_json.py` for the pattern.
+
 ## Build Notes
 
 Build from `py/`:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 695994c1..dc1a266a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -147,6 +147,87 @@ Common ones include:
 
 The `memory_logger` fixture from `braintrust.test_helpers` is useful for asserting on logged spans without a real Braintrust backend.
 
+## Benchmarks
+
+The SDK includes local performance benchmarks powered by [pyperf](https://pyperf.readthedocs.io/), located in `py/benchmarks/`. These cover hot paths like serialization and deep-copy routines.
+
+### Running benchmarks
+
+```bash
+cd py
+
+# Run all benchmarks
+make bench
+
+# Quick sanity check (fewer iterations)
+make bench BENCH_ARGS="--fast"
+
+# Save results for later comparison
+make bench BENCH_ARGS="-o /tmp/results.json"
+
+# Run a single benchmark module directly
+python -m benchmarks.benches.bench_bt_json
+```
+
+To benchmark with the optional `orjson` fast-path installed:
+
+```bash
+cd py
+python -m uv pip install -e '.[performance]'
+make bench
+```
+
+### Comparing across branches
+
+```bash
+cd py
+
+git checkout main
+make bench BENCH_ARGS="-o /tmp/main.json"
+
+git checkout my-branch
+make bench BENCH_ARGS="-o /tmp/branch.json"
+
+make bench-compare BENCH_BASE=/tmp/main.json BENCH_NEW=/tmp/branch.json
+```
+
+### Useful pyperf flags
+
+| Flag            | Purpose                                           |
+| --------------- | ------------------------------------------------- |
+| `--fast`        | Fewer iterations — good for a quick sanity check  |
+| `--rigorous`    | More iterations — reduces noise for final numbers |
+| `-o FILE`       | Write results to a JSON file for later comparison |
+| `--append FILE` | Append to an existing results file                |
+
+Run `python -m benchmarks --help` for the full list.
+
+### Adding a new benchmark
+
+Drop a new `bench_<name>.py` file into `py/benchmarks/benches/`. It will be picked up automatically — no registration required.
+
+Your module needs to expose a `main()` function that accepts an optional `pyperf.Runner`:
+
+```python
+import pyperf
+
+from benchmarks._utils import disable_pyperf_psutil
+
+
+def main(runner: pyperf.Runner | None = None) -> None:
+    if runner is None:
+        disable_pyperf_psutil()
+        runner = pyperf.Runner()
+
+    runner.bench_func("my_benchmark", my_func, my_arg)
+
+
+if __name__ == "__main__":
+    main()
+```
+
+If your benchmark needs reusable test data, add builder functions to `py/benchmarks/fixtures.py`.
+
 ## CI
 
 GitHub Actions workflows live in `.github/workflows/`.
diff --git a/Makefile b/Makefile
index ff5c5edd..4dba3a47 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 SHELL := /bin/bash
 
-.PHONY: help develop install-dev install-deps fixup test test-core test-wheel lint pylint nox
+.PHONY: help develop install-dev install-deps fixup test test-core test-wheel lint pylint nox bench bench-compare
 
 develop: install-dev
 	mise exec -- pre-commit install
@@ -30,19 +30,27 @@ lint:
 pylint:
 	mise exec -- $(MAKE) -C py pylint
 
+bench:
+	mise exec -- $(MAKE) -C py bench BENCH_ARGS="$(BENCH_ARGS)"
+
+bench-compare:
+	mise exec -- $(MAKE) -C py bench-compare BENCH_BASE="$(BENCH_BASE)" BENCH_NEW="$(BENCH_NEW)"
+
 nox: test
 
 help:
 	@echo "Available targets:"
-	@echo "  develop      - Install tools with mise, install py/ deps, and install pre-commit hooks"
-	@echo "  fixup        - Run pre-commit hooks across the repo"
-	@echo "  install-deps - Install Python SDK dependencies via py/Makefile"
-	@echo "  install-dev  - Install pinned tools and create/update the repo env via mise"
-	@echo "  lint         - Run pre-commit hooks plus Python SDK pylint via py/Makefile"
-	@echo "  pylint       - Run Python SDK pylint only via py/Makefile"
-	@echo "  nox          - Alias for test"
-	@echo "  test         - Run the Python SDK nox matrix via py/Makefile"
-	@echo "  test-core    - Run Python SDK core tests via py/Makefile"
-	@echo "  test-wheel   - Run Python SDK wheel sanity tests via py/Makefile (requires a built wheel)"
+	@echo "  bench         - Run benchmarks via py/Makefile (pass extra flags via BENCH_ARGS=)"
+	@echo "  bench-compare - Compare two benchmark results via py/Makefile (BENCH_BASE=... BENCH_NEW=...)"
+	@echo "  develop       - Install tools with mise, install py/ deps, and install pre-commit hooks"
+	@echo "  fixup         - Run pre-commit hooks across the repo"
+	@echo "  install-deps  - Install Python SDK dependencies via py/Makefile"
+	@echo "  install-dev   - Install pinned tools and create/update the repo env via mise"
+	@echo "  lint          - Run pre-commit hooks plus Python SDK pylint via py/Makefile"
+	@echo "  pylint        - Run Python SDK pylint only via py/Makefile"
+	@echo "  nox           - Alias for test"
+	@echo "  test          - Run the Python SDK nox matrix via py/Makefile"
+	@echo "  test-core     - Run Python SDK core tests via py/Makefile"
+	@echo "  test-wheel    - Run Python SDK wheel sanity tests via py/Makefile (requires a built wheel)"
 
 .DEFAULT_GOAL := help
diff --git a/py/Makefile b/py/Makefile
index a14a5530..56becdfe 100644
--- a/py/Makefile
+++ b/py/Makefile
@@ -2,7 +2,7 @@ PYTHON ?= python
 UV := $(PYTHON) -m uv
 UV_VERSION := $(shell awk '$$1=="uv" { print $$2 }' ../.tool-versions)
 
-.PHONY: lint pylint test test-wheel _template-version clean fixup build verify-build verify help install-build-deps install-dev install-optional test-core _check-git-clean
+.PHONY: lint pylint test test-wheel _template-version clean fixup build verify-build verify help install-build-deps install-dev install-optional test-core _check-git-clean bench bench-compare
 
 clean:
 	rm -rf build dist
@@ -32,6 +32,16 @@ test-wheel:
 test-core:
 	nox -s test_core
 
+bench:
+	$(PYTHON) -m benchmarks $(BENCH_ARGS)
+
+bench-compare:
+	@if [ -z "$(BENCH_BASE)" ] || [ -z "$(BENCH_NEW)" ]; then \
+		echo "Usage: make bench-compare BENCH_BASE=/tmp/base.json BENCH_NEW=/tmp/new.json"; \
+		exit 1; \
+	fi
+	$(PYTHON) -m pyperf compare_to $(BENCH_BASE) $(BENCH_NEW)
+
 _template-version:
 	@$(PYTHON) scripts/template-version.py
 
@@ -66,6 +76,8 @@ install-optional: install-dev
 .DEFAULT_GOAL := help
 help:
 	@echo "Available targets:"
+	@echo "  bench               - Run benchmarks (pass extra flags via BENCH_ARGS=, e.g. --fast)"
+	@echo "  bench-compare       - Compare two benchmark results (BENCH_BASE=... BENCH_NEW=...)"
 	@echo "  build               - Build Python package"
 	@echo "  clean               - Remove build artifacts"
 	@echo "  help                - Show this help message"
diff --git a/py/benchmarks/__init__.py b/py/benchmarks/__init__.py
new file mode 100644
index 00000000..f79c08c5
--- /dev/null
+++ b/py/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Local performance benchmarks for the Braintrust Python SDK."""
diff --git a/py/benchmarks/__main__.py b/py/benchmarks/__main__.py
new file mode 100644
index 00000000..2288e42d
--- /dev/null
+++ b/py/benchmarks/__main__.py
@@ -0,0 +1,48 @@
+"""Run every ``bench_*.py`` module inside ``benchmarks.benches``.
+
+Usage::
+
+    cd py
+    python -m benchmarks               # run all benchmarks
+    python -m benchmarks --fast         # pyperf flags are forwarded
+    python -m benchmarks -o /tmp/b.json # save results
+"""
+
+import importlib
+import pathlib
+import pkgutil
+import sys
+
+
+# Ensure ``py/`` is on sys.path so pyperf worker subprocesses can resolve
+# the ``benchmarks`` package regardless of their working directory.
+_PY_DIR = str(pathlib.Path(__file__).resolve().parents[1])
+if _PY_DIR not in sys.path:
+    sys.path.insert(0, _PY_DIR)
+
+import pyperf
+
+import benchmarks.benches as _benches_pkg
+from benchmarks._utils import disable_pyperf_psutil
+
+
+def _discover_bench_modules():
+    """Yield imported bench modules that expose a ``main()`` callable."""
+    for info in pkgutil.iter_modules(_benches_pkg.__path__):
+        if not info.name.startswith("bench_"):
+            continue
+        mod = importlib.import_module(f"benchmarks.benches.{info.name}")
+        if callable(getattr(mod, "main", None)):
+            yield mod
+
+
+def main() -> None:
+    disable_pyperf_psutil()
+    runner = pyperf.Runner()
+
+    for mod in _discover_bench_modules():
+        mod.main(runner)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/py/benchmarks/_utils.py b/py/benchmarks/_utils.py
new file mode 100644
index 00000000..2c28a90f
--- /dev/null
+++ b/py/benchmarks/_utils.py
@@ -0,0 +1,19 @@
+"""Shared helpers for benchmark modules."""
+
+import sys
+
+
+def disable_pyperf_psutil() -> None:
+    """Disable pyperf's psutil-based metadata collection on macOS.
+
+    pyperf's optional system metadata collection can hit sandboxed sysctl
+    paths on macOS.  Disabling it keeps local runs portable.
+    """
+    if sys.platform != "darwin":
+        return
+
+    import pyperf._collect_metadata as collect_metadata
+    import pyperf._cpu_utils as cpu_utils
+
+    collect_metadata.psutil = None
+    cpu_utils.psutil = None
diff --git a/py/benchmarks/benches/__init__.py b/py/benchmarks/benches/__init__.py
new file mode 100644
index 00000000..1123c815
--- /dev/null
+++ b/py/benchmarks/benches/__init__.py
@@ -0,0 +1,12 @@
+"""Individual benchmark modules.
+
+Every ``bench_*.py`` file in this package must expose a ``main()`` function
+that accepts an optional ``pyperf.Runner`` and registers its benchmarks on it.
+
+Signature::
+
+    def main(runner: pyperf.Runner | None = None) -> None: ...
+
+When *runner* is ``None`` the module should create its own ``Runner`` so it
+can still be executed standalone (``python -m benchmarks.benches.bench_foo``).
+"""
diff --git a/py/benchmarks/benches/bench_bt_json.py b/py/benchmarks/benches/bench_bt_json.py
new file mode 100644
index 00000000..5bf4fe46
--- /dev/null
+++ b/py/benchmarks/benches/bench_bt_json.py
@@ -0,0 +1,29 @@
+import pathlib
+import sys
+
+import pyperf
+
+
+if __package__ in (None, ""):
+    sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))
+
+from braintrust.bt_json import _to_bt_safe, bt_safe_deep_copy
+
+from benchmarks._utils import disable_pyperf_psutil
+from benchmarks.fixtures import make_bt_safe_deep_copy_cases, make_to_bt_safe_cases
+
+
+def main(runner: pyperf.Runner | None = None) -> None:
+    if runner is None:
+        disable_pyperf_psutil()
+        runner = pyperf.Runner()
+
+    for case_name, value in make_to_bt_safe_cases():
+        runner.bench_func(f"bt_json._to_bt_safe[{case_name}]", _to_bt_safe, value)
+
+    for case_name, value in make_bt_safe_deep_copy_cases():
+        runner.bench_func(f"bt_json.bt_safe_deep_copy[{case_name}]", bt_safe_deep_copy, value)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/py/benchmarks/fixtures.py b/py/benchmarks/fixtures.py
new file mode 100644
index 00000000..907983e1
--- /dev/null
+++ b/py/benchmarks/fixtures.py
@@ -0,0 +1,203 @@
+from dataclasses import dataclass
+from typing import Any
+
+from braintrust.span_types import SpanTypeAttribute
+
+
+@dataclass(frozen=True)
+class BenchmarkDataclass:
+    model: str
+    temperature: float
+    tags: list[str]
+    metadata: dict[str, Any]
+
+
+class PydanticLikeV2:
+    def __init__(self, payload: dict[str, Any]):
+        self._payload = payload
+
+    def model_dump(self, exclude_none: bool = True) -> dict[str, Any]:
+        if not exclude_none:
+            return dict(self._payload)
+        return {k: v for k, v in self._payload.items() if v is not None}
+
+
+class PydanticLikeV1:
+    def __init__(self, payload: dict[str, Any]):
+        self._payload = payload
+
+    def dict(self, exclude_none: bool = True) -> dict[str, Any]:
+        if not exclude_none:
+            return dict(self._payload)
+        return {k: v for k, v in self._payload.items() if v is not None}
+
+
+class StringifiableKey:
+    def __init__(self, label: str):
+        self.label = label
+
+    def __str__(self) -> str:
+        return f"key:{self.label}"
+
+
+class NonStringifiableKey:
+    def __str__(self) -> str:
+        raise RuntimeError("cannot stringify")
+
+
+def make_small_payload() -> dict[str, Any]:
+    return {
+        "input": {"prompt": "Summarize this email", "attempt": 1},
+        "metadata": {"user_id": "user-123", "session_id": "sess-123"},
+        "scores": {"helpfulness": 0.9},
+        "tags": ["support", "email"],
+    }
+
+
+def make_medium_payload() -> dict[str, Any]:
+    messages = [
+        {"role": "system", "content": "You are a concise assistant."},
+        {"role": "user", "content": "Summarize the following issue thread."},
+    ]
+    for idx in range(8):
+        messages.append(
+            {
+                "role": "assistant" if idx % 2 else "user",
+                "content": f"message-{idx}",
+                "metadata": {
+                    "turn": idx,
+                    "token_count": 64 + idx,
+                    "tool_calls": [{"name": "lookup", "args": {"id": idx}}],
+                },
+            }
+        )
+
+    return {
+        "input": {"messages": messages},
+        "metadata": {
+            "user_id": "user-123",
+            "session_id": "sess-123",
+            "workspace_id": "workspace-456",
+            "feature_flags": {"structured_output": True, "tool_calling": True},
+        },
+        "metrics": {"prompt_tokens": 512, "completion_tokens": 128, "latency_ms": 183.4},
+        "span_attributes": {"type": "llm", "model": "gpt-4.1", "provider": "openai"},
+        "tags": ["support", "email", "benchmark"],
+    }
+
+
+def make_large_payload() -> dict[str, Any]:
+    messages = []
+    for idx in range(48):
+        messages.append(
+            {
+                "role": "assistant" if idx % 2 else "user",
+                "content": f"message-{idx}-" + ("x" * 80),
+                "metadata": {
+                    "turn": idx,
+                    "token_count": 256 + idx,
+                    "tool_calls": [
+                        {"name": "lookup", "args": {"id": idx, "scope": "thread"}},
+                        {"name": "render", "args": {"template": "summary", "format": "markdown"}},
+                    ],
+                },
+            }
+        )
+
+    docs = []
+    for idx in range(20):
+        docs.append(
+            {
+                "id": f"doc-{idx}",
+                "title": f"Document {idx}",
+                "score": 0.95 - idx / 100,
+                "metadata": {"source": "kb", "lang": "en", "chunk": idx},
+            }
+        )
+
+    return {
+        "input": {"messages": messages, "retrieved_documents": docs},
+        "output": {"summary": "done", "citations": [doc["id"] for doc in docs]},
+        "metadata": {
+            "user_id": "user-123",
+            "session_id": "sess-123",
+            "workspace_id": "workspace-456",
+            "feature_flags": {
+                "structured_output": True,
+                "tool_calling": True,
+                "reasoning_tokens": True,
+            },
+            "routing": {"tier": "premium", "region": "us-west-2", "experiment": "bench-large"},
+        },
+        "metrics": {"prompt_tokens": 4096, "completion_tokens": 640, "latency_ms": 812.7},
+        "span_attributes": {"type": "llm", "model": "gpt-4.1", "provider": "openai"},
+        "tags": ["support", "email", "benchmark", "large"],
+    }
+
+
+def make_circular_payload() -> dict[str, Any]:
+    payload = make_medium_payload()
+    payload["self"] = payload
+    payload["input"]["parent"] = payload["input"]
+    return payload
+
+
+def make_non_string_key_payload() -> dict[Any, Any]:
+    return {
+        1: "integer-key",
+        ("tuple", "key"): {"nested": True},
+        StringifiableKey("custom"): [1, 2, 3],
+        NonStringifiableKey(): "fallback-key-path",
+    }
+
+
+def make_dataclass_value() -> BenchmarkDataclass:
+    return BenchmarkDataclass(
+        model="gpt-4.1",
+        temperature=0.2,
+        tags=["support", "triage"],
+        metadata={"user_id": "user-123", "session_id": "sess-123"},
+    )
+
+
+def make_pydantic_v2_like_value() -> PydanticLikeV2:
+    return PydanticLikeV2(
+        {
+            "model": "gpt-4.1",
+            "temperature": 0.2,
+            "user_id": "user-123",
+            "optional_field": None,
+        }
+    )
+
+
+def make_pydantic_v1_like_value() -> PydanticLikeV1:
+    return PydanticLikeV1(
+        {
+            "model": "gpt-4.1",
+            "temperature": 0.2,
+            "user_id": "user-123",
+            "optional_field": None,
+        }
+    )
+
+
+def make_to_bt_safe_cases() -> list[tuple[str, Any]]:
+    return [
+        ("primitive-int", 42),
+        ("primitive-float-nan", float("nan")),
+        ("str-subclass-enum", SpanTypeAttribute.TOOL),
+        ("dataclass", make_dataclass_value()),
+        ("pydantic-v2-like", make_pydantic_v2_like_value()),
+        ("pydantic-v1-like", make_pydantic_v1_like_value()),
+    ]
+
+
+def make_bt_safe_deep_copy_cases() -> list[tuple[str, Any]]:
+    return [
+        ("small", make_small_payload()),
+        ("medium", make_medium_payload()),
+        ("large", make_large_payload()),
+        ("circular", make_circular_payload()),
+        ("non-string-keys", make_non_string_key_payload()),
+    ]
diff --git a/py/benchmarks/perf.py b/py/benchmarks/perf.py
deleted file mode 100644
index 128ae2f2..00000000
--- a/py/benchmarks/perf.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import time
-
-import braintrust
-from braintrust import traced
-
-
-LOOPS = 2000
-
-braintrust.init_logger(project="perf_test")
-
-
-@traced
-def root(input: int) -> int:
-    return input * 2
-
-
-@traced
-def child(input: int) -> int:
-    with braintrust.start_span(name="child") as span:
-        span.log(metadata={"m1": "v1", "m2": "v2"})
-        result = root(input)
-        span.log(metrics={"result": result})
-        return result
-
-
-def main():
-    t = time.time()
-    for i in range(LOOPS):
-        child(i)
-    elapsed = time.time() - t
-    print(f"ran {LOOPS} in {elapsed:.3f}s")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/py/requirements-dev.txt b/py/requirements-dev.txt
index f8bc52e0..e79dff02 100644
--- a/py/requirements-dev.txt
+++ b/py/requirements-dev.txt
@@ -4,6 +4,7 @@ nox
 pre-commit
 pydoc-markdown
 pylint
+pyperf
 pytest
 pytest-asyncio
 pytest-forked