diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py
index 1dc38f90..fd49512a 100644
--- a/py/src/braintrust/framework.py
+++ b/py/src/braintrust/framework.py
@@ -163,7 +163,7 @@ class EvalHooks(abc.ABC, Generic[Output]):
 
     @property
     @abc.abstractmethod
-    def metadata(self) -> Metadata:
+    def metadata(self) -> Metadata | None:
         """
         The metadata object for the current evaluation. You can mutate this object to add or remove metadata.
         """
@@ -1148,7 +1148,7 @@ def evaluate_filter(object, filter: Filter):
 class DictEvalHooks(dict[str, Any]):
     def __init__(
         self,
-        metadata: Any | None = None,
+        metadata: Metadata | None = None,
         expected: Any | None = None,
         trial_index: int = 0,
         tags: Sequence[str] | None = None,
@@ -1170,7 +1170,7 @@ def __init__(
         self._parameters = parameters
 
     @property
-    def metadata(self):
+    def metadata(self) -> Metadata | None:
         return self.get("metadata")
 
     @property
diff --git a/py/src/braintrust/framework2.py b/py/src/braintrust/framework2.py
index 7fe00fa0..4f6d96ae 100644
--- a/py/src/braintrust/framework2.py
+++ b/py/src/braintrust/framework2.py
@@ -16,6 +16,7 @@
     SavedFunctionId,
     ToolFunctionDefinition,
 )
+from .types import Metadata
 from .util import eprint
 
 
@@ -58,7 +59,7 @@ class CodeFunction:
     parameters: Any
     returns: Any
     if_exists: IfExists | None
-    metadata: dict[str, Any] | None = None
+    metadata: Metadata | None = None
     tags: Sequence[str] | None = None
 
 
@@ -75,7 +76,7 @@ class CodePrompt:
     function_type: str | None
     id: str | None
     if_exists: IfExists | None
-    metadata: dict[str, Any] | None = None
+    metadata: Metadata | None = None
     tags: Sequence[str] | None = None
 
     def to_function_definition(self, if_exists: IfExists | None, project_ids: ProjectIdCache) -> dict[str, Any]:
@@ -133,7 +134,7 @@ def create(
         parameters: Any = None,
         returns: Any = None,
         if_exists: IfExists | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         tags: Sequence[str] | None = None,
     ) -> CodeFunction:
         """Creates a tool.
@@ -198,7 +199,7 @@ def create(
         params: ModelParams | None = None,
         tools: list[CodeFunction | SavedFunctionId | ToolFunctionDefinition] | None = None,
         if_exists: IfExists | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         tags: Sequence[str] | None = None,
     ) -> CodePrompt: ...
 
@@ -215,7 +216,7 @@ def create(
         params: ModelParams | None = None,
         tools: list[CodeFunction | SavedFunctionId | ToolFunctionDefinition] | None = None,
         if_exists: IfExists | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         tags: Sequence[str] | None = None,
     ) -> CodePrompt: ...
 
@@ -232,7 +233,7 @@ def create(
         params: ModelParams | None = None,
         tools: list[CodeFunction | SavedFunctionId | ToolFunctionDefinition] | None = None,
         if_exists: IfExists | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         tags: Sequence[str] | None = None,
     ):
         """Creates a prompt.
@@ -321,7 +322,7 @@ def create(
         slug: str | None = None,
         description: str | None = None,
         if_exists: IfExists | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         tags: Sequence[str] | None = None,
         handler: Callable[..., Any],
         parameters: Any,
@@ -337,7 +338,7 @@ def create(
         slug: str | None = None,
         description: str | None = None,
         if_exists: IfExists | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         tags: Sequence[str] | None = None,
         prompt: str,
         model: str,
@@ -355,7 +356,7 @@ def create(
         slug: str | None = None,
         description: str | None = None,
         if_exists: IfExists | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         tags: Sequence[str] | None = None,
         messages: list[ChatCompletionMessageParam],
         model: str,
@@ -371,7 +372,7 @@ def create(
         slug: str | None = None,
         description: str | None = None,
         if_exists: IfExists | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         tags: Sequence[str] | None = None,
         # Code scorer params.
         handler: Callable[..., Any] | None = None,
diff --git a/py/src/braintrust/functions/invoke.py b/py/src/braintrust/functions/invoke.py
index b9597954..85e471fc 100644
--- a/py/src/braintrust/functions/invoke.py
+++ b/py/src/braintrust/functions/invoke.py
@@ -5,6 +5,7 @@
 from .._generated_types import FunctionTypeEnum
 from ..bt_json import bt_dumps
 from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
+from ..types import Metadata
 from ..util import response_raise_for_status
 from .constants import INVOKE_API_VERSION
 from .stream import BraintrustInvokeError, BraintrustStream
@@ -45,7 +46,7 @@ def invoke(
     # arguments to the function
     input: Any = None,
     messages: list[Any] | None = None,
-    metadata: dict[str, Any] | None = None,
+    metadata: Metadata | None = None,
     tags: list[str] | None = None,
     parent: Exportable | str | None = None,
     stream: Literal[False] | None = None,
@@ -73,7 +74,7 @@ def invoke(
     # arguments to the function
     input: Any = None,
     messages: list[Any] | None = None,
-    metadata: dict[str, Any] | None = None,
+    metadata: Metadata | None = None,
     tags: list[str] | None = None,
     parent: Exportable | str | None = None,
     stream: Literal[True] = True,
@@ -100,7 +101,7 @@ def invoke(
     # arguments to the function
     input: Any = None,
     messages: list[Any] | None = None,
-    metadata: dict[str, Any] | None = None,
+    metadata: Metadata | None = None,
     tags: list[str] | None = None,
     parent: Exportable | str | None = None,
     stream: bool = False,
diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py
index ff08052d..1221b5c6 100644
--- a/py/src/braintrust/logger.py
+++ b/py/src/braintrust/logger.py
@@ -76,6 +76,7 @@
 from .span_identifier_v3 import SpanComponentsV3, SpanObjectTypeV3
 from .span_identifier_v4 import SpanComponentsV4
 from .span_types import SpanTypeAttribute
+from .types import Metadata
 from .util import (
     GLOBAL_PROJECT,
     AugmentedHTTPError,
@@ -92,15 +93,13 @@
     parse_env_var_float,
     response_raise_for_status,
 )
+from .xact_ids import prettify_xact
 
 
 # Fields that should be passed to the masking function
 # Note: "tags" field is intentionally excluded, but can be added if needed
 REDACTION_FIELDS = ["input", "output", "expected", "metadata", "context", "scores", "metrics"]
-from .xact_ids import prettify_xact
 
-
-Metadata = dict[str, Any]
 DATA_API_VERSION = 2
 LOGS3_OVERFLOW_REFERENCE_TYPE = "logs3_overflow"
 # 6 MB for the AWS lambda gateway (from our own testing).
@@ -3216,7 +3215,7 @@ def _log_feedback_impl(
     expected: Any | None = None,
     tags: Sequence[str] | None = None,
     comment: str | None = None,
-    metadata: Mapping[str, Any] | None = None,
+    metadata: Metadata | None = None,
     source: Literal["external", "app", "api", None] = None,
 ):
     if source is None:
@@ -3667,7 +3666,7 @@ def log(
         error: str | None = None,
         tags: Sequence[str] | None = None,
         scores: Mapping[str, int | float] | None = None,
-        metadata: Mapping[str, Any] | None = None,
+        metadata: Metadata | None = None,
         metrics: Mapping[str, int | float] | None = None,
         id: str | None = None,
         dataset_record_id: str | None = None,
@@ -3719,7 +3718,7 @@ def log_feedback(
         expected: Any | None = None,
         tags: Sequence[str] | None = None,
         comment: str | None = None,
-        metadata: Mapping[str, Any] | None = None,
+        metadata: Metadata | None = None,
         source: Literal["external", "app", "api", None] = None,
     ) -> None:
         """
@@ -4502,7 +4501,7 @@ def _get_state(self) -> BraintrustState:
 
     def _validate_event(
         self,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         expected: Any | None = None,
         output: Any | None = None,
         tags: Sequence[str] | None = None,
@@ -4555,7 +4554,7 @@ def insert(
         input: Any | None = None,
         expected: Any | None = None,
         tags: Sequence[str] | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
         id: str | None = None,
         output: Any | None = None,
     ) -> str:
@@ -4599,7 +4598,7 @@ def update(
         input: Any | None = None,
         expected: Any | None = None,
         tags: Sequence[str] | None = None,
-        metadata: dict[str, Any] | None = None,
+        metadata: Metadata | None = None,
     ) -> str:
         """
         Update fields of a single record in the dataset. The updated fields will be batched and uploaded behind the scenes.
@@ -5100,7 +5099,7 @@ def log(
         error: str | None = None,
         tags: Sequence[str] | None = None,
         scores: Mapping[str, int | float] | None = None,
-        metadata: Mapping[str, Any] | None = None,
+        metadata: Metadata | None = None,
         metrics: Mapping[str, int | float] | None = None,
         id: str | None = None,
         allow_concurrent_with_spans: bool = False,
@@ -5151,7 +5150,7 @@ def log_feedback(
         expected: Any | None = None,
         tags: Sequence[str] | None = None,
         comment: str | None = None,
-        metadata: Mapping[str, Any] | None = None,
+        metadata: Metadata | None = None,
         source: Literal["external", "app", "api", None] = None,
     ) -> None:
         """
diff --git a/py/src/braintrust/score.py b/py/src/braintrust/score.py
index 62f9ee7e..d8dd0fdf 100644
--- a/py/src/braintrust/score.py
+++ b/py/src/braintrust/score.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 from .serializable_data_class import SerializableDataClass
+from .types import Metadata
 
 
 # =========================================================================
@@ -26,7 +27,7 @@ class Score(SerializableDataClass):
     score: float | None = None
     """The score for the evaluation. This should be a float between 0 and 1. If the score is None, the evaluation is considered to be skipped."""
 
-    metadata: dict[str, Any] = dataclasses.field(default_factory=dict)
+    metadata: Metadata = dataclasses.field(default_factory=dict)
     """Metadata for the score. This can be used to store additional information about the score."""
 
     # DEPRECATION_NOTICE: this field is deprecated, as errors are propagated up to the caller.
diff --git a/py/src/braintrust/span_cache.py b/py/src/braintrust/span_cache.py
index 1f6bde9c..f3248d0c 100644
--- a/py/src/braintrust/span_cache.py
+++ b/py/src/braintrust/span_cache.py
@@ -13,6 +13,7 @@
 import uuid
 from typing import Any, Optional
 
+from braintrust.types import Metadata
 from braintrust.util import merge_dicts
 
 
@@ -29,7 +30,7 @@ def __init__(
         span_id: str,
         input: Optional[Any] = None,
         output: Optional[Any] = None,
-        metadata: Optional[dict[str, Any]] = None,
+        metadata: Metadata | None = None,
         span_parents: Optional[list[str]] = None,
         span_attributes: Optional[dict[str, Any]] = None,
     ):
diff --git a/py/src/braintrust/trace.py b/py/src/braintrust/trace.py
index f07b9ef0..575ee2ca 100644
--- a/py/src/braintrust/trace.py
+++ b/py/src/braintrust/trace.py
@@ -10,6 +10,7 @@
 
 from braintrust.functions.invoke import invoke
 from braintrust.logger import BraintrustState, ObjectFetcher
+from braintrust.types import Metadata
 
 
 class SpanData:
@@ -19,7 +20,7 @@ def __init__(
         self,
         input: Optional[Any] = None,
         output: Optional[Any] = None,
-        metadata: Optional[dict[str, Any]] = None,
+        metadata: Metadata | None = None,
         span_id: Optional[str] = None,
         span_parents: Optional[list[str]] = None,
         span_attributes: Optional[dict[str, Any]] = None,
diff --git a/py/src/braintrust/types.py b/py/src/braintrust/types.py
new file mode 100644
index 00000000..ce11d711
--- /dev/null
+++ b/py/src/braintrust/types.py
@@ -0,0 +1,4 @@
+from typing import Any
+
+
+Metadata = dict[str, Any]
diff --git a/py/src/braintrust/wrappers/agno/test_agno.py b/py/src/braintrust/wrappers/agno/test_agno.py
index 81f6110d..cc9a6d3b 100644
--- a/py/src/braintrust/wrappers/agno/test_agno.py
+++ b/py/src/braintrust/wrappers/agno/test_agno.py
@@ -94,7 +94,9 @@ def test_agno_simple_agent_execution(memory_logger):
     assert root_span["metrics"]["duration"] > 0
 
     llm_span = spans[1]
-    assert llm_span["span_attributes"]["name"] == "OpenAI.response"
+    llm_span_name = llm_span["span_attributes"]["name"]
+    assert "OpenAI" in llm_span_name
+    assert llm_span_name.endswith(".response")
     assert llm_span["span_attributes"]["type"].value == "llm"
     assert llm_span["span_parents"] == [root_span["span_id"]]
     assert llm_span["metadata"]["model"] == "gpt-4o-mini"