diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index 1dc38f90..fd49512a 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -163,7 +163,7 @@ class EvalHooks(abc.ABC, Generic[Output]): @property @abc.abstractmethod - def metadata(self) -> Metadata: + def metadata(self) -> Metadata | None: """ The metadata object for the current evaluation. You can mutate this object to add or remove metadata. """ @@ -1148,7 +1148,7 @@ def evaluate_filter(object, filter: Filter): class DictEvalHooks(dict[str, Any]): def __init__( self, - metadata: Any | None = None, + metadata: Metadata | None = None, expected: Any | None = None, trial_index: int = 0, tags: Sequence[str] | None = None, @@ -1170,7 +1170,7 @@ def __init__( self._parameters = parameters @property - def metadata(self): + def metadata(self) -> Metadata | None: return self.get("metadata") @property diff --git a/py/src/braintrust/framework2.py b/py/src/braintrust/framework2.py index 7fe00fa0..4f6d96ae 100644 --- a/py/src/braintrust/framework2.py +++ b/py/src/braintrust/framework2.py @@ -16,6 +16,7 @@ SavedFunctionId, ToolFunctionDefinition, ) +from .types import Metadata from .util import eprint @@ -58,7 +59,7 @@ class CodeFunction: parameters: Any returns: Any if_exists: IfExists | None - metadata: dict[str, Any] | None = None + metadata: Metadata | None = None tags: Sequence[str] | None = None @@ -75,7 +76,7 @@ class CodePrompt: function_type: str | None id: str | None if_exists: IfExists | None - metadata: dict[str, Any] | None = None + metadata: Metadata | None = None tags: Sequence[str] | None = None def to_function_definition(self, if_exists: IfExists | None, project_ids: ProjectIdCache) -> dict[str, Any]: @@ -133,7 +134,7 @@ def create( parameters: Any = None, returns: Any = None, if_exists: IfExists | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: Sequence[str] | None = None, ) -> CodeFunction: """Creates a tool. @@ -198,7 +199,7 @@ def create( params: ModelParams | None = None, tools: list[CodeFunction | SavedFunctionId | ToolFunctionDefinition] | None = None, if_exists: IfExists | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: Sequence[str] | None = None, ) -> CodePrompt: ... @@ -215,7 +216,7 @@ def create( params: ModelParams | None = None, tools: list[CodeFunction | SavedFunctionId | ToolFunctionDefinition] | None = None, if_exists: IfExists | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: Sequence[str] | None = None, ) -> CodePrompt: ... @@ -232,7 +233,7 @@ def create( params: ModelParams | None = None, tools: list[CodeFunction | SavedFunctionId | ToolFunctionDefinition] | None = None, if_exists: IfExists | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: Sequence[str] | None = None, ): """Creates a prompt. @@ -321,7 +322,7 @@ def create( slug: str | None = None, description: str | None = None, if_exists: IfExists | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: Sequence[str] | None = None, handler: Callable[..., Any], parameters: Any, @@ -337,7 +338,7 @@ def create( slug: str | None = None, description: str | None = None, if_exists: IfExists | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: Sequence[str] | None = None, prompt: str, model: str, @@ -355,7 +356,7 @@ def create( slug: str | None = None, description: str | None = None, if_exists: IfExists | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: Sequence[str] | None = None, messages: list[ChatCompletionMessageParam], model: str, @@ -371,7 +372,7 @@ def create( slug: str | None = None, description: str | None = None, if_exists: IfExists | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: Sequence[str] | None = None, # Code scorer params. handler: Callable[..., Any] | None = None, diff --git a/py/src/braintrust/functions/invoke.py b/py/src/braintrust/functions/invoke.py index b9597954..85e471fc 100644 --- a/py/src/braintrust/functions/invoke.py +++ b/py/src/braintrust/functions/invoke.py @@ -5,6 +5,7 @@ from .._generated_types import FunctionTypeEnum from ..bt_json import bt_dumps from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn +from ..types import Metadata from ..util import response_raise_for_status from .constants import INVOKE_API_VERSION from .stream import BraintrustInvokeError, BraintrustStream @@ -45,7 +46,7 @@ def invoke( # arguments to the function input: Any = None, messages: list[Any] | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: list[str] | None = None, parent: Exportable | str | None = None, stream: Literal[False] | None = None, @@ -73,7 +74,7 @@ def invoke( # arguments to the function input: Any = None, messages: list[Any] | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: list[str] | None = None, parent: Exportable | str | None = None, stream: Literal[True] = True, @@ -100,7 +101,7 @@ def invoke( # arguments to the function input: Any = None, messages: list[Any] | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, tags: list[str] | None = None, parent: Exportable | str | None = None, stream: bool = False, diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py index ff08052d..1221b5c6 100644 --- a/py/src/braintrust/logger.py +++ b/py/src/braintrust/logger.py @@ -76,6 +76,7 @@ from .span_identifier_v3 import SpanComponentsV3, SpanObjectTypeV3 from .span_identifier_v4 import SpanComponentsV4 from .span_types import SpanTypeAttribute +from .types import Metadata from .util import ( GLOBAL_PROJECT, AugmentedHTTPError, @@ -92,15 +93,13 @@ parse_env_var_float, response_raise_for_status, ) +from .xact_ids import prettify_xact # Fields that should be passed to the masking function # Note: "tags" field is intentionally excluded, but can be added if needed REDACTION_FIELDS = ["input", "output", "expected", "metadata", "context", "scores", "metrics"] -from .xact_ids import prettify_xact - -Metadata = dict[str, Any] DATA_API_VERSION = 2 LOGS3_OVERFLOW_REFERENCE_TYPE = "logs3_overflow" # 6 MB for the AWS lambda gateway (from our own testing). @@ -3216,7 +3215,7 @@ def _log_feedback_impl( expected: Any | None = None, tags: Sequence[str] | None = None, comment: str | None = None, - metadata: Mapping[str, Any] | None = None, + metadata: Metadata | None = None, source: Literal["external", "app", "api", None] = None, ): if source is None: @@ -3667,7 +3666,7 @@ def log( error: str | None = None, tags: Sequence[str] | None = None, scores: Mapping[str, int | float] | None = None, - metadata: Mapping[str, Any] | None = None, + metadata: Metadata | None = None, metrics: Mapping[str, int | float] | None = None, id: str | None = None, dataset_record_id: str | None = None, @@ -3719,7 +3718,7 @@ def log_feedback( expected: Any | None = None, tags: Sequence[str] | None = None, comment: str | None = None, - metadata: Mapping[str, Any] | None = None, + metadata: Metadata | None = None, source: Literal["external", "app", "api", None] = None, ) -> None: """ @@ -4502,7 +4501,7 @@ def _get_state(self) -> BraintrustState: def _validate_event( self, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, expected: Any | None = None, output: Any | None = None, tags: Sequence[str] | None = None, @@ -4555,7 +4554,7 @@ def insert( input: Any | None = None, expected: Any | None = None, tags: Sequence[str] | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, id: str | None = None, output: Any | None = None, ) -> str: @@ -4599,7 +4598,7 @@ def update( input: Any | None = None, expected: Any | None = None, tags: Sequence[str] | None = None, - metadata: dict[str, Any] | None = None, + metadata: Metadata | None = None, ) -> str: """ Update fields of a single record in the dataset. The updated fields will be batched and uploaded behind the scenes. @@ -5100,7 +5099,7 @@ def log( error: str | None = None, tags: Sequence[str] | None = None, scores: Mapping[str, int | float] | None = None, - metadata: Mapping[str, Any] | None = None, + metadata: Metadata | None = None, metrics: Mapping[str, int | float] | None = None, id: str | None = None, allow_concurrent_with_spans: bool = False, @@ -5151,7 +5150,7 @@ def log_feedback( expected: Any | None = None, tags: Sequence[str] | None = None, comment: str | None = None, - metadata: Mapping[str, Any] | None = None, + metadata: Metadata | None = None, source: Literal["external", "app", "api", None] = None, ) -> None: """ diff --git a/py/src/braintrust/score.py b/py/src/braintrust/score.py index 62f9ee7e..d8dd0fdf 100644 --- a/py/src/braintrust/score.py +++ b/py/src/braintrust/score.py @@ -5,6 +5,7 @@ from typing import Any from .serializable_data_class import SerializableDataClass +from .types import Metadata # ========================================================================= @@ -26,7 +27,7 @@ class Score(SerializableDataClass): score: float | None = None """The score for the evaluation. This should be a float between 0 and 1. If the score is None, the evaluation is considered to be skipped.""" - metadata: dict[str, Any] = dataclasses.field(default_factory=dict) + metadata: Metadata = dataclasses.field(default_factory=dict) """Metadata for the score. This can be used to store additional information about the score.""" # DEPRECATION_NOTICE: this field is deprecated, as errors are propagated up to the caller. diff --git a/py/src/braintrust/span_cache.py b/py/src/braintrust/span_cache.py index 1f6bde9c..f3248d0c 100644 --- a/py/src/braintrust/span_cache.py +++ b/py/src/braintrust/span_cache.py @@ -13,6 +13,7 @@ import uuid from typing import Any, Optional +from braintrust.types import Metadata from braintrust.util import merge_dicts @@ -29,7 +30,7 @@ def __init__( span_id: str, input: Optional[Any] = None, output: Optional[Any] = None, - metadata: Optional[dict[str, Any]] = None, + metadata: Metadata | None = None, span_parents: Optional[list[str]] = None, span_attributes: Optional[dict[str, Any]] = None, ): diff --git a/py/src/braintrust/trace.py b/py/src/braintrust/trace.py index f07b9ef0..575ee2ca 100644 --- a/py/src/braintrust/trace.py +++ b/py/src/braintrust/trace.py @@ -10,6 +10,7 @@ from braintrust.functions.invoke import invoke from braintrust.logger import BraintrustState, ObjectFetcher +from braintrust.types import Metadata class SpanData: @@ -19,7 +20,7 @@ def __init__( self, input: Optional[Any] = None, output: Optional[Any] = None, - metadata: Optional[dict[str, Any]] = None, + metadata: Metadata | None = None, span_id: Optional[str] = None, span_parents: Optional[list[str]] = None, span_attributes: Optional[dict[str, Any]] = None, diff --git a/py/src/braintrust/types.py b/py/src/braintrust/types.py new file mode 100644 index 00000000..ce11d711 --- /dev/null +++ b/py/src/braintrust/types.py @@ -0,0 +1,4 @@ +from typing import Any + + +Metadata = dict[str, Any] diff --git a/py/src/braintrust/wrappers/agno/test_agno.py b/py/src/braintrust/wrappers/agno/test_agno.py index 81f6110d..cc9a6d3b 100644 --- a/py/src/braintrust/wrappers/agno/test_agno.py +++ b/py/src/braintrust/wrappers/agno/test_agno.py @@ -94,7 +94,9 @@ def test_agno_simple_agent_execution(memory_logger): assert root_span["metrics"]["duration"] > 0 llm_span = spans[1] - assert llm_span["span_attributes"]["name"] == "OpenAI.response" + llm_span_name = llm_span["span_attributes"]["name"] + assert "OpenAI" in llm_span_name + assert llm_span_name.endswith(".response") assert llm_span["span_attributes"]["type"].value == "llm" assert llm_span["span_parents"] == [root_span["span_id"]] assert llm_span["metadata"]["model"] == "gpt-4o-mini"