Skip to content

Commit f2f05be

Browse files
authored
fix: resolve reportUnknownMemberType under pyright strict mode (#242)
Create a `types/` package to house internal type definitions: - `types/__init__.py` — `Metadata` alias (was `types.py`) - `types/_eval.py` — eval TypedDicts (was inline in framework/logger) The `_eval` module name keeps these out of docs while the class names (`EvalCaseDict`, `EvalCaseDictNoOutput`, `ExperimentDatasetEvent`) are not underscore-prefixed so pyright strict mode doesn't flag them. Also explicitly parameterize `ErrorScoreHandler[Input, Output]` so pyright can bind the free TypeVars. fixes #239
1 parent 40d612f commit f2f05be

4 files changed

Lines changed: 64 additions & 50 deletions

File tree

py/src/braintrust/framework.py

Lines changed: 9 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from tqdm.asyncio import tqdm as async_tqdm
2626
from tqdm.auto import tqdm as std_tqdm
27-
from typing_extensions import NotRequired, Protocol, TypedDict
27+
from typing_extensions import Protocol, TypedDict
2828

2929
from .generated_types import FunctionFormat, FunctionOutputType, ObjectReference
3030
from .git_fields import GitMetadataSettings, RepoInfo
@@ -36,7 +36,6 @@
3636
Metadata,
3737
ScoreSummary,
3838
Span,
39-
_ExperimentDatasetEvent,
4039
parent_context,
4140
start_span,
4241
stringify_exception,
@@ -53,6 +52,7 @@
5352
from .score import Score, is_score, is_scorer
5453
from .serializable_data_class import SerializableDataClass
5554
from .span_types import SpanTypeAttribute
55+
from .types._eval import EvalCaseDict, EvalCaseDictNoOutput, ExperimentDatasetEvent
5656
from .util import bt_iscoroutinefunction, eprint, merge_dicts
5757

5858

@@ -91,30 +91,6 @@ class EvalCase(SerializableDataClass, Generic[Input, Output]):
9191
created: str | None = None
9292

9393

94-
class _EvalCaseDictNoOutput(Generic[Input], TypedDict):
95-
"""
96-
Workaround for the Pyright type checker handling of generics. Specifically,
97-
the type checker doesn't know that a dict which is missing the key
98-
"expected" can be used to satisfy `_EvalCaseDict[Input, Output]` for any
99-
`Output` type.
100-
"""
101-
102-
input: Input
103-
metadata: NotRequired[Metadata | None]
104-
tags: NotRequired[Sequence[str] | None]
105-
106-
id: NotRequired[str | None]
107-
_xact_id: NotRequired[str | None]
108-
109-
110-
class _EvalCaseDict(Generic[Input, Output], _EvalCaseDictNoOutput[Input]):
111-
"""
112-
Mirrors EvalCase for callers who pass a dict instead of dataclass.
113-
"""
114-
115-
expected: NotRequired[Output | None]
116-
117-
11894
# Inheritance doesn't quite work for dataclasses, so we redefine the fields
11995
# from EvalCase here.
12096
@dataclasses.dataclass
@@ -292,9 +268,9 @@ class BaseExperiment:
292268

293269
_AnyEvalCase = Union[
294270
EvalCase[Input, Output],
295-
_EvalCaseDict[Input, Output],
296-
_EvalCaseDictNoOutput[Input],
297-
_ExperimentDatasetEvent,
271+
EvalCaseDict[Input, Output],
272+
EvalCaseDictNoOutput[Input],
273+
ExperimentDatasetEvent,
298274
]
299275

300276
_EvalDataObject = Union[
@@ -429,7 +405,7 @@ class Evaluator(Generic[Input, Output]):
429405
takes precedence over `git_metadata_settings` if specified.
430406
"""
431407

432-
error_score_handler: ErrorScoreHandler | None = None
408+
error_score_handler: ErrorScoreHandler[Input, Output] | None = None
433409
"""
434410
Optionally supply a custom function to specifically handle score values when tasks or scoring functions have errored.
435411
A default implementation is exported as `default_error_score_handler` which will log a 0 score to the root span for any scorer that was not run.
@@ -682,7 +658,7 @@ def _EvalCommon(
682658
description: str | None,
683659
summarize_scores: bool,
684660
no_send_logs: bool,
685-
error_score_handler: ErrorScoreHandler | None = None,
661+
error_score_handler: ErrorScoreHandler[Input, Output] | None = None,
686662
parameters: EvalParameters | RemoteEvalParameters | None = None,
687663
on_start: Callable[[ExperimentSummary], None] | None = None,
688664
stream: Callable[[SSEProgressEvent], None] | None = None,
@@ -815,7 +791,7 @@ async def EvalAsync(
815791
base_experiment_id: str | None = None,
816792
git_metadata_settings: GitMetadataSettings | None = None,
817793
repo_info: RepoInfo | None = None,
818-
error_score_handler: ErrorScoreHandler | None = None,
794+
error_score_handler: ErrorScoreHandler[Input, Output] | None = None,
819795
description: str | None = None,
820796
summarize_scores: bool = True,
821797
no_send_logs: bool = False,
@@ -942,7 +918,7 @@ def Eval(
942918
base_experiment_id: str | None = None,
943919
git_metadata_settings: GitMetadataSettings | None = None,
944920
repo_info: RepoInfo | None = None,
945-
error_score_handler: ErrorScoreHandler | None = None,
921+
error_score_handler: ErrorScoreHandler[Input, Output] | None = None,
946922
description: str | None = None,
947923
summarize_scores: bool = True,
948924
no_send_logs: bool = False,

py/src/braintrust/logger.py

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
from .span_identifier_v4 import SpanComponentsV4
8080
from .span_types import SpanTypeAttribute
8181
from .types import Metadata
82+
from .types._eval import ExperimentDatasetEvent
8283
from .util import (
8384
GLOBAL_PROJECT,
8485
AugmentedHTTPError,
@@ -3716,35 +3717,21 @@ class ExperimentIdentifier:
37163717
name: str
37173718

37183719

3719-
class _ExperimentDatasetEvent(TypedDict):
3720-
"""
3721-
TODO: This could be unified with `framework._EvalCaseDict` like we do in the
3722-
TypeScript SDK, or generated from OpenAPI spec. For now, marking as internal
3723-
to exclude it from the docs.
3724-
"""
3725-
3726-
id: str
3727-
_xact_id: str
3728-
input: Any | None
3729-
expected: Any | None
3730-
tags: Sequence[str] | None
3731-
3732-
37333720
class ExperimentDatasetIterator:
37343721
def __init__(self, iterator: Iterator[ExperimentEvent]):
37353722
self.iterator = iterator
37363723

37373724
def __iter__(self):
37383725
return self
37393726

3740-
def __next__(self) -> _ExperimentDatasetEvent:
3727+
def __next__(self) -> ExperimentDatasetEvent:
37413728
while True:
37423729
value = next(self.iterator)
37433730
if value["root_span_id"] != value["span_id"]:
37443731
continue
37453732

37463733
output, expected = value.get("output"), value.get("expected")
3747-
ret: _ExperimentDatasetEvent = {
3734+
ret: ExperimentDatasetEvent = {
37483735
"input": value.get("input"),
37493736
"expected": expected if expected is not None else output,
37503737
"tags": value.get("tags"),
@@ -4133,7 +4120,7 @@ def _get_state(self) -> BraintrustState:
41334120
self._lazy_metadata.get()
41344121
return self.state
41354122

4136-
def as_dataset(self, batch_size: int | None = None) -> Iterator[_ExperimentDatasetEvent]:
4123+
def as_dataset(self, batch_size: int | None = None) -> Iterator[ExperimentDatasetEvent]:
41374124
"""
41384125
Return the experiment's data as a dataset iterator.
41394126

py/src/braintrust/types/_eval.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""Internal TypedDict types used in Eval/EvalAsync signatures.
2+
3+
These live in an underscore-prefixed module so they don't appear in
4+
generated documentation, while the class names themselves are *not*
5+
underscore-prefixed so pyright strict mode doesn't flag them as private.
6+
"""
7+
8+
from typing import Any, Generic, Sequence, TypeVar
9+
10+
from typing_extensions import NotRequired, TypedDict
11+
12+
13+
Input = TypeVar("Input")
14+
Output = TypeVar("Output")
15+
16+
17+
class EvalCaseDictNoOutput(Generic[Input], TypedDict):
18+
"""
19+
Workaround for the Pyright type checker handling of generics. Specifically,
20+
the type checker doesn't know that a dict which is missing the key
21+
"expected" can be used to satisfy ``EvalCaseDict[Input, Output]`` for any
22+
``Output`` type.
23+
"""
24+
25+
input: Input
26+
metadata: NotRequired[dict[str, Any] | None]
27+
tags: NotRequired[Sequence[str] | None]
28+
29+
id: NotRequired[str | None]
30+
_xact_id: NotRequired[str | None]
31+
32+
33+
class EvalCaseDict(Generic[Input, Output], EvalCaseDictNoOutput[Input]):
34+
"""
35+
Mirrors EvalCase for callers who pass a dict instead of dataclass.
36+
"""
37+
38+
expected: NotRequired[Output | None]
39+
40+
41+
class ExperimentDatasetEvent(TypedDict):
42+
"""
43+
TODO: This could be unified with ``EvalCaseDict`` like we do in the
44+
TypeScript SDK, or generated from OpenAPI spec.
45+
"""
46+
47+
id: str
48+
_xact_id: str
49+
input: Any | None
50+
expected: Any | None
51+
tags: Sequence[str] | None

0 commit comments

Comments
 (0)