fix: resolve reportUnknownMemberType under pyright strict mode (#242)

AbhiPrasad · web-flow · commit f2f05be2b59d · 2026-04-09T18:11:38.000-04:00
Create a `types/` package to house internal type definitions: - `types/__init__.py` — `Metadata` alias (was `types.py`) - `types/_eval.py` — eval TypedDicts (was inline in framework/logger) The `_eval` module name keeps these out of docs while the class names (`EvalCaseDict`, `EvalCaseDictNoOutput`, `ExperimentDatasetEvent`) are not underscore-prefixed so pyright strict mode doesn't flag them. Also explicitly parameterize `ErrorScoreHandler[Input, Output]` so pyright can bind the free TypeVars. fixes #239
diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py
@@ -24,7 +24,7 @@
 
 from tqdm.asyncio import tqdm as async_tqdm
 from tqdm.auto import tqdm as std_tqdm
-from typing_extensions import NotRequired, Protocol, TypedDict
+from typing_extensions import Protocol, TypedDict
 
 from .generated_types import FunctionFormat, FunctionOutputType, ObjectReference
 from .git_fields import GitMetadataSettings, RepoInfo
@@ -36,7 +36,6 @@
     Metadata,
     ScoreSummary,
     Span,
-    _ExperimentDatasetEvent,
     parent_context,
     start_span,
     stringify_exception,
@@ -53,6 +52,7 @@
 from .score import Score, is_score, is_scorer
 from .serializable_data_class import SerializableDataClass
 from .span_types import SpanTypeAttribute
+from .types._eval import EvalCaseDict, EvalCaseDictNoOutput, ExperimentDatasetEvent
 from .util import bt_iscoroutinefunction, eprint, merge_dicts
 
 
@@ -91,30 +91,6 @@ class EvalCase(SerializableDataClass, Generic[Input, Output]):
     created: str | None = None
 
 
-class _EvalCaseDictNoOutput(Generic[Input], TypedDict):
-    """
-    Workaround for the Pyright type checker handling of generics. Specifically,
-    the type checker doesn't know that a dict which is missing the key
-    "expected" can be used to satisfy `_EvalCaseDict[Input, Output]` for any
-    `Output` type.
-    """
-
-    input: Input
-    metadata: NotRequired[Metadata | None]
-    tags: NotRequired[Sequence[str] | None]
-
-    id: NotRequired[str | None]
-    _xact_id: NotRequired[str | None]
-
-
-class _EvalCaseDict(Generic[Input, Output], _EvalCaseDictNoOutput[Input]):
-    """
-    Mirrors EvalCase for callers who pass a dict instead of dataclass.
-    """
-
-    expected: NotRequired[Output | None]
-
-
 # Inheritance doesn't quite work for dataclasses, so we redefine the fields
 # from EvalCase here.
 @dataclasses.dataclass
@@ -292,9 +268,9 @@ class BaseExperiment:
 
 _AnyEvalCase = Union[
     EvalCase[Input, Output],
-    _EvalCaseDict[Input, Output],
-    _EvalCaseDictNoOutput[Input],
-    _ExperimentDatasetEvent,
+    EvalCaseDict[Input, Output],
+    EvalCaseDictNoOutput[Input],
+    ExperimentDatasetEvent,
 ]
 
 _EvalDataObject = Union[
@@ -429,7 +405,7 @@ class Evaluator(Generic[Input, Output]):
     takes precedence over `git_metadata_settings` if specified.
     """
 
-    error_score_handler: ErrorScoreHandler | None = None
+    error_score_handler: ErrorScoreHandler[Input, Output] | None = None
     """
     Optionally supply a custom function to specifically handle score values when tasks or scoring functions have errored.
     A default implementation is exported as `default_error_score_handler` which will log a 0 score to the root span for any scorer that was not run.
@@ -682,7 +658,7 @@ def _EvalCommon(
     description: str | None,
     summarize_scores: bool,
     no_send_logs: bool,
-    error_score_handler: ErrorScoreHandler | None = None,
+    error_score_handler: ErrorScoreHandler[Input, Output] | None = None,
     parameters: EvalParameters | RemoteEvalParameters | None = None,
     on_start: Callable[[ExperimentSummary], None] | None = None,
     stream: Callable[[SSEProgressEvent], None] | None = None,
@@ -815,7 +791,7 @@ async def EvalAsync(
     base_experiment_id: str | None = None,
     git_metadata_settings: GitMetadataSettings | None = None,
     repo_info: RepoInfo | None = None,
-    error_score_handler: ErrorScoreHandler | None = None,
+    error_score_handler: ErrorScoreHandler[Input, Output] | None = None,
     description: str | None = None,
     summarize_scores: bool = True,
     no_send_logs: bool = False,
@@ -942,7 +918,7 @@ def Eval(
     base_experiment_id: str | None = None,
     git_metadata_settings: GitMetadataSettings | None = None,
     repo_info: RepoInfo | None = None,
-    error_score_handler: ErrorScoreHandler | None = None,
+    error_score_handler: ErrorScoreHandler[Input, Output] | None = None,
     description: str | None = None,
     summarize_scores: bool = True,
     no_send_logs: bool = False,
diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py
@@ -79,6 +79,7 @@
 from .span_identifier_v4 import SpanComponentsV4
 from .span_types import SpanTypeAttribute
 from .types import Metadata
+from .types._eval import ExperimentDatasetEvent
 from .util import (
     GLOBAL_PROJECT,
     AugmentedHTTPError,
@@ -3716,35 +3717,21 @@ class ExperimentIdentifier:
     name: str
 
 
-class _ExperimentDatasetEvent(TypedDict):
-    """
-    TODO: This could be unified with `framework._EvalCaseDict` like we do in the
-    TypeScript SDK, or generated from OpenAPI spec. For now, marking as internal
-    to exclude it from the docs.
-    """
-
-    id: str
-    _xact_id: str
-    input: Any | None
-    expected: Any | None
-    tags: Sequence[str] | None
-
-
 class ExperimentDatasetIterator:
     def __init__(self, iterator: Iterator[ExperimentEvent]):
         self.iterator = iterator
 
     def __iter__(self):
         return self
 
-    def __next__(self) -> _ExperimentDatasetEvent:
+    def __next__(self) -> ExperimentDatasetEvent:
         while True:
             value = next(self.iterator)
             if value["root_span_id"] != value["span_id"]:
                 continue
 
             output, expected = value.get("output"), value.get("expected")
-            ret: _ExperimentDatasetEvent = {
+            ret: ExperimentDatasetEvent = {
                 "input": value.get("input"),
                 "expected": expected if expected is not None else output,
                 "tags": value.get("tags"),
@@ -4133,7 +4120,7 @@ def _get_state(self) -> BraintrustState:
         self._lazy_metadata.get()
         return self.state
 
-    def as_dataset(self, batch_size: int | None = None) -> Iterator[_ExperimentDatasetEvent]:
+    def as_dataset(self, batch_size: int | None = None) -> Iterator[ExperimentDatasetEvent]:
         """
         Return the experiment's data as a dataset iterator.
 
diff --git a/py/src/braintrust/types/__init__.py b/py/src/braintrust/types/__init__.py
diff --git a/py/src/braintrust/types/_eval.py b/py/src/braintrust/types/_eval.py
@@ -0,0 +1,51 @@
+"""Internal TypedDict types used in Eval/EvalAsync signatures.
+
+These live in an underscore-prefixed module so they don't appear in
+generated documentation, while the class names themselves are *not*
+underscore-prefixed so pyright strict mode doesn't flag them as private.
+"""
+
+from typing import Any, Generic, Sequence, TypeVar
+
+from typing_extensions import NotRequired, TypedDict
+
+
+Input = TypeVar("Input")
+Output = TypeVar("Output")
+
+
+class EvalCaseDictNoOutput(Generic[Input], TypedDict):
+    """
+    Workaround for the Pyright type checker handling of generics. Specifically,
+    the type checker doesn't know that a dict which is missing the key
+    "expected" can be used to satisfy ``EvalCaseDict[Input, Output]`` for any
+    ``Output`` type.
+    """
+
+    input: Input
+    metadata: NotRequired[dict[str, Any] | None]
+    tags: NotRequired[Sequence[str] | None]
+
+    id: NotRequired[str | None]
+    _xact_id: NotRequired[str | None]
+
+
+class EvalCaseDict(Generic[Input, Output], EvalCaseDictNoOutput[Input]):
+    """
+    Mirrors EvalCase for callers who pass a dict instead of dataclass.
+    """
+
+    expected: NotRequired[Output | None]
+
+
+class ExperimentDatasetEvent(TypedDict):
+    """
+    TODO: This could be unified with ``EvalCaseDict`` like we do in the
+    TypeScript SDK, or generated from OpenAPI spec.
+    """
+
+    id: str
+    _xact_id: str
+    input: Any | None
+    expected: Any | None
+    tags: Sequence[str] | None