make StringMethods generically typed

TrevorBergeron · TrevorBergeron · commit 7fb3cdc69295 · 2025-10-17T20:37:00.000Z
diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py
@@ -15,20 +15,18 @@
 from __future__ import annotations
 
 import re
-from typing import Literal, Optional, TYPE_CHECKING, Union
+from typing import Generic, Literal, Optional, TypeVar, Union
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.strings.accessor as vendorstr
 
 from bigframes.core import log_adapter
+import bigframes.core.indexes.base as indices
 import bigframes.dataframe as df
 import bigframes.operations as ops
 from bigframes.operations._op_converters import convert_index, convert_slice
 import bigframes.operations.aggregations as agg_ops
-
-if TYPE_CHECKING:
-    import bigframes.core.indexes.base as indices
-    import bigframes.series as series
+import bigframes.series as series
 
 # Maps from python to re2
 REGEXP_FLAGS = {
@@ -37,15 +35,17 @@
     re.DOTALL: "s",
 }
 
+T = TypeVar("T", series.Series, indices.Index)
+
 
 @log_adapter.class_logger
-class StringMethods(vendorstr.StringMethods):
+class StringMethods(vendorstr.StringMethods, Generic[T]):
     __doc__ = vendorstr.StringMethods.__doc__
 
-    def __init__(self, data: Union[series.Series, indices.Index]):
-        self._data = data
+    def __init__(self, data: T):
+        self._data: T = data
 
-    def __getitem__(self, key: Union[int, slice]) -> series.Series:
+    def __getitem__(self, key: Union[int, slice]) -> T:
         if isinstance(key, int):
             return self._data._apply_unary_op(convert_index(key))
         elif isinstance(key, slice):
@@ -58,18 +58,18 @@ def find(
         sub: str,
         start: Optional[int] = None,
         end: Optional[int] = None,
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_unary_op(
             ops.StrFindOp(substr=sub, start=start, end=end)
         )
 
-    def len(self) -> series.Series:
+    def len(self) -> T:
         return self._data._apply_unary_op(ops.len_op)
 
-    def lower(self) -> series.Series:
+    def lower(self) -> T:
         return self._data._apply_unary_op(ops.lower_op)
 
-    def reverse(self) -> series.Series:
+    def reverse(self) -> T:
         """Reverse strings in the Series.
 
         **Examples:**
@@ -94,103 +94,103 @@ def slice(
         self,
         start: Optional[int] = None,
         stop: Optional[int] = None,
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_unary_op(ops.StrSliceOp(start=start, end=stop))
 
-    def strip(self, to_strip: Optional[str] = None) -> series.Series:
+    def strip(self, to_strip: Optional[str] = None) -> T:
         return self._data._apply_unary_op(
             ops.StrStripOp(to_strip=" \n\t" if to_strip is None else to_strip)
         )
 
-    def upper(self) -> series.Series:
+    def upper(self) -> T:
         return self._data._apply_unary_op(ops.upper_op)
 
-    def isnumeric(self) -> series.Series:
+    def isnumeric(self) -> T:
         return self._data._apply_unary_op(ops.isnumeric_op)
 
     def isalpha(
         self,
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_unary_op(ops.isalpha_op)
 
     def isdigit(
         self,
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_unary_op(ops.isdigit_op)
 
     def isdecimal(
         self,
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_unary_op(ops.isdecimal_op)
 
     def isalnum(
         self,
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_unary_op(ops.isalnum_op)
 
     def isspace(
         self,
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_unary_op(ops.isspace_op)
 
     def islower(
         self,
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_unary_op(ops.islower_op)
 
     def isupper(
         self,
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_unary_op(ops.isupper_op)
 
-    def rstrip(self, to_strip: Optional[str] = None) -> series.Series:
+    def rstrip(self, to_strip: Optional[str] = None) -> T:
         return self._data._apply_unary_op(
             ops.StrRstripOp(to_strip=" \n\t" if to_strip is None else to_strip)
         )
 
-    def lstrip(self, to_strip: Optional[str] = None) -> series.Series:
+    def lstrip(self, to_strip: Optional[str] = None) -> T:
         return self._data._apply_unary_op(
             ops.StrLstripOp(to_strip=" \n\t" if to_strip is None else to_strip)
         )
 
-    def repeat(self, repeats: int) -> series.Series:
+    def repeat(self, repeats: int) -> T:
         return self._data._apply_unary_op(ops.StrRepeatOp(repeats=repeats))
 
-    def capitalize(self) -> series.Series:
+    def capitalize(self) -> T:
         return self._data._apply_unary_op(ops.capitalize_op)
 
-    def match(self, pat, case=True, flags=0) -> series.Series:
+    def match(self, pat, case=True, flags=0) -> T:
         # \A anchors start of entire string rather than start of any line in multiline mode
         adj_pat = rf"\A{pat}"
         return self.contains(pat=adj_pat, case=case, flags=flags)
 
-    def fullmatch(self, pat, case=True, flags=0) -> series.Series:
+    def fullmatch(self, pat, case=True, flags=0) -> T:
         # \A anchors start of entire string rather than start of any line in multiline mode
         # \z likewise anchors to the end of the entire multiline string
         adj_pat = rf"\A{pat}\z"
         return self.contains(pat=adj_pat, case=case, flags=flags)
 
-    def get(self, i: int) -> series.Series:
+    def get(self, i: int) -> T:
         return self._data._apply_unary_op(ops.StrGetOp(i=i))
 
-    def pad(self, width, side="left", fillchar=" ") -> series.Series:
+    def pad(self, width, side="left", fillchar=" ") -> T:
         return self._data._apply_unary_op(
             ops.StrPadOp(length=width, fillchar=fillchar, side=side)
         )
 
-    def ljust(self, width, fillchar=" ") -> series.Series:
+    def ljust(self, width, fillchar=" ") -> T:
         return self._data._apply_unary_op(
             ops.StrPadOp(length=width, fillchar=fillchar, side="right")
         )
 
-    def rjust(self, width, fillchar=" ") -> series.Series:
+    def rjust(self, width, fillchar=" ") -> T:
         return self._data._apply_unary_op(
             ops.StrPadOp(length=width, fillchar=fillchar, side="left")
         )
 
     def contains(
         self, pat, case: bool = True, flags: int = 0, *, regex: bool = True
-    ) -> series.Series:
+    ) -> T:
         if not case:
             return self.contains(pat=pat, flags=flags | re.IGNORECASE, regex=True)
         if regex:
@@ -235,7 +235,7 @@ def replace(
         case: Optional[bool] = None,
         flags: int = 0,
         regex: bool = False,
-    ) -> series.Series:
+    ) -> T:
         if isinstance(pat, re.Pattern):
             assert isinstance(pat.pattern, str)
             pat_str = pat.pattern
@@ -262,15 +262,15 @@ def replace(
     def startswith(
         self,
         pat: Union[str, tuple[str, ...]],
-    ) -> series.Series:
+    ) -> T:
         if not isinstance(pat, tuple):
             pat = (pat,)
         return self._data._apply_unary_op(ops.StartsWithOp(pat=pat))
 
     def endswith(
         self,
         pat: Union[str, tuple[str, ...]],
-    ) -> series.Series:
+    ) -> T:
         if not isinstance(pat, tuple):
             pat = (pat,)
         return self._data._apply_unary_op(ops.EndsWithOp(pat=pat))
@@ -279,7 +279,7 @@ def split(
         self,
         pat: str = " ",
         regex: Union[bool, None] = None,
-    ) -> series.Series:
+    ) -> T:
         if regex is True or (regex is None and len(pat) > 1):
             raise NotImplementedError(
                 "Regular expressions aren't currently supported. Please set "
@@ -297,18 +297,18 @@ def center(self, width: int, fillchar: str = " ") -> series.Series:
 
     def cat(
         self,
-        others: Union[str, series.Series],
+        others: Union[str, indices.Index, series.Series],
         *,
         join: Literal["outer", "left"] = "left",
-    ) -> series.Series:
+    ) -> T:
         return self._data._apply_binary_op(others, ops.strconcat_op, alignment=join)
 
-    def join(self, sep: str) -> series.Series:
+    def join(self, sep: str) -> T:
         return self._data._apply_unary_op(
             ops.ArrayReduceOp(aggregation=agg_ops.StringAggOp(sep=sep))
         )
 
-    def to_blob(self, connection: Optional[str] = None) -> series.Series:
+    def to_blob(self, connection: Optional[str] = None) -> T:
         """Create a BigFrames Blob series from a series of URIs.
 
         .. note::
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -74,12 +74,13 @@
 import bigframes.operations.datetimes as dt
 import bigframes.operations.lists as lists
 import bigframes.operations.plotting as plotting
-import bigframes.operations.strings as strings
 import bigframes.operations.structs as structs
 import bigframes.session
 
 if typing.TYPE_CHECKING:
     import bigframes.geopandas.geoseries
+    import bigframes.operations.strings as strings
+
 
 LevelType = typing.Union[str, int]
 LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
@@ -2649,6 +2650,8 @@ def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series:
     # confusing type checker by overriding str
     @property
     def str(self) -> strings.StringMethods:
+        import bigframes.operations.strings as strings
+
         return strings.StringMethods(self)
 
     @property
diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py
@@ -698,6 +698,8 @@ def test_index_str_accessor_unary(scalars_df_index, scalars_pandas_df_index):
 
 
 def test_index_str_accessor_binary(scalars_df_index, scalars_pandas_df_index):
+    if pd.__version__.startswith("1."):
+        pytest.skip("doesn't work in pandas 1.x.")
     bf_index = scalars_df_index.set_index("string_col").index
     pd_index = scalars_pandas_df_index.set_index("string_col").index