feat: Add Groupby.describe()

TrevorBergeron · TrevorBergeron · commit 21a5b840a22d · 2025-09-16T19:27:10.000Z
diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py
@@ -15,19 +15,18 @@
 
 from typing import TYPE_CHECKING
 
-from bigframes.core import rewrite
-from bigframes.core.compile.ibis_compiler import ibis_compiler
-
 if TYPE_CHECKING:
     import bigframes.core.nodes
 
 
 def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode):
     """Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema."""
+    from bigframes.core.compile.ibis_compiler import ibis_compiler
+    import bigframes.core.rewrite
     import bigframes.core.schema
 
     node = ibis_compiler._replace_unsupported_ops(node)
-    node = rewrite.bake_order(node)
+    node = bigframes.core.rewrite.bake_order(node)
     ir = ibis_compiler.compile_node(node)
     items = tuple(
         bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id))
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
@@ -149,6 +149,20 @@ def head(self, n: int = 5) -> df.DataFrame:
             )
         )
 
+    def describe(self, include: None | Literal["all"] = None):
+        from bigframes.pandas.core.methods import describe
+
+        return df.DataFrame(
+            describe._describe(
+                self._block,
+                self._selected_cols,
+                include,
+                as_index=self._as_index,
+                by_col_ids=self._by_col_ids,
+                dropna=self._dropna,
+            )
+        )
+
     def size(self) -> typing.Union[df.DataFrame, series.Series]:
         agg_block, _ = self._block.aggregate_size(
             by_column_ids=self._by_col_ids,
diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py
@@ -75,6 +75,20 @@ def head(self, n: int = 5) -> series.Series:
             )
         )
 
+    def describe(self, include: None | Literal["all"] = None):
+        from bigframes.pandas.core.methods import describe
+
+        return df.DataFrame(
+            describe._describe(
+                self._block.select_column(self._value_column),
+                columns=[self._value_column],
+                include=include,
+                as_index=True,
+                by_col_ids=self._by_col_ids,
+                dropna=self._dropna,
+            )
+        ).droplevel(level=0, axis=1)
+
     def all(self) -> series.Series:
         return self._aggregate(agg_ops.all_op)
 
diff --git a/bigframes/core/rewrite/implicit_align.py b/bigframes/core/rewrite/implicit_align.py
@@ -18,12 +18,8 @@
 from typing import cast, Optional, Sequence, Set, Tuple
 
 import bigframes.core.expression
-import bigframes.core.guid
 import bigframes.core.identifiers
-import bigframes.core.join_def
 import bigframes.core.nodes
-import bigframes.core.window_spec
-import bigframes.operations.aggregations
 
 # Combination of selects and additive nodes can be merged as an explicit keyless "row join"
 ALIGNABLE_NODES = (
diff --git a/bigframes/pandas/core/methods/describe.py b/bigframes/pandas/core/methods/describe.py
@@ -16,8 +16,16 @@
 
 import typing
 
+import pandas as pd
+
 from bigframes import dataframe, dtypes, series
+from bigframes.core import agg_expressions, blocks
 from bigframes.core.reshape import api as rs
+from bigframes.operations import aggregations
+
+_DEFAULT_DTYPES = (
+    dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
+)
 
 
 def describe(
@@ -33,8 +41,7 @@ def describe(
     if include is None:
         numeric_df = _select_dtypes(
             input,
-            dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
-            + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
+            _DEFAULT_DTYPES,
         )
         if len(numeric_df.columns) == 0:
             # Describe eligible non-numeric columns
@@ -61,6 +68,62 @@ def describe(
         raise ValueError(f"Unsupported include type: {include}")
 
 
+def _describe(
+    block: blocks.Block,
+    columns: typing.Sequence[str],
+    include: None | typing.Literal["all"] = None,
+    *,
+    as_index: bool = True,
+    by_col_ids: typing.Sequence[str] = [],
+    dropna: bool = False,
+) -> blocks.Block:
+    stats: list[agg_expressions.Aggregation] = []
+    column_labels: list[typing.Hashable] = []
+
+    for col_id in columns:
+        label = block.col_id_to_label[col_id]
+        dtype = block.expr.get_column_type(col_id)
+        if include != "all" and dtype not in _DEFAULT_DTYPES:
+            continue
+        agg_ops = _get_aggs_for_dtype(dtype)
+        stats.extend(op.as_expr(col_id) for op in agg_ops)
+        label_tuple = (label,) if block.column_labels.nlevels == 1 else label
+        column_labels.extend((*label_tuple, op.name) for op in agg_ops)  # type: ignore
+
+    agg_block, _ = block.aggregate(
+        by_column_ids=by_col_ids,
+        aggregations=stats,
+        dropna=dropna,
+        column_labels=pd.Index(column_labels, name=(*block.index.names, None)),
+    )
+    return agg_block if as_index else agg_block.reset_index(drop=False)
+
+
+def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]:
+    if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE:
+        return [
+            aggregations.count_op,
+            aggregations.mean_op,
+            aggregations.std_op,
+            aggregations.min_op,
+            aggregations.ApproxQuartilesOp(1),
+            aggregations.ApproxQuartilesOp(2),
+            aggregations.ApproxQuartilesOp(3),
+            aggregations.max_op,
+        ]
+    elif dtype in dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES:
+        return [aggregations.count_op]
+    elif dtype in [
+        dtypes.STRING_DTYPE,
+        dtypes.BOOL_DTYPE,
+        dtypes.BYTES_DTYPE,
+        dtypes.TIME_DTYPE,
+    ]:
+        return [aggregations.count_op, aggregations.nunique_op]
+    else:
+        return []
+
+
 def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
     number_df_result = typing.cast(
         dataframe.DataFrame,
@@ -91,8 +154,7 @@ def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
 
         original_columns = _select_dtypes(
             df,
-            dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
-            + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
+            _DEFAULT_DTYPES,
         ).columns
 
         # Use reindex after join to preserve the original column order.
diff --git a/tests/system/small/pandas/test_describe.py b/tests/system/small/pandas/test_describe.py
@@ -230,3 +230,125 @@ def test_series_describe_temporal(scalars_dfs):
         check_dtype=False,
         check_index_type=False,
     )
+
+
+def test_df_groupby_describe(scalars_dfs):
+    # TODO: supply a reason why this isn't compatible with pandas 1.x
+    pytest.importorskip("pandas", minversion="2.0.0")
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    numeric_columns = [
+        "int64_col",
+        "float64_col",
+    ]
+    non_numeric_columns = ["string_col"]
+    supported_columns = numeric_columns + non_numeric_columns
+
+    bf_full_result = (
+        scalars_df.groupby("bool_col")[supported_columns]
+        .describe(include="all")
+        .to_pandas()
+    )
+
+    pd_full_result = scalars_pandas_df.groupby("bool_col")[supported_columns].describe(
+        include="all"
+    )
+
+    for col in supported_columns:
+        pd_result = pd_full_result[col]
+        bf_result = bf_full_result[col]
+
+        if col in numeric_columns:
+            # Drop quartiles, as they are approximate
+            bf_min = bf_result["min"]
+            bf_p25 = bf_result["25%"]
+            bf_p50 = bf_result["50%"]
+            bf_p75 = bf_result["75%"]
+            bf_max = bf_result["max"]
+
+            # Reindex results with the specified keys and their order, because
+            # the relative order is not important.
+            bf_result = bf_result.reindex(
+                columns=["count", "mean", "std", "min", "max"]
+            )
+            pd_result = pd_result.reindex(
+                columns=["count", "mean", "std", "min", "max"]
+            )
+
+            # Double-check that quantiles are at least plausible.
+            assert (
+                (bf_min <= bf_p25)
+                & (bf_p25 <= bf_p50)
+                & (bf_p50 <= bf_p50)
+                & (bf_p75 <= bf_max)
+            ).all()
+        else:
+            # Reindex results with the specified keys and their order, because
+            # the relative order is not important.
+            bf_result = bf_result.reindex(columns=["count", "nunique"])
+            pd_result = pd_result.reindex(columns=["count", "unique"])
+        pandas.testing.assert_frame_equal(
+            # BF counter part of "unique" is called "nunique"
+            pd_result.astype("Float64").rename(columns={"unique": "nunique"}),
+            bf_result,
+            check_dtype=False,
+            check_index_type=False,
+        )
+
+
+def test_series_groupby_describe(scalars_dfs):
+    # TODO: supply a reason why this isn't compatible with pandas 1.x
+    pytest.importorskip("pandas", minversion="2.0.0")
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    numeric_columns = [
+        "int64_col",
+        "float64_col",
+    ]
+    non_numeric_columns = ["string_col"]
+    supported_columns = numeric_columns + non_numeric_columns
+
+    bf_df = scalars_df.groupby("bool_col")
+
+    pd_df = scalars_pandas_df.groupby("bool_col")
+
+    for col in supported_columns:
+        pd_result = pd_df[col].describe(include="all")
+        bf_result = bf_df[col].describe(include="all").to_pandas()
+
+        if col in numeric_columns:
+            # Drop quartiles, as they are approximate
+            bf_min = bf_result["min"]
+            bf_p25 = bf_result["25%"]
+            bf_p50 = bf_result["50%"]
+            bf_p75 = bf_result["75%"]
+            bf_max = bf_result["max"]
+
+            # Reindex results with the specified keys and their order, because
+            # the relative order is not important.
+            bf_result = bf_result.reindex(
+                columns=["count", "mean", "std", "min", "max"]
+            )
+            pd_result = pd_result.reindex(
+                columns=["count", "mean", "std", "min", "max"]
+            )
+
+            # Double-check that quantiles are at least plausible.
+            assert (
+                (bf_min <= bf_p25)
+                & (bf_p25 <= bf_p50)
+                & (bf_p50 <= bf_p50)
+                & (bf_p75 <= bf_max)
+            ).all()
+        else:
+            # Reindex results with the specified keys and their order, because
+            # the relative order is not important.
+            bf_result = bf_result.reindex(columns=["count", "nunique"])
+            pd_result = pd_result.reindex(columns=["count", "unique"])
+        pandas.testing.assert_frame_equal(
+            # BF counter part of "unique" is called "nunique"
+            pd_result.astype("Float64").rename(columns={"unique": "nunique"}),
+            bf_result,
+            check_dtype=False,
+            check_index_type=False,
+        )