Skip to content
This repository was archived by the owner on May 7, 2026. It is now read-only.

Commit 21a5b84

Browse files
feat: Add Groupby.describe()
1 parent 5ce5d63 commit 21a5b84

6 files changed

Lines changed: 219 additions & 12 deletions

File tree

bigframes/core/compile/api.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,18 @@
1515

1616
from typing import TYPE_CHECKING
1717

18-
from bigframes.core import rewrite
19-
from bigframes.core.compile.ibis_compiler import ibis_compiler
20-
2118
if TYPE_CHECKING:
2219
import bigframes.core.nodes
2320

2421

2522
def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode):
2623
"""Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema."""
24+
from bigframes.core.compile.ibis_compiler import ibis_compiler
25+
import bigframes.core.rewrite
2726
import bigframes.core.schema
2827

2928
node = ibis_compiler._replace_unsupported_ops(node)
30-
node = rewrite.bake_order(node)
29+
node = bigframes.core.rewrite.bake_order(node)
3130
ir = ibis_compiler.compile_node(node)
3231
items = tuple(
3332
bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id))

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,20 @@ def head(self, n: int = 5) -> df.DataFrame:
149149
)
150150
)
151151

152+
def describe(self, include: None | Literal["all"] = None):
153+
from bigframes.pandas.core.methods import describe
154+
155+
return df.DataFrame(
156+
describe._describe(
157+
self._block,
158+
self._selected_cols,
159+
include,
160+
as_index=self._as_index,
161+
by_col_ids=self._by_col_ids,
162+
dropna=self._dropna,
163+
)
164+
)
165+
152166
def size(self) -> typing.Union[df.DataFrame, series.Series]:
153167
agg_block, _ = self._block.aggregate_size(
154168
by_column_ids=self._by_col_ids,

bigframes/core/groupby/series_group_by.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,20 @@ def head(self, n: int = 5) -> series.Series:
7575
)
7676
)
7777

78+
def describe(self, include: None | Literal["all"] = None):
79+
from bigframes.pandas.core.methods import describe
80+
81+
return df.DataFrame(
82+
describe._describe(
83+
self._block.select_column(self._value_column),
84+
columns=[self._value_column],
85+
include=include,
86+
as_index=True,
87+
by_col_ids=self._by_col_ids,
88+
dropna=self._dropna,
89+
)
90+
).droplevel(level=0, axis=1)
91+
7892
def all(self) -> series.Series:
7993
return self._aggregate(agg_ops.all_op)
8094

bigframes/core/rewrite/implicit_align.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,8 @@
1818
from typing import cast, Optional, Sequence, Set, Tuple
1919

2020
import bigframes.core.expression
21-
import bigframes.core.guid
2221
import bigframes.core.identifiers
23-
import bigframes.core.join_def
2422
import bigframes.core.nodes
25-
import bigframes.core.window_spec
26-
import bigframes.operations.aggregations
2723

2824
# Combination of selects and additive nodes can be merged as an explicit keyless "row join"
2925
ALIGNABLE_NODES = (

bigframes/pandas/core/methods/describe.py

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,16 @@
1616

1717
import typing
1818

19+
import pandas as pd
20+
1921
from bigframes import dataframe, dtypes, series
22+
from bigframes.core import agg_expressions, blocks
2023
from bigframes.core.reshape import api as rs
24+
from bigframes.operations import aggregations
25+
26+
_DEFAULT_DTYPES = (
27+
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
28+
)
2129

2230

2331
def describe(
@@ -33,8 +41,7 @@ def describe(
3341
if include is None:
3442
numeric_df = _select_dtypes(
3543
input,
36-
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
37-
+ dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
44+
_DEFAULT_DTYPES,
3845
)
3946
if len(numeric_df.columns) == 0:
4047
# Describe eligible non-numeric columns
@@ -61,6 +68,62 @@ def describe(
6168
raise ValueError(f"Unsupported include type: {include}")
6269

6370

71+
def _describe(
72+
block: blocks.Block,
73+
columns: typing.Sequence[str],
74+
include: None | typing.Literal["all"] = None,
75+
*,
76+
as_index: bool = True,
77+
by_col_ids: typing.Sequence[str] = [],
78+
dropna: bool = False,
79+
) -> blocks.Block:
80+
stats: list[agg_expressions.Aggregation] = []
81+
column_labels: list[typing.Hashable] = []
82+
83+
for col_id in columns:
84+
label = block.col_id_to_label[col_id]
85+
dtype = block.expr.get_column_type(col_id)
86+
if include != "all" and dtype not in _DEFAULT_DTYPES:
87+
continue
88+
agg_ops = _get_aggs_for_dtype(dtype)
89+
stats.extend(op.as_expr(col_id) for op in agg_ops)
90+
label_tuple = (label,) if block.column_labels.nlevels == 1 else label
91+
column_labels.extend((*label_tuple, op.name) for op in agg_ops) # type: ignore
92+
93+
agg_block, _ = block.aggregate(
94+
by_column_ids=by_col_ids,
95+
aggregations=stats,
96+
dropna=dropna,
97+
column_labels=pd.Index(column_labels, name=(*block.index.names, None)),
98+
)
99+
return agg_block if as_index else agg_block.reset_index(drop=False)
100+
101+
102+
def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]:
103+
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE:
104+
return [
105+
aggregations.count_op,
106+
aggregations.mean_op,
107+
aggregations.std_op,
108+
aggregations.min_op,
109+
aggregations.ApproxQuartilesOp(1),
110+
aggregations.ApproxQuartilesOp(2),
111+
aggregations.ApproxQuartilesOp(3),
112+
aggregations.max_op,
113+
]
114+
elif dtype in dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES:
115+
return [aggregations.count_op]
116+
elif dtype in [
117+
dtypes.STRING_DTYPE,
118+
dtypes.BOOL_DTYPE,
119+
dtypes.BYTES_DTYPE,
120+
dtypes.TIME_DTYPE,
121+
]:
122+
return [aggregations.count_op, aggregations.nunique_op]
123+
else:
124+
return []
125+
126+
64127
def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
65128
number_df_result = typing.cast(
66129
dataframe.DataFrame,
@@ -91,8 +154,7 @@ def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
91154

92155
original_columns = _select_dtypes(
93156
df,
94-
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
95-
+ dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
157+
_DEFAULT_DTYPES,
96158
).columns
97159

98160
# Use reindex after join to preserve the original column order.

tests/system/small/pandas/test_describe.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,3 +230,125 @@ def test_series_describe_temporal(scalars_dfs):
230230
check_dtype=False,
231231
check_index_type=False,
232232
)
233+
234+
235+
def test_df_groupby_describe(scalars_dfs):
236+
# TODO: supply a reason why this isn't compatible with pandas 1.x
237+
pytest.importorskip("pandas", minversion="2.0.0")
238+
scalars_df, scalars_pandas_df = scalars_dfs
239+
240+
numeric_columns = [
241+
"int64_col",
242+
"float64_col",
243+
]
244+
non_numeric_columns = ["string_col"]
245+
supported_columns = numeric_columns + non_numeric_columns
246+
247+
bf_full_result = (
248+
scalars_df.groupby("bool_col")[supported_columns]
249+
.describe(include="all")
250+
.to_pandas()
251+
)
252+
253+
pd_full_result = scalars_pandas_df.groupby("bool_col")[supported_columns].describe(
254+
include="all"
255+
)
256+
257+
for col in supported_columns:
258+
pd_result = pd_full_result[col]
259+
bf_result = bf_full_result[col]
260+
261+
if col in numeric_columns:
262+
# Drop quartiles, as they are approximate
263+
bf_min = bf_result["min"]
264+
bf_p25 = bf_result["25%"]
265+
bf_p50 = bf_result["50%"]
266+
bf_p75 = bf_result["75%"]
267+
bf_max = bf_result["max"]
268+
269+
# Reindex results with the specified keys and their order, because
270+
# the relative order is not important.
271+
bf_result = bf_result.reindex(
272+
columns=["count", "mean", "std", "min", "max"]
273+
)
274+
pd_result = pd_result.reindex(
275+
columns=["count", "mean", "std", "min", "max"]
276+
)
277+
278+
# Double-check that quantiles are at least plausible.
279+
assert (
280+
(bf_min <= bf_p25)
281+
& (bf_p25 <= bf_p50)
282+
& (bf_p50 <= bf_p50)
283+
& (bf_p75 <= bf_max)
284+
).all()
285+
else:
286+
# Reindex results with the specified keys and their order, because
287+
# the relative order is not important.
288+
bf_result = bf_result.reindex(columns=["count", "nunique"])
289+
pd_result = pd_result.reindex(columns=["count", "unique"])
290+
pandas.testing.assert_frame_equal(
291+
# BF counter part of "unique" is called "nunique"
292+
pd_result.astype("Float64").rename(columns={"unique": "nunique"}),
293+
bf_result,
294+
check_dtype=False,
295+
check_index_type=False,
296+
)
297+
298+
299+
def test_series_groupby_describe(scalars_dfs):
300+
# TODO: supply a reason why this isn't compatible with pandas 1.x
301+
pytest.importorskip("pandas", minversion="2.0.0")
302+
scalars_df, scalars_pandas_df = scalars_dfs
303+
304+
numeric_columns = [
305+
"int64_col",
306+
"float64_col",
307+
]
308+
non_numeric_columns = ["string_col"]
309+
supported_columns = numeric_columns + non_numeric_columns
310+
311+
bf_df = scalars_df.groupby("bool_col")
312+
313+
pd_df = scalars_pandas_df.groupby("bool_col")
314+
315+
for col in supported_columns:
316+
pd_result = pd_df[col].describe(include="all")
317+
bf_result = bf_df[col].describe(include="all").to_pandas()
318+
319+
if col in numeric_columns:
320+
# Drop quartiles, as they are approximate
321+
bf_min = bf_result["min"]
322+
bf_p25 = bf_result["25%"]
323+
bf_p50 = bf_result["50%"]
324+
bf_p75 = bf_result["75%"]
325+
bf_max = bf_result["max"]
326+
327+
# Reindex results with the specified keys and their order, because
328+
# the relative order is not important.
329+
bf_result = bf_result.reindex(
330+
columns=["count", "mean", "std", "min", "max"]
331+
)
332+
pd_result = pd_result.reindex(
333+
columns=["count", "mean", "std", "min", "max"]
334+
)
335+
336+
# Double-check that quantiles are at least plausible.
337+
assert (
338+
(bf_min <= bf_p25)
339+
& (bf_p25 <= bf_p50)
340+
& (bf_p50 <= bf_p50)
341+
& (bf_p75 <= bf_max)
342+
).all()
343+
else:
344+
# Reindex results with the specified keys and their order, because
345+
# the relative order is not important.
346+
bf_result = bf_result.reindex(columns=["count", "nunique"])
347+
pd_result = pd_result.reindex(columns=["count", "unique"])
348+
pandas.testing.assert_frame_equal(
349+
# BF counter part of "unique" is called "nunique"
350+
pd_result.astype("Float64").rename(columns={"unique": "nunique"}),
351+
bf_result,
352+
check_dtype=False,
353+
check_index_type=False,
354+
)

0 commit comments

Comments
 (0)