diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c9ac275cc4ea76..73ae643e71a7a0 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -6,6 +6,7 @@ from pandas import ( Categorical, + CategoricalDtype, DataFrame, MultiIndex, Series, @@ -473,6 +474,7 @@ def time_sum(self): class Categories: + # benchmark grouping by categoricals def setup(self): N = 10 ** 5 arr = np.random.random(N) @@ -510,6 +512,33 @@ def time_groupby_extra_cat_nosort(self): self.df_extra_cat.groupby("a", sort=False)["b"].count() +class CategoricalFrame: + # benchmark grouping with operations on categorical values (GH #32976) + param_names = ["groupby_type", "value_type", "agg_method"] + params = [(int,), (int, str), ("last", "head", "count")] + + def setup(self, groupby_type, value_type, agg_method): + SIZE = 100000 + GROUPS = 1000 + CARDINALITY = 10 + CAT = CategoricalDtype([value_type(i) for i in range(CARDINALITY)]) + df = DataFrame( + { + "group": [ + groupby_type(np.random.randint(0, GROUPS)) for i in range(SIZE) + ], + "cat": [np.random.choice(CAT.categories) for i in range(SIZE)], + } + ) + self.df_cat_values = df.astype({"cat": CAT}) + + def time_groupby(self, groupby_type, value_type, agg_method): + getattr(self.df_cat_values.groupby("group"), agg_method)() + + def time_groupby_ordered(self, groupby_type, value_type, agg_method): + getattr(self.df_cat_values.groupby("group", sort=True), agg_method)() + + class Datelike: # GH 14338 params = ["period_range", "date_range", "date_range_tz"] diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 95cb4ccbbb7961..1860d4ddd2a59d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -239,7 +239,7 @@ Other API changes - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) - Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. -- +- :meth:`DataFrame.groupby` aggregations of categorical series will now return a :class:`Categorical` while preserving the codes and categories of the original series Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -525,6 +525,7 @@ Performance improvements :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). +- Performance improvement in :meth:`DataFrame.groupby` when aggregating categorical data (:issue:`32976`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 43e6b02e9dc53f..674bf4b2257350 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -39,6 +39,7 @@ from pandas.core.dtypes.missing import _maybe_fill, isna import pandas.core.algorithms as algorithms +from pandas.core.arrays.categorical import Categorical from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -354,6 +355,29 @@ def get_group_levels(self) -> List[Index]: _name_functions = {"ohlc": ["open", "high", "low", "close"]} + _cat_method_blacklist = ( + "add", + "median", + "prod", + "sem", + "cumsum", + "sum", + "cummin", + "mean", + "max", + "skew", + "cumprod", + "cummax", + "rank", + "pct_change", + "min", + "var", + "mad", + "describe", + "std", + "quantile", + ) + def _is_builtin_func(self, arg): """ if we define a builtin function for this argument, return it, @@ -458,7 +482,7 @@ def _cython_operation( # categoricals are only 1d, so we # are not setup for dim transforming - if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): + if is_sparse(values.dtype): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: @@ -479,6 +503,7 @@ def _cython_operation( is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) + is_categorical = is_categorical_dtype(values) if is_datetimelike: values = values.view("int64") @@ -494,6 +519,17 @@ def _cython_operation( values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) + elif is_categorical: + if how in self._cat_method_blacklist: + raise NotImplementedError( + f"{values.dtype} dtype not supported for `how` argument {how}" + ) + values, categories, ordered = ( + values.codes.astype(np.int64), + values.categories, + values.ordered, + ) + is_numeric = True else: values = values.astype(object) @@ -581,6 +617,11 @@ def _cython_operation( result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) + elif is_categorical: + # re-create categories + result = Categorical.from_codes( + result, categories=categories, ordered=ordered, + ) return result, names diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d4b061594c3645..b4f00fa7631752 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -466,7 +466,7 @@ def test_agg_cython_category_not_implemented_fallback(): result = df.groupby("col_num").col_cat.first() expected = pd.Series( [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat" - ) + ).astype("category") tm.assert_series_equal(result, expected) result = df.groupby("col_num").agg({"col_cat": "first"})