From 964ac84b4157302f67b74e6a5c2de9d6732360f9 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Tue, 21 Apr 2020 23:32:05 -0700 Subject: [PATCH] PERF: fix #32976 slow group by for categorical columns Aggregate categorical codes with fast cython aggregation for select `how` operations. 8/1/20: rebase and move release note to 1.2 --- asv_bench/benchmarks/groupby.py | 29 ++++++++++++++++++++++ doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/groupby/ops.py | 43 ++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5ffda03fad80f9..7c127b90c181c5 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -6,6 +6,7 @@ from pandas import ( Categorical, + CategoricalDtype, DataFrame, MultiIndex, Series, @@ -473,6 +474,7 @@ def time_sum(self): class Categories: + # benchmark grouping by categoricals def setup(self): N = 10 ** 5 arr = np.random.random(N) @@ -510,6 +512,33 @@ def time_groupby_extra_cat_nosort(self): self.df_extra_cat.groupby("a", sort=False)["b"].count() +class CategoricalFrame: + # benchmark grouping with operations on categorical values (GH #32976) + param_names = ["groupby_type", "value_type", "agg_method"] + params = [(int,), (int, str), ("last", "head", "count")] + + def setup(self, groupby_type, value_type, agg_method): + SIZE = 100000 + GROUPS = 1000 + CARDINALITY = 10 + CAT = CategoricalDtype([value_type(i) for i in range(CARDINALITY)]) + df = DataFrame( + { + "group": [ + groupby_type(np.random.randint(0, GROUPS)) for i in range(SIZE) + ], + "cat": [np.random.choice(CAT.categories) for i in range(SIZE)], + } + ) + self.df_cat_values = df.astype({"cat": CAT}) + + def time_groupby(self, groupby_type, value_type, agg_method): + getattr(self.df_cat_values.groupby("group"), agg_method)() + + def time_groupby_ordered(self, groupby_type, value_type, agg_method): + getattr(self.df_cat_values.groupby("group", sort=True), agg_method)() + + class Datelike: # GH 14338 params = ["period_range", "date_range", "date_range_tz"] diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b16ca0a80c5b4d..401b8d3af2f00f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -40,6 +40,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :meth:`DataFrame.groupby` when aggregating categorical data (:issue:`32976`) - - @@ -132,6 +133,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- :meth:`DataFrame.groupby` aggregations of categorical series will now return a :class:`Categorical` while preserving the codes and categories of the original series - - diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3aaeef3b637609..d2e8f2023e7bcd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -40,6 +40,7 @@ from pandas.core.dtypes.missing import _maybe_fill, isna import pandas.core.algorithms as algorithms +from pandas.core.arrays.categorical import Categorical from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -356,6 +357,29 @@ def get_group_levels(self) -> List[Index]: _name_functions = {"ohlc": ["open", "high", "low", "close"]} + _cat_method_blacklist = ( + "add", + "median", + "prod", + "sem", + "cumsum", + "sum", + "cummin", + "mean", + "max", + "skew", + "cumprod", + "cummax", + "rank", + "pct_change", + "min", + "var", + "mad", + "describe", + "std", + "quantile", + ) + def _is_builtin_func(self, arg): """ if we define a builtin function for this argument, return it, @@ -460,7 +484,7 @@ def _cython_operation( # categoricals are only 1d, so we # are not setup for dim transforming - if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): + if is_sparse(values.dtype): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values.dtype): if how in ["add", "prod", "cumsum", "cumprod"]: @@ -481,6 +505,7 @@ def _cython_operation( is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) + is_categorical = is_categorical_dtype(values) if is_datetimelike: values = values.view("int64") @@ -496,6 +521,17 @@ def _cython_operation( values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) + elif is_categorical: + if how in self._cat_method_blacklist: + raise NotImplementedError( + f"{values.dtype} dtype not supported for `how` argument {how}" + ) + values, categories, ordered = ( + values.codes.astype(np.int64), + values.categories, + values.ordered, + ) + is_numeric = True else: values = values.astype(object) @@ -572,6 +608,11 @@ def _cython_operation( result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) + elif is_categorical: + # re-create categories + result = Categorical.from_codes( + result, categories=categories, ordered=ordered, + ) if is_extension_array_dtype(orig_values.dtype): result = maybe_cast_result(result=result, obj=orig_values, how=how)