Skip to content

Commit

Permalink
PERF: fix #32976 slow group by for categorical columns
Browse files Browse the repository at this point in the history
Aggregate categorical codes with fast cython aggregation for select
`how` operations.
  • Loading branch information
rtlee9 committed May 9, 2020
1 parent 9f746a7 commit 7570425
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 3 deletions.
29 changes: 29 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from pandas import (
Categorical,
CategoricalDtype,
DataFrame,
MultiIndex,
Series,
Expand Down Expand Up @@ -473,6 +474,7 @@ def time_sum(self):


class Categories:
# benchmark grouping by categoricals
def setup(self):
N = 10 ** 5
arr = np.random.random(N)
Expand Down Expand Up @@ -510,6 +512,33 @@ def time_groupby_extra_cat_nosort(self):
self.df_extra_cat.groupby("a", sort=False)["b"].count()


class CategoricalFrame:
# benchmark grouping with operations on categorical values (GH #32976)
param_names = ["groupby_type", "value_type", "agg_method"]
params = [(int,), (int, str), ("last", "head", "count")]

def setup(self, groupby_type, value_type, agg_method):
SIZE = 100000
GROUPS = 1000
CARDINALITY = 10
CAT = CategoricalDtype([value_type(i) for i in range(CARDINALITY)])
df = DataFrame(
{
"group": [
groupby_type(np.random.randint(0, GROUPS)) for i in range(SIZE)
],
"cat": [np.random.choice(CAT.categories) for i in range(SIZE)],
}
)
self.df_cat_values = df.astype({"cat": CAT})

def time_groupby(self, groupby_type, value_type, agg_method):
getattr(self.df_cat_values.groupby("group"), agg_method)()

def time_groupby_ordered(self, groupby_type, value_type, agg_method):
getattr(self.df_cat_values.groupby("group", sort=True), agg_method)()


class Datelike:
# GH 14338
params = ["period_range", "date_range", "date_range_tz"]
Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ Other API changes
- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
-
- :meth:`DataFrame.groupby` aggregations of categorical series will now return a :class:`Categorical` while preserving the codes and categories of the original series

Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -525,6 +525,7 @@ Performance improvements
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
- Performance improvement in :meth:`DataFrame.groupby` when aggregating categorical data (:issue:`32976`)


.. ---------------------------------------------------------------------------
Expand Down
43 changes: 42 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from pandas.core.dtypes.missing import _maybe_fill, isna

import pandas.core.algorithms as algorithms
from pandas.core.arrays.categorical import Categorical
from pandas.core.base import SelectionMixin
import pandas.core.common as com
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -354,6 +355,29 @@ def get_group_levels(self) -> List[Index]:

_name_functions = {"ohlc": ["open", "high", "low", "close"]}

_cat_method_blacklist = (
"add",
"median",
"prod",
"sem",
"cumsum",
"sum",
"cummin",
"mean",
"max",
"skew",
"cumprod",
"cummax",
"rank",
"pct_change",
"min",
"var",
"mad",
"describe",
"std",
"quantile",
)

def _is_builtin_func(self, arg):
"""
if we define a builtin function for this argument, return it,
Expand Down Expand Up @@ -458,7 +482,7 @@ def _cython_operation(

# categoricals are only 1d, so we
# are not setup for dim transforming
if is_categorical_dtype(values.dtype) or is_sparse(values.dtype):
if is_sparse(values.dtype):
raise NotImplementedError(f"{values.dtype} dtype not supported")
elif is_datetime64_any_dtype(values):
if how in ["add", "prod", "cumsum", "cumprod"]:
Expand All @@ -479,6 +503,7 @@ def _cython_operation(

is_datetimelike = needs_i8_conversion(values.dtype)
is_numeric = is_numeric_dtype(values.dtype)
is_categorical = is_categorical_dtype(values)

if is_datetimelike:
values = values.view("int64")
Expand All @@ -494,6 +519,17 @@ def _cython_operation(
values = ensure_int_or_float(values)
elif is_numeric and not is_complex_dtype(values):
values = ensure_float64(values)
elif is_categorical:
if how in self._cat_method_blacklist:
raise NotImplementedError(
f"{values.dtype} dtype not supported for `how` argument {how}"
)
values, categories, ordered = (
values.codes.astype(np.int64),
values.categories,
values.ordered,
)
is_numeric = True
else:
values = values.astype(object)

Expand Down Expand Up @@ -581,6 +617,11 @@ def _cython_operation(
result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
elif is_datetimelike and kind == "aggregate":
result = result.astype(orig_values.dtype)
elif is_categorical:
# re-create categories
result = Categorical.from_codes(
result, categories=categories, ordered=ordered,
)

return result, names

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ def test_agg_cython_category_not_implemented_fallback():
result = df.groupby("col_num").col_cat.first()
expected = pd.Series(
[1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat"
)
).astype("category")
tm.assert_series_equal(result, expected)

result = df.groupby("col_num").agg({"col_cat": "first"})
Expand Down

0 comments on commit 7570425

Please sign in to comment.