diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 54e26c155595b..147f07e36efb8 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -568,6 +568,29 @@ For a grouped ``DataFrame``, you can rename in a similar manner: 'mean': 'bar', 'std': 'baz'})) +.. note:: + + In general, the output column names should be unique. You can't apply + the same function (or two functions with the same name) to the same + column. + + .. ipython:: python + :okexcept: + + grouped['C'].agg(['sum', 'sum']) + + + Pandas *does* allow you to provide multiple lambdas. In this case, pandas + will mangle the name of the (nameless) lambda functions, appending ``_`` + to each subsequent lambda. + + .. ipython:: python + + grouped['C'].agg([lambda x: x.max() - x.min(), + lambda x: x.median() - x.mean()]) + + + .. _groupby.aggregate.named: Named aggregation diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1980e00f1073d..7074ab5dd3a0e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -79,6 +79,26 @@ a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.depreca See :ref:`groupby.aggregate.named` for more. +.. _whatsnew_0250.enhancements.multiple_lambdas: + +Groupby Aggregation with multiple lambdas +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can now provide multiple lambda functions to a list-like aggregation in +:class:`pandas.core.groupby.GroupBy.agg` (:issue:`26430`). + +.. ipython:: python + + animals.groupby('kind').height.agg([ + lambda x: x.iloc[0], lambda x: x.iloc[-1] + ]) + + animals.groupby('kind').agg([ + lambda x: x.iloc[0] - x.iloc[1], + lambda x: x.iloc[0] + x.iloc[1] + ]) + +Previously, these raised a ``SpecificationError``. .. _whatsnew_0250.enhancements.multi_index_repr: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a10920b7a5afb..aec2373bce42d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -5,13 +5,13 @@ These are user facing as the result of the ``df.groupby(...)`` operations, which here returns a DataFrameGroupBy object. """ - from collections import OrderedDict, abc, namedtuple import copy +import functools from functools import partial from textwrap import dedent import typing -from typing import Any, Callable, FrozenSet, Iterator, List, Type, Union +from typing import Any, Callable, FrozenSet, Iterator, Sequence, Type, Union import warnings import numpy as np @@ -24,9 +24,9 @@ from pandas.core.dtypes.cast import ( maybe_convert_objects, maybe_downcast_to_dtype) from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_bool, is_datetimelike, - is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_object_dtype, - is_scalar) + ensure_int64, ensure_platform_int, is_bool, is_datetimelike, is_dict_like, + is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, + is_object_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna from pandas._typing import FrameOrSeries @@ -49,6 +49,10 @@ NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) # TODO(typing) the return value on this callable should be any *scalar*. AggScalar = Union[str, Callable[..., Any]] +# TODO: validate types on ScalarResult and move to _typing +# Blocked from using by https://github.com/python/mypy/issues/1484 +# See note at _mangle_lambda_list +ScalarResult = typing.TypeVar("ScalarResult") def whitelist_method_generator(base_class: Type[GroupBy], @@ -210,6 +214,8 @@ def aggregate(self, func, *args, **kwargs): raise TypeError("Must provide 'func' or tuples of " "'(column, aggfunc).") + func = _maybe_mangle_lambdas(func) + result, how = self._aggregate(func, _level=_level, *args, **kwargs) if how is None: return result @@ -823,6 +829,7 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): if isinstance(func_or_funcs, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. + func_or_funcs = _maybe_mangle_lambdas(func_or_funcs) ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) if relabeling: @@ -1691,7 +1698,10 @@ def _normalize_keyword_aggregation(kwargs): # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to # defaultdict(list) - aggspec = OrderedDict() # type: typing.OrderedDict[str, List[AggScalar]] + # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] + # May be hitting https://github.com/python/mypy/issues/5958 + # saying it doesn't have an attribute __name__ + aggspec = OrderedDict() order = [] columns, pairs = list(zip(*kwargs.items())) @@ -1705,6 +1715,90 @@ def _normalize_keyword_aggregation(kwargs): return aggspec, columns, order +# TODO: Can't use, because mypy doesn't like us setting __name__ +# error: "partial[Any]" has no attribute "__name__" +# the type is: +# typing.Sequence[Callable[..., ScalarResult]] +# -> typing.Sequence[Callable[..., ScalarResult]]: + +def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: + """ + Possibly mangle a list of aggfuncs. + + Parameters + ---------- + aggfuncs : Sequence + + Returns + ------- + mangled: list-like + A new AggSpec sequence, where lambdas have been converted + to have unique names. + + Notes + ----- + If just one aggfunc is passed, the name will not be mangled. + """ + if len(aggfuncs) <= 1: + # don't mangle for .agg([lambda x: .]) + return aggfuncs + i = 0 + mangled_aggfuncs = [] + for aggfunc in aggfuncs: + if com.get_callable_name(aggfunc) == "": + aggfunc = functools.partial(aggfunc) + aggfunc.__name__ = ''.format(i) + i += 1 + mangled_aggfuncs.append(aggfunc) + + return mangled_aggfuncs + + +def _maybe_mangle_lambdas(agg_spec: Any) -> Any: + """ + Make new lambdas with unique names. + + Parameters + ---------- + agg_spec : Any + An argument to NDFrameGroupBy.agg. + Non-dict-like `agg_spec` are pass through as is. + For dict-like `agg_spec` a new spec is returned + with name-mangled lambdas. + + Returns + ------- + mangled : Any + Same type as the input. + + Examples + -------- + >>> _maybe_mangle_lambdas('sum') + 'sum' + + >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + [, + .f(*args, **kwargs)>] + """ + is_dict = is_dict_like(agg_spec) + if not (is_dict or is_list_like(agg_spec)): + return agg_spec + mangled_aggspec = type(agg_spec)() # dict or OrderdDict + + if is_dict: + for key, aggfuncs in agg_spec.items(): + if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): + mangled_aggfuncs = _managle_lambda_list(aggfuncs) + else: + mangled_aggfuncs = aggfuncs + + mangled_aggspec[key] = mangled_aggfuncs + else: + mangled_aggspec = _managle_lambda_list(agg_spec) + + return mangled_aggspec + + def _recast_datetimelike_result(result: DataFrame) -> DataFrame: """ If we have date/time like in the original, then coerce dates diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 801b99fed5ce6..ea59cde54f17b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError +from pandas.core.groupby.generic import _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -210,15 +211,6 @@ def test_multiple_functions_tuples_and_non_tuples(df): tm.assert_frame_equal(result, expected) -def test_agg_multiple_functions_too_many_lambdas(df): - grouped = df.groupby('A') - funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] - - msg = 'Function names must be unique, found multiple named ' - with pytest.raises(SpecificationError, match=msg): - grouped.agg(funcs) - - def test_more_flexible_frame_multi_function(df): grouped = df.groupby('A') @@ -362,6 +354,12 @@ def test_series_named_agg_duplicates_raises(self): with pytest.raises(SpecificationError): gr.agg(a='sum', b='sum') + def test_mangled(self): + gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) + result = gr.agg(a=lambda x: 0, b=lambda x: 1) + expected = pd.DataFrame({'a': [0, 0], 'b': [1, 1]}) + tm.assert_frame_equal(result, expected) + class TestNamedAggregationDataFrame: def test_agg_relabel(self): @@ -458,3 +456,84 @@ def test_agg_namedtuple(self): expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count")) tm.assert_frame_equal(result, expected) + + def test_mangled(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) + result = df.groupby("A").agg( + b=("B", lambda x: 0), + c=("C", lambda x: 1) + ) + expected = pd.DataFrame({"b": [0, 0], "c": [1, 1]}, + index=pd.Index([0, 1], name='A')) + tm.assert_frame_equal(result, expected) + + +class TestLambdaMangling: + + def test_maybe_mangle_lambdas_passthrough(self): + assert _maybe_mangle_lambdas('mean') == 'mean' + assert _maybe_mangle_lambdas(lambda x: x).__name__ == '' + # don't mangel single lambda. + assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == '' + + def test_maybe_mangle_lambdas_listlike(self): + aggfuncs = [lambda x: 1, lambda x: 2] + result = _maybe_mangle_lambdas(aggfuncs) + assert result[0].__name__ == '' + assert result[1].__name__ == '' + assert aggfuncs[0](None) == result[0](None) + assert aggfuncs[1](None) == result[1](None) + + def test_maybe_mangle_lambdas(self): + func = { + 'A': [lambda x: 0, lambda x: 1] + } + result = _maybe_mangle_lambdas(func) + assert result['A'][0].__name__ == '' + assert result['A'][1].__name__ == '' + + def test_maybe_mangle_lambdas_args(self): + func = { + 'A': [lambda x, a, b=1: (0, a, b), lambda x: 1] + } + result = _maybe_mangle_lambdas(func) + assert result['A'][0].__name__ == '' + assert result['A'][1].__name__ == '' + + assert func['A'][0](0, 1) == (0, 1, 1) + assert func['A'][0](0, 1, 2) == (0, 1, 2) + assert func['A'][0](0, 2, b=3) == (0, 2, 3) + + def test_maybe_mangle_lambdas_named(self): + func = OrderedDict([('C', np.mean), + ('D', OrderedDict([('foo', np.mean), + ('bar', np.mean)]))]) + result = _maybe_mangle_lambdas(func) + assert result == func + + def test_basic(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]}) + + expected = pd.DataFrame({("B", ""): [0, 0], + ("B", ""): [1, 1]}, + index=pd.Index([0, 1], name='A')) + tm.assert_frame_equal(result, expected) + + def test_mangle_series_groupby(self): + gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1]) + result = gr.agg([lambda x: 0, lambda x: 1]) + expected = pd.DataFrame({'': [0, 0], '': [1, 1]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.") + def test_with_kwargs(self): + f1 = lambda x, y, b=1: x.sum() + y + b + f2 = lambda x, y, b=2: x.sum() + y * b + result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0) + expected = pd.DataFrame({'': [4], '': [6]}) + tm.assert_frame_equal(result, expected) + + result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) + expected = pd.DataFrame({'': [13], '': [30]}) + tm.assert_frame_equal(result, expected)