Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow multiple lambdas in Groupby.aggregate #26905

Merged
merged 12 commits into from
Jun 27, 2019
23 changes: 23 additions & 0 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,29 @@ For a grouped ``DataFrame``, you can rename in a similar manner:
'mean': 'bar',
'std': 'baz'}))

.. note::

In general, the output column names should be unique. You can't apply
the same function (or two functions with the same name) to the same
column.

.. ipython:: python
:okexcept:

grouped['C'].agg(['sum', 'sum'])


Pandas *does* allow you to provide multiple lambdas. In this case, pandas
will mangle the name of the (nameless) lambda functions, appending ``_<i>``
to each subsequent lambda.

.. ipython:: python

grouped['C'].agg([lambda x: x.max() - x.min(),
lambda x: x.median() - x.mean()])



.. _groupby.aggregate.named:

Named Aggregation
Expand Down
20 changes: 20 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,26 @@ a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.depreca

See :ref:`groupby.aggregate.named` for more.

.. _whatsnew_0250.enhancements.multiple_lambdas:

Groupby Aggregation with multiple lambdas
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

You can now provide multiple lambda functions to a list-like aggregation in
:class:`pandas.core.groupby.GroupBy.agg` (:issue:`26430`).

.. ipython:: python

animals.groupby('kind').height.agg([
lambda x: x.iloc[0], lambda x: x.iloc[-1]
])

animals.groupby('kind').agg([
lambda x: x.iloc[0] - x.iloc[1],
lambda x: x.iloc[0] + x.iloc[1]
])

Previously, these raised a ``SpecificationError``.

.. _whatsnew_0250.enhancements.multi_index_repr:

Expand Down
105 changes: 100 additions & 5 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
These are user facing as the result of the ``df.groupby(...)`` operations,
which here returns a DataFrameGroupBy object.
"""

from collections import OrderedDict, abc, namedtuple
import copy
import functools
from functools import partial
from textwrap import dedent
import typing
from typing import Any, Callable, FrozenSet, Iterator, List, Type, Union
from typing import Any, Callable, FrozenSet, Iterator, Type, Union
import warnings

import numpy as np
Expand All @@ -23,8 +23,9 @@

from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
ensure_int64, ensure_platform_int, is_bool, is_datetimelike,
is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_scalar)
ensure_int64, ensure_platform_int, is_bool, is_datetimelike, is_dict_like,
is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype,
is_scalar)
from pandas.core.dtypes.missing import isna, notna

from pandas._typing import FrameOrSeries
Expand All @@ -47,6 +48,10 @@
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
# TODO(typing) the return value on this callable should be any *scalar*.
AggScalar = Union[str, Callable[..., Any]]
# TODO: validate types on ScalarResult and move to _typing
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make an issue for this and point to this code after we merge

# Blocked from using by https://github.com/python/mypy/issues/1484
# See note at _mangle_lambda_list
ScalarResult = typing.TypeVar("ScalarResult")


def whitelist_method_generator(base_class: Type[GroupBy],
Expand Down Expand Up @@ -208,6 +213,8 @@ def aggregate(self, func, *args, **kwargs):
raise TypeError("Must provide 'func' or tuples of "
"'(column, aggfunc).")

func = _maybe_mangle_lambdas(func)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, the one on L810 is SeriesGroupBy.aggregate. I think it's entirely separate from NDFramGroupBy.aggregate.


result, how = self._aggregate(func, _level=_level, *args, **kwargs)
if how is None:
return result
Expand Down Expand Up @@ -830,6 +837,7 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs):
if isinstance(func_or_funcs, abc.Iterable):
# Catch instances of lists / tuples
# but not the class list / tuple itself.
func_or_funcs = _maybe_mangle_lambdas(func_or_funcs)
ret = self._aggregate_multiple_funcs(func_or_funcs,
(_level or 0) + 1)
if relabeling:
Expand Down Expand Up @@ -1698,7 +1706,10 @@ def _normalize_keyword_aggregation(kwargs):
# process normally, then fixup the names.
# TODO(Py35): When we drop python 3.5, change this to
# defaultdict(list)
aggspec = OrderedDict() # type: typing.OrderedDict[str, List[AggScalar]]
# TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]]
# May be hitting https://github.com/python/mypy/issues/5958
# saying it doesn't have an attribute __name__
aggspec = OrderedDict()
order = []
columns, pairs = list(zip(*kwargs.items()))

Expand All @@ -1710,3 +1721,87 @@ def _normalize_keyword_aggregation(kwargs):
order.append((column,
com.get_callable_name(aggfunc) or aggfunc))
return aggspec, columns, order


# TODO: Can't use, because mypy doesn't like us setting __name__
# error: "partial[Any]" has no attribute "__name__"
# the type is:
# typing.Sequence[Callable[..., ScalarResult]]
# -> typing.Sequence[Callable[..., ScalarResult]]:

def _managle_lambda_list(aggfuncs):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add Parameters & types

Possibly mangle a list of aggfuncs.

Parameters
----------
aggfuncs : Sequence

Returns
-------
mangled: list-like
A new AggSpec sequence, where lambdas have been converted
to have unique names.

jreback marked this conversation as resolved.
Show resolved Hide resolved
Notes
-----
If just one aggfunc is passed, the name will not be mangled.
"""
if len(aggfuncs) <= 1:
# don't mangle for .agg([lambda x: .])
return aggfuncs
i = 0
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
mangled_aggfuncs = []
for aggfunc in aggfuncs:
if com.get_callable_name(aggfunc) == "<lambda>":
aggfunc = functools.partial(aggfunc)
aggfunc.__name__ = '<lambda_{}>'.format(i)
i += 1
mangled_aggfuncs.append(aggfunc)

return mangled_aggfuncs


def _maybe_mangle_lambdas(agg_spec):
"""
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
Make new lambdas with unique names.

Parameters
----------
agg_spec : Any
An argument to NDFrameGroupBy.agg.
Non-dict-like `agg_spec` are pass through as is.
jreback marked this conversation as resolved.
Show resolved Hide resolved
For dict-like `agg_spec` a new spec is returned
with name-mangled lambdas.

Returns
-------
mangled : Any
Same type as the input.

Examples
--------
>>> _maybe_mangle_lambdas('sum')
'sum'

>>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP
[<function __main__.<lambda_0>,
<function pandas...._make_lambda.<locals>.f(*args, **kwargs)>]
"""
is_dict = is_dict_like(agg_spec)
if not (is_dict or is_list_like(agg_spec)):
return agg_spec
mangled_aggspec = type(agg_spec)() # dict or OrderdDict

if is_dict:
for key, aggfuncs in agg_spec.items():
if is_list_like(aggfuncs) and not is_dict_like(aggfuncs):
mangled_aggfuncs = _managle_lambda_list(aggfuncs)
else:
mangled_aggfuncs = aggfuncs

mangled_aggspec[key] = mangled_aggfuncs
else:
mangled_aggspec = _managle_lambda_list(agg_spec)

return mangled_aggspec
97 changes: 88 additions & 9 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
from pandas.core.base import SpecificationError
from pandas.core.groupby.generic import _maybe_mangle_lambdas
from pandas.core.groupby.grouper import Grouping
import pandas.util.testing as tm

Expand Down Expand Up @@ -210,15 +211,6 @@ def test_multiple_functions_tuples_and_non_tuples(df):
tm.assert_frame_equal(result, expected)


def test_agg_multiple_functions_too_many_lambdas(df):
grouped = df.groupby('A')
funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]

msg = 'Function names must be unique, found multiple named <lambda>'
with pytest.raises(SpecificationError, match=msg):
grouped.agg(funcs)


def test_more_flexible_frame_multi_function(df):
grouped = df.groupby('A')

Expand Down Expand Up @@ -362,6 +354,12 @@ def test_series_named_agg_duplicates_raises(self):
with pytest.raises(SpecificationError):
gr.agg(a='sum', b='sum')

def test_mangled(self):
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
result = gr.agg(a=lambda x: 0, b=lambda x: 1)
expected = pd.DataFrame({'a': [0, 0], 'b': [1, 1]})
tm.assert_frame_equal(result, expected)


class TestNamedAggregationDataFrame:
def test_agg_relabel(self):
Expand Down Expand Up @@ -458,3 +456,84 @@ def test_agg_namedtuple(self):
expected = df.groupby("A").agg(b=("B", "sum"),
c=("B", "count"))
tm.assert_frame_equal(result, expected)

def test_mangled(self):
df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
result = df.groupby("A").agg(
b=("B", lambda x: 0),
c=("C", lambda x: 1)
)
expected = pd.DataFrame({"b": [0, 0], "c": [1, 1]},
index=pd.Index([0, 1], name='A'))
tm.assert_frame_equal(result, expected)


class TestLambdaMangling:

def test_maybe_mangle_lambdas_passthrough(self):
assert _maybe_mangle_lambdas('mean') == 'mean'
assert _maybe_mangle_lambdas(lambda x: x).__name__ == '<lambda>'
# don't mangel single lambda.
assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == '<lambda>'

def test_maybe_mangle_lambdas_listlike(self):
aggfuncs = [lambda x: 1, lambda x: 2]
result = _maybe_mangle_lambdas(aggfuncs)
assert result[0].__name__ == '<lambda_0>'
assert result[1].__name__ == '<lambda_1>'
assert aggfuncs[0](None) == result[0](None)
assert aggfuncs[1](None) == result[1](None)

def test_maybe_mangle_lambdas(self):
func = {
'A': [lambda x: 0, lambda x: 1]
}
result = _maybe_mangle_lambdas(func)
assert result['A'][0].__name__ == '<lambda_0>'
assert result['A'][1].__name__ == '<lambda_1>'

def test_maybe_mangle_lambdas_args(self):
func = {
'A': [lambda x, a, b=1: (0, a, b), lambda x: 1]
}
result = _maybe_mangle_lambdas(func)
assert result['A'][0].__name__ == '<lambda_0>'
assert result['A'][1].__name__ == '<lambda_1>'

assert func['A'][0](0, 1) == (0, 1, 1)
assert func['A'][0](0, 1, 2) == (0, 1, 2)
assert func['A'][0](0, 2, b=3) == (0, 2, 3)

def test_maybe_mangle_lambdas_named(self):
func = OrderedDict([('C', np.mean),
('D', OrderedDict([('foo', np.mean),
('bar', np.mean)]))])
result = _maybe_mangle_lambdas(func)
assert result == func

def test_basic(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})

expected = pd.DataFrame({("B", "<lambda_0>"): [0, 0],
("B", "<lambda_1>"): [1, 1]},
index=pd.Index([0, 1], name='A'))
tm.assert_frame_equal(result, expected)

def test_mangle_series_groupby(self):
gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
result = gr.agg([lambda x: 0, lambda x: 1])
expected = pd.DataFrame({'<lambda_0>': [0, 0], '<lambda_1>': [1, 1]})
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
def test_with_kwargs(self):
f1 = lambda x, y, b=1: x.sum() + y + b
f2 = lambda x, y, b=2: x.sum() + y * b
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
expected = pd.DataFrame({'<lambda_0>': [4], '<lambda_1>': [6]})
tm.assert_frame_equal(result, expected)

result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
expected = pd.DataFrame({'<lambda_0>': [13], '<lambda_1>': [30]})
tm.assert_frame_equal(result, expected)