Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow multiple lambdas in Groupby.aggregate #26905

Merged
merged 12 commits into from
Jun 27, 2019
23 changes: 23 additions & 0 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,29 @@ For a grouped ``DataFrame``, you can rename in a similar manner:
'mean': 'bar',
'std': 'baz'}))

.. note::

In general, the output column names should be unique. You can't apply
the same function (or two functions with the same name) to the same
column.

.. ipython:: python
:okexcept:

grouped['C'].agg(['sum', 'sum'])


Pandas *does* allow you to provide multiple lambdas. In this case, pandas
will mangle the name of the (nameless) lambda functions, appending ``_<i>``
to each subsequent lambda.

.. ipython:: python

grouped['C'].agg([lambda x: x.max() - x.min(),
lambda x: x.median() - x.mean()])



.. _groupby.aggregate.named:

Named Aggregation
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ Other Enhancements
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
- :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`)
- Supported for multiple lambdas in the same aggregation for :meth:`GroupBy.aggregate` (:issue:`26430`).
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
- :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`)
- Error message for missing required imports now includes the original import error's text (:issue:`23868`)
- :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`)
Expand Down
70 changes: 70 additions & 0 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from pandas.core.dtypes.common import (
ensure_int64, ensure_platform_int, is_bool, is_datetimelike,
is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_scalar)
from pandas.core.dtypes.inference import is_dict_like, is_list_like
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
from pandas.core.dtypes.missing import isna, notna

from pandas._typing import FrameOrSeries
Expand Down Expand Up @@ -208,6 +209,8 @@ def aggregate(self, func, *args, **kwargs):
raise TypeError("Must provide 'func' or tuples of "
"'(column, aggfunc).")

func = _maybe_mangle_lambdas(func)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, the one on L810 is SeriesGroupBy.aggregate. I think it's entirely separate from NDFramGroupBy.aggregate.


result, how = self._aggregate(func, _level=_level, *args, **kwargs)
if how is None:
return result
Expand Down Expand Up @@ -830,6 +833,7 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs):
if isinstance(func_or_funcs, abc.Iterable):
# Catch instances of lists / tuples
# but not the class list / tuple itself.
func_or_funcs = _maybe_mangle_lambdas(func_or_funcs)
ret = self._aggregate_multiple_funcs(func_or_funcs,
(_level or 0) + 1)
if relabeling:
Expand Down Expand Up @@ -1710,3 +1714,69 @@ def _normalize_keyword_aggregation(kwargs):
order.append((column,
com.get_callable_name(aggfunc) or aggfunc))
return aggspec, columns, order


def _make_lambda(func, i):
def f(*args, **kwargs):
jreback marked this conversation as resolved.
Show resolved Hide resolved
return func(*args, **kwargs)
f.__name__ = "<lambda_{}>".format(i)
return f


def _managle_lambda_list(aggfuncs):
i = 0
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
aggfuncs2 = []
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
for aggfunc in aggfuncs:
if com.get_callable_name(aggfunc) == "<lambda>":
if i > 0:
aggfunc = _make_lambda(aggfunc, i)
i += 1
aggfuncs2.append(aggfunc)

return aggfuncs2


def _maybe_mangle_lambdas(agg_spec):
"""
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
Make new lambdas with unique names.

Parameters
----------
agg_spec : Any
An argument to NDFrameGroupBy.agg.
Non-dict-like `agg_spec` are pass through as is.
jreback marked this conversation as resolved.
Show resolved Hide resolved
For dict-like `agg_spec` a new spec is returned
with name-mangled lambdas.

Returns
-------
mangled : Any
Same type as the input.

Examples
--------
>>> _maybe_mangle_lambdas('sum')
'sum'

>>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP
[<function __main__.<lambda>()>,
<function pandas...._make_lambda.<locals>.f(*args, **kwargs)>]
"""
is_dict = is_dict_like(agg_spec)
if not (is_dict or is_list_like(agg_spec)):
return agg_spec
agg_spec2 = type(agg_spec)() # dict or OrderdDict

if is_dict:
for key in agg_spec:
aggfuncs = agg_spec[key]
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
if is_list_like(aggfuncs) and not is_dict_like(aggfuncs):
aggfuncs2 = _managle_lambda_list(aggfuncs)
else:
aggfuncs2 = aggfuncs

agg_spec2[key] = aggfuncs2 or aggfuncs
else:
agg_spec2 = _managle_lambda_list(agg_spec)

return agg_spec2
81 changes: 72 additions & 9 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
from pandas.core.base import SpecificationError
from pandas.core.groupby.generic import _maybe_mangle_lambdas
from pandas.core.groupby.grouper import Grouping
import pandas.util.testing as tm

Expand Down Expand Up @@ -210,15 +211,6 @@ def test_multiple_functions_tuples_and_non_tuples(df):
tm.assert_frame_equal(result, expected)


def test_agg_multiple_functions_too_many_lambdas(df):
grouped = df.groupby('A')
funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]

msg = 'Function names must be unique, found multiple named <lambda>'
with pytest.raises(SpecificationError, match=msg):
grouped.agg(funcs)


def test_more_flexible_frame_multi_function(df):
grouped = df.groupby('A')

Expand Down Expand Up @@ -458,3 +450,74 @@ def test_agg_namedtuple(self):
expected = df.groupby("A").agg(b=("B", "sum"),
c=("B", "count"))
tm.assert_frame_equal(result, expected)


class TestLambdaMangling:

def test_maybe_mangle_lambdas_passthrough(self):
assert _maybe_mangle_lambdas('mean') == 'mean'
assert _maybe_mangle_lambdas(lambda x: x).__name__ == '<lambda>'
assert [x.__name__ for x in _maybe_mangle_lambdas([lambda x: x])
] == ['<lambda>']

def test_maybe_mangle_lambdas_listlike(self):
aggfuncs = [lambda x: 1, lambda x: 2]
result = _maybe_mangle_lambdas(aggfuncs)
assert result[0].__name__ == '<lambda>'
assert result[1].__name__ == '<lambda_1>'
assert aggfuncs[0](None) == result[0](None)
assert aggfuncs[1](None) == result[1](None)

def test_maybe_mangle_lambdas(self):
func = {
'A': [lambda x: 0, lambda x: 1]
}
result = _maybe_mangle_lambdas(func)
assert result['A'][0].__name__ == '<lambda>'
assert result['A'][1].__name__ == '<lambda_1>'

def test_maybe_mangle_lambdas_args(self):
func = {
'A': [lambda x, a, b=1: (0, a, b), lambda x: 1]
}
result = _maybe_mangle_lambdas(func)
assert result['A'][0].__name__ == '<lambda>'
assert result['A'][1].__name__ == '<lambda_1>'

assert func['A'][0](0, 1) == (0, 1, 1)
assert func['A'][0](0, 1, 2) == (0, 1, 2)
assert func['A'][0](0, 2, b=3) == (0, 2, 3)

def test_maybe_mangle_lambdas_named(self):
func = OrderedDict([('C', np.mean),
('D', OrderedDict([('foo', np.mean),
('bar', np.mean)]))])
result = _maybe_mangle_lambdas(func)
assert result == func

def test_basic(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})

expected = pd.DataFrame({("B", "<lambda>"): [0, 0],
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
("B", "<lambda_1>"): [1, 1]},
index=pd.Index([0, 1], name='A'))
tm.assert_frame_equal(result, expected)

def test_mangle_series_groupby(self):
gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
result = gr.agg([lambda x: 0, lambda x: 1])
expected = pd.DataFrame({'<lambda>': [0, 0], '<lambda_1>': [1, 1]})
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
def test_with_kwargs(self):
f1 = lambda x, y, b=1: x.sum() + y + b
f2 = lambda x, y, b=2: x.sum() + y * b
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
expected = pd.DataFrame({'<lambda>': [4], '<lambda_1>': [6]})
tm.assert_frame_equal(result, expected)

result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
expected = pd.DataFrame({'<lambda>': [13], '<lambda_1>': [30]})
tm.assert_frame_equal(result, expected)