From b6222ec976a71b4ba0c643411606e2376e744c4d Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 14 Jul 2020 21:34:55 +0100 Subject: [PATCH] BUG: aggregations were getting overwritten if they had the same name (#30858) * :bug: aggregations were getting overwritten if they had the same name --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/generic.py | 15 +++-- .../tests/groupby/aggregate/test_aggregate.py | 58 +++++++++++++++++++ 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cfac916157649..3faca9c8868ca 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1093,6 +1093,7 @@ Reshaping - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) - :meth:`DataFrame.pivot` can now take lists for ``index`` and ``columns`` arguments (:issue:`21425`) - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) +- Bug in :meth:`SeriesGroupBy.aggregate` was resulting in aggregations being overwritten when they shared the same name (:issue:`30880`) - Bug where :meth:`Index.astype` would lose the name attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) - :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 093e1d4ab3942..94dc216c82f55 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -278,7 +278,7 @@ def aggregate( if isinstance(ret, dict): from pandas import concat - ret = concat(ret, axis=1) + ret = concat(ret.values(), axis=1, keys=[key.label for key in ret.keys()]) return ret agg = aggregate @@ -307,8 +307,8 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) - results = {} - for name, func in arg: + results: Dict[base.OutputKey, Union[Series, DataFrame]] = {} + for idx, (name, func) in enumerate(arg): obj = self # reset the cache so that we @@ -317,13 +317,14 @@ def _aggregate_multiple_funcs(self, arg): obj = copy.copy(obj) obj._reset_cache() obj._selection = name - results[name] = obj.aggregate(func) + results[base.OutputKey(label=name, position=idx)] = obj.aggregate(func) if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle return results - return self.obj._constructor_expanddim(results, columns=columns) + output = self._wrap_aggregated_output(results) + return self.obj._constructor_expanddim(output, columns=columns) def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, @@ -354,10 +355,12 @@ def _wrap_series_output( if len(output) > 1: result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns - else: + elif not columns.empty: result = self.obj._constructor( indexed_output[0], index=index, name=columns[0] ) + else: + result = self.obj._constructor_expanddim() return result diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index dbd713a0af4cf..bf465635c0085 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -2,10 +2,13 @@ test .agg behavior / note that .apply is tested generally in test_groupby.py """ import functools +from functools import partial import numpy as np import pytest +from pandas.errors import PerformanceWarning + from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -252,6 +255,61 @@ def test_agg_multiple_functions_maintain_order(df): tm.assert_index_equal(result.columns, exp_cols) +def test_agg_multiple_functions_same_name(): + # GH 30880 + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) + result = df.resample("3T").agg( + {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} + ) + expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) + expected_values = np.array( + [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + ).T + expected = pd.DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_multiple_functions_same_name_with_ohlc_present(): + # GH 30880 + # ohlc expands dimensions, so different test to the above is required. + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) + result = df.resample("3T").agg( + {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} + ) + expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_columns = pd.MultiIndex.from_tuples( + [ + ("A", "ohlc", "open"), + ("A", "ohlc", "high"), + ("A", "ohlc", "low"), + ("A", "ohlc", "close"), + ("A", "quantile", "A"), + ("A", "quantile", "A"), + ] + ) + non_ohlc_expected_values = np.array( + [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + ).T + expected_values = np.hstack([df.resample("3T").A.ohlc(), non_ohlc_expected_values]) + expected = pd.DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + # PerformanceWarning is thrown by `assert col in right` in assert_frame_equal + with tm.assert_produces_warning(PerformanceWarning): + tm.assert_frame_equal(result, expected) + + def test_multiple_functions_tuples_and_non_tuples(df): # #1359 funcs = [("foo", "mean"), "std"]