From 1c4dacb4464fa0139216130b1835e5f4d4b73342 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 13 Apr 2017 10:18:04 +0000 Subject: [PATCH] DEPR: deprecate relableling dicts in groupby.agg (#15931) * DEPR: deprecate relabling dictionarys in groupby.agg --- doc/source/computation.rst | 8 -- doc/source/groupby.rst | 32 ++++-- doc/source/timeseries.rst | 8 -- doc/source/whatsnew/v0.20.0.txt | 82 +++++++++++++ pandas/core/base.py | 152 +++++++++++++++++++++---- pandas/core/groupby.py | 52 +++++---- pandas/tests/groupby/test_aggregate.py | 83 +++++++++++--- pandas/tests/groupby/test_groupby.py | 14 ++- pandas/tests/groupby/test_whitelist.py | 2 +- pandas/tests/test_window.py | 22 ++-- pandas/tests/tseries/test_resample.py | 67 ++++++----- pandas/types/cast.py | 17 +++ 12 files changed, 418 insertions(+), 121 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index a37cbc96b2d8c..f46a00826a8d9 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -610,14 +610,6 @@ aggregation with, outputting a DataFrame: r['A'].agg([np.sum, np.mean, np.std]) -If a dict is passed, the keys will be used to name the columns. Otherwise the -function's name (stored in the function object) will be used. - -.. ipython:: python - - r['A'].agg({'result1' : np.sum, - 'result2' : np.mean}) - On a widowed DataFrame, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index cbe3588104439..03ee5e0d67913 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -502,7 +502,7 @@ index are the group names and whose values are the sizes of each group. Applying multiple functions at once ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -With grouped Series you can also pass a list or dict of functions to do +With grouped ``Series`` you can also pass a list or dict of functions to do aggregation with, outputting a DataFrame: .. ipython:: python @@ -510,23 +510,35 @@ aggregation with, outputting a DataFrame: grouped = df.groupby('A') grouped['C'].agg([np.sum, np.mean, np.std]) -If a dict is passed, the keys will be used to name the columns. Otherwise the -function's name (stored in the function object) will be used. +On a grouped ``DataFrame``, you can pass a list of functions to apply to each +column, which produces an aggregated result with a hierarchical index: .. ipython:: python - grouped['D'].agg({'result1' : np.sum, - 'result2' : np.mean}) + grouped.agg([np.sum, np.mean, np.std]) -On a grouped DataFrame, you can pass a list of functions to apply to each -column, which produces an aggregated result with a hierarchical index: + +The resulting aggregations are named for the functions themselves. If you +need to rename, then you can add in a chained operation for a ``Series`` like this: .. ipython:: python - grouped.agg([np.sum, np.mean, np.std]) + (grouped['C'].agg([np.sum, np.mean, np.std]) + .rename(columns={'sum': 'foo', + 'mean': 'bar', + 'std': 'baz'}) + ) + +For a grouped ``DataFrame``, you can rename in a similar manner: + +.. ipython:: python + + (grouped.agg([np.sum, np.mean, np.std]) + .rename(columns={'sum': 'foo', + 'mean': 'bar', + 'std': 'baz'}) + ) -Passing a dict of functions has different behavior by default, see the next -section. Applying different functions to DataFrame columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 61812684e7648..0a957772d785e 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1549,14 +1549,6 @@ You can pass a list or dict of functions to do aggregation with, outputting a Da r['A'].agg([np.sum, np.mean, np.std]) -If a dict is passed, the keys will be used to name the columns. Otherwise the -function's name (stored in the function object) will be used. - -.. ipython:: python - - r['A'].agg({'result1' : np.sum, - 'result2' : np.mean}) - On a resampled DataFrame, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index defabee3cef8c..c243e4ef81b38 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -456,6 +456,88 @@ Convert to an xarray DataArray p.to_xarray() +.. _whatsnew_0200.api_breaking.deprecate_group_agg_dict: + +Deprecate groupby.agg() with a dictionary when renaming +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``.groupby(..).agg(..)``, ``.rolling(..).agg(..)``, and ``.resample(..).agg(..)`` syntax can accept a variable of inputs, including scalars, +list, and a dict of column names to scalars or lists. This provides a useful syntax for constructing multiple +(potentially different) aggregations. + +However, ``.agg(..)`` can *also* accept a dict that allows 'renaming' of the result columns. This is a complicated and confusing syntax, as well as not consistent +between ``Series`` and ``DataFrame``. We are deprecating this 'renaming' functionaility. + +1) We are deprecating passing a dict to a grouped/rolled/resampled ``Series``. This allowed +one to ``rename`` the resulting aggregation, but this had a completely different +meaning than passing a dictionary to a grouped ``DataFrame``, which accepts column-to-aggregations. +2) We are deprecating passing a dict-of-dicts to a grouped/rolled/resampled ``DataFrame`` in a similar manner. + +This is an illustrative example: + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 1, 1, 2, 2], + 'B': range(5), + 'C': range(5)}) + df + +Here is a typical useful syntax for computing different aggregations for different columns. This +is a natural (and useful) syntax. We aggregate from the dict-to-list by taking the specified +columns and applying the list of functions. This returns a ``MultiIndex`` for the columns. + +.. ipython:: python + + df.groupby('A').agg({'B': 'sum', 'C': 'min'}) + +Here's an example of the first deprecation (1), passing a dict to a grouped ``Series``. This +is a combination aggregation & renaming: + +.. code-block:: ipython + + In [6]: df.groupby('A').B.agg({'foo': 'count'}) + FutureWarning: using a dict on a Series for aggregation + is deprecated and will be removed in a future version + + Out[6]: + foo + A + 1 3 + 2 2 + +You can accomplish the same operation, more idiomatically by: + +.. ipython:: python + + df.groupby('A').B.agg(['count']).rename({'count': 'foo'}) + + +Here's an example of the second deprecation (2), passing a dict-of-dict to a grouped ``DataFrame``: + +.. code-block:: python + + In [23]: (df.groupby('A') + .agg({'B': {'foo': 'sum'}, 'C': {'bar': 'min'}}) + ) + FutureWarning: using a dict with renaming is deprecated and will be removed in a future version + + Out[23]: + B C + foo bar + A + 1 3 0 + 2 7 3 + + +You can accomplish nearly the same by: + +.. ipython:: python + + (df.groupby('A') + .agg({'B': 'sum', 'C': 'min'}) + .rename(columns={'B': 'foo', 'C': 'bar'}) + ) + .. _whatsnew.api_breaking.io_compat: Possible incompat for HDF5 formats for pandas < 0.13.0 diff --git a/pandas/core/base.py b/pandas/core/base.py index bdbfb7b949986..6566ee38c1ade 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1,6 +1,7 @@ """ Base and utility classes for pandas objects. """ +import warnings from pandas import compat from pandas.compat import builtins import numpy as np @@ -290,7 +291,12 @@ class SelectionMixin(object): } @property - def name(self): + def _selection_name(self): + """ + return a name for myself; this would ideally be called + the 'name' property, but we cannot conflict with the + Series.name property which can be set + """ if self._selection is None: return None # 'result' else: @@ -405,6 +411,26 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate + def _try_aggregate_string_function(self, arg, *args, **kwargs): + """ + if arg is a string, then try to operate on it: + - try to find a function on ourselves + - try to find a numpy function + - raise + + """ + assert isinstance(arg, compat.string_types) + + f = getattr(self, arg, None) + if f is not None: + return f(*args, **kwargs) + + f = getattr(np, arg, None) + if f is not None: + return f(self, *args, **kwargs) + + raise ValueError("{} is an unknown string function".format(arg)) + def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators @@ -424,18 +450,22 @@ def _aggregate(self, arg, *args, **kwargs): how can be a string describe the required post-processing, or None if not required """ - is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False + _axis = kwargs.pop('_axis', None) + if _axis is None: + _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) + if isinstance(arg, compat.string_types): - return getattr(self, arg)(*args, **kwargs), None + return self._try_aggregate_string_function(arg, *args, + **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict - if self.axis != 0: # pragma: no cover + if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj @@ -454,7 +484,7 @@ def _aggregate(self, arg, *args, **kwargs): # the keys must be in the columns # for ndim=2, or renamers for ndim=1 - # ok + # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} @@ -469,8 +499,28 @@ def _aggregate(self, arg, *args, **kwargs): 'for {0} with a nested ' 'dictionary'.format(k)) + # deprecation of nested renaming + # GH 15931 + warnings.warn( + ("using a dict with renaming " + "is deprecated and will be removed in a future " + "version"), + FutureWarning, stacklevel=4) + arg = new_arg + else: + # deprecation of renaming keys + # GH 15931 + keys = list(compat.iterkeys(arg)) + if (isinstance(obj, ABCDataFrame) and + len(obj.columns.intersection(keys)) != len(keys)): + warnings.warn( + ("using a dict with renaming " + "is deprecated and will be removed in a future " + "version"), + FutureWarning, stacklevel=4) + from pandas.tools.concat import concat def _agg_1dim(name, how, subset=None): @@ -534,7 +584,7 @@ def _agg(arg, func): agg_how: _agg_1dim(self._selection, agg_how)) # we are selecting the same set as we are aggregating - elif not len(sl - set(compat.iterkeys(arg))): + elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) @@ -555,32 +605,74 @@ def _agg(arg, func): result = _agg(arg, _agg_2dim) # combine results + + def is_any_series(): + # return a boolean if we have *any* nested series + return any([isinstance(r, ABCSeries) + for r in compat.itervalues(result)]) + + def is_any_frame(): + # return a boolean if we have *any* nested series + return any([isinstance(r, ABCDataFrame) + for r in compat.itervalues(result)]) + if isinstance(result, list): - result = concat(result, keys=keys, axis=1) - elif isinstance(list(compat.itervalues(result))[0], - ABCDataFrame): - result = concat([result[k] for k in keys], keys=keys, axis=1) - else: - from pandas import DataFrame + return concat(result, keys=keys, axis=1), True + + elif is_any_frame(): + # we have a dict of DataFrames + # return a MI DataFrame + + return concat([result[k] for k in keys], + keys=keys, axis=1), True + + elif isinstance(self, ABCSeries) and is_any_series(): + + # we have a dict of Series + # return a MI Series + try: + result = concat(result) + except TypeError: + # we want to give a nice error here if + # we have non-same sized objects, so + # we don't automatically broadcast + + raise ValueError("cannot perform both aggregation " + "and transformation operations " + "simultaneously") + + return result, True + + # fall thru + from pandas import DataFrame, Series + try: result = DataFrame(result) + except ValueError: + + # we have a dict of scalars + result = Series(result, + name=getattr(self, 'name', None)) return result, True - elif hasattr(arg, '__iter__'): - return self._aggregate_multiple_funcs(arg, _level=_level), None + elif is_list_like(arg) and arg not in compat.string_types: + # we require a list, but not an 'str' + return self._aggregate_multiple_funcs(arg, + _level=_level, + _axis=_axis), None else: result = None - cy_func = self._is_cython_func(arg) - if cy_func and not args and not kwargs: - return getattr(self, cy_func)(), None + f = self._is_cython_func(arg) + if f and not args and not kwargs: + return getattr(self, f)(), None # caller can react return result, True - def _aggregate_multiple_funcs(self, arg, _level): + def _aggregate_multiple_funcs(self, arg, _level, _axis): from pandas.tools.concat import concat - if self.axis != 0: + if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: @@ -615,10 +707,30 @@ def _aggregate_multiple_funcs(self, arg, _level): keys.append(col) except (TypeError, DataError): pass + except ValueError: + # cannot aggregate + continue except SpecificationError: raise - return concat(results, keys=keys, axis=1) + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + return concat(results, keys=keys, axis=1) + except TypeError: + + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + + from pandas.types.cast import is_nested_object + from pandas import Series + result = Series(results, index=keys, name=self.name) + if is_nested_object(result): + raise ValueError("cannot combine transform and " + "aggregation operations") + return result def _shallow_copy(self, obj=None, obj_type=None, **kwargs): """ return a new object with the replacement attributes """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index add2987b8f452..5e55196803c22 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -722,7 +722,7 @@ def _python_apply_general(self, f): not_indexed_same=mutated or self.mutated) def _iterate_slices(self): - yield self.name, self._selected_obj + yield self._selection_name, self._selected_obj def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) @@ -921,9 +921,9 @@ def reset_identity(values): result = concat(values, axis=self.axis) if (isinstance(result, Series) and - getattr(self, 'name', None) is not None): + getattr(self, '_selection_name', None) is not None): - result.name = self.name + result.name = self._selection_name return result @@ -1123,7 +1123,7 @@ def size(self): result = self.grouper.size() if isinstance(self.obj, Series): - result.name = getattr(self, 'name', None) + result.name = getattr(self.obj, 'name', None) return result @classmethod @@ -2736,7 +2736,7 @@ class SeriesGroupBy(GroupBy): exec(_def_str) @property - def name(self): + def _selection_name(self): """ since we are a series, we by definition only have a single name, but may be the result of a selection or @@ -2834,6 +2834,17 @@ def aggregate(self, func_or_funcs, *args, **kwargs): def _aggregate_multiple_funcs(self, arg, _level): if isinstance(arg, dict): + + # show the deprecation, but only if we + # have not shown a higher level one + # GH 15931 + if isinstance(self._selected_obj, Series) and _level <= 1: + warnings.warn( + ("using a dict on a Series for aggregation\n" + "is deprecated and will be removed in a future " + "version"), + FutureWarning, stacklevel=4) + columns = list(arg.keys()) arg = list(arg.items()) elif any(isinstance(x, (tuple, list)) for x in arg): @@ -2879,12 +2890,12 @@ def _aggregate_multiple_funcs(self, arg, _level): def _wrap_output(self, output, index, names=None): """ common agg/transform wrapping logic """ - output = output[self.name] + output = output[self._selection_name] if names is not None: return DataFrame(output, index=index, columns=names) else: - name = self.name + name = self._selection_name if name is None: name = self._selected_obj.name return Series(output, index=index, name=name) @@ -2902,7 +2913,7 @@ def _wrap_transformed_output(self, output, names=None): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return Series([], name=self.name, index=keys) + return Series([], name=self._selection_name, index=keys) def _get_index(): if self.grouper.nkeys > 1: @@ -2915,7 +2926,7 @@ def _get_index(): # GH #823 index = _get_index() result = DataFrame(values, index=index).stack() - result.name = self.name + result.name = self._selection_name return result if isinstance(values[0], (Series, dict)): @@ -2927,7 +2938,8 @@ def _get_index(): not_indexed_same=not_indexed_same) else: # GH #6265 - return Series(values, index=_get_index(), name=self.name) + return Series(values, index=_get_index(), + name=self._selection_name) def _aggregate_named(self, func, *args, **kwargs): result = {} @@ -3098,7 +3110,7 @@ def nunique(self, dropna=True): return Series(res, index=ri, - name=self.name) + name=self._selection_name) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -3156,7 +3168,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, # multi-index components labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] - names = self.grouper.names + [self.name] + names = self.grouper.names + [self._selection_name] if dropna: mask = labels[-1] != -1 @@ -3191,7 +3203,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if is_integer_dtype(out): out = _ensure_int64(out) - return Series(out, index=mi, name=self.name) + return Series(out, index=mi, name=self._selection_name) # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros @@ -3222,7 +3234,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if is_integer_dtype(out): out = _ensure_int64(out) - return Series(out, index=mi, name=self.name) + return Series(out, index=mi, name=self._selection_name) def count(self): """ Compute count of group, excluding missing values """ @@ -3235,7 +3247,7 @@ def count(self): return Series(out, index=self.grouper.result_index, - name=self.name, + name=self._selection_name, dtype='int64') def _apply_to_column_groupbys(self, func): @@ -3391,7 +3403,7 @@ def aggregate(self, arg, *args, **kwargs): try: assert not args and not kwargs result = self._aggregate_multiple_funcs( - [arg], _level=_level) + [arg], _level=_level, _axis=self.axis) result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name) @@ -3623,7 +3635,8 @@ def first_non_None_value(values): except (ValueError, AttributeError): # GH1738: values is list of arrays of unequal lengths fall # through to the outer else caluse - return Series(values, index=key_index, name=self.name) + return Series(values, index=key_index, + name=self._selection_name) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here @@ -3647,8 +3660,9 @@ def first_non_None_value(values): # only coerce dates if we find at least 1 datetime coerce = True if any([isinstance(x, Timestamp) for x in values]) else False - # self.name not passed through to Series as the result - # should not take the name of original selection of columns + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns return (Series(values, index=key_index) ._convert(datetime=True, coerce=coerce)) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 52b35048b6762..c2d6422c50d02 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import (date_range, MultiIndex, DataFrame, - Series, Index, bdate_range) + Series, Index, bdate_range, concat) from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.groupby import SpecificationError, DataError from pandas.compat import OrderedDict @@ -291,8 +291,10 @@ def test_aggregate_api_consistency(self): expected.columns = MultiIndex.from_product([['C', 'D'], ['mean', 'sum']]) - result = grouped[['D', 'C']].agg({'r': np.sum, - 'r2': np.mean}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = grouped[['D', 'C']].agg({'r': np.sum, + 'r2': np.mean}) expected = pd.concat([d_sum, c_sum, d_mean, @@ -302,6 +304,28 @@ def test_aggregate_api_consistency(self): ['D', 'C']]) assert_frame_equal(result, expected, check_like=True) + def test_agg_dict_renaming_deprecation(self): + # 15931 + df = pd.DataFrame({'A': [1, 1, 1, 2, 2], + 'B': range(5), + 'C': range(5)}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False) as w: + df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, + 'C': {'bar': ['count', 'min']}}) + assert "using a dict with renaming" in str(w[0].message) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + df.groupby('A')[['B', 'C']].agg({'ma': 'max'}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False) as w: + df.groupby('A').B.agg({'foo': 'count'}) + assert "using a dict on a Series for aggregation" in str( + w[0].message) + def test_agg_compat(self): # GH 12334 @@ -320,14 +344,19 @@ def test_agg_compat(self): axis=1) expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')]) - result = g['D'].agg({'C': ['sum', 'std']}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = g['D'].agg({'C': ['sum', 'std']}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = ['C', 'D'] - result = g['D'].agg({'C': 'sum', 'D': 'std'}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = g['D'].agg({'C': 'sum', 'D': 'std'}) assert_frame_equal(result, expected, check_like=True) def test_agg_nested_dicts(self): @@ -348,8 +377,10 @@ def f(): self.assertRaises(SpecificationError, f) - result = g.agg({'C': {'ra': ['mean', 'std']}, - 'D': {'rb': ['mean', 'std']}}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = g.agg({'C': {'ra': ['mean', 'std']}, + 'D': {'rb': ['mean', 'std']}}) expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), g['D'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( @@ -358,9 +389,14 @@ def f(): # same name as the original column # GH9052 - expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) expected = expected.rename(columns={'result1': 'D'}) - result = g['D'].agg({'D': np.sum, 'result2': np.mean}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = g['D'].agg({'D': np.sum, 'result2': np.mean}) assert_frame_equal(result, expected, check_like=True) def test_agg_python_multiindex(self): @@ -627,7 +663,6 @@ def test_agg_multiple_functions_too_many_lambdas(self): self.assertRaises(SpecificationError, grouped.agg, funcs) def test_more_flexible_frame_multi_function(self): - from pandas import concat grouped = self.df.groupby('A') @@ -655,9 +690,12 @@ def foo(x): def bar(x): return np.std(x, ddof=1) - d = OrderedDict([['C', np.mean], ['D', OrderedDict( - [['foo', np.mean], ['bar', np.std]])]]) - result = grouped.aggregate(d) + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + d = OrderedDict([['C', np.mean], ['D', OrderedDict( + [['foo', np.mean], ['bar', np.std]])]]) + result = grouped.aggregate(d) d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) expected = grouped.aggregate(d) @@ -671,16 +709,29 @@ def test_multi_function_flexible_mix(self): d = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ 'bar', 'std' ]])], ['D', 'sum']]) - result = grouped.aggregate(d) + + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = grouped.aggregate(d) + d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ 'bar', 'std' ]])], ['D', ['sum']]]) - result2 = grouped.aggregate(d2) + + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result2 = grouped.aggregate(d2) d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ 'bar', 'std' ]])], ['D', {'sum': 'sum'}]]) - expected = grouped.aggregate(d3) + + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + expected = grouped.aggregate(d3) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 68955c954206e..8f3d8e2307f45 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -59,7 +59,10 @@ def checkit(dtype): # complex agg agged = grouped.aggregate([np.mean, np.std]) - agged = grouped.aggregate({'one': np.mean, 'two': np.std}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + agged = grouped.aggregate({'one': np.mean, 'two': np.std}) group_constants = {0: 10, 1: 20, 2: 30} agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) @@ -1262,7 +1265,9 @@ def test_frame_set_name_single(self): result = grouped['C'].agg([np.mean, np.std]) self.assertEqual(result.index.name, 'A') - result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) self.assertEqual(result.index.name, 'A') def test_multi_iter(self): @@ -1438,7 +1443,10 @@ def test_groupby_as_index_agg(self): grouped = self.df.groupby('A', as_index=True) expected3 = grouped['C'].sum() expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) - result3 = grouped['C'].agg({'Q': np.sum}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result3 = grouped['C'].agg({'Q': np.sum}) assert_frame_equal(result3, expected3) # multi-key diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index d566f34b7eae8..5a4f282789eeb 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -233,7 +233,7 @@ def test_tab_completion(mframe): expected = set( ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot', + 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'nunique', 'head', 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 5fc31e9321f31..9cd3b8b839a9b 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -134,16 +134,18 @@ def test_agg(self): expected.columns = ['mean', 'sum'] tm.assert_frame_equal(result, expected) - result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) + with catch_warnings(record=True): + result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'sum')]) tm.assert_frame_equal(result, expected, check_like=True) - result = r.aggregate({'A': {'mean': 'mean', - 'sum': 'sum'}, - 'B': {'mean2': 'mean', - 'sum2': 'sum'}}) + with catch_warnings(record=True): + result = r.aggregate({'A': {'mean': 'mean', + 'sum': 'sum'}, + 'B': {'mean2': 'mean', + 'sum2': 'sum'}}) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) exp_cols = [('A', 'mean'), ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')] expected.columns = pd.MultiIndex.from_tuples(exp_cols) @@ -195,12 +197,14 @@ def f(): r['B'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) - result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + with catch_warnings(record=True): + result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, + 'B': {'rb': ['mean', 'std']}}) tm.assert_frame_equal(result, expected, check_like=True) - result = r.agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + with catch_warnings(record=True): + result = r.agg({'A': {'ra': ['mean', 'std']}, + 'B': {'rb': ['mean', 'std']}}) expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), ( 'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')]) tm.assert_frame_equal(result, expected, check_like=True) diff --git a/pandas/tests/tseries/test_resample.py b/pandas/tests/tseries/test_resample.py index 9c66cae292c4e..98664c1ec118c 100755 --- a/pandas/tests/tseries/test_resample.py +++ b/pandas/tests/tseries/test_resample.py @@ -394,8 +394,10 @@ def test_agg_consistency(self): r = df.resample('3T') - expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) - result = r.agg({'r1': 'mean', 'r2': 'sum'}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) + result = r.agg({'r1': 'mean', 'r2': 'sum'}) assert_frame_equal(result, expected) # TODO: once GH 14008 is fixed, move these tests into @@ -459,7 +461,9 @@ def test_agg(self): expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'sum')]) for t in cases: - result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) @@ -468,8 +472,10 @@ def test_agg(self): ('B', 'mean2'), ('B', 'sum2')]) for t in cases: - result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, - 'B': {'mean2': 'mean', 'sum2': 'sum'}}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, + 'B': {'mean2': 'mean', 'sum2': 'sum'}}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) @@ -529,9 +535,12 @@ def test_agg_misc(self): ('result1', 'B'), ('result2', 'A'), ('result2', 'B')]) + for t in cases: - result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum), - ('result2', np.mean)])) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum), + ('result2', np.mean)])) assert_frame_equal(result, expected, check_like=True) # agg with different hows @@ -557,7 +566,9 @@ def test_agg_misc(self): # series like aggs for t in cases: - result = t['A'].agg({'A': ['sum', 'std']}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = t['A'].agg({'A': ['sum', 'std']}) expected = pd.concat([t['A'].sum(), t['A'].std()], axis=1) @@ -572,15 +583,20 @@ def test_agg_misc(self): ('A', 'std'), ('B', 'mean'), ('B', 'std')]) - result = t['A'].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = t['A'].agg({'A': ['sum', 'std'], + 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) # errors # invalid names in the agg specification for t in cases: def f(): - t[['A']].agg({'A': ['sum', 'std'], - 'B': ['mean', 'std']}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + t[['A']].agg({'A': ['sum', 'std'], + 'B': ['mean', 'std']}) self.assertRaises(SpecificationError, f) @@ -617,12 +633,16 @@ def f(): expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) - result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, + 'B': {'rb': ['mean', 'std']}}) assert_frame_equal(result, expected, check_like=True) - result = t.agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = t.agg({'A': {'ra': ['mean', 'std']}, + 'B': {'rb': ['mean', 'std']}}) assert_frame_equal(result, expected, check_like=True) def test_selection_api_validation(self): @@ -752,16 +772,7 @@ def test_resample_empty_series(self): expected.index = s.index._shallow_copy(freq=freq) assert_index_equal(result.index, expected.index) self.assertEqual(result.index.freq, expected.index.freq) - - if (method == 'size' and - isinstance(result.index, PeriodIndex) and - freq in ['M', 'D']): - # GH12871 - TODO: name should propagate, but currently - # doesn't on lower / same frequency with PeriodIndex - assert_series_equal(result, expected, check_dtype=False) - - else: - assert_series_equal(result, expected, check_dtype=False) + assert_series_equal(result, expected, check_dtype=False) def test_resample_empty_dataframe(self): # GH13212 @@ -1846,10 +1857,12 @@ def test_how_lambda_functions(self): tm.assert_series_equal(result['foo'], foo_exp) tm.assert_series_equal(result['bar'], bar_exp) + # this is a MI Series, so comparing the names of the results + # doesn't make sense result = ts.resample('M').aggregate({'foo': lambda x: x.mean(), 'bar': lambda x: x.std(ddof=1)}) - tm.assert_series_equal(result['foo'], foo_exp) - tm.assert_series_equal(result['bar'], bar_exp) + tm.assert_series_equal(result['foo'], foo_exp, check_names=False) + tm.assert_series_equal(result['bar'], bar_exp, check_names=False) def test_resample_unequal_times(self): # #1772 diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 580ce12de3333..85053dba0c18b 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -45,6 +45,23 @@ def maybe_convert_platform(values): return values +def is_nested_object(obj): + """ + return a boolean if we have a nested object, e.g. a Series with 1 or + more Series elements + + This may not be necessarily be performant. + + """ + + if isinstance(obj, ABCSeries) and is_object_dtype(obj): + + if any(isinstance(v, ABCSeries) for v in obj.values): + return True + + return False + + def maybe_downcast_to_dtype(result, dtype): """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32