ENH: add Series & DataFrame .agg/.aggregate to provide convienent

function application that mimics the groupby(..).agg/.aggregate interface .apply is now a synonym for .agg, and will accept dict/list-likes for aggregations CLN: rename .name attr -> ._selection_name from SeriesGroupby for compat (didn't exist on DataFrameGroupBy) resolves conflicts w.r.t. setting .name on a groupby object closes pandas-dev#1623 closes pandas-dev#14464 custom .describe closes pandas-dev#14483 closes pandas-dev#7014
jreback · Nov 16, 2016 · dec6354 · dec6354
1 parent 3f523f3
commit dec6354
Show file tree

Hide file tree

Showing 9 changed files with 442 additions and 47 deletions.
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -289,7 +289,9 @@ class SelectionMixin(object):
     }
 
     @property
-    def name(self):
+    def _selection_name(self):
+        """ return a name for myself; this would ideally be the 'name' property, but
+        we cannot conflict with the Series.name property which can be set """
         if self._selection is None:
             return None  # 'result'
         else:
@@ -404,6 +406,26 @@ def aggregate(self, func, *args, **kwargs):
 
     agg = aggregate
 
+    def _try_aggregate_string_function(self, arg, *args, **kwargs):
+        """
+        if arg is a string, then try to operate on it:
+        - try to find a function on ourselves
+        - try to find a numpy function
+        - raise
+
+        """
+        assert isinstance(arg, compat.string_types)
+
+        f = getattr(self, arg, None)
+        if f is not None:
+            return f(*args, **kwargs)
+
+        f = getattr(np, arg, None)
+        if f is not None:
+            return f(self, *args, **kwargs)
+
+        raise ValueError("{} is an unknown string function".format(arg))
+
     def _aggregate(self, arg, *args, **kwargs):
         """
         provide an implementation for the aggregators
@@ -427,14 +449,19 @@ def _aggregate(self, arg, *args, **kwargs):
         is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
         is_nested_renamer = False
 
+        _axis = kwargs.pop('_axis', None)
+        if _axis is None:
+            _axis = getattr(self, 'axis', 0)
         _level = kwargs.pop('_level', None)
+
         if isinstance(arg, compat.string_types):
-            return getattr(self, arg)(*args, **kwargs), None
+            return self._try_aggregate_string_function(arg, *args,
+                                                       **kwargs), None
 
         if isinstance(arg, dict):
 
             # aggregate based on the passed dict
-            if self.axis != 0:  # pragma: no cover
+            if _axis != 0:  # pragma: no cover
                 raise ValueError('Can only pass dict with axis=0')
 
             obj = self._selected_obj
@@ -560,26 +587,33 @@ def _agg(arg, func):
                             ABCDataFrame):
                 result = concat([result[k] for k in keys], keys=keys, axis=1)
             else:
-                from pandas import DataFrame
-                result = DataFrame(result)
+                from pandas import DataFrame, Series
+                try:
+                    result = DataFrame(result)
+                except ValueError:
+                    # we have a dict of scalars
+                    result = Series(result, name=self.name)
 
             return result, True
-        elif hasattr(arg, '__iter__'):
-            return self._aggregate_multiple_funcs(arg, _level=_level), None
+        elif is_list_like(arg) and arg not in compat.string_types:
+            # we require a list, but not an 'str'
+            return self._aggregate_multiple_funcs(arg,
+                                                  _level=_level,
+                                                  _axis=_axis), None
         else:
             result = None
 
-        cy_func = self._is_cython_func(arg)
-        if cy_func and not args and not kwargs:
-            return getattr(self, cy_func)(), None
+        f = self._is_cython_func(arg)
+        if f and not args and not kwargs:
+            return getattr(self, f)(), None
 
         # caller can react
         return result, True
 
-    def _aggregate_multiple_funcs(self, arg, _level):
+    def _aggregate_multiple_funcs(self, arg, _level, _axis):
         from pandas.tools.merge import concat
 
-        if self.axis != 0:
+        if _axis != 0:
             raise NotImplementedError("axis other than 0 is not supported")
 
         if self._selected_obj.ndim == 1:
@@ -617,7 +651,15 @@ def _aggregate_multiple_funcs(self, arg, _level):
                 except SpecificationError:
                     raise
 
-        return concat(results, keys=keys, axis=1)
+        try:
+            return concat(results, keys=keys, axis=1)
+        except TypeError:
+            # shape change
+            from pandas.types.cast import _maybe_convert_nested_object
+            from pandas import Series
+            result = Series(results, index=keys, name=self.name)
+            result = _maybe_convert_nested_object(result)
+            return result
 
     def _shallow_copy(self, obj=None, obj_type=None, **kwargs):
         """ return a new object with the replacement attributes """

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4073,6 +4073,38 @@ def diff(self, periods=1, axis=0):
     # ----------------------------------------------------------------------
     # Function application
 
+    def _gotitem(self, key, ndim, subset=None):
+        """
+        sub-classes to define
+        return a sliced object
+
+        Parameters
+        ----------
+        key : string / list of selections
+        ndim : 1,2
+            requested ndim of result
+        subset : object, default None
+            subset to act on
+        """
+        if subset is None:
+            subset = self
+
+        # TODO: _shallow_copy(subset)?
+        return self[key]
+
+    def aggregate(self, func, axis=0, *args, **kwargs):
+        axis = self._get_axis_number(axis)
+
+        # TODO: flipped axis
+        result = None
+        if axis == 0:
+            result, how = self._aggregate(func, axis=0, *args, **kwargs)
+        if result is None:
+            return self.apply(func, axis=axis, args=args, **kwargs)
+        return result
+
+    agg = aggregate
+
     def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
               args=(), **kwds):
         """
@@ -4134,16 +4166,20 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
         applied : Series or DataFrame
         """
         axis = self._get_axis_number(axis)
-        if kwds or args and not isinstance(func, np.ufunc):
 
+        # dispatch to agg
+        if axis == 0 and isinstance(func, (list, dict)):
+            return self.aggregate(func, axis=axis, *args, **kwds)
+
+        if len(self.columns) == 0 and len(self.index) == 0:
+            return self._apply_empty_result(func, axis, reduce, *args, **kwds)
+
+        if kwds or args and not isinstance(func, np.ufunc):
             def f(x):
                 return func(x, *args, **kwds)
         else:
             f = func
 
-        if len(self.columns) == 0 and len(self.index) == 0:
-            return self._apply_empty_result(func, axis, reduce, *args, **kwds)
-
         if isinstance(f, np.ufunc):
             with np.errstate(all='ignore'):
                 results = f(self.values)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -33,7 +33,7 @@
                                 SettingWithCopyError, SettingWithCopyWarning,
                                 AbstractMethodError)
 
-from pandas.core.base import PandasObject
+from pandas.core.base import PandasObject, SelectionMixin
 from pandas.core.index import (Index, MultiIndex, _ensure_index,
                                InvalidIndexError)
 import pandas.core.indexing as indexing
@@ -91,7 +91,7 @@ def _single_replace(self, to_replace, method, inplace, limit):
     return result
 
 
-class NDFrame(PandasObject):
+class NDFrame(PandasObject, SelectionMixin):
     """
     N-dimensional analogue of DataFrame. Store multi-dimensional in a
     size-mutable, labeled data structure
@@ -428,6 +428,16 @@ def size(self):
         """number of elements in the NDFrame"""
         return np.prod(self.shape)
 
+    @property
+    def _selected_obj(self):
+        """ internal compat with SelectionMixin """
+        return self
+
+    @property
+    def _obj_with_exclusions(self):
+        """ internal compat with SelectionMixin """
+        return self
+
     def _expand_axes(self, key):
         new_axes = []
         for k, ax in zip(key, self.axes):

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -703,7 +703,7 @@ def _python_apply_general(self, f):
             not_indexed_same=mutated or self.mutated)
 
     def _iterate_slices(self):
-        yield self.name, self._selected_obj
+        yield self._selection_name, self._selected_obj
 
     def transform(self, func, *args, **kwargs):
         raise AbstractMethodError(self)
@@ -886,9 +886,9 @@ def reset_identity(values):
             result = concat(values, axis=self.axis)
 
         if (isinstance(result, Series) and
-                getattr(self, 'name', None) is not None):
+                getattr(self, '_selection_name', None) is not None):
 
-            result.name = self.name
+            result.name = self._selection_name
 
         return result
 
@@ -2575,7 +2575,7 @@ class SeriesGroupBy(GroupBy):
         exec(_def_str)
 
     @property
-    def name(self):
+    def _selection_name(self):
         """
         since we are a series, we by definition only have
         a single name, but may be the result of a selection or
@@ -2718,12 +2718,12 @@ def _aggregate_multiple_funcs(self, arg, _level):
 
     def _wrap_output(self, output, index, names=None):
         """ common agg/transform wrapping logic """
-        output = output[self.name]
+        output = output[self._selection_name]
 
         if names is not None:
             return DataFrame(output, index=index, columns=names)
         else:
-            name = self.name
+            name = self._selection_name
             if name is None:
                 name = self._selected_obj.name
             return Series(output, index=index, name=name)
@@ -2741,7 +2741,7 @@ def _wrap_transformed_output(self, output, names=None):
     def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         if len(keys) == 0:
             # GH #6265
-            return Series([], name=self.name, index=keys)
+            return Series([], name=self._selection_name, index=keys)
 
         def _get_index():
             if self.grouper.nkeys > 1:
@@ -2754,7 +2754,7 @@ def _get_index():
             # GH #823
             index = _get_index()
             result = DataFrame(values, index=index).stack()
-            result.name = self.name
+            result.name = self._selection_name
             return result
 
         if isinstance(values[0], (Series, dict)):
@@ -2766,7 +2766,8 @@ def _get_index():
                                         not_indexed_same=not_indexed_same)
         else:
             # GH #6265
-            return Series(values, index=_get_index(), name=self.name)
+            return Series(values, index=_get_index(),
+                          name=self._selection_name)
 
     def _aggregate_named(self, func, *args, **kwargs):
         result = {}
@@ -2938,7 +2939,7 @@ def nunique(self, dropna=True):
 
         return Series(res,
                       index=ri,
-                      name=self.name)
+                      name=self._selection_name)
 
     @deprecate_kwarg('take_last', 'keep',
                      mapping={True: 'last', False: 'first'})
@@ -3002,7 +3003,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
         # multi-index components
         labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
         levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
-        names = self.grouper.names + [self.name]
+        names = self.grouper.names + [self._selection_name]
 
         if dropna:
             mask = labels[-1] != -1
@@ -3037,7 +3038,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
 
             if is_integer_dtype(out):
                 out = _ensure_int64(out)
-            return Series(out, index=mi, name=self.name)
+            return Series(out, index=mi, name=self._selection_name)
 
         # for compat. with algos.value_counts need to ensure every
         # bin is present at every index level, null filled with zeros
@@ -3068,7 +3069,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
 
         if is_integer_dtype(out):
             out = _ensure_int64(out)
-        return Series(out, index=mi, name=self.name)
+        return Series(out, index=mi, name=self._selection_name)
 
     def count(self):
         """ Compute count of group, excluding missing values """
@@ -3081,7 +3082,7 @@ def count(self):
 
         return Series(out,
                       index=self.grouper.result_index,
-                      name=self.name,
+                      name=self._selection_name,
                       dtype='int64')
 
     def _apply_to_column_groupbys(self, func):
@@ -3191,7 +3192,7 @@ def aggregate(self, arg, *args, **kwargs):
                 try:
                     assert not args and not kwargs
                     result = self._aggregate_multiple_funcs(
-                        [arg], _level=_level)
+                        [arg], _level=_level, _axis=self.axis)
                     result.columns = Index(
                         result.columns.levels[0],
                         name=self._selected_obj.columns.name)
@@ -3422,7 +3423,8 @@ def first_non_None_value(values):
                 except (ValueError, AttributeError):
                     # GH1738: values is list of arrays of unequal lengths fall
                     # through to the outer else caluse
-                    return Series(values, index=key_index, name=self.name)
+                    return Series(values, index=key_index,
+                                  name=self._selection_name)
 
                 # if we have date/time like in the original, then coerce dates
                 # as we are stacking can easily have object dtypes here
@@ -3445,8 +3447,9 @@ def first_non_None_value(values):
                 # only coerce dates if we find at least 1 datetime
                 coerce = True if any([isinstance(x, Timestamp)
                                       for x in values]) else False
-                # self.name not passed through to Series as the result
-                # should not take the name of original selection of columns
+                # self._selection_name not passed through to Series as the
+                # result should not take the name of original selection
+                # of columns
                 return (Series(values, index=key_index)
                         ._convert(datetime=True,
                                   coerce=coerce))