Skip to content

Commit

Permalink
ENH: add Series & DataFrame .agg/.aggregate to provide convienent
Browse files Browse the repository at this point in the history
function application that mimics the groupby(..).agg/.aggregate
interface

.apply is now a synonym for .agg, and will accept dict/list-likes
for aggregations

CLN: rename .name attr -> ._selection_name from SeriesGroupby for compat (didn't exist on DataFrameGroupBy)
resolves conflicts w.r.t. setting .name on a groupby object

closes pandas-dev#1623
closes pandas-dev#14464

custom .describe
closes pandas-dev#14483
closes pandas-dev#7014
  • Loading branch information
jreback committed Nov 16, 2016
1 parent 3f523f3 commit dec6354
Show file tree
Hide file tree
Showing 9 changed files with 442 additions and 47 deletions.
68 changes: 55 additions & 13 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,9 @@ class SelectionMixin(object):
}

@property
def name(self):
def _selection_name(self):
""" return a name for myself; this would ideally be the 'name' property, but
we cannot conflict with the Series.name property which can be set """
if self._selection is None:
return None # 'result'
else:
Expand Down Expand Up @@ -404,6 +406,26 @@ def aggregate(self, func, *args, **kwargs):

agg = aggregate

def _try_aggregate_string_function(self, arg, *args, **kwargs):
"""
if arg is a string, then try to operate on it:
- try to find a function on ourselves
- try to find a numpy function
- raise
"""
assert isinstance(arg, compat.string_types)

f = getattr(self, arg, None)
if f is not None:
return f(*args, **kwargs)

f = getattr(np, arg, None)
if f is not None:
return f(self, *args, **kwargs)

raise ValueError("{} is an unknown string function".format(arg))

def _aggregate(self, arg, *args, **kwargs):
"""
provide an implementation for the aggregators
Expand All @@ -427,14 +449,19 @@ def _aggregate(self, arg, *args, **kwargs):
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
is_nested_renamer = False

_axis = kwargs.pop('_axis', None)
if _axis is None:
_axis = getattr(self, 'axis', 0)
_level = kwargs.pop('_level', None)

if isinstance(arg, compat.string_types):
return getattr(self, arg)(*args, **kwargs), None
return self._try_aggregate_string_function(arg, *args,
**kwargs), None

if isinstance(arg, dict):

# aggregate based on the passed dict
if self.axis != 0: # pragma: no cover
if _axis != 0: # pragma: no cover
raise ValueError('Can only pass dict with axis=0')

obj = self._selected_obj
Expand Down Expand Up @@ -560,26 +587,33 @@ def _agg(arg, func):
ABCDataFrame):
result = concat([result[k] for k in keys], keys=keys, axis=1)
else:
from pandas import DataFrame
result = DataFrame(result)
from pandas import DataFrame, Series
try:
result = DataFrame(result)
except ValueError:
# we have a dict of scalars
result = Series(result, name=self.name)

return result, True
elif hasattr(arg, '__iter__'):
return self._aggregate_multiple_funcs(arg, _level=_level), None
elif is_list_like(arg) and arg not in compat.string_types:
# we require a list, but not an 'str'
return self._aggregate_multiple_funcs(arg,
_level=_level,
_axis=_axis), None
else:
result = None

cy_func = self._is_cython_func(arg)
if cy_func and not args and not kwargs:
return getattr(self, cy_func)(), None
f = self._is_cython_func(arg)
if f and not args and not kwargs:
return getattr(self, f)(), None

# caller can react
return result, True

def _aggregate_multiple_funcs(self, arg, _level):
def _aggregate_multiple_funcs(self, arg, _level, _axis):
from pandas.tools.merge import concat

if self.axis != 0:
if _axis != 0:
raise NotImplementedError("axis other than 0 is not supported")

if self._selected_obj.ndim == 1:
Expand Down Expand Up @@ -617,7 +651,15 @@ def _aggregate_multiple_funcs(self, arg, _level):
except SpecificationError:
raise

return concat(results, keys=keys, axis=1)
try:
return concat(results, keys=keys, axis=1)
except TypeError:
# shape change
from pandas.types.cast import _maybe_convert_nested_object
from pandas import Series
result = Series(results, index=keys, name=self.name)
result = _maybe_convert_nested_object(result)
return result

def _shallow_copy(self, obj=None, obj_type=None, **kwargs):
""" return a new object with the replacement attributes """
Expand Down
44 changes: 40 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4073,6 +4073,38 @@ def diff(self, periods=1, axis=0):
# ----------------------------------------------------------------------
# Function application

def _gotitem(self, key, ndim, subset=None):
"""
sub-classes to define
return a sliced object
Parameters
----------
key : string / list of selections
ndim : 1,2
requested ndim of result
subset : object, default None
subset to act on
"""
if subset is None:
subset = self

# TODO: _shallow_copy(subset)?
return self[key]

def aggregate(self, func, axis=0, *args, **kwargs):
axis = self._get_axis_number(axis)

# TODO: flipped axis
result = None
if axis == 0:
result, how = self._aggregate(func, axis=0, *args, **kwargs)
if result is None:
return self.apply(func, axis=axis, args=args, **kwargs)
return result

agg = aggregate

def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
args=(), **kwds):
"""
Expand Down Expand Up @@ -4134,16 +4166,20 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
applied : Series or DataFrame
"""
axis = self._get_axis_number(axis)
if kwds or args and not isinstance(func, np.ufunc):

# dispatch to agg
if axis == 0 and isinstance(func, (list, dict)):
return self.aggregate(func, axis=axis, *args, **kwds)

if len(self.columns) == 0 and len(self.index) == 0:
return self._apply_empty_result(func, axis, reduce, *args, **kwds)

if kwds or args and not isinstance(func, np.ufunc):
def f(x):
return func(x, *args, **kwds)
else:
f = func

if len(self.columns) == 0 and len(self.index) == 0:
return self._apply_empty_result(func, axis, reduce, *args, **kwds)

if isinstance(f, np.ufunc):
with np.errstate(all='ignore'):
results = f(self.values)
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
SettingWithCopyError, SettingWithCopyWarning,
AbstractMethodError)

from pandas.core.base import PandasObject
from pandas.core.base import PandasObject, SelectionMixin
from pandas.core.index import (Index, MultiIndex, _ensure_index,
InvalidIndexError)
import pandas.core.indexing as indexing
Expand Down Expand Up @@ -91,7 +91,7 @@ def _single_replace(self, to_replace, method, inplace, limit):
return result


class NDFrame(PandasObject):
class NDFrame(PandasObject, SelectionMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Expand Down Expand Up @@ -428,6 +428,16 @@ def size(self):
"""number of elements in the NDFrame"""
return np.prod(self.shape)

@property
def _selected_obj(self):
""" internal compat with SelectionMixin """
return self

@property
def _obj_with_exclusions(self):
""" internal compat with SelectionMixin """
return self

def _expand_axes(self, key):
new_axes = []
for k, ax in zip(key, self.axes):
Expand Down
39 changes: 21 additions & 18 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,7 @@ def _python_apply_general(self, f):
not_indexed_same=mutated or self.mutated)

def _iterate_slices(self):
yield self.name, self._selected_obj
yield self._selection_name, self._selected_obj

def transform(self, func, *args, **kwargs):
raise AbstractMethodError(self)
Expand Down Expand Up @@ -886,9 +886,9 @@ def reset_identity(values):
result = concat(values, axis=self.axis)

if (isinstance(result, Series) and
getattr(self, 'name', None) is not None):
getattr(self, '_selection_name', None) is not None):

result.name = self.name
result.name = self._selection_name

return result

Expand Down Expand Up @@ -2575,7 +2575,7 @@ class SeriesGroupBy(GroupBy):
exec(_def_str)

@property
def name(self):
def _selection_name(self):
"""
since we are a series, we by definition only have
a single name, but may be the result of a selection or
Expand Down Expand Up @@ -2718,12 +2718,12 @@ def _aggregate_multiple_funcs(self, arg, _level):

def _wrap_output(self, output, index, names=None):
""" common agg/transform wrapping logic """
output = output[self.name]
output = output[self._selection_name]

if names is not None:
return DataFrame(output, index=index, columns=names)
else:
name = self.name
name = self._selection_name
if name is None:
name = self._selected_obj.name
return Series(output, index=index, name=name)
Expand All @@ -2741,7 +2741,7 @@ def _wrap_transformed_output(self, output, names=None):
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
# GH #6265
return Series([], name=self.name, index=keys)
return Series([], name=self._selection_name, index=keys)

def _get_index():
if self.grouper.nkeys > 1:
Expand All @@ -2754,7 +2754,7 @@ def _get_index():
# GH #823
index = _get_index()
result = DataFrame(values, index=index).stack()
result.name = self.name
result.name = self._selection_name
return result

if isinstance(values[0], (Series, dict)):
Expand All @@ -2766,7 +2766,8 @@ def _get_index():
not_indexed_same=not_indexed_same)
else:
# GH #6265
return Series(values, index=_get_index(), name=self.name)
return Series(values, index=_get_index(),
name=self._selection_name)

def _aggregate_named(self, func, *args, **kwargs):
result = {}
Expand Down Expand Up @@ -2938,7 +2939,7 @@ def nunique(self, dropna=True):

return Series(res,
index=ri,
name=self.name)
name=self._selection_name)

@deprecate_kwarg('take_last', 'keep',
mapping={True: 'last', False: 'first'})
Expand Down Expand Up @@ -3002,7 +3003,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
# multi-index components
labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
names = self.grouper.names + [self.name]
names = self.grouper.names + [self._selection_name]

if dropna:
mask = labels[-1] != -1
Expand Down Expand Up @@ -3037,7 +3038,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,

if is_integer_dtype(out):
out = _ensure_int64(out)
return Series(out, index=mi, name=self.name)
return Series(out, index=mi, name=self._selection_name)

# for compat. with algos.value_counts need to ensure every
# bin is present at every index level, null filled with zeros
Expand Down Expand Up @@ -3068,7 +3069,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,

if is_integer_dtype(out):
out = _ensure_int64(out)
return Series(out, index=mi, name=self.name)
return Series(out, index=mi, name=self._selection_name)

def count(self):
""" Compute count of group, excluding missing values """
Expand All @@ -3081,7 +3082,7 @@ def count(self):

return Series(out,
index=self.grouper.result_index,
name=self.name,
name=self._selection_name,
dtype='int64')

def _apply_to_column_groupbys(self, func):
Expand Down Expand Up @@ -3191,7 +3192,7 @@ def aggregate(self, arg, *args, **kwargs):
try:
assert not args and not kwargs
result = self._aggregate_multiple_funcs(
[arg], _level=_level)
[arg], _level=_level, _axis=self.axis)
result.columns = Index(
result.columns.levels[0],
name=self._selected_obj.columns.name)
Expand Down Expand Up @@ -3422,7 +3423,8 @@ def first_non_None_value(values):
except (ValueError, AttributeError):
# GH1738: values is list of arrays of unequal lengths fall
# through to the outer else caluse
return Series(values, index=key_index, name=self.name)
return Series(values, index=key_index,
name=self._selection_name)

# if we have date/time like in the original, then coerce dates
# as we are stacking can easily have object dtypes here
Expand All @@ -3445,8 +3447,9 @@ def first_non_None_value(values):
# only coerce dates if we find at least 1 datetime
coerce = True if any([isinstance(x, Timestamp)
for x in values]) else False
# self.name not passed through to Series as the result
# should not take the name of original selection of columns
# self._selection_name not passed through to Series as the
# result should not take the name of original selection
# of columns
return (Series(values, index=key_index)
._convert(datetime=True,
coerce=coerce))
Expand Down
Loading

0 comments on commit dec6354

Please sign in to comment.