From 2e45a27d7142cf52ce57924926b957a0a68187c3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 7 Feb 2018 08:06:46 -0500 Subject: [PATCH] API/BUG: .apply will correctly infer output shape when axis=1 (#18577) closes #16353 closes #17348 closes #17437 closes #18573 closes #17970 closes #17892 closes #17602 closes #18775 closes #18901 closes #18919 --- doc/source/basics.rst | 10 +- doc/source/whatsnew/v0.23.0.txt | 73 ++++- pandas/core/apply.py | 288 ++++++++++++------ pandas/core/frame.py | 136 ++++++++- pandas/core/sparse/frame.py | 42 ++- pandas/io/formats/style.py | 4 +- pandas/tests/frame/test_apply.py | 386 ++++++++++++++++++++++-- pandas/tests/sparse/frame/test_apply.py | 92 ++++++ pandas/tests/sparse/frame/test_frame.py | 46 --- 9 files changed, 885 insertions(+), 192 deletions(-) create mode 100644 pandas/tests/sparse/frame/test_apply.py diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 18da53506f018..fb9e5a6cc75cb 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -793,8 +793,14 @@ The :meth:`~DataFrame.apply` method will also dispatch on a string method name. df.apply('mean') df.apply('mean', axis=1) -Depending on the return type of the function passed to :meth:`~DataFrame.apply`, -the result will either be of lower dimension or the same dimension. +The return type of the function passed to :meth:`~DataFrame.apply` affects the +type of the ultimate output from DataFrame.apply + +* If the applied function returns a ``Series``, the ultimate output is a ``DataFrame``. + The columns match the index of the ``Series`` returned by the applied function. +* If the applied function returns any other type, the ultimate output is a ``Series``. +* A ``result_type`` kwarg is accepted with the options: ``reduce``, ``broadcast``, and ``expand``. + These will determine how list-likes return results expand (or not) to a ``DataFrame``. :meth:`~DataFrame.apply` combined with some cleverness can be used to answer many questions about a data set. For example, suppose we wanted to extract the date where the diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 80c5352701540..1c6b698605521 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -142,7 +142,7 @@ Previous Behavior: 4 NaN dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -167,7 +167,7 @@ Previous Behavior: 3 2.5 dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -332,6 +332,73 @@ Convert to an xarray DataArray p.to_xarray() +.. _whatsnew_0230.api_breaking.apply: + +Apply Changes +~~~~~~~~~~~~~ + +:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies +are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case +where a list-like (e.g. ``tuple`` or ``list`` is returned), (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, +:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`) + +.. ipython:: python + + df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) + df + +Previous Behavior. If the returned shape happened to match the original columns, this would return a ``DataFrame``. +If the return shape did not match, a ``Series`` with lists was returned. + +.. code-block:: python + + In [3]: df.apply(lambda x: [1, 2, 3], axis=1) + Out[3]: + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + In [4]: df.apply(lambda x: [1, 2], axis=1) + Out[4]: + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + dtype: object + + +New Behavior. The behavior is consistent. These will *always* return a ``Series``. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1) + df.apply(lambda x: [1, 2], axis=1) + +To have expanded columns, you can use ``result_type='expand'`` + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') + +To have broadcast the result across, you can use ``result_type='broadcast'``. The shape +must match the original columns. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + +Returning a ``Series`` allows one to control the exact return structure and column names: + +.. ipython:: python + + df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + .. _whatsnew_0230.api_breaking.build_changes: @@ -456,6 +523,8 @@ Deprecations - The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`). - ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`) +- The ``broadcast`` parameter of ``.apply()`` is removed in favor of ``result_type='broadcast'`` (:issue:`18577`) +- The ``reduce`` parameter of ``.apply()`` is removed in favor of ``result_type='reduce'`` (:issue:`18577`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4cdec54b9a07a..c65943fbbb201 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,15 +1,20 @@ +import warnings import numpy as np from pandas import compat from pandas._libs import reduction +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.common import ( is_extension_type, is_sequence) +from pandas.util._decorators import cache_readonly from pandas.io.formats.printing import pprint_thing -def frame_apply(obj, func, axis=0, broadcast=False, - raw=False, reduce=None, args=(), **kwds): +def frame_apply(obj, func, axis=0, broadcast=None, + raw=False, reduce=None, result_type=None, + ignore_failures=False, + args=None, kwds=None): """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) @@ -19,20 +24,49 @@ def frame_apply(obj, func, axis=0, broadcast=False, klass = FrameColumnApply return klass(obj, func, broadcast=broadcast, - raw=raw, reduce=reduce, args=args, kwds=kwds) + raw=raw, reduce=reduce, result_type=result_type, + ignore_failures=ignore_failures, + args=args, kwds=kwds) class FrameApply(object): - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): + def __init__(self, obj, func, broadcast, raw, reduce, result_type, + ignore_failures, args, kwds): self.obj = obj - self.broadcast = broadcast self.raw = raw - self.reduce = reduce - self.args = args - - self.ignore_failures = kwds.pop('ignore_failures', False) - self.kwds = kwds + self.ignore_failures = ignore_failures + self.args = args or () + self.kwds = kwds or {} + + if result_type not in [None, 'reduce', 'broadcast', 'expand']: + raise ValueError("invalid value for result_type, must be one " + "of {None, 'reduce', 'broadcast', 'expand'}") + + if broadcast is not None: + warnings.warn("The broadcast argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='broadcast' to broadcast the result " + "to the original dimensions", + FutureWarning, stacklevel=4) + if broadcast: + result_type = 'broadcast' + + if reduce is not None: + warnings.warn("The reduce argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='reduce' to try to reduce the result " + "to the original dimensions", + FutureWarning, stacklevel=4) + if reduce: + + if result_type is not None: + raise ValueError( + "cannot pass both reduce=True and result_type") + + result_type = 'reduce' + + self.result_type = result_type # curry if needed if kwds or args and not isinstance(func, np.ufunc): @@ -43,6 +77,11 @@ def f(x): self.f = f + # results + self.result = None + self.res_index = None + self.res_columns = None + @property def columns(self): return self.obj.columns @@ -51,10 +90,14 @@ def columns(self): def index(self): return self.obj.index - @property + @cache_readonly def values(self): return self.obj.values + @cache_readonly + def dtypes(self): + return self.obj.dtypes + @property def agg_axis(self): return self.obj._get_agg_axis(self.axis) @@ -68,8 +111,7 @@ def get_result(self): # string dispatch if isinstance(self.f, compat.string_types): - if self.axis: - self.kwds['axis'] = self.axis + self.kwds['axis'] = self.axis return getattr(self.obj, self.f)(*self.args, **self.kwds) # ufunc @@ -80,25 +122,37 @@ def get_result(self): columns=self.columns, copy=False) # broadcasting - if self.broadcast: + if self.result_type == 'broadcast': return self.apply_broadcast() # one axis empty - if not all(self.obj.shape): + elif not all(self.obj.shape): return self.apply_empty_result() # raw - if self.raw and not self.obj._is_mixed_type: + elif self.raw and not self.obj._is_mixed_type: return self.apply_raw() return self.apply_standard() def apply_empty_result(self): - from pandas import Series - reduce = self.reduce + """ + we have an empty result; at least 1 axis is 0 + + we will try to apply the function to an empty + series in order to see if this is a reduction function + """ + + # we are not asked to reduce or infer reduction + # so just return a copy of the existing object + if self.result_type not in ['reduce', None]: + return self.obj.copy() + + # we may need to infer + reduce = self.result_type == 'reduce' - if reduce is None: - reduce = False + from pandas import Series + if not reduce: EMPTY_SERIES = Series([]) try: @@ -113,6 +167,8 @@ def apply_empty_result(self): return self.obj.copy() def apply_raw(self): + """ apply to the values as a numpy array """ + try: result = reduction.reduce(self.values, self.f, axis=self.axis) except Exception: @@ -125,49 +181,70 @@ def apply_raw(self): else: return Series(result, index=self.agg_axis) - def apply_standard(self): - from pandas import Series + def apply_broadcast(self, target): + result_values = np.empty_like(target.values) + + # axis which we want to compare compliance + result_compare = target.shape[0] + + for i, col in enumerate(target.columns): + res = self.f(target[col]) + ares = np. asarray(res).ndim + + # must be a scalar or 1d + if ares > 1: + raise ValueError("too many dims to broadcast") + elif ares == 1: + + # must match return dim + if result_compare != len(res): + raise ValueError("cannot broadcast result") - reduce = self.reduce - if reduce is None: - reduce = True + result_values[:, i] = res + + # we *always* preserve the original index / columns + result = self.obj._constructor(result_values, + index=target.index, + columns=target.columns) + return result + + def apply_standard(self): # try to reduce first (by default) # this only matters if the reduction in values is of different dtype # e.g. if we want to apply to a SparseFrame, then can't directly reduce - if reduce: - values = self.values - # we cannot reduce using non-numpy dtypes, - # as demonstrated in gh-12244 - if not is_extension_type(values): + # we cannot reduce using non-numpy dtypes, + # as demonstrated in gh-12244 + if (self.result_type in ['reduce', None] and + not self.dtypes.apply(is_extension_type).any()): - # Create a dummy Series from an empty array - index = self.obj._get_axis(self.axis) - empty_arr = np.empty(len(index), dtype=values.dtype) - - dummy = Series(empty_arr, index=index, dtype=values.dtype) + # Create a dummy Series from an empty array + from pandas import Series + values = self.values + index = self.obj._get_axis(self.axis) + labels = self.agg_axis + empty_arr = np.empty(len(index), dtype=values.dtype) + dummy = Series(empty_arr, index=index, dtype=values.dtype) - try: - labels = self.agg_axis - result = reduction.reduce(values, self.f, - axis=self.axis, - dummy=dummy, - labels=labels) - return Series(result, index=labels) - except Exception: - pass + try: + result = reduction.reduce(values, self.f, + axis=self.axis, + dummy=dummy, + labels=labels) + return Series(result, index=labels) + except Exception: + pass # compute the result using the series generator - results, res_index, res_columns = self._apply_series_generator() + self.apply_series_generator() # wrap results - return self.wrap_results(results, res_index, res_columns) + return self.wrap_results() - def _apply_series_generator(self): + def apply_series_generator(self): series_gen = self.series_generator res_index = self.result_index - res_columns = self.result_columns i = None keys = [] @@ -201,40 +278,23 @@ def _apply_series_generator(self): pprint_thing(k), ) raise - return results, res_index, res_columns + self.results = results + self.res_index = res_index + self.res_columns = self.result_columns - def wrap_results(self, results, res_index, res_columns): - from pandas import Series + def wrap_results(self): + results = self.results + # see if we can infer the results if len(results) > 0 and is_sequence(results[0]): - if not isinstance(results[0], Series): - index = res_columns - else: - index = None - result = self.obj._constructor(data=results, index=index) - result.columns = res_index + return self.wrap_results_for_axis() - if self.axis == 1: - result = result.T - result = result._convert( - datetime=True, timedelta=True, copy=False) - - else: - - result = Series(results) - result.index = res_index - - return result - - def _apply_broadcast(self, target): - result_values = np.empty_like(target.values) - columns = target.columns - for i, col in enumerate(columns): - result_values[:, i] = self.f(target[col]) + # dict of scalars + from pandas import Series + result = Series(results) + result.index = self.res_index - result = self.obj._constructor(result_values, index=target.index, - columns=target.columns) return result @@ -251,7 +311,7 @@ def get_result(self): return super(FrameRowApply, self).get_result() def apply_broadcast(self): - return self._apply_broadcast(self.obj) + return super(FrameRowApply, self).apply_broadcast(self.obj) @property def series_generator(self): @@ -266,29 +326,37 @@ def result_index(self): def result_columns(self): return self.index + def wrap_results_for_axis(self): + """ return the results for the rows """ -class FrameColumnApply(FrameApply): - axis = 1 + results = self.results + result = self.obj._constructor(data=results) - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): - super(FrameColumnApply, self).__init__(obj, func, broadcast, - raw, reduce, args, kwds) + if not isinstance(results[0], ABCSeries): + try: + result.index = self.res_columns + except ValueError: + pass - # skip if we are mixed datelike and trying reduce across axes - # GH6125 - if self.reduce: - if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type: - self.reduce = False + try: + result.columns = self.res_index + except ValueError: + pass + + return result + + +class FrameColumnApply(FrameApply): + axis = 1 def apply_broadcast(self): - return self._apply_broadcast(self.obj.T).T + result = super(FrameColumnApply, self).apply_broadcast(self.obj.T) + return result.T @property def series_generator(self): - from pandas import Series - dtype = object if self.obj._is_mixed_type else None - return (Series._from_array(arr, index=self.columns, name=name, - dtype=dtype) + constructor = self.obj._constructor_sliced + return (constructor(arr, index=self.columns, name=name) for i, (arr, name) in enumerate(zip(self.values, self.index))) @@ -299,3 +367,39 @@ def result_index(self): @property def result_columns(self): return self.columns + + def wrap_results_for_axis(self): + """ return the results for the columns """ + results = self.results + + # we have requested to expand + if self.result_type == 'expand': + result = self.infer_to_same_shape() + + # we have a non-series and don't want inference + elif not isinstance(results[0], ABCSeries): + from pandas import Series + + result = Series(results) + result.index = self.res_index + + # we may want to infer results + else: + result = self.infer_to_same_shape() + + return result + + def infer_to_same_shape(self): + """ infer the results to the same shape as the input object """ + results = self.results + + result = self.obj._constructor(data=results) + result = result.T + + # set the index + result.index = self.res_index + + # infer dtypes + result = result.infer_objects() + + return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b0ead3f0c7f00..9487f51919108 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4820,8 +4820,8 @@ def aggregate(self, func, axis=0, *args, **kwargs): agg = aggregate - def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, - args=(), **kwds): + def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, + result_type=None, args=(), **kwds): """Applies function along input axis of DataFrame. Objects passed to functions are Series objects having index @@ -4836,9 +4836,14 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, axis : {0 or 'index', 1 or 'columns'}, default 0 * 0 or 'index': apply function to each column * 1 or 'columns': apply function to each row - broadcast : boolean, default False + broadcast : boolean, optional For aggregation functions, return object of same size with values propagated + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='broadcast'. + raw : boolean, default False If False, convert each row or column into a Series. If raw=True the passed function will receive ndarray objects instead. If you are @@ -4852,6 +4857,24 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, while guessing, exceptions raised by func will be ignored). If reduce is True a Series will always be returned, and if False a DataFrame will always be returned. + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='reduce'. + + result_type : {'expand', 'reduce', 'broadcast, None} + These only act when axis=1 {columns} + * 'expand' : list-like results will be turned into columns. + * 'reduce' : return a Series if possible rather than expanding + list-like results. This is the opposite to 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the frame, the original index & columns will be retained. + * None : list-like results will be returned as a list + in a single column. However if the apply function + returns a Series these are expanded to columns. + + .. versionadded:: 0.23.0 + args : tuple Positional arguments to pass to function in addition to the array/series @@ -4867,9 +4890,96 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, Examples -------- - >>> df.apply(numpy.sqrt) # returns DataFrame - >>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0) - >>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1) + + We use this DataFrame to illustrate + + >>> df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + Using a ufunc + + >>> df.apply(np.sqrt) + A B C + 0 1.0 1.414214 1.732051 + 1 1.0 1.414214 1.732051 + 2 1.0 1.414214 1.732051 + 3 1.0 1.414214 1.732051 + 4 1.0 1.414214 1.732051 + 5 1.0 1.414214 1.732051 + + Using a reducing function on either axis + + >>> df.apply(np.sum, axis=0) + A 6 + B 12 + C 18 + dtype: int64 + + >>> df.apply(np.sum, axis=1) + 0 6 + 1 6 + 2 6 + 3 6 + 4 6 + 5 6 + dtype: int64 + + Retuning a list-like will result in a Series + + >>> df.apply(lambda x: [1, 2], axis=1) + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + + Passing result_type='expand' will expand list-like results + to columns of a Dataframe + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') + 0 1 + 0 1 2 + 1 1 2 + 2 1 2 + 3 1 2 + 4 1 2 + 5 1 2 + + Return a Series inside the function is similar to passing + Passing result_type='expand'. The resulting column names + will be the Series index. + + >>> df.apply(lambda x: Series([1, 2], index=['foo', 'bar']), axis=1) + foo bar + 0 1 2 + 1 1 2 + 2 1 2 + 3 1 2 + 4 1 2 + 5 1 2 + + + Passing result_type='broadcast' will take a same shape + result, whether list-like or scalar and broadcast it + along the axis. The resulting column names will be the originals. + + >>> df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 See also -------- @@ -4888,7 +4998,9 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, broadcast=broadcast, raw=raw, reduce=reduce, - args=args, **kwds) + result_type=result_type, + args=args, + kwds=kwds) return op.get_result() def applymap(self, func): @@ -5592,12 +5704,16 @@ def f(x): # numeric_only and yet we have tried a # column-by-column reduction, where we have mixed type. # So let's just do what we can - result = self.apply(f, reduce=False, - ignore_failures=True) + from pandas.core.apply import frame_apply + opa = frame_apply(self, + func=f, + result_type='expand', + ignore_failures=True) + result = opa.get_result() if result.ndim == self.ndim: result = result.iloc[0] return result - except: + except Exception: pass if filter_type is None or filter_type == 'numeric': diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 122c2b11f25f9..371377ce2899c 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -829,7 +829,8 @@ def notna(self): return self._apply_columns(lambda x: x.notna()) notnull = notna - def apply(self, func, axis=0, broadcast=False, reduce=False): + def apply(self, func, axis=0, broadcast=None, reduce=None, + result_type=None): """ Analogous to DataFrame.apply, for SparseDataFrame @@ -842,6 +843,35 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): For aggregation functions, return object of same size with values propagated + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='broadcast'. + + reduce : boolean or None, default None + Try to apply reduction procedures. If the DataFrame is empty, + apply will use reduce to determine whether the result should be a + Series or a DataFrame. If reduce is None (the default), apply's + return value will be guessed by calling func an empty Series (note: + while guessing, exceptions raised by func will be ignored). If + reduce is True a Series will always be returned, and if False a + DataFrame will always be returned. + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='reduce'. + + result_type : {'expand', 'reduce', 'broadcast, None} + These only act when axis=1 {columns} + * 'expand' : list-like results will be turned into columns + * 'reduce' : return a Series if possible rather than expanding + list-like results. This is the opposite to 'expand' + * 'broadcast' : scalar results will be broadcast to all columns + * None : list-like results will be returned as a list + in a single column. However if the apply function + returns a Series these are expanded to columns. + + .. versionadded:: 0.23.0 + Returns ------- applied : Series or SparseDataFrame @@ -865,12 +895,10 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): op = frame_apply(self, func=func, axis=axis, - reduce=reduce) - - if broadcast: - return op.apply_broadcast() - - return op.apply_standard() + reduce=reduce, + broadcast=broadcast, + result_type=result_type) + return op.get_result() def applymap(self, func): """ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 20e72dd6bde91..525f487d8aa39 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -509,7 +509,9 @@ def _apply(self, func, axis=0, subset=None, **kwargs): subset = _non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: - result = data.apply(func, axis=axis, **kwargs) + result = data.apply(func, axis=axis, + result_type='expand', **kwargs) + result.columns = data.columns else: result = func(data, **kwargs) if not isinstance(result, pd.DataFrame): diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index d69ddcd8f14d4..d1ad9f71e6350 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -82,24 +82,30 @@ def test_apply_empty(self): rs = xp.apply(lambda x: x['a'], axis=1) assert_frame_equal(xp, rs) + def test_apply_with_reduce_empty(self): # reduce with an empty DataFrame x = [] - result = self.empty.apply(x.append, axis=1, reduce=False) + result = self.empty.apply(x.append, axis=1, result_type='expand') assert_frame_equal(result, self.empty) - result = self.empty.apply(x.append, axis=1, reduce=True) + result = self.empty.apply(x.append, axis=1, result_type='reduce') assert_series_equal(result, Series( [], index=pd.Index([], dtype=object))) empty_with_cols = DataFrame(columns=['a', 'b', 'c']) - result = empty_with_cols.apply(x.append, axis=1, reduce=False) + result = empty_with_cols.apply(x.append, axis=1, result_type='expand') assert_frame_equal(result, empty_with_cols) - result = empty_with_cols.apply(x.append, axis=1, reduce=True) + result = empty_with_cols.apply(x.append, axis=1, result_type='reduce') assert_series_equal(result, Series( [], index=pd.Index([], dtype=object))) # Ensure that x.append hasn't been called assert x == [] + def test_apply_deprecate_reduce(self): + with warnings.catch_warnings(record=True): + x = [] + self.empty.apply(x.append, axis=1, result_type='reduce') + def test_apply_standard_nonunique(self): df = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) @@ -121,17 +127,79 @@ def test_with_string_args(self): expected = getattr(self.frame, arg)(axis=1) tm.assert_series_equal(result, expected) + def test_apply_broadcast_deprecated(self): + with tm.assert_produces_warning(FutureWarning): + self.frame.apply(np.mean, broadcast=True) + def test_apply_broadcast(self): - broadcasted = self.frame.apply(np.mean, broadcast=True) - agged = self.frame.apply(np.mean) - for col, ts in compat.iteritems(broadcasted): - assert (ts == agged[col]).all() + # scalars + result = self.frame.apply(np.mean, result_type='broadcast') + expected = DataFrame([self.frame.mean()], index=self.frame.index) + tm.assert_frame_equal(result, expected) + + result = self.frame.apply(np.mean, axis=1, result_type='broadcast') + m = self.frame.mean(axis=1) + expected = DataFrame({c: m for c in self.frame.columns}) + tm.assert_frame_equal(result, expected) + + # lists + result = self.frame.apply( + lambda x: list(range(len(self.frame.columns))), + axis=1, + result_type='broadcast') + m = list(range(len(self.frame.columns))) + expected = DataFrame([m] * len(self.frame.index), + dtype='float64', + index=self.frame.index, + columns=self.frame.columns) + tm.assert_frame_equal(result, expected) - broadcasted = self.frame.apply(np.mean, axis=1, broadcast=True) - agged = self.frame.apply(np.mean, axis=1) - for idx in broadcasted.index: - assert (broadcasted.xs(idx) == agged[idx]).all() + result = self.frame.apply(lambda x: list(range(len(self.frame.index))), + result_type='broadcast') + m = list(range(len(self.frame.index))) + expected = DataFrame({c: m for c in self.frame.columns}, + dtype='float64', + index=self.frame.index) + tm.assert_frame_equal(result, expected) + + # preserve columns + df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + columns=list('ABC')) + result = df.apply(lambda x: [1, 2, 3], + axis=1, + result_type='broadcast') + tm.assert_frame_equal(result, df) + + df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + columns=list('ABC')) + result = df.apply(lambda x: Series([1, 2, 3], index=list('abc')), + axis=1, + result_type='broadcast') + expected = df.copy() + tm.assert_frame_equal(result, expected) + + def test_apply_broadcast_error(self): + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + # > 1 ndim + with pytest.raises(ValueError): + df.apply(lambda x: np.array([1, 2]).reshape(-1, 2), + axis=1, + result_type='broadcast') + + # cannot broadcast + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2], + axis=1, + result_type='broadcast') + + with pytest.raises(ValueError): + df.apply(lambda x: Series([1, 2]), + axis=1, + result_type='broadcast') def test_apply_raw(self): result0 = self.frame.apply(np.mean, raw=True) @@ -208,7 +276,7 @@ def _checkit(axis=0, raw=False): _check(no_index, lambda x: x) _check(no_index, lambda x: x.mean()) - result = no_cols.apply(lambda x: x.mean(), broadcast=True) + result = no_cols.apply(lambda x: x.mean(), result_type='broadcast') assert isinstance(result, DataFrame) def test_apply_with_args_kwds(self): @@ -350,33 +418,37 @@ def test_apply_attach_name(self): result = self.frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) - expected = DataFrame(np.tile(self.frame.index, - (len(self.frame.columns), 1)).T, - index=self.frame.index, - columns=self.frame.columns) - assert_frame_equal(result, expected) + expected = Series(np.repeat(t[0], len(self.frame.columns)) + for t in self.frame.itertuples()) + expected.index = self.frame.index + assert_series_equal(result, expected) def test_apply_multi_index(self): - s = DataFrame([[1, 2], [3, 4], [5, 6]]) - s.index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) - s.columns = ['col1', 'col2'] - res = s.apply(lambda x: Series({'min': min(x), 'max': max(x)}), 1) - assert isinstance(res.index, MultiIndex) + index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], + index=index, + columns=['col1', 'col2']) + result = s.apply( + lambda x: Series({'min': min(x), 'max': max(x)}), 1) + expected = DataFrame([[1, 2], [3, 4], [5, 6]], + index=index, + columns=['min', 'max']) + assert_frame_equal(result, expected, check_like=True) def test_apply_dict(self): # GH 8735 A = DataFrame([['foo', 'bar'], ['spam', 'eggs']]) - A_dicts = pd.Series([dict([(0, 'foo'), (1, 'spam')]), - dict([(0, 'bar'), (1, 'eggs')])]) + A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]), + dict([(0, 'bar'), (1, 'eggs')])]) B = DataFrame([[0, 1], [2, 3]]) - B_dicts = pd.Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) + B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) fn = lambda x: x.to_dict() for df, dicts in [(A, A_dicts), (B, B_dicts)]: - reduce_true = df.apply(fn, reduce=True) - reduce_false = df.apply(fn, reduce=False) - reduce_none = df.apply(fn, reduce=None) + reduce_true = df.apply(fn, result_type='reduce') + reduce_false = df.apply(fn, result_type='expand') + reduce_none = df.apply(fn) assert_series_equal(reduce_true, dicts) assert_frame_equal(reduce_false, df) @@ -465,8 +537,8 @@ def test_frame_apply_dont_convert_datetime64(self): assert df.x1.dtype == 'M8[ns]' - # See gh-12244 def test_apply_non_numpy_dtype(self): + # See gh-12244 df = DataFrame({'dt': pd.date_range( "2015-01-01", periods=3, tz='Europe/Brussels')}) result = df.apply(lambda x: x) @@ -482,6 +554,256 @@ def test_apply_non_numpy_dtype(self): assert_frame_equal(result, df) +class TestInferOutputShape(object): + # the user has supplied an opaque UDF where + # they are transforming the input that requires + # us to infer the output + + def test_infer_row_shape(self): + # gh-17437 + # if row shape is changing, infer it + df = pd.DataFrame(np.random.rand(10, 2)) + result = df.apply(np.fft.fft, axis=0) + assert result.shape == (10, 2) + + result = df.apply(np.fft.rfft, axis=0) + assert result.shape == (6, 2) + + def test_with_dictlike_columns(self): + # gh 17602 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1) + expected = Series([{'s': 3} for t in df.itertuples()]) + assert_series_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1) + assert_series_equal(result, expected) + + # compose a series + result = (df['a'] + df['b']).apply(lambda x: {'s': x}) + expected = Series([{'s': 3}, {'s': 3}]) + assert_series_equal(result, expected) + + # gh-18775 + df = DataFrame() + df["author"] = ["X", "Y", "Z"] + df["publisher"] = ["BBC", "NBC", "N24"] + df["date"] = pd.to_datetime(['17-10-2010 07:15:30', + '13-05-2011 08:20:35', + '15-01-2013 09:09:09']) + result = df.apply(lambda x: {}, axis=1) + expected = Series([{}, {}, {}]) + assert_series_equal(result, expected) + + def test_with_dictlike_columns_with_infer(self): + # gh 17602 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1, result_type='expand') + expected = DataFrame({'s': [3, 3]}) + assert_frame_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1, result_type='expand') + assert_frame_equal(result, expected) + + def test_with_listlike_columns(self): + # gh-17348 + df = DataFrame({'a': Series(np.random.randn(4)), + 'b': ['a', 'list', 'of', 'words'], + 'ts': date_range('2016-10-01', periods=4, freq='H')}) + + result = df[['a', 'b']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'b']].itertuples()]) + assert_series_equal(result, expected) + + result = df[['a', 'ts']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()]) + assert_series_equal(result, expected) + + # gh-18919 + df = DataFrame({'x': Series([['a', 'b'], ['q']]), + 'y': Series([['z'], ['q', 't']])}) + df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')]) + + result = df.apply( + lambda row: [el for el in row['x'] if el in row['y']], + axis=1) + expected = Series([[], ['q']], index=df.index) + assert_series_equal(result, expected) + + def test_infer_output_shape_columns(self): + # gh-18573 + + df = DataFrame({'number': [1., 2.], + 'string': ['foo', 'bar'], + 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), + pd.Timestamp('2017-11-29 03:45:00')]}) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = Series([t[2:] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_infer_output_shape_listlike_columns(self): + # gh-16353 + + df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + # gh-17970 + df = DataFrame({"a": [1, 2, 3]}, index=list('abc')) + + result = df.apply(lambda row: np.ones(1), axis=1) + expected = Series([np.ones(1) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + result = df.apply(lambda row: np.ones(2), axis=1) + expected = Series([np.ones(2) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + # gh-17892 + df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), + pd.Timestamp('2010-02-04'), + pd.Timestamp('2010-02-05'), + pd.Timestamp('2010-02-06')], + 'b': [9, 5, 4, 3], + 'c': [5, 3, 4, 2], + 'd': [1, 2, 3, 4]}) + + def fun(x): + return (1, 2) + + result = df.apply(fun, axis=1) + expected = Series([(1, 2) for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_consistent_coerce_for_shapes(self): + # we want column names to NOT be propagated + # just because the shape matches the input shape + df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_consistent_names(self): + # if a Series is returned, we should use the resulting index names + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: Series([1, 2, 3], + index=['test', 'other', 'cols']), + axis=1) + expected = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['test', 'other', 'cols']) + assert_frame_equal(result, expected) + + result = df.apply( + lambda x: pd.Series([1, 2], index=['test', 'other']), axis=1) + expected = DataFrame( + np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['test', 'other']) + assert_frame_equal(result, expected) + + def test_result_type(self): + # result_type should be consistent no matter which + # path we take in the code + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') + expected = df.copy() + expected.columns = [0, 1, 2] + assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1, result_type='expand') + expected = df[['A', 'B']].copy() + expected.columns = [0, 1] + assert_frame_equal(result, expected) + + # broadcast result + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + expected = df.copy() + assert_frame_equal(result, expected) + + columns = ['other', 'col', 'names'] + result = df.apply( + lambda x: pd.Series([1, 2, 3], + index=columns), + axis=1, + result_type='broadcast') + expected = df.copy() + assert_frame_equal(result, expected) + + # series result + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + expected = df.copy() + assert_frame_equal(result, expected) + + # series result with other index + columns = ['other', 'col', 'names'] + result = df.apply( + lambda x: pd.Series([1, 2, 3], index=columns), + axis=1) + expected = df.copy() + expected.columns = columns + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("result_type", ['foo', 1]) + def test_result_type_error(self, result_type): + # allowed result_type + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2, 3], + axis=1, + result_type=result_type) + + @pytest.mark.parametrize( + "box", + [lambda x: list(x), + lambda x: tuple(x), + lambda x: np.array(x, dtype='int64')], + ids=['list', 'tuple', 'array']) + def test_consistency_for_boxed(self, box): + # passing an array or list should not affect the output shape + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: box([1, 2]), axis=1) + expected = Series([box([1, 2]) for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand') + expected = DataFrame( + np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1) + assert_frame_equal(result, expected) + + def zip_frames(*frames): """ take a list of frames, zip the columns together for each @@ -657,13 +979,13 @@ def test_non_callable_aggregates(self): # Function aggregate result = df.agg({'A': 'count'}) - expected = pd.Series({'A': 2}) + expected = Series({'A': 2}) assert_series_equal(result, expected) # Non-function aggregate result = df.agg({'A': 'size'}) - expected = pd.Series({'A': 3}) + expected = Series({'A': 3}) assert_series_equal(result, expected) diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py new file mode 100644 index 0000000000000..07e4b1bf7c913 --- /dev/null +++ b/pandas/tests/sparse/frame/test_apply.py @@ -0,0 +1,92 @@ +import pytest +import numpy as np +from pandas import SparseDataFrame, DataFrame, Series, bdate_range +from pandas.core import nanops +from pandas.util import testing as tm + + +@pytest.fixture +def dates(): + return bdate_range('1/1/2011', periods=10) + + +@pytest.fixture +def empty(): + return SparseDataFrame() + + +@pytest.fixture +def frame(dates): + data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], + 'C': np.arange(10, dtype=np.float64), + 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]} + + return SparseDataFrame(data, index=dates) + + +@pytest.fixture +def fill_frame(frame): + values = frame.values.copy() + values[np.isnan(values)] = 2 + + return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], + default_fill_value=2, + index=frame.index) + + +def test_apply(frame): + applied = frame.apply(np.sqrt) + assert isinstance(applied, SparseDataFrame) + tm.assert_almost_equal(applied.values, np.sqrt(frame.values)) + + # agg / broadcast + with tm.assert_produces_warning(FutureWarning): + broadcasted = frame.apply(np.sum, broadcast=True) + assert isinstance(broadcasted, SparseDataFrame) + + with tm.assert_produces_warning(FutureWarning): + exp = frame.to_dense().apply(np.sum, broadcast=True) + tm.assert_frame_equal(broadcasted.to_dense(), exp) + + applied = frame.apply(np.sum) + tm.assert_series_equal(applied, + frame.to_dense().apply(nanops.nansum)) + + +def test_apply_fill(fill_frame): + applied = fill_frame.apply(np.sqrt) + assert applied['A'].fill_value == np.sqrt(2) + + +def test_apply_empty(empty): + assert empty.apply(np.sqrt) is empty + + +def test_apply_nonuq(): + orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'a', 'c']) + sparse = orig.to_sparse() + res = sparse.apply(lambda s: s[0], axis=1) + exp = orig.apply(lambda s: s[0], axis=1) + + # dtype must be kept + assert res.dtype == np.int64 + + # ToDo: apply must return subclassed dtype + assert isinstance(res, Series) + tm.assert_series_equal(res.to_dense(), exp) + + # df.T breaks + sparse = orig.T.to_sparse() + res = sparse.apply(lambda s: s[0], axis=0) # noqa + exp = orig.T.apply(lambda s: s[0], axis=0) + + # TODO: no non-unique columns supported in sparse yet + # tm.assert_series_equal(res.to_dense(), exp) + + +def test_applymap(frame): + # just test that it works + result = frame.applymap(lambda x: x * 2) + assert isinstance(result, SparseDataFrame) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 54f567bcd2a8c..29fad3c8eefaf 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -621,52 +621,6 @@ def test_append(self): tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3], exact_indices=False) - def test_apply(self): - applied = self.frame.apply(np.sqrt) - assert isinstance(applied, SparseDataFrame) - tm.assert_almost_equal(applied.values, np.sqrt(self.frame.values)) - - applied = self.fill_frame.apply(np.sqrt) - assert applied['A'].fill_value == np.sqrt(2) - - # agg / broadcast - broadcasted = self.frame.apply(np.sum, broadcast=True) - assert isinstance(broadcasted, SparseDataFrame) - - exp = self.frame.to_dense().apply(np.sum, broadcast=True) - tm.assert_frame_equal(broadcasted.to_dense(), exp) - - assert self.empty.apply(np.sqrt) is self.empty - - from pandas.core import nanops - applied = self.frame.apply(np.sum) - tm.assert_series_equal(applied, - self.frame.to_dense().apply(nanops.nansum)) - - def test_apply_nonuq(self): - orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'a', 'c']) - sparse = orig.to_sparse() - res = sparse.apply(lambda s: s[0], axis=1) - exp = orig.apply(lambda s: s[0], axis=1) - # dtype must be kept - assert res.dtype == np.int64 - # ToDo: apply must return subclassed dtype - assert isinstance(res, pd.Series) - tm.assert_series_equal(res.to_dense(), exp) - - # df.T breaks - sparse = orig.T.to_sparse() - res = sparse.apply(lambda s: s[0], axis=0) # noqa - exp = orig.T.apply(lambda s: s[0], axis=0) - # TODO: no non-unique columns supported in sparse yet - # tm.assert_series_equal(res.to_dense(), exp) - - def test_applymap(self): - # just test that it works - result = self.frame.applymap(lambda x: x * 2) - assert isinstance(result, SparseDataFrame) - def test_astype(self): sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], dtype=np.int64),