Skip to content

Commit

Permalink
API/BUG: .apply will correctly infer output shape when axis=1
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Jan 13, 2018
1 parent 0477880 commit 52b6ea1
Show file tree
Hide file tree
Showing 3 changed files with 253 additions and 34 deletions.
50 changes: 48 additions & 2 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ Previous Behavior:
4 NaN
dtype: float64

Current Behavior
Current Behavior:

.. ipython:: python

Expand All @@ -139,7 +139,7 @@ Previous Behavior:
3 2.5
dtype: float64

Current Behavior
Current Behavior:

.. ipython:: python

Expand Down Expand Up @@ -224,6 +224,52 @@ If installed, we now require:
| openpyxl | 2.4.0 | |
+-----------------+-----------------+----------+

.. _whatsnew_0230.api_breaking.apply:

Apply Changes
~~~~~~~~~~~~~

:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies
are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case
where a list-like (e.g. ``tuple`` or ``list`` is returned), (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`,
:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`)

.. ipython:: python

df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
df

Previous Behavior. If the returned shape happened to match the index, this would return a list-like.

.. code-block:: python

In [3]: df.apply(lambda x: [1, 2, 3], axis=1)
Out[3]:
A B C
0 1 2 3
1 1 2 3
2 1 2 3
3 1 2 3
4 1 2 3
5 1 2 3

In [4]: df.apply(lambda x: [1, 2], axis=1)
Out[4]:
0 [1, 2]
1 [1, 2]
2 [1, 2]
3 [1, 2]
4 [1, 2]
5 [1, 2]
dtype: object


New Behavior. The behavior is consistent. These will *always* return a ``Series``.

.. ipython:: python

df.apply(lambda x: [1, 2, 3], axis=1)
df.apply(lambda x: [1, 2], axis=1)

Build Changes
^^^^^^^^^^^^^
Expand Down
83 changes: 61 additions & 22 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,20 @@ def frame_apply(obj, func, axis=0, broadcast=False,
klass = FrameColumnApply

return klass(obj, func, broadcast=broadcast,
raw=raw, reduce=reduce, args=args, kwds=kwds)
raw=raw, reduce=reduce,
args=args, kwds=kwds)


class FrameApply(object):

def __init__(self, obj, func, broadcast, raw, reduce, args, kwds):
def __init__(self, obj, func, broadcast, raw, reduce,
args, kwds):
self.obj = obj
self.broadcast = broadcast
self.raw = raw
self.reduce = reduce
self.args = args

self.args = args
self.ignore_failures = kwds.pop('ignore_failures', False)
self.kwds = kwds

Expand Down Expand Up @@ -94,6 +96,13 @@ def get_result(self):
return self.apply_standard()

def apply_empty_result(self):
"""
we have an empty result; at least 1 axis is 0
we will try to apply the function to an empty
series in order to see if this is a reduction function
"""

from pandas import Series
reduce = self.reduce

Expand All @@ -113,6 +122,8 @@ def apply_empty_result(self):
return self.obj.copy()

def apply_raw(self):
""" apply to the values as a numpy array """

try:
result = lib.reduce(self.values, self.f, axis=self.axis)
except Exception:
Expand Down Expand Up @@ -207,19 +218,57 @@ def wrap_results(self, results, res_index, res_columns):
from pandas import Series

if len(results) > 0 and is_sequence(results[0]):
if not isinstance(results[0], Series):
index = res_columns

# map to rows
if self.axis == 0:
result = self.obj._constructor(data=results)

if not isinstance(results[0], Series):
try:
result.index = res_columns
except ValueError:
pass

try:
result.columns = res_index
except ValueError:
pass

# map to columns
else:
index = None

result = self.obj._constructor(data=results, index=index)
result.columns = res_index
def infer_to_same_shape():
result = self.obj._constructor(data=results)
result = result.T

# try to assign the result indices;
# this may fail, if so we have
# received an invalid return shape
try:
result.index = res_index
except ValueError:
pass

try:
result.columns = res_columns
except ValueError:
pass

# infer dtypes
result = result.infer_objects()

if self.axis == 1:
result = result.T
result = result._convert(
datetime=True, timedelta=True, copy=False)
return result

# we have a non-series and don't want inference
if not isinstance(results[0], Series):
result = Series(results)
result.index = res_index

# we may want to infer results
else:
result = infer_to_same_shape()

# dict of scalars
else:

result = Series(results)
Expand Down Expand Up @@ -270,16 +319,6 @@ def result_columns(self):
class FrameColumnApply(FrameApply):
axis = 1

def __init__(self, obj, func, broadcast, raw, reduce, args, kwds):
super(FrameColumnApply, self).__init__(obj, func, broadcast,
raw, reduce, args, kwds)

# skip if we are mixed datelike and trying reduce across axes
# GH6125
if self.reduce:
if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type:
self.reduce = False

def apply_broadcast(self):
return self._apply_broadcast(self.obj.T).T

Expand Down
154 changes: 144 additions & 10 deletions pandas/tests/frame/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,11 +350,10 @@ def test_apply_attach_name(self):

result = self.frame.apply(lambda x: np.repeat(x.name, len(x)),
axis=1)
expected = DataFrame(np.tile(self.frame.index,
(len(self.frame.columns), 1)).T,
index=self.frame.index,
columns=self.frame.columns)
assert_frame_equal(result, expected)
expected = Series(np.repeat(t[0], len(self.frame.columns))
for t in self.frame.itertuples())
expected.index = self.frame.index
assert_series_equal(result, expected)

def test_apply_multi_index(self):
s = DataFrame([[1, 2], [3, 4], [5, 6]])
Expand All @@ -367,10 +366,10 @@ def test_apply_dict(self):

# GH 8735
A = DataFrame([['foo', 'bar'], ['spam', 'eggs']])
A_dicts = pd.Series([dict([(0, 'foo'), (1, 'spam')]),
dict([(0, 'bar'), (1, 'eggs')])])
A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]),
dict([(0, 'bar'), (1, 'eggs')])])
B = DataFrame([[0, 1], [2, 3]])
B_dicts = pd.Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])])
B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])])
fn = lambda x: x.to_dict()

for df, dicts in [(A, A_dicts), (B, B_dicts)]:
Expand Down Expand Up @@ -472,6 +471,141 @@ def test_apply_non_numpy_dtype(self):
assert_frame_equal(result, df)


class TestInferOutputShape(object):
# the user has supplied an opaque UDF where
# they are transforming the input that requires
# us to infer the output

def test_infer_row_shape(self):
# gh-17437
# if row shape is changing, infer it
df = pd.DataFrame(np.random.rand(10, 2))
result = df.apply(np.fft.fft, axis=0)
assert result.shape == (10, 2)

result = df.apply(np.fft.rfft, axis=0)
assert result.shape == (6, 2)

def test_with_dictlike_columns(self):
# gh 17602

df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1)
expected = Series([{'s': 3} for t in df.itertuples()])
assert_series_equal(result, expected)

df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
pd.Timestamp('2017-05-02 00:00:00')]
assert_series_equal(result, expected)

# compose a series
result = (df['a'] + df['b']).apply(lambda x: {'s': x})
expected = Series([{'s': 3}, {'s': 3}])
assert_series_equal(result, expected)

# gh-18775
df = DataFrame()
df["author"] = ["X", "Y", "Z"]
df["publisher"] = ["BBC", "NBC", "N24"]
df["date"] = pd.to_datetime(['17-10-2010 07:15:30',
'13-05-2011 08:20:35',
'15-01-2013 09:09:09'])
result = df.apply(lambda x: {}, axis=1)
expected = Series([{}, {}, {}])
assert_series_equal(result, expected)

def test_with_listlike_columns(self):
# gh-17348
df = DataFrame({'a': Series(np.random.randn(4)),
'b': ['a', 'list', 'of', 'words'],
'ts': date_range('2016-10-01', periods=4, freq='H')})

result = df[['a', 'b']].apply(tuple, axis=1)
expected = Series([t[1:] for t in df[['a', 'b']].itertuples()])
assert_series_equal(result, expected)

result = df[['a', 'ts']].apply(tuple, axis=1)
expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()])
assert_series_equal(result, expected)

# gh-18919
df = DataFrame({'x': Series([['a', 'b'], ['q']]),
'y': Series([['z'], ['q', 't']])})
df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')])

result = df.apply(
lambda row: [el for el in row['x'] if el in row['y']],
axis=1)
expected = Series([[], ['q']], index=df.index)
assert_series_equal(result, expected)

def test_infer_output_shape_columns(self):
# gh-18573

df = DataFrame({'number': [1., 2.],
'string': ['foo', 'bar'],
'datetime': [pd.Timestamp('2017-11-29 03:30:00'),
pd.Timestamp('2017-11-29 03:45:00')]})
result = df.apply(lambda row: (row.number, row.string), axis=1)
expected = Series([t[2:] for t in df.itertuples()])
assert_series_equal(result, expected)

def test_infer_output_shape_listlike_columns(self):
# gh-16353

df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])

result = df.apply(lambda x: [1, 2, 3], axis=1)
expected = Series([[1, 2, 3] for t in df.itertuples()])
assert_series_equal(result, expected)

result = df.apply(lambda x: [1, 2], axis=1)
expected = Series([[1, 2] for t in df.itertuples()])
assert_series_equal(result, expected)

# gh-17970
df = DataFrame({"a": [1, 2, 3]}, index=list('abc'))

result = df.apply(lambda row: np.ones(1), axis=1)
expected = Series([np.ones(1) for t in df.itertuples()],
index=df.index)
assert_series_equal(result, expected)

result = df.apply(lambda row: np.ones(2), axis=1)
expected = Series([np.ones(2) for t in df.itertuples()],
index=df.index)
assert_series_equal(result, expected)

# gh-17892
df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
pd.Timestamp('2010-02-04'),
pd.Timestamp('2010-02-05'),
pd.Timestamp('2010-02-06')],
'b': [9, 5, 4, 3],
'c': [5, 3, 4, 2],
'd': [1, 2, 3, 4]})

def fun(x):
return (1, 2)

result = df.apply(fun, axis=1)
expected = Series([(1, 2) for t in df.itertuples()])
assert_series_equal(result, expected)

def test_consistent_coerce_for_shapes(self):
# we want column names to NOT be propagated
# just because the shape matches the input shape
df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C'])

result = df.apply(lambda x: [1, 2, 3], axis=1)
expected = Series([[1, 2, 3] for t in df.itertuples()])
assert_series_equal(result, expected)

result = df.apply(lambda x: [1, 2], axis=1)
expected = Series([[1, 2] for t in df.itertuples()])
assert_series_equal(result, expected)


def zip_frames(*frames):
"""
take a list of frames, zip the columns together for each
Expand Down Expand Up @@ -649,13 +783,13 @@ def test_non_callable_aggregates(self):

# Function aggregate
result = df.agg({'A': 'count'})
expected = pd.Series({'A': 2})
expected = Series({'A': 2})

assert_series_equal(result, expected)

# Non-function aggregate
result = df.agg({'A': 'size'})
expected = pd.Series({'A': 3})
expected = Series({'A': 3})

assert_series_equal(result, expected)

Expand Down

0 comments on commit 52b6ea1

Please sign in to comment.