Skip to content

Commit

Permalink
API/BUG: .apply will correctly infer output shape when axis=1
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Nov 30, 2017
1 parent a47ad56 commit dce186a
Show file tree
Hide file tree
Showing 3 changed files with 213 additions and 14 deletions.
73 changes: 72 additions & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,80 @@ Backwards incompatible API changes
-


.. _whatsnew_0220.api_breaking.apply:

Apply Changes
~~~~~~~~~~~~~

:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies
are resolved (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, :issue:`17602`)

.. ipython:: python

df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
df

Previous Behavior. If the returned shape happened to match the index, this would return a list-like.

.. code-block:: python

In [3]: df.apply(lambda x: [1, 2, 3], axis=1)
Out[3]:
A B C
0 1 2 3
1 1 2 3
2 1 2 3
3 1 2 3
4 1 2 3
5 1 2 3

In [4]: df.apply(lambda x: [1, 2], axis=1)
Out[4]:
0 [1, 2]
1 [1, 2]
2 [1, 2]
3 [1, 2]
4 [1, 2]
5 [1, 2]
dtype: object


New Behavior. The behaviour is consistent.

.. ipython:: python

df.apply(lambda x: [1, 2, 3], axis=1)
df.apply(lambda x: [1, 2], axis=1)

The returned input will also *not* return a Series with the list-wrapper as previously.

.. ipython:: python

df = pd.DataFrame([[1,2], [1,2]], columns=['a','b'])


Previous Behavior

.. code-block:: python

In [3]: df.apply(lambda x: {'s':x['a'] + x['b']}, 1)
Out[3]:
0 {'s': 3}
1 {'s': 3}
dtype: object


New Behaviour

.. ipython:: python

df.apply(lambda x: {'s':x['a'] + x['b']}, 1)

To achieve the original effect, you can operate on a ``Series``

.. ipython:: python

(df['a'] + df['b']).apply(lambda x: {'s': x})


.. _whatsnew_0220.api:
Expand Down Expand Up @@ -185,7 +257,6 @@ Sparse
Reshaping
^^^^^^^^^

-
-
-

Expand Down
49 changes: 36 additions & 13 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2133,7 +2133,7 @@ def __getitem__(self, key):
try:
if key in self.columns and not is_mi_columns:
return self._getitem_column(key)
except:
except Exception:
pass

# see if we can slice the rows
Expand Down Expand Up @@ -2576,7 +2576,7 @@ def _ensure_valid_index(self, value):
if not len(self.index) and is_list_like(value):
try:
value = Series(value)
except:
except Exception:
raise ValueError('Cannot set a frame with no defined index '
'and a value that cannot be converted to a '
'Series')
Expand Down Expand Up @@ -4916,8 +4916,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):

# skip if we are mixed datelike and trying reduce across axes
# GH6125
if (reduce and axis == 1 and self._is_mixed_type and
self._is_datelike_mixed_type):
if reduce and axis == 1:
reduce = False

# try to reduce first (by default)
Expand Down Expand Up @@ -4990,16 +4989,40 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
raise

if len(results) > 0 and is_sequence(results[0]):
if not isinstance(results[0], Series):
index = res_columns
else:
index = None
# map to rows
if axis == 0:
result = self._constructor(data=results)

if not isinstance(results[0], Series):
try:
result.index = res_columns
except ValueError:
pass

result = self._constructor(data=results, index=index)
result.columns = res_index
try:
result.columns = res_index
except ValueError:
pass

if axis == 1:
# map to columns
else:

result = self._constructor(data=results)
result = result.T

# try to assign the result indices;
# this may fail, if so we have
# received an invalid return shape
try:
result.index = res_index
except ValueError:
pass

try:
result.columns = res_columns
except ValueError:
pass

result = result._convert(datetime=True, timedelta=True, copy=False)

else:
Expand Down Expand Up @@ -5732,7 +5755,7 @@ def f(x):
if result.ndim == self.ndim:
result = result.iloc[0]
return result
except:
except Exception:
pass

if filter_type is None or filter_type == 'numeric':
Expand Down Expand Up @@ -6247,7 +6270,7 @@ def convert(v):
values = np.array([convert(v) for v in values])
else:
values = convert(values)
except:
except Exception:
values = convert(values)

else:
Expand Down
105 changes: 105 additions & 0 deletions pandas/tests/frame/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,111 @@ def test_apply_non_numpy_dtype(self):
assert_frame_equal(result, df)


class TestInferOutputShape(object):
# the user has supplied an opaque UDF where
# they are transforming the input that requires
# us to infer the output

def test_infer_row_shape(self):
# gh-17437
# if row shape is changing, infer it
df = pd.DataFrame(np.random.rand(10, 2))
result = df.apply(np.fft.fft, axis=0)
assert result.shape == (10, 2)

result = df.apply(np.fft.rfft, axis=0)
assert result.shape == (6, 2)

def test_with_dictlike_columns(self):
# gh 17602

df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1)
expected = DataFrame({'s': df['a'].values + df['b'].values})
assert_frame_equal(result, expected)

df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
pd.Timestamp('2017-05-02 00:00:00')]
result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1)
assert_frame_equal(result, expected)

# compose a series
result = (df['a'] + df['b']).apply(lambda x: {'s': x})
expected = Series([{'s': 3}, {'s': 3}])
assert_series_equal(result, expected)

def test_with_listlike_columns(self):
# gh-17348
df = DataFrame({'a': Series(np.random.randn(4)),
'b': ['a', 'list', 'of', 'words'],
'ts': date_range('2016-10-01', periods=4, freq='H')})

result = df[['a', 'b']].apply(tuple, axis=1)
expected = df[['a', 'b']]
assert_frame_equal(result, expected)

result = df[['a', 'ts']].apply(tuple, axis=1)
expected = df[['a', 'ts']]
assert_frame_equal(result, expected)

def test_infer_output_shape_columns(self):
# gh-18573

df = DataFrame({'number': [1., 2.],
'string': ['foo', 'bar'],
'datetime': [pd.Timestamp('2017-11-29 03:30:00'),
pd.Timestamp('2017-11-29 03:45:00')]})
result = df.apply(lambda row: (row.number, row.string), axis=1)
expected = df[['number', 'string']].copy()
expected.columns = [0, 1]
assert_frame_equal(result, expected)

def test_infer_output_shape_listlike_columns(self):
# gh-16353

df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])

result = df.apply(lambda x: [1, 2, 3], axis=1)
expected = DataFrame({'A': 1, 'B': 2, 'C': 3},
index=range(6)).reindex(columns=df.columns)
assert_frame_equal(result, expected)

result = df.apply(lambda x: [1, 2], axis=1)
expected = DataFrame({0: 1, 1: 2},
index=range(6)).reindex(columns=[0, 1])
assert_frame_equal(result, expected)

# gh-17970
df = DataFrame({"a": [1, 2, 3]})

result = df.apply(lambda row: np.ones(1), axis=1)
expected = DataFrame({'a': 1.0},
index=range(3))
assert_frame_equal(result, expected)

result = df.apply(lambda row: np.ones(2), axis=1)
expected = DataFrame({0: 1., 1: 1.},
index=range(3)).reindex(columns=[0, 1])
assert_frame_equal(result, expected)

# gh-17892
df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
pd.Timestamp('2010-02-04'),
pd.Timestamp('2010-02-05'),
pd.Timestamp('2010-02-06')],
'b': [9, 5, 4, 3],
'c': [5, 3, 4, 2],
'd': [1, 2, 3, 4]})

def fun(x):
return (1, 2)

result = df.apply(fun, axis=1)
expected = DataFrame({0: 1, 1: 2},
index=range(4)).reindex(columns=[0, 1])
assert_frame_equal(result, expected)


def zip_frames(*frames):
"""
take a list of frames, zip the columns together for each
Expand Down

0 comments on commit dce186a

Please sign in to comment.