Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG/ENH: Fix apply to only call func once on the first column/row #34183

Merged
merged 15 commits into from
Jun 2, 2020
39 changes: 39 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,45 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma

df.groupby("a", as_index=False).nunique()

.. _whatsnew_110.api_breaking.apply_applymap_first_once:

apply and applymap on ``DataFrame`` evaluates first row/column only once
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. ipython:: python

df = pd.DataFrame({'a': [1, 2], 'b': [3, 6]})

def func(row):
print(row)
return row

*Previous behavior*:

.. code-block:: ipython

In [4]: df.apply(func, axis=1)
a 1
b 3
Name: 0, dtype: int64
a 1
b 3
Name: 0, dtype: int64
a 2
b 6
Name: 1, dtype: int64
Out[4]:
a b
0 1 3
1 2 6

*New behavior*:

.. ipython:: python

df.apply(func, axis=1)


.. _whatsnew_110.deprecations:

Deprecations
Expand Down
25 changes: 20 additions & 5 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ cdef class Reducer:

result = np.empty(self.nresults, dtype='O')
it = <flatiter>PyArray_IterNew(result)
reduction_success = True

try:
for i in range(self.nresults):
Expand Down Expand Up @@ -134,21 +135,35 @@ cdef class Reducer:
res = self.f(chunk)

# TODO: reason for not squeezing here?
res = _extract_result(res, squeeze=False)
extracted_res = _extract_result(res, squeeze=False)
if i == 0:
# On the first pass, we check the output shape to see
# if this looks like a reduction.
_check_result_array(res, len(self.dummy))

PyArray_SETITEM(result, PyArray_ITER_DATA(it), res)
# If it does not, return the computed value to be used by the
jreback marked this conversation as resolved.
Show resolved Hide resolved
# pure python implementation,
# so the function won't be called twice on the same object,
# and side effects would occur twice
try:
_check_result_array(extracted_res, len(self.dummy))
except ValueError as err:
if "Function does not reduce" not in str(err):
# catch only the specific exception
raise

reduction_success = False
PyArray_SETITEM(result, PyArray_ITER_DATA(it), copy(res))
jreback marked this conversation as resolved.
Show resolved Hide resolved
break

PyArray_SETITEM(result, PyArray_ITER_DATA(it), extracted_res)
chunk.data = chunk.data + self.increment
PyArray_ITER_NEXT(it)

finally:
# so we don't free the wrong memory
chunk.data = dummy_buf

result = maybe_convert_objects(result)
return result
return result, reduction_success


cdef class _BaseGrouper:
Expand Down
55 changes: 33 additions & 22 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,14 +220,12 @@ def apply_empty_result(self):

def apply_raw(self):
""" apply to the values as a numpy array """
try:
result = libreduction.compute_reduction(self.values, self.f, axis=self.axis)
except ValueError as err:
if "Function does not reduce" not in str(err):
# catch only ValueError raised intentionally in libreduction
raise
# We expect np.apply_along_axis to give a two-dimensional result, or
# also raise.
result, reduction_success = libreduction.compute_reduction(
self.values, self.f, axis=self.axis
)

# We expect np.apply_along_axis to give a two-dimensional result, or raise.
if not reduction_success:
result = np.apply_along_axis(self.f, self.axis, self.values)

# TODO: mixed type case
Expand Down Expand Up @@ -265,6 +263,9 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame":

def apply_standard(self):

# partial result that may be returned from reduction
partial_result = None

# try to reduce first (by default)
# this only matters if the reduction in values is of different dtype
# e.g. if we want to apply to a SparseFrame, then can't directly reduce
Expand Down Expand Up @@ -292,13 +293,9 @@ def apply_standard(self):
)

try:
result = libreduction.compute_reduction(
result, reduction_success = libreduction.compute_reduction(
values, self.f, axis=self.axis, dummy=dummy, labels=labels
)
except ValueError as err:
if "Function does not reduce" not in str(err):
# catch only ValueError raised intentionally in libreduction
raise
except TypeError:
# e.g. test_apply_ignore_failures we just ignore
if not self.ignore_failures:
Expand All @@ -307,39 +304,53 @@ def apply_standard(self):
# reached via numexpr; fall back to python implementation
pass
else:
return self.obj._constructor_sliced(result, index=labels)
if reduction_success:
return self.obj._constructor_sliced(result, index=labels)

# compute the result using the series generator
results, res_index = self.apply_series_generator()
# no exceptions - however reduction was unsuccessful,
# use the computed function result for first element
partial_result = result[0]
if isinstance(partial_result, ABCSeries):
partial_result = partial_result.infer_objects()

# compute the result using the series generator,
# use the result computed while trying to reduce if available.
results, res_index = self.apply_series_generator(partial_result)

# wrap results
return self.wrap_results(results, res_index)

def apply_series_generator(self) -> Tuple[ResType, "Index"]:
def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]:
series_gen = self.series_generator
res_index = self.result_index

keys = []
results = {}

# If a partial result was already computed,
# use it instead of running on the first element again
series_gen_enumeration = enumerate(series_gen)
if partial_result is not None:
i, v = next(series_gen_enumeration)
results[i] = partial_result

if self.ignore_failures:
successes = []
for i, v in enumerate(series_gen):
for i, v in series_gen_enumeration:
try:
results[i] = self.f(v)
except Exception:
pass
else:
keys.append(v.name)
successes.append(i)

# so will work with MultiIndex
if len(successes) < len(res_index):
res_index = res_index.take(successes)

else:
for i, v in enumerate(series_gen):
for i, v in series_gen_enumeration:

results[i] = self.f(v)
keys.append(v.name)

return results, res_index

Expand Down
8 changes: 0 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7421,14 +7421,6 @@ def applymap(self, func) -> "DataFrame":
--------
DataFrame.apply : Apply a function along input axis of DataFrame.

Notes
-----
In the current implementation applymap calls `func` twice on the
first column/row to decide whether it can take a fast or slow
code path. This can lead to unexpected behavior if `func` has
side-effects, as they will take effect twice for the first
column/row.

Examples
--------
>>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
Expand Down
63 changes: 62 additions & 1 deletion pandas/tests/frame/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,12 +718,73 @@ def apply_list(row):

def test_apply_noreduction_tzaware_object(self):
# https://github.com/pandas-dev/pandas/issues/31505
df = pd.DataFrame({"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="object")
df = pd.DataFrame(
{"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]"
jreback marked this conversation as resolved.
Show resolved Hide resolved
)
result = df.apply(lambda x: x)
tm.assert_frame_equal(result, df)
result = df.apply(lambda x: x.copy())
tm.assert_frame_equal(result, df)

def test_apply_function_runs_once(self):
alonme marked this conversation as resolved.
Show resolved Hide resolved
# https://github.com/pandas-dev/pandas/issues/30815

df = pd.DataFrame({"a": [1, 2, 3]})
names = [] # Save row names function is applied to

def reducing_function(row):
names.append(row.name)

def non_reducing_function(row):
names.append(row.name)
return row

for func in [reducing_function, non_reducing_function]:
del names[:]

df.apply(func, axis=1)
assert names == list(df.index)

@pytest.mark.xfail(
reason="The 'run once' enhancement for apply_raw not implemented yet."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue number you create here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its listed in comment in the first row of test function,
You want it also in the reason?

)
def test_apply_raw_function_runs_once(self):
# https://github.com/pandas-dev/pandas/issues/34506

df = pd.DataFrame({"a": [1, 2, 3]})
values = [] # Save row values function is applied to

def reducing_function(row):
values.extend(row)

def non_reducing_function(row):
values.extend(row)
return row

for func in [reducing_function, non_reducing_function]:
del values[:]

df.apply(func, raw=True, axis=1)
assert values == list(df.a.to_list())

def test_applymap_function_runs_once(self):

jreback marked this conversation as resolved.
Show resolved Hide resolved
df = pd.DataFrame({"a": [1, 2, 3]})
values = [] # Save values function is applied to

def reducing_function(val):
values.append(val)

def non_reducing_function(val):
values.append(val)
return val

for func in [reducing_function, non_reducing_function]:
del values[:]

df.applymap(func)
assert values == df.a.to_list()


class TestInferOutputShape:
# the user has supplied an opaque UDF where
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/groupby/test_bin_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,20 +153,20 @@ def test_int_index(self):
)

dummy = Series(0.0, index=np.arange(100))
result = libreduction.compute_reduction(
result, _ = libreduction.compute_reduction(
jreback marked this conversation as resolved.
Show resolved Hide resolved
arr, np.sum, dummy=dummy, labels=Index(np.arange(4))
)
expected = arr.sum(0)
tm.assert_almost_equal(result, expected)

dummy = Series(0.0, index=np.arange(4))
result = libreduction.compute_reduction(
result, _ = libreduction.compute_reduction(
arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))
)
expected = arr.sum(1)
tm.assert_almost_equal(result, expected)

result = libreduction.compute_reduction(
result, _ = libreduction.compute_reduction(
arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))
)
tm.assert_almost_equal(result, expected)