Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: avoid creating many Series in apply_standard #34909

Merged
merged 9 commits into from
Jun 25, 2020
113 changes: 48 additions & 65 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,13 @@

import numpy as np

from pandas._config import option_context

from pandas._libs import reduction as libreduction
from pandas._typing import Axis
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.common import (
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_sequence,
)
from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence
from pandas.core.dtypes.generic import ABCSeries

from pandas.core.construction import create_series_with_explicit_dtype
Expand Down Expand Up @@ -266,53 +263,6 @@ def apply_standard(self):
# partial result that may be returned from reduction
partial_result = None

# try to reduce first (by default)
# this only matters if the reduction in values is of different dtype
# e.g. if we want to apply to a SparseFrame, then can't directly reduce

# we cannot reduce using non-numpy dtypes,
# as demonstrated in gh-12244
if (
self.result_type in ["reduce", None]
and not self.dtypes.apply(is_extension_array_dtype).any()
# Disallow dtypes where setting _index_data will break
# ExtensionArray values, see GH#31182
and not self.dtypes.apply(lambda x: x.kind in ["m", "M"]).any()
# Disallow complex_internals since libreduction shortcut raises a TypeError
and not self.agg_axis._has_complex_internals
):

values = self.values
index = self.obj._get_axis(self.axis)
labels = self.agg_axis
empty_arr = np.empty(len(index), dtype=values.dtype)

# Preserve subclass for e.g. test_subclassed_apply
dummy = self.obj._constructor_sliced(
empty_arr, index=index, dtype=values.dtype
)

try:
result, reduction_success = libreduction.compute_reduction(
values, self.f, axis=self.axis, dummy=dummy, labels=labels
)
except TypeError:
# e.g. test_apply_ignore_failures we just ignore
if not self.ignore_failures:
raise
except ZeroDivisionError:
# reached via numexpr; fall back to python implementation
pass
else:
if reduction_success:
return self.obj._constructor_sliced(result, index=labels)

# no exceptions - however reduction was unsuccessful,
# use the computed function result for first element
partial_result = result[0]
if isinstance(partial_result, ABCSeries):
partial_result = partial_result.infer_objects()

# compute the result using the series generator,
# use the result computed while trying to reduce if available.
results, res_index = self.apply_series_generator(partial_result)
Expand Down Expand Up @@ -350,7 +300,14 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]
else:
for i, v in series_gen_enumeration:

results[i] = self.f(v)
with option_context("mode.chained_assignment", None):
# ignore SettingWithCopy here in case the user mutates
results[i] = self.f(v)

if isinstance(results[i], ABCSeries):
# If we have a view on v, we need to make a copy because
# series_generator will swap out the underlying data
results[i] = results[i].copy(deep=False)

return results, res_index

Expand All @@ -361,7 +318,6 @@ def wrap_results(

# see if we can infer the results
if len(results) > 0 and 0 in results and is_sequence(results[0]):

return self.wrap_results_for_axis(results, res_index)

# dict of scalars
Expand Down Expand Up @@ -401,9 +357,30 @@ def result_columns(self) -> "Index":

def wrap_results_for_axis(
self, results: ResType, res_index: "Index"
) -> "DataFrame":
) -> Union["Series", "DataFrame"]:
""" return the results for the rows """
result = self.obj._constructor(data=results)

if self.result_type == "reduce":
# e.g. test_apply_dict GH#8735
return self.obj._constructor_sliced(results)
elif self.result_type is None and all(
isinstance(x, dict) for x in results.values()
):
# Our operation was a to_dict op e.g.
# test_apply_dict GH#8735, test_apply_reduce_rows_to_dict GH#25196
return self.obj._constructor_sliced(results)

try:
result = self.obj._constructor(data=results)
except ValueError as err:
if "arrays must all be same length" in str(err):
# e.g. result = [[2, 3], [1.5], ['foo', 'bar']]
# see test_agg_listlike_result GH#29587
res = self.obj._constructor_sliced(results)
res.index = res_index
return res
else:
raise

if not isinstance(results[0], ABCSeries):
if len(result.index) == len(self.res_columns):
Expand All @@ -424,11 +401,19 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame":

@property
def series_generator(self):
constructor = self.obj._constructor_sliced
return (
constructor(arr, index=self.columns, name=name)
for i, (arr, name) in enumerate(zip(self.values, self.index))
)
values = self.values
assert len(values) > 0

# We create one Series object, and will swap out the data inside
# of it. Kids: don't do this at home.
ser = self.obj._ixs(0, axis=0)
mgr = ser._mgr
blk = mgr.blocks[0]

for (arr, name) in zip(values, self.index):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you push this to an internals method instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im looking at that now. the other place where this pattern could be really useful is in groupby.ops, but its tougher there

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure also exposing an api for this would be ok as well (eg another internals method)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im still troubleshooting the groupby.ops usage, would like to punt on making this an internals method for the time being

blk.values = arr
ser.name = name
yield ser

@property
def result_index(self) -> "Index":
Expand All @@ -450,9 +435,7 @@ def wrap_results_for_axis(

# we have a non-series and don't want inference
elif not isinstance(results[0], ABCSeries):
from pandas import Series

result = Series(results)
result = self.obj._constructor_sliced(results)
result.index = res_index

# we may want to infer results
Expand Down