Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: do DataFrame.op(series, axis=0) blockwise #31296

Merged
merged 16 commits into from
Mar 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
5532b2e
PERF: do DataFrame.op(series, axis=0) blockwsie
jbrockmendel Jan 24, 2020
1011185
whatsnew
jbrockmendel Jan 24, 2020
a77057b
update GH ref
jbrockmendel Jan 24, 2020
c9c2fbb
Fix+test numexpr-fallback case
jbrockmendel Jan 25, 2020
b50c0b8
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Jan 26, 2020
e50f7d5
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Jan 31, 2020
1dc2deb
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Feb 9, 2020
0af4ffe
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Feb 22, 2020
f02b8c0
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Feb 22, 2020
231a316
whatsnew, docstrings
jbrockmendel Feb 22, 2020
bd889ef
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Feb 23, 2020
4cbb7db
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Feb 24, 2020
f29e09b
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Feb 26, 2020
54df295
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Mar 2, 2020
c899b9e
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Mar 5, 2020
3110628
Merge branch 'master' of https://github.com/pandas-dev/pandas into pe…
jbrockmendel Mar 5, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions asv_bench/benchmarks/arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,36 @@ def time_frame_op_with_scalar(self, dtype, scalar, op):
op(self.df, scalar)


class MixedFrameWithSeriesAxis0:
params = [
[
"eq",
"ne",
"lt",
"le",
"ge",
"gt",
"add",
"sub",
"div",
"floordiv",
"mul",
"pow",
]
]
param_names = ["opname"]

def setup(self, opname):
arr = np.arange(10 ** 6).reshape(100, -1)
df = DataFrame(arr)
df["C"] = 1.0
self.df = df
self.ser = df[0]

def time_frame_op_with_series_axis0(self, opname):
getattr(self.df, opname)(self.ser, axis=0)


class Ops:

params = [[True, False], ["default", 1]]
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ Performance improvements

- Performance improvement in :class:`Timedelta` constructor (:issue:`30543`)
- Performance improvement in :class:`Timestamp` constructor (:issue:`30543`)
-
- Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`)
-

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def scalar_compare(object[:] values, object val, object op):

@cython.wraparound(False)
@cython.boundscheck(False)
def vec_compare(object[:] left, object[:] right, object op):
def vec_compare(ndarray[object] left, ndarray[object] right, object op):
"""
Compare the elements of `left` with the elements of `right` pointwise,
with the comparison operation described by `op`.
Expand Down
14 changes: 0 additions & 14 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5212,20 +5212,6 @@ def _arith_op(left, right):

return new_data

def _combine_match_index(self, other: Series, func):
# at this point we have `self.index.equals(other.index)`

if ops.should_series_dispatch(self, other, func):
# operate column-wise; avoid costly object-casting in `.values`
new_data = ops.dispatch_to_series(self, other, func)
else:
# fastpath --> operate directly on values
other_vals = other.values.reshape(-1, 1)
with np.errstate(all="ignore"):
new_data = func(self.values, other_vals)
new_data = dispatch_fill_zeros(func, self.values, other_vals, new_data)
return new_data

def _construct_result(self, result) -> "DataFrame":
"""
Wrap the result of an arithmetic, comparison, or logical operation.
Expand Down
21 changes: 17 additions & 4 deletions pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
# DataFrame


def _combine_series_frame(left, right, func, axis: int):
def _combine_series_frame(left, right, func, axis: int, str_rep: str):
"""
Apply binary operator `func` to self, other using alignment and fill
conventions determined by the axis argument.
Expand All @@ -596,14 +596,25 @@ def _combine_series_frame(left, right, func, axis: int):
right : Series
func : binary operator
axis : {0, 1}
str_rep : str

Returns
-------
result : DataFrame
"""
# We assume that self.align(other, ...) has already been called
if axis == 0:
new_data = left._combine_match_index(right, func)
values = right._values
if isinstance(values, np.ndarray):
jreback marked this conversation as resolved.
Show resolved Hide resolved
# We can operate block-wise
values = values.reshape(-1, 1)

array_op = get_array_op(func, str_rep=str_rep)
bm = left._data.apply(array_op, right=values.T)
return type(left)(bm)

new_data = dispatch_to_series(left, right, func)

else:
new_data = dispatch_to_series(left, right, func, axis="columns")

Expand Down Expand Up @@ -791,7 +802,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
raise NotImplementedError(f"fill_value {fill_value} not supported.")

axis = self._get_axis_number(axis) if axis is not None else 1
return _combine_series_frame(self, other, pass_op, axis=axis)
return _combine_series_frame(
jreback marked this conversation as resolved.
Show resolved Hide resolved
self, other, pass_op, axis=axis, str_rep=str_rep
)
else:
# in this case we always have `np.ndim(other) == 0`
if fill_value is not None:
Expand Down Expand Up @@ -826,7 +839,7 @@ def f(self, other, axis=default_axis, level=None):

elif isinstance(other, ABCSeries):
axis = self._get_axis_number(axis) if axis is not None else 1
return _combine_series_frame(self, other, op, axis=axis)
return _combine_series_frame(self, other, op, axis=axis, str_rep=str_rep)
else:
# in this case we always have `np.ndim(other) == 0`
new_data = dispatch_to_series(self, other, op, str_rep)
Expand Down
74 changes: 68 additions & 6 deletions pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
ABCDatetimeArray,
ABCExtensionArray,
ABCIndex,
ABCIndexClass,
ABCSeries,
ABCTimedeltaArray,
)
Expand All @@ -53,13 +52,15 @@ def comp_method_OBJECT_ARRAY(op, x, y):
if isinstance(y, (ABCSeries, ABCIndex)):
y = y.values

result = libops.vec_compare(x.ravel(), y, op)
if x.shape != y.shape:
raise ValueError("Shapes must match", x.shape, y.shape)
jreback marked this conversation as resolved.
Show resolved Hide resolved
result = libops.vec_compare(x.ravel(), y.ravel(), op)
jreback marked this conversation as resolved.
Show resolved Hide resolved
else:
result = libops.scalar_compare(x.ravel(), y, op)
return result.reshape(x.shape)


def masked_arith_op(x, y, op):
def masked_arith_op(x: np.ndarray, y, op):
"""
If the given arithmetic operation fails, attempt it again on
only the non-null elements of the input array(s).
Expand All @@ -78,10 +79,22 @@ def masked_arith_op(x, y, op):
dtype = find_common_type([x.dtype, y.dtype])
result = np.empty(x.size, dtype=dtype)

if len(x) != len(y):
if not _can_broadcast(x, y):
raise ValueError(x.shape, y.shape)

# Call notna on pre-broadcasted y for performance
ymask = notna(y)
y = np.broadcast_to(y, x.shape)
ymask = np.broadcast_to(ymask, x.shape)

else:
ymask = notna(y)

# NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex
# we would get int64 dtype, see GH#19956
yrav = y.ravel()
mask = notna(xrav) & notna(yrav)
mask = notna(xrav) & ymask.ravel()

if yrav.shape != mask.shape:
# FIXME: GH#5284, GH#5035, GH#19448
Expand Down Expand Up @@ -211,6 +224,51 @@ def arithmetic_op(left: ArrayLike, right: Any, op, str_rep: str):
return res_values


def _broadcast_comparison_op(lvalues, rvalues, op) -> np.ndarray:
"""
Broadcast a comparison operation between two 2D arrays.

Parameters
----------
lvalues : np.ndarray or ExtensionArray
rvalues : np.ndarray or ExtensionArray

Returns
-------
np.ndarray[bool]
"""
if isinstance(rvalues, np.ndarray):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks like the case on L81 in array_ops.py

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you follow up with consolidate if this is the case

rvalues = np.broadcast_to(rvalues, lvalues.shape)
result = comparison_op(lvalues, rvalues, op)
else:
result = np.empty(lvalues.shape, dtype=bool)
for i in range(len(lvalues)):
result[i, :] = comparison_op(lvalues[i], rvalues[:, 0], op)
return result


def _can_broadcast(lvalues, rvalues) -> bool:
"""
Check if we can broadcast rvalues to match the shape of lvalues.

Parameters
----------
lvalues : np.ndarray or ExtensionArray
rvalues : np.ndarray or ExtensionArray

Returns
-------
bool
"""
# We assume that lengths dont match
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

if lvalues.ndim == rvalues.ndim == 2:
# See if we can broadcast unambiguously
if lvalues.shape[1] == rvalues.shape[-1]:
if rvalues.shape[0] == 1:
return True
return False


def comparison_op(
left: ArrayLike, right: Any, op, str_rep: Optional[str] = None,
) -> ArrayLike:
Expand All @@ -237,12 +295,16 @@ def comparison_op(
# TODO: same for tuples?
rvalues = np.asarray(rvalues)

if isinstance(rvalues, (np.ndarray, ABCExtensionArray, ABCIndexClass)):
if isinstance(rvalues, (np.ndarray, ABCExtensionArray)):
# TODO: make this treatment consistent across ops and classes.
# We are not catching all listlikes here (e.g. frozenset, tuple)
# The ambiguous case is object-dtype. See GH#27803
if len(lvalues) != len(rvalues):
raise ValueError("Lengths must match to compare")
if _can_broadcast(lvalues, rvalues):
return _broadcast_comparison_op(lvalues, rvalues, op)
raise ValueError(
"Lengths must match to compare", lvalues.shape, rvalues.shape
)

if should_extension_dispatch(lvalues, rvalues):
res_values = dispatch_to_extension_op(op, lvalues, rvalues)
Expand Down
17 changes: 16 additions & 1 deletion pandas/tests/arithmetic/test_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

import pandas._testing as tm
from pandas.core.ops.array_ops import na_logical_op
from pandas.core.ops.array_ops import comparison_op, na_logical_op


def test_na_logical_op_2d():
Expand All @@ -19,3 +19,18 @@ def test_na_logical_op_2d():
result = na_logical_op(left, right, operator.or_)
expected = right
tm.assert_numpy_array_equal(result, expected)


def test_object_comparison_2d():
left = np.arange(9).reshape(3, 3).astype(object)
right = left.T

result = comparison_op(left, right, operator.eq)
expected = np.eye(3).astype(bool)
tm.assert_numpy_array_equal(result, expected)

# Ensure that cython doesn't raise on non-writeable arg, which
# we can get from np.broadcast_to
right.flags.writeable = False
result = comparison_op(left, right, operator.ne)
tm.assert_numpy_array_equal(result, ~expected)
19 changes: 19 additions & 0 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,25 @@ def test_floordiv_axis0(self):
result2 = df.floordiv(ser.values, axis=0)
tm.assert_frame_equal(result2, expected)

@pytest.mark.slow
@pytest.mark.parametrize("opname", ["floordiv", "pow"])
def test_floordiv_axis0_numexpr_path(self, opname):
# case that goes through numexpr and has to fall back to masked_arith_op
op = getattr(operator, opname)

arr = np.arange(10 ** 6).reshape(100, -1)
df = pd.DataFrame(arr)
df["C"] = 1.0

ser = df[0]
result = getattr(df, opname)(ser, axis=0)

expected = pd.DataFrame({col: op(df[col], ser) for col in df.columns})
tm.assert_frame_equal(result, expected)

result2 = getattr(df, opname)(ser.values, axis=0)
tm.assert_frame_equal(result2, expected)

def test_df_add_td64_columnwise(self):
# GH 22534 Check that column-wise addition broadcasts correctly
dti = pd.date_range("2016-01-01", periods=10)
Expand Down