From 5532b2e47c39c42810b9cff559bfb80bb3e39de2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 24 Jan 2020 14:50:06 -0800 Subject: [PATCH 1/5] PERF: do DataFrame.op(series, axis=0) blockwsie --- asv_bench/benchmarks/binary_ops.py | 30 ++++++++++++++++++++ pandas/_libs/ops.pyx | 2 +- pandas/core/frame.py | 12 -------- pandas/core/ops/__init__.py | 21 +++++++++++--- pandas/core/ops/array_ops.py | 34 ++++++++++++++++++++--- pandas/tests/arithmetic/test_array_ops.py | 17 +++++++++++- pandas/tests/frame/test_arithmetic.py | 15 ++++++++++ 7 files changed, 109 insertions(+), 22 deletions(-) diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 64e067d25a454..35fc249fb1e4e 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -41,6 +41,36 @@ def time_frame_op_with_scalar(self, dtype, scalar, op): op(self.df, scalar) +class MixedFrameWithSeriesAxis0: + params = [ + [ + "eq", + "ne", + "lt", + "le", + "ge", + "gt", + "add", + "sub", + "div", + "floordiv", + "mul", + "pow", + ] + ] + param_names = ["opname"] + + def setup(self, opname): + arr = np.arange(10**6).reshape(100, -1) + df = DataFrame(arr) + df["C"] = 1.0 + self.df = df + self.ser = df[0] + + def time_frame_op_with_series_axis0(self, opname): + getattr(self.df, opname)(self.ser, axis=0) + + class Ops: params = [[True, False], ["default", 1]] diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index abe1484e3763d..c0971b91a2fa1 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -100,7 +100,7 @@ def scalar_compare(object[:] values, object val, object op): @cython.wraparound(False) @cython.boundscheck(False) -def vec_compare(object[:] left, object[:] right, object op): +def vec_compare(ndarray[object] left, ndarray[object] right, object op): """ Compare the elements of `left` with the elements of `right` pointwise, with the comparison operation described by `op`. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 012fb1d0c2eb7..bdddbdfb57064 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5368,18 +5368,6 @@ def _arith_op(left, right): return new_data - def _combine_match_index(self, other, func): - # at this point we have `self.index.equals(other.index)` - - if ops.should_series_dispatch(self, other, func): - # operate column-wise; avoid costly object-casting in `.values` - new_data = ops.dispatch_to_series(self, other, func) - else: - # fastpath --> operate directly on values - with np.errstate(all="ignore"): - new_data = func(self.values.T, other.values).T - return new_data - def _construct_result(self, result) -> "DataFrame": """ Wrap the result of an arithmetic, comparison, or logical operation. diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 9ed233cad65ce..2abac8e0bb368 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -584,7 +584,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # DataFrame -def _combine_series_frame(left, right, func, axis: int): +def _combine_series_frame(left, right, func, axis: int, str_rep: str): """ Apply binary operator `func` to self, other using alignment and fill conventions determined by the axis argument. @@ -595,6 +595,7 @@ def _combine_series_frame(left, right, func, axis: int): right : Series func : binary operator axis : {0, 1} + str_rep : str Returns ------- @@ -602,7 +603,17 @@ def _combine_series_frame(left, right, func, axis: int): """ # We assume that self.align(other, ...) has already been called if axis == 0: - new_data = left._combine_match_index(right, func) + values = right._values + if isinstance(values, np.ndarray): + # We can operate block-wise + values = values.reshape(-1, 1) + + array_op = get_array_op(func, str_rep=str_rep) + bm = left._data.apply(array_op, right=values.T) + return type(left)(bm) + + new_data = dispatch_to_series(left, right, func) + else: new_data = dispatch_to_series(left, right, func, axis="columns") @@ -705,7 +716,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): self, other = self.align( other, join="outer", axis=axis, level=level, copy=False ) - return _combine_series_frame(self, other, pass_op, axis=axis) + return _combine_series_frame( + self, other, pass_op, axis=axis, str_rep=str_rep + ) else: # in this case we always have `np.ndim(other) == 0` if fill_value is not None: @@ -745,7 +758,7 @@ def f(self, other, axis=default_axis, level=None): self, other = self.align( other, join="outer", axis=axis, level=level, copy=False ) - return _combine_series_frame(self, other, op, axis=axis) + return _combine_series_frame(self, other, op, axis=axis, str_rep=str_rep) else: # in this case we always have `np.ndim(other) == 0` new_data = dispatch_to_series(self, other, op) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index c393b8028113b..190c88a1b99dc 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -27,7 +27,6 @@ ABCDatetimeArray, ABCExtensionArray, ABCIndex, - ABCIndexClass, ABCSeries, ABCTimedeltaArray, ) @@ -52,7 +51,9 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, (ABCSeries, ABCIndex)): y = y.values - result = libops.vec_compare(x.ravel(), y, op) + if x.shape != y.shape: + raise ValueError("Shapes must match", x.shape, y.shape) + result = libops.vec_compare(x.ravel(), y.ravel(), op) else: result = libops.scalar_compare(x.ravel(), y, op) return result.reshape(x.shape) @@ -200,6 +201,27 @@ def arithmetic_op( return res_values +def _broadcast_comparison_op(lvalues, rvalues, op): + if isinstance(rvalues, np.ndarray): + rvalues = np.broadcast_to(rvalues, lvalues.shape) + result = comparison_op(lvalues, rvalues, op) + else: + result = np.empty(lvalues.shape, dtype=bool) + for i in range(len(lvalues)): + result[i, :] = comparison_op(lvalues[i], rvalues[:, 0], op) + return result + + +def _can_broadcast(lvalues, rvalues) -> bool: + # We assume that lengths dont match + if lvalues.ndim == rvalues.ndim == 2: + # See if we can broadcast unambiguously + if lvalues.shape[1] == rvalues.shape[-1]: + if rvalues.shape[0] == 1: + return True + return False + + def comparison_op( left: Union[np.ndarray, ABCExtensionArray], right: Any, op ) -> Union[np.ndarray, ABCExtensionArray]: @@ -227,12 +249,16 @@ def comparison_op( # TODO: same for tuples? rvalues = np.asarray(rvalues) - if isinstance(rvalues, (np.ndarray, ABCExtensionArray, ABCIndexClass)): + if isinstance(rvalues, (np.ndarray, ABCExtensionArray)): # TODO: make this treatment consistent across ops and classes. # We are not catching all listlikes here (e.g. frozenset, tuple) # The ambiguous case is object-dtype. See GH#27803 if len(lvalues) != len(rvalues): - raise ValueError("Lengths must match to compare") + if _can_broadcast(lvalues, rvalues): + return _broadcast_comparison_op(lvalues, rvalues, op) + raise ValueError( + "Lengths must match to compare", lvalues.shape, rvalues.shape + ) if should_extension_dispatch(lvalues, rvalues): res_values = dispatch_to_extension_op(op, lvalues, rvalues) diff --git a/pandas/tests/arithmetic/test_array_ops.py b/pandas/tests/arithmetic/test_array_ops.py index d8aaa3183a1c6..53cb10ba9fc5e 100644 --- a/pandas/tests/arithmetic/test_array_ops.py +++ b/pandas/tests/arithmetic/test_array_ops.py @@ -4,7 +4,7 @@ import pytest import pandas._testing as tm -from pandas.core.ops.array_ops import na_logical_op +from pandas.core.ops.array_ops import comparison_op, na_logical_op def test_na_logical_op_2d(): @@ -19,3 +19,18 @@ def test_na_logical_op_2d(): result = na_logical_op(left, right, operator.or_) expected = right tm.assert_numpy_array_equal(result, expected) + + +def test_object_comparison_2d(): + left = np.arange(9).reshape(3, 3).astype(object) + right = left.T + + result = comparison_op(left, right, operator.eq) + expected = np.eye(3).astype(bool) + tm.assert_numpy_array_equal(result, expected) + + # Ensure that cython doesn't raise on non-writeable arg, which + # we can get from np.broadcast_to + right.flags.writeable = False + result = comparison_op(left, right, operator.ne) + tm.assert_numpy_array_equal(result, ~expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 659b55756c4b6..c6eacf2bbcd84 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -332,6 +332,21 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): class TestFrameFlexArithmetic: + def test_floordiv_axis0(self): + # make sure we df.floordiv(ser, axis=0) matches column-wise result + arr = np.arange(3) + ser = pd.Series(arr) + df = pd.DataFrame({"A": ser, "B": ser}) + + result = df.floordiv(ser, axis=0) + + expected = pd.DataFrame({col: df[col] // ser for col in df.columns}) + + tm.assert_frame_equal(result, expected) + + result2 = df.floordiv(ser.values, axis=0) + tm.assert_frame_equal(result2, expected) + def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly dti = pd.date_range("2016-01-01", periods=10) From 1011185967d822a25f2e22b8fefc3ec08d75e966 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 24 Jan 2020 14:50:50 -0800 Subject: [PATCH 2/5] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d0644fbb7ef54..45dfb29f1c788 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -119,6 +119,7 @@ Timezones Numeric ^^^^^^^ +- Bug in :meth:`DataFrame.floordiv` with ``axis=0`` not treating division-by-zero like :meth:`Series.floordiv` (:issue:`31271`) - - From a77057b474286b8a1a01e33bf391f70917b73908 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 24 Jan 2020 14:57:35 -0800 Subject: [PATCH 3/5] update GH ref --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 45dfb29f1c788..c70e1daea2fe7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -119,7 +119,7 @@ Timezones Numeric ^^^^^^^ -- Bug in :meth:`DataFrame.floordiv` with ``axis=0`` not treating division-by-zero like :meth:`Series.floordiv` (:issue:`31271`) +- Bug in :meth:`DataFrame.floordiv` with ``axis=0`` not treating division-by-zero like :meth:`Series.floordiv` (:issue:`31296`) - - From c9c2fbb67f864bc73dbc17aa6fb96f0568f11c7e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 24 Jan 2020 16:27:38 -0800 Subject: [PATCH 4/5] Fix+test numexpr-fallback case --- asv_bench/benchmarks/binary_ops.py | 2 +- pandas/core/ops/array_ops.py | 16 ++++++++++++++-- pandas/tests/frame/test_arithmetic.py | 19 +++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 35fc249fb1e4e..065b131cd000d 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -61,7 +61,7 @@ class MixedFrameWithSeriesAxis0: param_names = ["opname"] def setup(self, opname): - arr = np.arange(10**6).reshape(100, -1) + arr = np.arange(10 ** 6).reshape(100, -1) df = DataFrame(arr) df["C"] = 1.0 self.df = df diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 190c88a1b99dc..db086c3bbaf29 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -59,7 +59,7 @@ def comp_method_OBJECT_ARRAY(op, x, y): return result.reshape(x.shape) -def masked_arith_op(x, y, op): +def masked_arith_op(x: np.ndarray, y, op): """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). @@ -78,10 +78,22 @@ def masked_arith_op(x, y, op): dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) + if len(x) != len(y): + if not _can_broadcast(x, y): + raise ValueError(x.shape, y.shape) + + # Call notna on pre-broadcasted y for performance + ymask = notna(y) + y = np.broadcast_to(y, x.shape) + ymask = np.broadcast_to(ymask, x.shape) + + else: + ymask = notna(y) + # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex # we would get int64 dtype, see GH#19956 yrav = y.ravel() - mask = notna(xrav) & notna(yrav) + mask = notna(xrav) & ymask.ravel() if yrav.shape != mask.shape: # FIXME: GH#5284, GH#5035, GH#19448 diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index c6eacf2bbcd84..15444010a3e52 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -347,6 +347,25 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) + @pytest.mark.slow + @pytest.mark.parametrize("opname", ["floordiv", "pow"]) + def test_floordiv_axis0_numexpr_path(self, opname): + # case that goes through numexpr and has to fall back to masked_arith_op + op = getattr(operator, opname) + + arr = np.arange(10 ** 6).reshape(100, -1) + df = pd.DataFrame(arr) + df["C"] = 1.0 + + ser = df[0] + result = getattr(df, opname)(ser, axis=0) + + expected = pd.DataFrame({col: op(df[col], ser) for col in df.columns}) + tm.assert_frame_equal(result, expected) + + result2 = getattr(df, opname)(ser.values, axis=0) + tm.assert_frame_equal(result2, expected) + def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly dti = pd.date_range("2016-01-01", periods=10) From 231a31647db5cbfc34499da8202f3efc25c06941 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 22 Feb 2020 13:23:50 -0800 Subject: [PATCH 5/5] whatsnew, docstrings --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/ops/array_ops.py | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7449c62a5ad31..fc27068298a7b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -90,7 +90,7 @@ Performance improvements - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`) - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`) -- +- Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 6ccb02c1631db..73ec994a72bad 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -213,7 +213,19 @@ def arithmetic_op( return res_values -def _broadcast_comparison_op(lvalues, rvalues, op): +def _broadcast_comparison_op(lvalues, rvalues, op) -> np.ndarray: + """ + Broadcast a comparison operation between two 2D arrays. + + Parameters + ---------- + lvalues : np.ndarray or ExtensionArray + rvalues : np.ndarray or ExtensionArray + + Returns + ------- + np.ndarray[bool] + """ if isinstance(rvalues, np.ndarray): rvalues = np.broadcast_to(rvalues, lvalues.shape) result = comparison_op(lvalues, rvalues, op) @@ -225,6 +237,18 @@ def _broadcast_comparison_op(lvalues, rvalues, op): def _can_broadcast(lvalues, rvalues) -> bool: + """ + Check if we can broadcast rvalues to match the shape of lvalues. + + Parameters + ---------- + lvalues : np.ndarray or ExtensionArray + rvalues : np.ndarray or ExtensionArray + + Returns + ------- + bool + """ # We assume that lengths dont match if lvalues.ndim == rvalues.ndim == 2: # See if we can broadcast unambiguously