PERF: implement scalar ops blockwise (pandas-dev#29853)

AlexKirko · Dec 29, 2019 · e98f9b7 · e98f9b7
1 parent 9a8f8a6
commit e98f9b7
Show file tree

Hide file tree

Showing 11 changed files with 124 additions and 24 deletions.
diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py
@@ -1,3 +1,5 @@
+import operator
+
 import numpy as np
 
 from pandas import DataFrame, Series, date_range
@@ -9,6 +11,36 @@
     import pandas.computation.expressions as expr
 
 
+class IntFrameWithScalar:
+    params = [
+        [np.float64, np.int64],
+        [2, 3.0, np.int32(4), np.float64(5)],
+        [
+            operator.add,
+            operator.sub,
+            operator.mul,
+            operator.truediv,
+            operator.floordiv,
+            operator.pow,
+            operator.mod,
+            operator.eq,
+            operator.ne,
+            operator.gt,
+            operator.ge,
+            operator.lt,
+            operator.le,
+        ],
+    ]
+    param_names = ["dtype", "scalar", "op"]
+
+    def setup(self, dtype, scalar, op):
+        arr = np.random.randn(20000, 100)
+        self.df = DataFrame(arr.astype(dtype))
+
+    def time_frame_op_with_scalar(self, dtype, scalar, op):
+        op(self.df, scalar)
+
+
 class Ops:
 
     params = [[True, False], ["default", 1]]

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -672,6 +672,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
+- Performance improvement in :class:`DataFrame` arithmetic and comparison operations with scalars (:issue:`24990`, :issue:`29853`)
 - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`)
 - Performance improvement in :attr:`MultiIndex.is_monotonic` (:issue:`27495`)
 - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`)

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -325,6 +325,24 @@ class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray)
         _generate_range
     """
 
+    @property
+    def ndim(self) -> int:
+        return self._data.ndim
+
+    @property
+    def shape(self):
+        return self._data.shape
+
+    def reshape(self, *args, **kwargs):
+        # Note: we drop any freq
+        data = self._data.reshape(*args, **kwargs)
+        return type(self)(data, dtype=self.dtype)
+
+    def ravel(self, *args, **kwargs):
+        # Note: we drop any freq
+        data = self._data.ravel(*args, **kwargs)
+        return type(self)(data, dtype=self.dtype)
+
     @property
     def _box_func(self):
         """
@@ -413,7 +431,10 @@ def __getitem__(self, key):
         getitem = self._data.__getitem__
         if is_int:
             val = getitem(key)
-            return self._box_func(val)
+            if lib.is_scalar(val):
+                # i.e. self.ndim == 1
+                return self._box_func(val)
+            return type(self)(val, dtype=self.dtype)
 
         if com.is_bool_indexer(key):
             key = np.asarray(key, dtype=bool)
@@ -823,6 +844,8 @@ def inferred_freq(self):
         generated by infer_freq.  Returns None if it can't autodetect the
         frequency.
         """
+        if self.ndim != 1:
+            return None
         try:
             return frequencies.infer_freq(self)
         except ValueError:
@@ -968,7 +991,7 @@ def _add_timedeltalike_scalar(self, other):
         """
         if isna(other):
             # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds
-            new_values = np.empty(len(self), dtype="i8")
+            new_values = np.empty(self.shape, dtype="i8")
             new_values[:] = iNaT
             return new_values
 
@@ -1014,7 +1037,7 @@ def _add_nat(self):
 
         # GH#19124 pd.NaT is treated like a timedelta for both timedelta
         # and datetime dtypes
-        result = np.zeros(len(self), dtype=np.int64)
+        result = np.zeros(self.shape, dtype=np.int64)
         result.fill(iNaT)
         return type(self)(result, dtype=self.dtype, freq=None)
 
@@ -1028,7 +1051,7 @@ def _sub_nat(self):
         # For datetime64 dtypes by convention we treat NaT as a datetime, so
         # this subtraction returns a timedelta64 dtype.
         # For period dtype, timedelta64 is a close-enough return dtype.
-        result = np.zeros(len(self), dtype=np.int64)
+        result = np.zeros(self.shape, dtype=np.int64)
         result.fill(iNaT)
         return result.view("timedelta64[ns]")
 

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -339,7 +339,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
                 " those."
             )
             raise ValueError(msg)
-        if values.ndim != 1:
+        if values.ndim not in [1, 2]:
             raise ValueError("Only 1-dimensional input arrays are supported.")
 
         if values.dtype == "i8":
@@ -788,6 +788,9 @@ def _sub_datetime_arraylike(self, other):
         return new_values.view("timedelta64[ns]")
 
     def _add_offset(self, offset):
+        if self.ndim == 2:
+            return self.ravel()._add_offset(offset).reshape(self.shape)
+
         assert not isinstance(offset, Tick)
         try:
             if self.tz is not None:

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -217,7 +217,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False):
                 " TimedeltaArray ndarray, or Series or Index containing one of those."
             )
             raise ValueError(msg)
-        if values.ndim != 1:
+        if values.ndim not in [1, 2]:
             raise ValueError("Only 1-dimensional input arrays are supported.")
 
         if values.dtype == "i8":
@@ -1036,8 +1036,6 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
         raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]")
 
     data = np.array(data, copy=copy)
-    if data.ndim != 1:
-        raise ValueError("Only 1-dimensional input arrays are supported.")
 
     assert data.dtype == "m8[ns]", data
     return data, inferred_freq

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -368,7 +368,19 @@ def apply(self, func, **kwargs):
         """
         with np.errstate(all="ignore"):
             result = func(self.values, **kwargs)
+
+        if is_extension_array_dtype(result) and result.ndim > 1:
+            # if we get a 2D ExtensionArray, we need to split it into 1D pieces
+            nbs = []
+            for i, loc in enumerate(self.mgr_locs):
+                vals = result[i]
+                nv = _block_shape(vals, ndim=self.ndim)
+                block = self.make_block(values=nv, placement=[loc])
+                nbs.append(block)
+            return nbs
+
         if not isinstance(result, Block):
+            # Exclude the 0-dim case so we can do reductions
             result = self.make_block(values=_block_shape(result, ndim=self.ndim))
 
         return result

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -340,13 +340,13 @@ def _verify_integrity(self):
                 f"tot_items: {tot_items}"
             )
 
-    def apply(self, f: str, filter=None, **kwargs):
+    def apply(self, f, filter=None, **kwargs):
         """
         Iterate over the blocks, collect and create a new BlockManager.
 
         Parameters
         ----------
-        f : str
+        f : str or callable
             Name of the Block method to apply.
         filter : list, if supplied, only call the block if the filter is in
                  the block
@@ -411,7 +411,10 @@ def apply(self, f: str, filter=None, **kwargs):
                     axis = obj._info_axis_number
                     kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
 
-            applied = getattr(b, f)(**kwargs)
+            if callable(f):
+                applied = b.apply(f, **kwargs)
+            else:
+                applied = getattr(b, f)(**kwargs)
             result_blocks = _extend_blocks(applied, result_blocks)
 
         if len(result_blocks) == 0:

diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py
@@ -26,6 +26,7 @@
     arithmetic_op,
     comparison_op,
     define_na_arithmetic_op,
+    get_array_op,
     logical_op,
 )
 from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY  # noqa:F401
@@ -372,8 +373,10 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None):
     right = lib.item_from_zerodim(right)
     if lib.is_scalar(right) or np.ndim(right) == 0:
 
-        def column_op(a, b):
-            return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}
+        # Get the appropriate array-op to apply to each block's values.
+        array_op = get_array_op(func, str_rep=str_rep)
+        bm = left._data.apply(array_op, right=right)
+        return type(left)(bm)
 
     elif isinstance(right, ABCDataFrame):
         assert right._indexed_same(left)
@@ -713,7 +716,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
             if fill_value is not None:
                 self = self.fillna(fill_value)
 
-            new_data = dispatch_to_series(self, other, op)
+            new_data = dispatch_to_series(self, other, op, str_rep)
             return self._construct_result(new_data)
 
     f.__name__ = op_name

diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py
@@ -2,8 +2,9 @@
 Functions for arithmetic and comparison operations on NumPy arrays and
 ExtensionArrays.
 """
+from functools import partial
 import operator
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 
@@ -51,10 +52,10 @@ def comp_method_OBJECT_ARRAY(op, x, y):
         if isinstance(y, (ABCSeries, ABCIndex)):
             y = y.values
 
-        result = libops.vec_compare(x, y, op)
+        result = libops.vec_compare(x.ravel(), y, op)
     else:
-        result = libops.scalar_compare(x, y, op)
-    return result
+        result = libops.scalar_compare(x.ravel(), y, op)
+    return result.reshape(x.shape)
 
 
 def masked_arith_op(x, y, op):
@@ -237,9 +238,9 @@ def comparison_op(
     elif is_scalar(rvalues) and isna(rvalues):
         # numpy does not like comparisons vs None
         if op is operator.ne:
-            res_values = np.ones(len(lvalues), dtype=bool)
+            res_values = np.ones(lvalues.shape, dtype=bool)
         else:
-            res_values = np.zeros(len(lvalues), dtype=bool)
+            res_values = np.zeros(lvalues.shape, dtype=bool)
 
     elif is_object_dtype(lvalues.dtype):
         res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
@@ -367,3 +368,27 @@ def fill_bool(x, left=None):
         res_values = filler(res_values)  # type: ignore
 
     return res_values
+
+
+def get_array_op(op, str_rep: Optional[str] = None):
+    """
+    Return a binary array operation corresponding to the given operator op.
+
+    Parameters
+    ----------
+    op : function
+        Binary operator from operator or roperator module.
+    str_rep : str or None, default None
+        str_rep to pass to arithmetic_op
+
+    Returns
+    -------
+    function
+    """
+    op_name = op.__name__.strip("_")
+    if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}:
+        return partial(comparison_op, op=op)
+    elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}:
+        return partial(logical_op, op=op)
+    else:
+        return partial(arithmetic_op, op=op, str_rep=str_rep)
diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
@@ -24,8 +24,8 @@ def test_only_1dim_accepted(self):
         arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")
 
         with pytest.raises(ValueError, match="Only 1-dimensional"):
-            # 2-dim
-            DatetimeArray(arr.reshape(2, 2))
+            # 3-dim, we allow 2D to sneak in for ops purposes GH#29853
+            DatetimeArray(arr.reshape(2, 2, 1))
 
         with pytest.raises(ValueError, match="Only 1-dimensional"):
             # 0-dim

diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py
@@ -12,8 +12,8 @@ def test_only_1dim_accepted(self):
         arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]")
 
         with pytest.raises(ValueError, match="Only 1-dimensional"):
-            # 2-dim
-            TimedeltaArray(arr.reshape(2, 2))
+            # 3-dim, we allow 2D to sneak in for ops purposes GH#29853
+            TimedeltaArray(arr.reshape(2, 2, 1))
 
         with pytest.raises(ValueError, match="Only 1-dimensional"):
             # 0-dim