Skip to content

Commit

Permalink
PERF: implement scalar ops blockwise (pandas-dev#29853)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and AlexKirko committed Dec 29, 2019
1 parent 9a8f8a6 commit e98f9b7
Show file tree
Hide file tree
Showing 11 changed files with 124 additions and 24 deletions.
32 changes: 32 additions & 0 deletions asv_bench/benchmarks/binary_ops.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import operator

import numpy as np

from pandas import DataFrame, Series, date_range
Expand All @@ -9,6 +11,36 @@
import pandas.computation.expressions as expr


class IntFrameWithScalar:
params = [
[np.float64, np.int64],
[2, 3.0, np.int32(4), np.float64(5)],
[
operator.add,
operator.sub,
operator.mul,
operator.truediv,
operator.floordiv,
operator.pow,
operator.mod,
operator.eq,
operator.ne,
operator.gt,
operator.ge,
operator.lt,
operator.le,
],
]
param_names = ["dtype", "scalar", "op"]

def setup(self, dtype, scalar, op):
arr = np.random.randn(20000, 100)
self.df = DataFrame(arr.astype(dtype))

def time_frame_op_with_scalar(self, dtype, scalar, op):
op(self.df, scalar)


class Ops:

params = [[True, False], ["default", 1]]
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Performance improvement in :class:`DataFrame` arithmetic and comparison operations with scalars (:issue:`24990`, :issue:`29853`)
- Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`)
- Performance improvement in :attr:`MultiIndex.is_monotonic` (:issue:`27495`)
- Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`)
Expand Down
31 changes: 27 additions & 4 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,24 @@ class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray)
_generate_range
"""

@property
def ndim(self) -> int:
return self._data.ndim

@property
def shape(self):
return self._data.shape

def reshape(self, *args, **kwargs):
# Note: we drop any freq
data = self._data.reshape(*args, **kwargs)
return type(self)(data, dtype=self.dtype)

def ravel(self, *args, **kwargs):
# Note: we drop any freq
data = self._data.ravel(*args, **kwargs)
return type(self)(data, dtype=self.dtype)

@property
def _box_func(self):
"""
Expand Down Expand Up @@ -413,7 +431,10 @@ def __getitem__(self, key):
getitem = self._data.__getitem__
if is_int:
val = getitem(key)
return self._box_func(val)
if lib.is_scalar(val):
# i.e. self.ndim == 1
return self._box_func(val)
return type(self)(val, dtype=self.dtype)

if com.is_bool_indexer(key):
key = np.asarray(key, dtype=bool)
Expand Down Expand Up @@ -823,6 +844,8 @@ def inferred_freq(self):
generated by infer_freq. Returns None if it can't autodetect the
frequency.
"""
if self.ndim != 1:
return None
try:
return frequencies.infer_freq(self)
except ValueError:
Expand Down Expand Up @@ -968,7 +991,7 @@ def _add_timedeltalike_scalar(self, other):
"""
if isna(other):
# i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds
new_values = np.empty(len(self), dtype="i8")
new_values = np.empty(self.shape, dtype="i8")
new_values[:] = iNaT
return new_values

Expand Down Expand Up @@ -1014,7 +1037,7 @@ def _add_nat(self):

# GH#19124 pd.NaT is treated like a timedelta for both timedelta
# and datetime dtypes
result = np.zeros(len(self), dtype=np.int64)
result = np.zeros(self.shape, dtype=np.int64)
result.fill(iNaT)
return type(self)(result, dtype=self.dtype, freq=None)

Expand All @@ -1028,7 +1051,7 @@ def _sub_nat(self):
# For datetime64 dtypes by convention we treat NaT as a datetime, so
# this subtraction returns a timedelta64 dtype.
# For period dtype, timedelta64 is a close-enough return dtype.
result = np.zeros(len(self), dtype=np.int64)
result = np.zeros(self.shape, dtype=np.int64)
result.fill(iNaT)
return result.view("timedelta64[ns]")

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
" those."
)
raise ValueError(msg)
if values.ndim != 1:
if values.ndim not in [1, 2]:
raise ValueError("Only 1-dimensional input arrays are supported.")

if values.dtype == "i8":
Expand Down Expand Up @@ -788,6 +788,9 @@ def _sub_datetime_arraylike(self, other):
return new_values.view("timedelta64[ns]")

def _add_offset(self, offset):
if self.ndim == 2:
return self.ravel()._add_offset(offset).reshape(self.shape)

assert not isinstance(offset, Tick)
try:
if self.tz is not None:
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False):
" TimedeltaArray ndarray, or Series or Index containing one of those."
)
raise ValueError(msg)
if values.ndim != 1:
if values.ndim not in [1, 2]:
raise ValueError("Only 1-dimensional input arrays are supported.")

if values.dtype == "i8":
Expand Down Expand Up @@ -1036,8 +1036,6 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]")

data = np.array(data, copy=copy)
if data.ndim != 1:
raise ValueError("Only 1-dimensional input arrays are supported.")

assert data.dtype == "m8[ns]", data
return data, inferred_freq
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,19 @@ def apply(self, func, **kwargs):
"""
with np.errstate(all="ignore"):
result = func(self.values, **kwargs)

if is_extension_array_dtype(result) and result.ndim > 1:
# if we get a 2D ExtensionArray, we need to split it into 1D pieces
nbs = []
for i, loc in enumerate(self.mgr_locs):
vals = result[i]
nv = _block_shape(vals, ndim=self.ndim)
block = self.make_block(values=nv, placement=[loc])
nbs.append(block)
return nbs

if not isinstance(result, Block):
# Exclude the 0-dim case so we can do reductions
result = self.make_block(values=_block_shape(result, ndim=self.ndim))

return result
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,13 +340,13 @@ def _verify_integrity(self):
f"tot_items: {tot_items}"
)

def apply(self, f: str, filter=None, **kwargs):
def apply(self, f, filter=None, **kwargs):
"""
Iterate over the blocks, collect and create a new BlockManager.
Parameters
----------
f : str
f : str or callable
Name of the Block method to apply.
filter : list, if supplied, only call the block if the filter is in
the block
Expand Down Expand Up @@ -411,7 +411,10 @@ def apply(self, f: str, filter=None, **kwargs):
axis = obj._info_axis_number
kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)

applied = getattr(b, f)(**kwargs)
if callable(f):
applied = b.apply(f, **kwargs)
else:
applied = getattr(b, f)(**kwargs)
result_blocks = _extend_blocks(applied, result_blocks)

if len(result_blocks) == 0:
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
arithmetic_op,
comparison_op,
define_na_arithmetic_op,
get_array_op,
logical_op,
)
from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401
Expand Down Expand Up @@ -372,8 +373,10 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None):
right = lib.item_from_zerodim(right)
if lib.is_scalar(right) or np.ndim(right) == 0:

def column_op(a, b):
return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}
# Get the appropriate array-op to apply to each block's values.
array_op = get_array_op(func, str_rep=str_rep)
bm = left._data.apply(array_op, right=right)
return type(left)(bm)

elif isinstance(right, ABCDataFrame):
assert right._indexed_same(left)
Expand Down Expand Up @@ -713,7 +716,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
if fill_value is not None:
self = self.fillna(fill_value)

new_data = dispatch_to_series(self, other, op)
new_data = dispatch_to_series(self, other, op, str_rep)
return self._construct_result(new_data)

f.__name__ = op_name
Expand Down
37 changes: 31 additions & 6 deletions pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
Functions for arithmetic and comparison operations on NumPy arrays and
ExtensionArrays.
"""
from functools import partial
import operator
from typing import Any, Union
from typing import Any, Optional, Union

import numpy as np

Expand Down Expand Up @@ -51,10 +52,10 @@ def comp_method_OBJECT_ARRAY(op, x, y):
if isinstance(y, (ABCSeries, ABCIndex)):
y = y.values

result = libops.vec_compare(x, y, op)
result = libops.vec_compare(x.ravel(), y, op)
else:
result = libops.scalar_compare(x, y, op)
return result
result = libops.scalar_compare(x.ravel(), y, op)
return result.reshape(x.shape)


def masked_arith_op(x, y, op):
Expand Down Expand Up @@ -237,9 +238,9 @@ def comparison_op(
elif is_scalar(rvalues) and isna(rvalues):
# numpy does not like comparisons vs None
if op is operator.ne:
res_values = np.ones(len(lvalues), dtype=bool)
res_values = np.ones(lvalues.shape, dtype=bool)
else:
res_values = np.zeros(len(lvalues), dtype=bool)
res_values = np.zeros(lvalues.shape, dtype=bool)

elif is_object_dtype(lvalues.dtype):
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
Expand Down Expand Up @@ -367,3 +368,27 @@ def fill_bool(x, left=None):
res_values = filler(res_values) # type: ignore

return res_values


def get_array_op(op, str_rep: Optional[str] = None):
"""
Return a binary array operation corresponding to the given operator op.
Parameters
----------
op : function
Binary operator from operator or roperator module.
str_rep : str or None, default None
str_rep to pass to arithmetic_op
Returns
-------
function
"""
op_name = op.__name__.strip("_")
if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}:
return partial(comparison_op, op=op)
elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}:
return partial(logical_op, op=op)
else:
return partial(arithmetic_op, op=op, str_rep=str_rep)
4 changes: 2 additions & 2 deletions pandas/tests/arrays/test_datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 2-dim
DatetimeArray(arr.reshape(2, 2))
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
DatetimeArray(arr.reshape(2, 2, 1))

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/test_timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]")

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 2-dim
TimedeltaArray(arr.reshape(2, 2))
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
TimedeltaArray(arr.reshape(2, 2, 1))

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
Expand Down

0 comments on commit e98f9b7

Please sign in to comment.