Skip to content

Commit

Permalink
PERF: Faster SparseArray.__getitem__ for boolean masks(pandas-dev#23122)
Browse files Browse the repository at this point in the history
BUG: unary operators for SparseArray doesn't recalc indexes(pandas-dev#44956)
  • Loading branch information
bdrum committed Dec 21, 2021
1 parent 7973c8b commit 1bb10c7
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 37 deletions.
30 changes: 26 additions & 4 deletions asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,16 +196,38 @@ def time_take(self, indices, allow_fill):


class GetItem:
def setup(self):
def setup(self, fill_value):
N = 1_000_000
arr = make_array(N, 1e-5, np.nan, np.float64)
d = 1e-5
arr = make_array(N, d, np.nan, np.float64)
self.sp_arr = SparseArray(arr)

def time_integer_indexing(self):
def time_integer_indexing(self, fill_value):
self.sp_arr[78]

def time_slice(self):
def time_slice(self, fill_value):
self.sp_arr[1:]


class GetItemMask:

params = [True, False]
param_names = ["fill_value"]

def setup(self, fill_value):
N = 1_000_000
d = 1e-5
arr = make_array(N, d, np.nan, np.float64)
self.sp_arr = SparseArray(arr)
b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool8)
fv_inds = np.unique(
np.random.randint(low=0, high=N - 1, size=int(N * d), dtype=np.int32)
)
b_arr[fv_inds] = not fill_value
self.sp_b_arr = SparseArray(b_arr, dtype=np.bool8, fill_value=fill_value)

def time_mask(self, fill_value):
self.sp_arr[self.sp_b_arr]


from .pandas_vb_common import setup # noqa: F401 isort:skip
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ Other Deprecations
- Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`)
- Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`)
- A deprecation warning is now shown for :meth:`DataFrame.to_latex` indicating the arguments signature may change and emulate more the arguments to :meth:`.Styler.to_latex` in future versions (:issue:`44411`)
- Deprecated direct passing non boolean or non nan value to ``fill_value`` for :class:`SparseDType` when dtype is bool type (:pull:`44955`)
-

.. ---------------------------------------------------------------------------
Expand Down Expand Up @@ -589,6 +590,7 @@ Performance improvements
- Performance improvement in :func:`merge` (:issue:`43332`)
- Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`)
- Performance improvement in :func:`concat` (:issue:`43354`)
- Performance improvement in :meth:`SparseArray.__getitem__` (:issue:`23122`)
-

.. ---------------------------------------------------------------------------
Expand Down Expand Up @@ -807,6 +809,7 @@ Sparse
- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`)
- Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`)
- Bug in :class:`SparseArray` comparison methods with an array-like operand of mismatched length raising ``AssertionError`` or unclear ``ValueError`` depending on the input (:issue:`43863`)
- Bug in :class:`SparseArray` unary methods as well as :meth:`SparseArray.isna` doesn't recalculate indexes (:pull:`44955`)
-

ExtensionArray
Expand Down
34 changes: 20 additions & 14 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
)
from pandas.core.dtypes.common import (
is_array_like,
is_bool,
is_bool_dtype,
is_datetime64_any_dtype,
is_datetime64tz_dtype,
Expand Down Expand Up @@ -950,17 +949,19 @@ def __getitem__(

else:
if isinstance(key, SparseArray):
# NOTE: If we guarantee that SparseDType(bool)
# has only fill_value - true, false or nan
# (see GH PR 44955)
# we can apply mask very fast:
if is_bool_dtype(key):
if is_bool(key.fill_value):
msk = np.full(
shape=len(self),
fill_value=key.fill_value,
dtype=np.bool8,
)
msk[key.sp_index.indices] = not key.fill_value
return self.take(np.arange(len(self), dtype=np.int32)[msk])
else:
key = key.to_dense()
if isna(key.fill_value):
return self.take(key.sp_index.indices[key.sp_values])
if not key.fill_value:
return self.take(key.sp_index.indices)
n = len(self)
mask = np.full(n, True, dtype=np.bool8)
mask[key.sp_index.indices] = False
return self.take(np.arange(n)[mask])
else:
key = np.asarray(key)

Expand Down Expand Up @@ -1691,9 +1692,14 @@ def _cmp_method(self, other, op) -> SparseArray:

def _unary_method(self, op) -> SparseArray:
fill_value = op(np.array(self.fill_value)).item()
values = op(self.sp_values)
dtype = SparseDtype(values.dtype, fill_value)
return type(self)._simple_new(values, self.sp_index, dtype)
dtype = SparseDtype(self.dtype.subtype, fill_value)
# NOTE: if fill_value doesn't change
# we just have to apply op to sp_values
if isna(self.fill_value) or fill_value == self.fill_value:
values = op(self.sp_values)
return type(self)._simple_new(values, self.sp_index, self.dtype)
# In the other case we have to recalc indexes
return type(self)(op(self.to_dense()), dtype=dtype)

def __pos__(self) -> SparseArray:
return self._unary_method(operator.pos)
Expand Down
17 changes: 15 additions & 2 deletions pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
)
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool,
is_bool_dtype,
is_object_dtype,
is_scalar,
Expand Down Expand Up @@ -95,10 +96,9 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
if fill_value is None:
fill_value = na_value_for_dtype(dtype)

if not is_scalar(fill_value):
raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead")
self._dtype = dtype
self._fill_value = fill_value
self._check_fill_value()

def __hash__(self):
# Python3 doesn't inherit __hash__ when a base class overrides
Expand Down Expand Up @@ -149,6 +149,19 @@ def fill_value(self):
"""
return self._fill_value

def _check_fill_value(self):
if not is_scalar(self._fill_value):
raise ValueError(
f"fill_value must be a scalar. Got {self._fill_value} instead"
)
if self._is_boolean and not (
is_bool(self._fill_value) or isna(self._fill_value)
):
raise ValueError(
"fill_value must be True, False or nan "
f"for boolean type. Got {self._fill_value} instead"
)

@property
def _is_na_fill_value(self) -> bool:
return isna(self.fill_value)
Expand Down
83 changes: 66 additions & 17 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,14 +249,22 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
assert exp.dtype == dtype

# GH 23122
def test_get_item_bool_sparse_array(self):
def test_getitem_bool_sparse_array(self):
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
tm.assert_sp_array_equal(self.arr[spar_bool], exp)

spar_bool = SparseArray(~spar_bool.to_dense(), dtype=np.bool8, fill_value=False)
spar_bool = ~spar_bool
res = self.arr[spar_bool]
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
tm.assert_sp_array_equal(res, exp)

spar_bool = SparseArray(
[False, True, np.nan] * 3, dtype=np.bool8, fill_value=np.nan
)
res = self.arr[spar_bool]
exp = SparseArray([np.nan, 3, 5])
tm.assert_sp_array_equal(res, exp)

def test_get_item(self):

Expand Down Expand Up @@ -515,7 +523,9 @@ def test_astype(self):
def test_astype_bool(self):
a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
result = a.astype(bool)
expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0))
expected = SparseArray(
[True, False, False, True], dtype=SparseDtype(bool, False)
)
tm.assert_sp_array_equal(result, expected)

# update fill value
Expand Down Expand Up @@ -615,10 +625,9 @@ def test_set_fill_value(self):
assert arr.fill_value

# coerces to bool
# msg = "unable to set fill_value 0 to bool dtype"
# with pytest.raises(ValueError, match=msg):
arr.fill_value = 0
assert arr.fill_value == 0
msg = "fill_value must be True, False or nan"
with pytest.raises(ValueError, match=msg):
arr.fill_value = 0

# msg = "unable to set fill_value nan to bool dtype"
# with pytest.raises(ValueError, match=msg):
Expand Down Expand Up @@ -747,6 +756,41 @@ def test_boolean_slice_empty(self):
res = arr[[False, False, False]]
assert res.dtype == arr.dtype

def test_neg_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)

arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)

def test_abs_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)

arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)

def test_invert_operator(self):
arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8)
res = ~arr
exp = SparseArray(
np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8
)
res = ~arr
tm.assert_sp_array_equal(exp, res)

arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32)
res = ~arr
exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)

@pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"])
def test_binary_operators(self, op):
op = getattr(operator, op)
Expand Down Expand Up @@ -1015,13 +1059,9 @@ def test_sum(self):

@pytest.mark.parametrize(
"arr",
[
np.array([0, 1, np.nan, 1]),
np.array([0, 1, 1]),
np.array([True, True, False]),
],
[np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])],
)
@pytest.mark.parametrize("fill_value", [0, 1, np.nan, True, False])
@pytest.mark.parametrize("fill_value", [0, 1, np.nan])
@pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)])
def test_sum_min_count(self, arr, fill_value, min_count, expected):
# https://github.com/pandas-dev/pandas/issues/25777
Expand All @@ -1032,6 +1072,15 @@ def test_sum_min_count(self, arr, fill_value, min_count, expected):
else:
assert result == expected

def test_bool_sum_min_count(self):
spar_bool = pd.arrays.SparseArray(
[False, True] * 5, dtype=np.bool8, fill_value=True
)
res = spar_bool.sum(min_count=1)
assert res == 5
res = spar_bool.sum(min_count=11)
assert isna(res)

def test_numpy_sum(self):
data = np.arange(10).astype(float)
out = np.sum(SparseArray(data))
Expand Down Expand Up @@ -1131,9 +1180,9 @@ def test_ufunc(self):
tm.assert_sp_array_equal(np.abs(sparse), result)

sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
tm.assert_sp_array_equal(abs(sparse), result)
tm.assert_sp_array_equal(np.abs(sparse), result)
exp = SparseArray([1, 1, 2, 2], fill_value=1)
tm.assert_sp_array_equal(abs(sparse), exp)
tm.assert_sp_array_equal(np.abs(sparse), exp)

sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))
Expand Down

0 comments on commit 1bb10c7

Please sign in to comment.