From f299229de0f02816cf961bbc4480fcf89d264840 Mon Sep 17 00:00:00 2001 From: Boris Rumyantsev Date: Fri, 10 Dec 2021 15:03:42 +0300 Subject: [PATCH] PERF: Faster SparseArray.__get_item__ for boolean masks (#23122) --- asv_bench/benchmarks/sparse.py | 30 ++++++-- doc/source/whatsnew/v1.4.0.rst | 2 + pandas/core/arrays/sparse/array.py | 34 ++++++--- pandas/core/arrays/sparse/dtype.py | 21 +++++- pandas/tests/arrays/sparse/test_array.py | 87 ++++++++++++++++++++---- pandas/tests/extension/test_sparse.py | 19 +++--- 6 files changed, 154 insertions(+), 39 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 8969beb17f2e3..65b1f25e7b666 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -196,16 +196,38 @@ def time_take(self, indices, allow_fill): class GetItem: - def setup(self): + def setup(self, fill_value): N = 1_000_000 - arr = make_array(N, 1e-5, np.nan, np.float64) + d = 1e-5 + arr = make_array(N, d, np.nan, np.float64) self.sp_arr = SparseArray(arr) - def time_integer_indexing(self): + def time_integer_indexing(self, fill_value): self.sp_arr[78] - def time_slice(self): + def time_slice(self, fill_value): self.sp_arr[1:] +class GetItemMask: + + params = [True, False, np.nan] + param_names = ["fill_value"] + + def setup(self, fill_value): + N = 1_000_000 + d = 1e-5 + arr = make_array(N, d, np.nan, np.float64) + self.sp_arr = SparseArray(arr) + b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool8) + fv_inds = np.unique( + np.random.randint(low=0, high=N - 1, size=int(N * d), dtype=np.int32) + ) + b_arr[fv_inds] = True if pd.isna(fill_value) else not fill_value + self.sp_b_arr = SparseArray(b_arr, dtype=np.bool8, fill_value=fill_value) + + def time_mask(self, fill_value): + self.sp_arr[self.sp_b_arr] + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index febf08f2c47aa..0e67080fc10a7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -602,6 +602,7 @@ Performance improvements - Performance improvement in :func:`to_csv` when :class:`MultiIndex` contains a lot of unused levels (:issue:`37484`) - Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`) - Performance improvement in :func:`concat` (:issue:`43354`) +- Performance improvement in :meth:`SparseArray.__getitem__` (:issue:`23122`) - Performance improvement in constructing a :class:`DataFrame` from array-like objects like a ``Pytorch`` tensor (:issue:`44616`) - @@ -847,6 +848,7 @@ Sparse - Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`) - Bug in :class:`SparseArray` comparison methods with an array-like operand of mismatched length raising ``AssertionError`` or unclear ``ValueError`` depending on the input (:issue:`43863`) - Bug in :class:`SparseArray` arithmetic methods ``floordiv`` and ``mod`` behaviors when dividing by zero not matching the non-sparse :class:`Series` behavior (:issue:`38172`) +- Bug in :class:`SparseArray` unary methods as well as :meth:`SparseArray.isna` doesn't recalculate indexes (:pull:`44955`) - ExtensionArray diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 17c5320b1e941..538d4e7e4a7aa 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -719,7 +719,11 @@ def isna(self): # If null fill value, we want SparseDtype[bool, true] # to preserve the same memory usage. dtype = SparseDtype(bool, self._null_fill_value) - return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) + if self._null_fill_value: + return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) + mask = np.full(len(self), False, dtype=np.bool8) + mask[self.sp_index.indices] = isna(self.sp_values) + return type(self)(mask, fill_value=False, dtype=dtype) def fillna( self: SparseArrayT, @@ -963,13 +967,20 @@ def __getitem__( ) else: - # TODO: I think we can avoid densifying when masking a - # boolean SparseArray with another. Need to look at the - # key's fill_value for True / False, and then do an intersection - # on the indices of the sp_values. if isinstance(key, SparseArray): + # NOTE: If we guarantee that SparseDType(bool) + # has only fill_value - true, false or nan + # (see GH PR 44955) + # we can apply mask very fast: if is_bool_dtype(key): - key = key.to_dense() + if isna(key.fill_value): + return self.take(key.sp_index.indices[key.sp_values]) + if not key.fill_value: + return self.take(key.sp_index.indices) + n = len(self) + mask = np.full(n, True, dtype=np.bool8) + mask[key.sp_index.indices] = False + return self.take(np.arange(n)[mask]) else: key = np.asarray(key) @@ -1684,9 +1695,14 @@ def _cmp_method(self, other, op) -> SparseArray: def _unary_method(self, op) -> SparseArray: fill_value = op(np.array(self.fill_value)).item() - values = op(self.sp_values) - dtype = SparseDtype(values.dtype, fill_value) - return type(self)._simple_new(values, self.sp_index, dtype) + dtype = SparseDtype(self.dtype.subtype, fill_value) + # NOTE: if fill_value doesn't change + # we just have to apply op to sp_values + if isna(self.fill_value) or fill_value == self.fill_value: + values = op(self.sp_values) + return type(self)._simple_new(values, self.sp_index, self.dtype) + # In the other case we have to recalc indexes + return type(self)(op(self.to_dense()), dtype=dtype) def __pos__(self) -> SparseArray: return self._unary_method(operator.pos) diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index 20bb880586bf9..f1da2421c4106 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -95,10 +95,9 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None): if fill_value is None: fill_value = na_value_for_dtype(dtype) - if not is_scalar(fill_value): - raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead") self._dtype = dtype self._fill_value = fill_value + self._check_fill_value() def __hash__(self): # Python3 doesn't inherit __hash__ when a base class overrides @@ -149,6 +148,24 @@ def fill_value(self): """ return self._fill_value + def _check_fill_value(self): + if not is_scalar(self._fill_value): + raise ValueError( + f"fill_value must be a scalar. Got {self._fill_value} instead" + ) + # TODO: Right now we can use Sparse boolean array + # with any fill_value. Here was an attempt + # to allow only 3 value: True, False or nan + # but plenty test has failed. + # see pull 44955 + # if self._is_boolean and not ( + # is_bool(self._fill_value) or isna(self._fill_value) + # ): + # raise ValueError( + # "fill_value must be True, False or nan " + # f"for boolean type. Got {self._fill_value} instead" + # ) + @property def _is_na_fill_value(self) -> bool: return isna(self.fill_value) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index d0e7b7d0a35fe..2c3dcdeeaf8dc 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -248,6 +248,24 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype): assert arr.dtype == dtype assert exp.dtype == dtype + # GH 23122 + def test_getitem_bool_sparse_array(self): + spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True) + exp = SparseArray([np.nan, 2, np.nan, 5, 6]) + tm.assert_sp_array_equal(self.arr[spar_bool], exp) + + spar_bool = ~spar_bool + res = self.arr[spar_bool] + exp = SparseArray([np.nan, 1, 3, 4, np.nan]) + tm.assert_sp_array_equal(res, exp) + + spar_bool = SparseArray( + [False, True, np.nan] * 3, dtype=np.bool8, fill_value=np.nan + ) + res = self.arr[spar_bool] + exp = SparseArray([np.nan, 3, 5]) + tm.assert_sp_array_equal(res, exp) + def test_get_item(self): assert np.isnan(self.arr[1]) @@ -505,7 +523,9 @@ def test_astype(self): def test_astype_bool(self): a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) result = a.astype(bool) - expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0)) + expected = SparseArray( + [True, False, False, True], dtype=SparseDtype(bool, False) + ) tm.assert_sp_array_equal(result, expected) # update fill value @@ -605,10 +625,11 @@ def test_set_fill_value(self): assert arr.fill_value # coerces to bool - # msg = "unable to set fill_value 0 to bool dtype" + # XXX: we can construct an sparse array of bool + # type and use as fill_value any value + # msg = "fill_value must be True, False or nan" # with pytest.raises(ValueError, match=msg): - arr.fill_value = 0 - assert arr.fill_value == 0 + # arr.fill_value = 0 # msg = "unable to set fill_value nan to bool dtype" # with pytest.raises(ValueError, match=msg): @@ -737,6 +758,41 @@ def test_boolean_slice_empty(self): res = arr[[False, False, False]] assert res.dtype == arr.dtype + def test_neg_operator(self): + arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8) + res = -arr + exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8) + res = -arr + exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + def test_abs_operator(self): + arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8) + res = abs(arr) + exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8) + res = abs(arr) + exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + def test_invert_operator(self): + arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8) + res = ~arr + exp = SparseArray( + np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8 + ) + res = ~arr + tm.assert_sp_array_equal(exp, res) + + arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32) + res = ~arr + exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32) + @pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) def test_binary_operators(self, op): op = getattr(operator, op) @@ -1005,13 +1061,9 @@ def test_sum(self): @pytest.mark.parametrize( "arr", - [ - np.array([0, 1, np.nan, 1]), - np.array([0, 1, 1]), - np.array([True, True, False]), - ], + [np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])], ) - @pytest.mark.parametrize("fill_value", [0, 1, np.nan, True, False]) + @pytest.mark.parametrize("fill_value", [0, 1, np.nan]) @pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)]) def test_sum_min_count(self, arr, fill_value, min_count, expected): # https://github.com/pandas-dev/pandas/issues/25777 @@ -1022,6 +1074,15 @@ def test_sum_min_count(self, arr, fill_value, min_count, expected): else: assert result == expected + def test_bool_sum_min_count(self): + spar_bool = pd.arrays.SparseArray( + [False, True] * 5, dtype=np.bool8, fill_value=True + ) + res = spar_bool.sum(min_count=1) + assert res == 5 + res = spar_bool.sum(min_count=11) + assert isna(res) + def test_numpy_sum(self): data = np.arange(10).astype(float) out = np.sum(SparseArray(data)) @@ -1121,9 +1182,9 @@ def test_ufunc(self): tm.assert_sp_array_equal(np.abs(sparse), result) sparse = SparseArray([1, -1, 2, -2], fill_value=-1) - result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1) - tm.assert_sp_array_equal(abs(sparse), result) - tm.assert_sp_array_equal(np.abs(sparse), result) + exp = SparseArray([1, 1, 2, 2], fill_value=1) + tm.assert_sp_array_equal(abs(sparse), exp) + tm.assert_sp_array_equal(np.abs(sparse), exp) sparse = SparseArray([1, np.nan, 2, np.nan, -2]) result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2])) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 59f2682756a35..f7809dc2e4217 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -193,20 +193,17 @@ def test_reindex(self, data, na_value): class TestMissing(BaseSparseTests, base.BaseMissingTests): def test_isna(self, data_missing): + sarr = SparseArray(data_missing) expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value)) expected = SparseArray([True, False], dtype=expected_dtype) + result = sarr.isna() + tm.assert_sp_array_equal(result, expected) - result = pd.isna(data_missing) - self.assert_equal(result, expected) - - result = pd.Series(data_missing).isna() - expected = pd.Series(expected) - self.assert_series_equal(result, expected) - - # GH 21189 - result = pd.Series(data_missing).drop([0, 1]).isna() - expected = pd.Series([], dtype=expected_dtype) - self.assert_series_equal(result, expected) + # test isna for arr without na + sarr = sarr.fillna(0) + expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value)) + expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype) + self.assert_equal(sarr.isna(), expected) def test_fillna_limit_pad(self, data_missing): with tm.assert_produces_warning(PerformanceWarning):