PERF: Faster SparseArray.__get_item__ for boolean masks (pandas-dev#2…

…3122)
bdrum · Dec 24, 2021 · f299229 · f299229
1 parent 7131268
commit f299229
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 39 deletions.
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
@@ -196,16 +196,38 @@ def time_take(self, indices, allow_fill):
 
 
 class GetItem:
-    def setup(self):
+    def setup(self, fill_value):
         N = 1_000_000
-        arr = make_array(N, 1e-5, np.nan, np.float64)
+        d = 1e-5
+        arr = make_array(N, d, np.nan, np.float64)
         self.sp_arr = SparseArray(arr)
 
-    def time_integer_indexing(self):
+    def time_integer_indexing(self, fill_value):
         self.sp_arr[78]
 
-    def time_slice(self):
+    def time_slice(self, fill_value):
         self.sp_arr[1:]
 
 
+class GetItemMask:
+
+    params = [True, False, np.nan]
+    param_names = ["fill_value"]
+
+    def setup(self, fill_value):
+        N = 1_000_000
+        d = 1e-5
+        arr = make_array(N, d, np.nan, np.float64)
+        self.sp_arr = SparseArray(arr)
+        b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool8)
+        fv_inds = np.unique(
+            np.random.randint(low=0, high=N - 1, size=int(N * d), dtype=np.int32)
+        )
+        b_arr[fv_inds] = True if pd.isna(fill_value) else not fill_value
+        self.sp_b_arr = SparseArray(b_arr, dtype=np.bool8, fill_value=fill_value)
+
+    def time_mask(self, fill_value):
+        self.sp_arr[self.sp_b_arr]
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -602,6 +602,7 @@ Performance improvements
 - Performance improvement in :func:`to_csv` when :class:`MultiIndex` contains a lot of unused levels (:issue:`37484`)
 - Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`)
 - Performance improvement in :func:`concat` (:issue:`43354`)
+- Performance improvement in :meth:`SparseArray.__getitem__` (:issue:`23122`)
 - Performance improvement in constructing a :class:`DataFrame` from array-like objects like a ``Pytorch`` tensor (:issue:`44616`)
 -
 
@@ -847,6 +848,7 @@ Sparse
 - Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`)
 - Bug in :class:`SparseArray` comparison methods with an array-like operand of mismatched length raising ``AssertionError`` or unclear ``ValueError`` depending on the input (:issue:`43863`)
 - Bug in :class:`SparseArray` arithmetic methods ``floordiv`` and ``mod`` behaviors when dividing by zero not matching the non-sparse :class:`Series` behavior (:issue:`38172`)
+- Bug in :class:`SparseArray` unary methods as well as :meth:`SparseArray.isna` doesn't recalculate indexes (:pull:`44955`)
 -
 
 ExtensionArray

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -719,7 +719,11 @@ def isna(self):
         # If null fill value, we want SparseDtype[bool, true]
         # to preserve the same memory usage.
         dtype = SparseDtype(bool, self._null_fill_value)
-        return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
+        if self._null_fill_value:
+            return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
+        mask = np.full(len(self), False, dtype=np.bool8)
+        mask[self.sp_index.indices] = isna(self.sp_values)
+        return type(self)(mask, fill_value=False, dtype=dtype)
 
     def fillna(
         self: SparseArrayT,
@@ -963,13 +967,20 @@ def __getitem__(
             )
 
         else:
-            # TODO: I think we can avoid densifying when masking a
-            # boolean SparseArray with another. Need to look at the
-            # key's fill_value for True / False, and then do an intersection
-            # on the indices of the sp_values.
             if isinstance(key, SparseArray):
+                # NOTE: If we guarantee that SparseDType(bool)
+                # has only fill_value - true, false or nan
+                # (see GH PR 44955)
+                # we can apply mask very fast:
                 if is_bool_dtype(key):
-                    key = key.to_dense()
+                    if isna(key.fill_value):
+                        return self.take(key.sp_index.indices[key.sp_values])
+                    if not key.fill_value:
+                        return self.take(key.sp_index.indices)
+                    n = len(self)
+                    mask = np.full(n, True, dtype=np.bool8)
+                    mask[key.sp_index.indices] = False
+                    return self.take(np.arange(n)[mask])
                 else:
                     key = np.asarray(key)
 
@@ -1684,9 +1695,14 @@ def _cmp_method(self, other, op) -> SparseArray:
 
     def _unary_method(self, op) -> SparseArray:
         fill_value = op(np.array(self.fill_value)).item()
-        values = op(self.sp_values)
-        dtype = SparseDtype(values.dtype, fill_value)
-        return type(self)._simple_new(values, self.sp_index, dtype)
+        dtype = SparseDtype(self.dtype.subtype, fill_value)
+        # NOTE: if fill_value doesn't change
+        # we just have to apply op to sp_values
+        if isna(self.fill_value) or fill_value == self.fill_value:
+            values = op(self.sp_values)
+            return type(self)._simple_new(values, self.sp_index, self.dtype)
+        # In the other case we have to recalc indexes
+        return type(self)(op(self.to_dense()), dtype=dtype)
 
     def __pos__(self) -> SparseArray:
         return self._unary_method(operator.pos)

diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -95,10 +95,9 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
         if fill_value is None:
             fill_value = na_value_for_dtype(dtype)
 
-        if not is_scalar(fill_value):
-            raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead")
         self._dtype = dtype
         self._fill_value = fill_value
+        self._check_fill_value()
 
     def __hash__(self):
         # Python3 doesn't inherit __hash__ when a base class overrides
@@ -149,6 +148,24 @@ def fill_value(self):
         """
         return self._fill_value
 
+    def _check_fill_value(self):
+        if not is_scalar(self._fill_value):
+            raise ValueError(
+                f"fill_value must be a scalar. Got {self._fill_value} instead"
+            )
+        # TODO: Right now we can use Sparse boolean array
+        #       with any fill_value. Here was an attempt
+        #       to allow only 3 value: True, False or nan
+        #       but plenty test has failed.
+        # see pull 44955
+        # if self._is_boolean and not (
+        #    is_bool(self._fill_value) or isna(self._fill_value)
+        # ):
+        #    raise ValueError(
+        #        "fill_value must be True, False or nan "
+        #        f"for boolean type. Got {self._fill_value} instead"
+        #    )
+
     @property
     def _is_na_fill_value(self) -> bool:
         return isna(self.fill_value)

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -248,6 +248,24 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
         assert arr.dtype == dtype
         assert exp.dtype == dtype
 
+    # GH 23122
+    def test_getitem_bool_sparse_array(self):
+        spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
+        exp = SparseArray([np.nan, 2, np.nan, 5, 6])
+        tm.assert_sp_array_equal(self.arr[spar_bool], exp)
+
+        spar_bool = ~spar_bool
+        res = self.arr[spar_bool]
+        exp = SparseArray([np.nan, 1, 3, 4, np.nan])
+        tm.assert_sp_array_equal(res, exp)
+
+        spar_bool = SparseArray(
+            [False, True, np.nan] * 3, dtype=np.bool8, fill_value=np.nan
+        )
+        res = self.arr[spar_bool]
+        exp = SparseArray([np.nan, 3, 5])
+        tm.assert_sp_array_equal(res, exp)
+
     def test_get_item(self):
 
         assert np.isnan(self.arr[1])
@@ -505,7 +523,9 @@ def test_astype(self):
     def test_astype_bool(self):
         a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
         result = a.astype(bool)
-        expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0))
+        expected = SparseArray(
+            [True, False, False, True], dtype=SparseDtype(bool, False)
+        )
         tm.assert_sp_array_equal(result, expected)
 
         # update fill value
@@ -605,10 +625,11 @@ def test_set_fill_value(self):
         assert arr.fill_value
 
         # coerces to bool
-        # msg = "unable to set fill_value 0 to bool dtype"
+        # XXX: we can construct an sparse array of bool
+        #      type and use as fill_value any value
+        # msg = "fill_value must be True, False or nan"
         # with pytest.raises(ValueError, match=msg):
-        arr.fill_value = 0
-        assert arr.fill_value == 0
+        #    arr.fill_value = 0
 
         # msg = "unable to set fill_value nan to bool dtype"
         # with pytest.raises(ValueError, match=msg):
@@ -737,6 +758,41 @@ def test_boolean_slice_empty(self):
         res = arr[[False, False, False]]
         assert res.dtype == arr.dtype
 
+    def test_neg_operator(self):
+        arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
+        res = -arr
+        exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8)
+        tm.assert_sp_array_equal(exp, res)
+
+        arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
+        res = -arr
+        exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8)
+        tm.assert_sp_array_equal(exp, res)
+
+    def test_abs_operator(self):
+        arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
+        res = abs(arr)
+        exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
+        tm.assert_sp_array_equal(exp, res)
+
+        arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
+        res = abs(arr)
+        exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8)
+        tm.assert_sp_array_equal(exp, res)
+
+    def test_invert_operator(self):
+        arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8)
+        res = ~arr
+        exp = SparseArray(
+            np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8
+        )
+        res = ~arr
+        tm.assert_sp_array_equal(exp, res)
+
+        arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32)
+        res = ~arr
+        exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)
+
     @pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"])
     def test_binary_operators(self, op):
         op = getattr(operator, op)
@@ -1005,13 +1061,9 @@ def test_sum(self):
 
     @pytest.mark.parametrize(
         "arr",
-        [
-            np.array([0, 1, np.nan, 1]),
-            np.array([0, 1, 1]),
-            np.array([True, True, False]),
-        ],
+        [np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])],
     )
-    @pytest.mark.parametrize("fill_value", [0, 1, np.nan, True, False])
+    @pytest.mark.parametrize("fill_value", [0, 1, np.nan])
     @pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)])
     def test_sum_min_count(self, arr, fill_value, min_count, expected):
         # https://github.com/pandas-dev/pandas/issues/25777
@@ -1022,6 +1074,15 @@ def test_sum_min_count(self, arr, fill_value, min_count, expected):
         else:
             assert result == expected
 
+    def test_bool_sum_min_count(self):
+        spar_bool = pd.arrays.SparseArray(
+            [False, True] * 5, dtype=np.bool8, fill_value=True
+        )
+        res = spar_bool.sum(min_count=1)
+        assert res == 5
+        res = spar_bool.sum(min_count=11)
+        assert isna(res)
+
     def test_numpy_sum(self):
         data = np.arange(10).astype(float)
         out = np.sum(SparseArray(data))
@@ -1121,9 +1182,9 @@ def test_ufunc(self):
         tm.assert_sp_array_equal(np.abs(sparse), result)
 
         sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
-        result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
-        tm.assert_sp_array_equal(abs(sparse), result)
-        tm.assert_sp_array_equal(np.abs(sparse), result)
+        exp = SparseArray([1, 1, 2, 2], fill_value=1)
+        tm.assert_sp_array_equal(abs(sparse), exp)
+        tm.assert_sp_array_equal(np.abs(sparse), exp)
 
         sparse = SparseArray([1, np.nan, 2, np.nan, -2])
         result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))

diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
@@ -193,20 +193,17 @@ def test_reindex(self, data, na_value):
 
 class TestMissing(BaseSparseTests, base.BaseMissingTests):
     def test_isna(self, data_missing):
+        sarr = SparseArray(data_missing)
         expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
         expected = SparseArray([True, False], dtype=expected_dtype)
+        result = sarr.isna()
+        tm.assert_sp_array_equal(result, expected)
 
-        result = pd.isna(data_missing)
-        self.assert_equal(result, expected)
-
-        result = pd.Series(data_missing).isna()
-        expected = pd.Series(expected)
-        self.assert_series_equal(result, expected)
-
-        # GH 21189
-        result = pd.Series(data_missing).drop([0, 1]).isna()
-        expected = pd.Series([], dtype=expected_dtype)
-        self.assert_series_equal(result, expected)
+        # test isna for arr without na
+        sarr = sarr.fillna(0)
+        expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
+        expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
+        self.assert_equal(sarr.isna(), expected)
 
     def test_fillna_limit_pad(self, data_missing):
         with tm.assert_produces_warning(PerformanceWarning):