Skip to content

Commit

Permalink
PERF: Faster SparseArray.__get_item__ for boolean masks (pandas-dev#2…
Browse files Browse the repository at this point in the history
  • Loading branch information
bdrum committed Dec 10, 2021
1 parent 097322f commit a8d517a
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 6 deletions.
17 changes: 11 additions & 6 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
)
from pandas.core.dtypes.common import (
is_array_like,
is_bool,
is_bool_dtype,
is_datetime64_any_dtype,
is_datetime64tz_dtype,
Expand Down Expand Up @@ -181,7 +182,6 @@ def _sparse_array_op(
ltype = SparseDtype(subtype, left.fill_value)
rtype = SparseDtype(subtype, right.fill_value)

# TODO(GH-23092): pass copy=False. Need to fix astype_nansafe
left = left.astype(ltype)
right = right.astype(rtype)
dtype = ltype.subtype
Expand Down Expand Up @@ -945,13 +945,18 @@ def __getitem__(
)

else:
# TODO: I think we can avoid densifying when masking a
# boolean SparseArray with another. Need to look at the
# key's fill_value for True / False, and then do an intersection
# on the indices of the sp_values.
if isinstance(key, SparseArray):
if is_bool_dtype(key):
key = key.to_dense()
if is_bool(key.fill_value):
msk = np.full(
shape=len(self),
fill_value=key.fill_value,
dtype=np.bool8,
)
msk[key.sp_index.indices] = not key.fill_value
return self.take(np.arange(len(self), dtype=np.int32)[msk])
else:
key = key.to_dense()
else:
key = np.asarray(key)

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,16 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
assert arr.dtype == dtype
assert exp.dtype == dtype

# GH 23122
def test_get_item_bool_sparse_array(self):
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
tm.assert_sp_array_equal(self.arr[spar_bool], exp)

spar_bool = SparseArray(~spar_bool.to_dense(), dtype=np.bool8, fill_value=False)
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
tm.assert_sp_array_equal(self.arr[spar_bool], exp)

def test_get_item(self):

assert np.isnan(self.arr[1])
Expand Down

0 comments on commit a8d517a

Please sign in to comment.