Skip to content

Commit

Permalink
PERF: Add __contains__ to CategoricalIndex (pandas-dev#21369)
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 authored and david-liu-brattle-1 committed Jun 18, 2018
1 parent 37e652d commit 19b3598
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
13 changes: 13 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,16 @@ def time_categorical_series_is_monotonic_increasing(self):

def time_categorical_series_is_monotonic_decreasing(self):
self.s.is_monotonic_decreasing


class Contains(object):

goal_time = 0.2

def setup(self):
N = 10**5
self.ci = tm.makeCategoricalIndex(N)
self.cat = self.ci.categories[0]

def time_contains(self):
self.cat in self.ci
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ Fixed Regressions
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

-
- Improved performance of membership checks in :class:`CategoricalIndex`
(i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains`
is likewise much faster (:issue:`21369`)
-

Documentation Changes
Expand Down
28 changes: 20 additions & 8 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,19 +325,31 @@ def _reverse_indexer(self):
def __contains__(self, key):
hash(key)

if self.categories._defer_to_indexing:
return key in self.categories
if isna(key): # if key is a NaN, check if any NaN is in self.
return self.isna().any()

# is key in self.categories? Then get its location.
# If not (i.e. KeyError), it logically can't be in self either
try:
loc = self.categories.get_loc(key)
except KeyError:
return False

return key in self.values
# loc is the location of key in self.categories, but also the value
# for key in self.codes and in self._engine. key may be in categories,
# but still not in self, check this. Example:
# 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False
if is_scalar(loc):
return loc in self._engine
else:
# if self.categories is IntervalIndex, loc is an array
# check if any scalar of the array is in self._engine
return any(loc_ in self._engine for loc_ in loc)

@Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
def contains(self, key):
hash(key)

if self.categories._defer_to_indexing:
return self.categories.contains(key)

return key in self.values
return key in self

def __array__(self, dtype=None):
""" the array interface, return my values """
Expand Down

0 comments on commit 19b3598

Please sign in to comment.