Skip to content

Commit

Permalink
BUG pandas-dev#16012 - fix isin for large object arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
Morgan243 committed Jul 18, 2017
1 parent 81f8ace commit 186607b
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,4 @@ Categorical
Other
^^^^^
- Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`)
- Bug when using :func:`isin` on a large object series and large comparison array, numpy's in1d is used but doesn't support objects in most conditions (:issue:`16012`)
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,10 @@ def isin(comps, values):
# work-around for numpy < 1.8 and comparisions on py3
# faster for larger cases to use np.in1d
f = lambda x, y: htable.ismember_object(x, values)
if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
# GH16012
# Ensure np.in1d doesn't get object types or it *may* throw an exception
if ((_np_version_under1p8 and compat.PY3) or len(comps) > 1000000 and
not is_object_dtype(comps)):
f = lambda x, y: np.in1d(x, y)
elif is_integer_dtype(comps):
try:
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1092,6 +1092,15 @@ def test_isin(self):
expected = Series([True, False, True, False, False, False, True, True])
assert_series_equal(result, expected)

# GH: 16012
# This specific issue has to have a series over 1e6 in len, but the
# comparison array (in_list) must be large enough so that numpy doesn't
# do a manual masking trick that will avoid this issue altogether
s = Series(list('abcdefghijk' * 10 ** 5))
in_list = [-1, 'a', 'b', 'G', 'Y', 'Z', 'E', 'K', 'E', 'S', 'I', 'R', 'R']*6

assert s.isin(in_list).sum() == 200000

def test_isin_with_string_scalar(self):
# GH4763
s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C'])
Expand Down

0 comments on commit 186607b

Please sign in to comment.