Skip to content

Commit

Permalink
BUG: fix isin with nans and large arrays (pandas-dev#36266)
Browse files Browse the repository at this point in the history
  • Loading branch information
Hanspagh authored and Kevin D Smith committed Nov 2, 2020
1 parent 14322ea commit 8523c68
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ Bug fixes
- Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`)
- Bug in :meth:`DataFrame.sort_values` raising an ``AttributeError`` when sorting on a key that casts column to categorical dtype (:issue:`36383`)
- Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`)
- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`)

.. ---------------------------------------------------------------------------
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,12 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
# GH16012
# Ensure np.in1d doesn't get object types or it *may* throw an exception
if len(comps) > 1_000_000 and not is_object_dtype(comps):
f = np.in1d
# If the the values include nan we need to check for nan explicitly
# since np.nan it not equal to np.nan
if np.isnan(values).any():
f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
else:
f = np.in1d
elif is_integer_dtype(comps):
try:
values = values.astype("int64", copy=False)
Expand Down
18 changes: 17 additions & 1 deletion pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,7 +801,6 @@ def test_i8(self):
tm.assert_numpy_array_equal(result, expected)

def test_large(self):

s = pd.date_range("20000101", periods=2000000, freq="s").values
result = algos.isin(s, s[0:2])
expected = np.zeros(len(s), dtype=bool)
Expand Down Expand Up @@ -841,6 +840,23 @@ def test_same_nan_is_in(self):
result = algos.isin(comps, values)
tm.assert_numpy_array_equal(expected, result)

def test_same_nan_is_in_large(self):
# https://github.com/pandas-dev/pandas/issues/22205
s = np.tile(1.0, 1_000_001)
s[0] = np.nan
result = algos.isin(s, [np.nan, 1])
expected = np.ones(len(s), dtype=bool)
tm.assert_numpy_array_equal(result, expected)

def test_same_nan_is_in_large_series(self):
# https://github.com/pandas-dev/pandas/issues/22205
s = np.tile(1.0, 1_000_001)
series = pd.Series(s)
s[0] = np.nan
result = series.isin([np.nan, 1])
expected = pd.Series(np.ones(len(s), dtype=bool))
tm.assert_series_equal(result, expected)

def test_same_object_is_in(self):
# GH 22160
# there could be special treatment for nans
Expand Down

0 comments on commit 8523c68

Please sign in to comment.