From 85dd25ce659ba51822e0316062c6b08dfbc15727 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Fri, 2 Jul 2021 15:25:29 -0700 Subject: [PATCH] BUG: Index.get_indexer_non_unique misbehaves when index contains multiple nan (#35392) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_libs/index.pyx | 22 ++++++++++++-- pandas/core/indexes/base.py | 6 ++++ pandas/tests/indexes/object/test_indexing.py | 31 ++++++++++++++++++++ pandas/tests/indexes/test_indexing.py | 31 ++++++++++++++++++++ 5 files changed, 89 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 24f307f23f435..292bbe1f8053a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -159,6 +159,7 @@ Interval Indexing ^^^^^^^^ - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`) +- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`) - Missing diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 3351bb7cac7d6..f2e2abd16b985 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -288,10 +288,12 @@ cdef class IndexEngine: object val int count = 0, count_missing = 0 Py_ssize_t i, j, n, n_t, n_alloc + bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True self._ensure_mapping_populated() values = np.array(self._get_index_values(), copy=False) stargets = set(targets) + n = len(values) n_t = len(targets) if n > 10_000: @@ -321,6 +323,7 @@ cdef class IndexEngine: if stargets: # otherwise, map by iterating through all items in the index + for i in range(n): val = values[i] if val in stargets: @@ -328,12 +331,27 @@ cdef class IndexEngine: d[val] = [] d[val].append(i) + elif util.is_nan(val): + # GH#35392 + if need_nan_check: + # Do this check only once + stargets_has_nan = any(util.is_nan(val) for x in stargets) + need_nan_check = False + + if stargets_has_nan: + if not d_has_nan: + # use a canonical nan object + d[np.nan] = [] + d_has_nan = True + d[np.nan].append(i) + for i in range(n_t): val = targets[i] # found - if val in d: - for j in d[val]: + if val in d or (d_has_nan and util.is_nan(val)): + key = val if not util.is_nan(val) else np.nan + for j in d[key]: # realloc if needed if count >= n_alloc: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6070d6863039e..469e5f81703b8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5360,6 +5360,12 @@ def get_indexer_for(self, target) -> np.ndarray: ------- np.ndarray[np.intp] List of indices. + + Examples + -------- + >>> idx = pd.Index([np.nan, 'var1', np.nan]) + >>> idx.get_indexer_for([np.nan]) + array([0, 2]) """ if self._index_as_unique: return self.get_indexer(target) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index b26676a0d83cf..039483cc948df 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._libs.missing import is_matching_na + import pandas as pd from pandas import Index import pandas._testing as tm @@ -66,6 +68,35 @@ def test_get_indexer_with_NA_values( tm.assert_numpy_array_equal(result, expected) +class TestGetIndexerNonUnique: + def test_get_indexer_non_unique_nas(self, nulls_fixture): + # even though this isn't non-unique, this should still work + index = Index(["a", "b", nulls_fixture]) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # actually non-unique + index = Index(["a", nulls_fixture, "b", nulls_fixture]) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # matching-but-not-identical nans + if is_matching_na(nulls_fixture, float("NaN")): + index = Index(["a", float("NaN"), "b", float("NaN")]) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + class TestSliceLocs: @pytest.mark.parametrize( "in_slice,expected", diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 5f6d0155ae6cf..80237baeb9594 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -7,6 +7,7 @@ take where get_indexer + get_indexer_for slice_locs asof_locs @@ -25,6 +26,7 @@ Int64Index, IntervalIndex, MultiIndex, + NaT, PeriodIndex, RangeIndex, Series, @@ -294,3 +296,32 @@ def test_maybe_cast_slice_bound_kind_deprecated(index): with tm.assert_produces_warning(FutureWarning): # pass as positional index._maybe_cast_slice_bound(index[0], "left", "loc") + + +@pytest.mark.parametrize( + "idx,target,expected", + [ + ([np.nan, "var1", np.nan], [np.nan], np.array([0, 2], dtype=np.intp)), + ( + [np.nan, "var1", np.nan], + [np.nan, "var1"], + np.array([0, 2, 1], dtype=np.intp), + ), + ( + np.array([np.nan, "var1", np.nan], dtype=object), + [np.nan], + np.array([0, 2], dtype=np.intp), + ), + ( + DatetimeIndex(["2020-08-05", NaT, NaT]), + [NaT], + np.array([1, 2], dtype=np.intp), + ), + (["a", "b", "a", np.nan], [np.nan], np.array([3], dtype=np.intp)), + ], +) +def test_get_indexer_non_unique_multiple_nans(idx, target, expected): + # GH 35392 + axis = Index(idx) + actual = axis.get_indexer_for(target) + tm.assert_numpy_array_equal(actual, expected)