BUG: Index.get_indexer_non_unique misbehaves when index contains mult…

…iple nan (#35392) (#35498)
pandas-dev · Aug 5, 2021 · bf267b4 · jbrockmendel · Sep 27, 2021 · alexhlim
1 parent 929c98e
commit bf267b4
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 2 deletions.
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -236,6 +236,7 @@ Indexing
 - Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
 - Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`)
 - Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
+- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`)
 - Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`)
 -
 

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -288,10 +288,12 @@ cdef class IndexEngine:
             object val
             int count = 0, count_missing = 0
             Py_ssize_t i, j, n, n_t, n_alloc
+            bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True
 
         self._ensure_mapping_populated()
         values = np.array(self._get_index_values(), copy=False)
         stargets = set(targets)
+
         n = len(values)
         n_t = len(targets)
         if n > 10_000:
@@ -321,19 +323,35 @@ cdef class IndexEngine:
 
         if stargets:
             # otherwise, map by iterating through all items in the index
+
             for i in range(n):
                 val = values[i]
                 if val in stargets:
                     if val not in d:
                         d[val] = []
                     d[val].append(i)
 
+                elif util.is_nan(val):
+                    # GH#35392
+                    if need_nan_check:
+                        # Do this check only once
+                        stargets_has_nan = any(util.is_nan(val) for x in stargets)
+                        need_nan_check = False
+
+                    if stargets_has_nan:
+                        if not d_has_nan:
+                            # use a canonical nan object
+                            d[np.nan] = []
+                            d_has_nan = True
+                        d[np.nan].append(i)
+
         for i in range(n_t):
             val = targets[i]
 
             # found
-            if val in d:
-                for j in d[val]:
+            if val in d or (d_has_nan and util.is_nan(val)):
+                key = val if not util.is_nan(val) else np.nan
+                for j in d[key]:
 
                     # realloc if needed
                     if count >= n_alloc:

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -5383,6 +5383,12 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]:
         -------
         np.ndarray[np.intp]
             List of indices.
+
+        Examples
+        --------
+        >>> idx = pd.Index([np.nan, 'var1', np.nan])
+        >>> idx.get_indexer_for([np.nan])
+        array([0, 2])
         """
         if self._index_as_unique:
             return self.get_indexer(target)

diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas._libs.missing import is_matching_na
+
 import pandas as pd
 from pandas import Index
 import pandas._testing as tm
@@ -66,6 +68,35 @@ def test_get_indexer_with_NA_values(
         tm.assert_numpy_array_equal(result, expected)
 
 
+class TestGetIndexerNonUnique:
+    def test_get_indexer_non_unique_nas(self, nulls_fixture):
+        # even though this isn't non-unique, this should still work
+        index = Index(["a", "b", nulls_fixture])
+        indexer, missing = index.get_indexer_non_unique([nulls_fixture])
+
+        expected_indexer = np.array([2], dtype=np.intp)
+        expected_missing = np.array([], dtype=np.intp)
+        tm.assert_numpy_array_equal(indexer, expected_indexer)
+        tm.assert_numpy_array_equal(missing, expected_missing)
+
+        # actually non-unique
+        index = Index(["a", nulls_fixture, "b", nulls_fixture])
+        indexer, missing = index.get_indexer_non_unique([nulls_fixture])
+
+        expected_indexer = np.array([1, 3], dtype=np.intp)
+        tm.assert_numpy_array_equal(indexer, expected_indexer)
+        tm.assert_numpy_array_equal(missing, expected_missing)
+
+        # matching-but-not-identical nans
+        if is_matching_na(nulls_fixture, float("NaN")):
+            index = Index(["a", float("NaN"), "b", float("NaN")])
+            indexer, missing = index.get_indexer_non_unique([nulls_fixture])
+
+            expected_indexer = np.array([1, 3], dtype=np.intp)
+            tm.assert_numpy_array_equal(indexer, expected_indexer)
+            tm.assert_numpy_array_equal(missing, expected_missing)
+
+
 class TestSliceLocs:
     @pytest.mark.parametrize(
         "in_slice,expected",

diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py
@@ -7,6 +7,7 @@
     take
     where
     get_indexer
+    get_indexer_for
     slice_locs
     asof_locs
 
@@ -25,6 +26,7 @@
     Int64Index,
     IntervalIndex,
     MultiIndex,
+    NaT,
     PeriodIndex,
     RangeIndex,
     Series,
@@ -294,3 +296,32 @@ def test_maybe_cast_slice_bound_kind_deprecated(index):
     with tm.assert_produces_warning(FutureWarning):
         # pass as positional
         index._maybe_cast_slice_bound(index[0], "left", "loc")
+
+
+@pytest.mark.parametrize(
+    "idx,target,expected",
+    [
+        ([np.nan, "var1", np.nan], [np.nan], np.array([0, 2], dtype=np.intp)),
+        (
+            [np.nan, "var1", np.nan],
+            [np.nan, "var1"],
+            np.array([0, 2, 1], dtype=np.intp),
+        ),
+        (
+            np.array([np.nan, "var1", np.nan], dtype=object),
+            [np.nan],
+            np.array([0, 2], dtype=np.intp),
+        ),
+        (
+            DatetimeIndex(["2020-08-05", NaT, NaT]),
+            [NaT],
+            np.array([1, 2], dtype=np.intp),
+        ),
+        (["a", "b", "a", np.nan], [np.nan], np.array([3], dtype=np.intp)),
+    ],
+)
+def test_get_indexer_non_unique_multiple_nans(idx, target, expected):
+    # GH 35392
+    axis = Index(idx)
+    actual = axis.get_indexer_for(target)
+    tm.assert_numpy_array_equal(actual, expected)