Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Index.get_indexer_non_unique misbehaves with multiple nan #35498

Merged
merged 4 commits into from
Aug 5, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ Indexing
- Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
- Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`)
- Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`)
-

Missing
^^^^^^^
Expand Down
22 changes: 20 additions & 2 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,12 @@ cdef class IndexEngine:
object val
int count = 0, count_missing = 0
Py_ssize_t i, j, n, n_t, n_alloc
bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True

self._ensure_mapping_populated()
values = np.array(self._get_index_values(), copy=False)
stargets = set(targets)

n = len(values)
n_t = len(targets)
if n > 10_000:
Expand Down Expand Up @@ -321,19 +323,35 @@ cdef class IndexEngine:

if stargets:
# otherwise, map by iterating through all items in the index

for i in range(n):
val = values[i]
if val in stargets:
if val not in d:
d[val] = []
d[val].append(i)

elif util.is_nan(val):
# GH#35392
if need_nan_check:
# Do this check only once
stargets_has_nan = any(util.is_nan(val) for x in stargets)
need_nan_check = False

if stargets_has_nan:
if not d_has_nan:
# use a canonical nan object
d[np.nan] = []
d_has_nan = True
d[np.nan].append(i)

for i in range(n_t):
val = targets[i]

# found
if val in d:
for j in d[val]:
if val in d or (d_has_nan and util.is_nan(val)):
key = val if not util.is_nan(val) else np.nan
for j in d[key]:

# realloc if needed
if count >= n_alloc:
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5383,6 +5383,12 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]:
-------
np.ndarray[np.intp]
List of indices.

Examples
--------
>>> idx = pd.Index([np.nan, 'var1', np.nan])
>>> idx.get_indexer_for([np.nan])
array([0, 2])
"""
if self._index_as_unique:
return self.get_indexer(target)
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/indexes/object/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

from pandas._libs.missing import is_matching_na

import pandas as pd
from pandas import Index
import pandas._testing as tm
Expand Down Expand Up @@ -66,6 +68,35 @@ def test_get_indexer_with_NA_values(
tm.assert_numpy_array_equal(result, expected)


class TestGetIndexerNonUnique:
def test_get_indexer_non_unique_nas(self, nulls_fixture):
# even though this isn't non-unique, this should still work
index = Index(["a", "b", nulls_fixture])
indexer, missing = index.get_indexer_non_unique([nulls_fixture])

expected_indexer = np.array([2], dtype=np.intp)
expected_missing = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)

# actually non-unique
index = Index(["a", nulls_fixture, "b", nulls_fixture])
indexer, missing = index.get_indexer_non_unique([nulls_fixture])

expected_indexer = np.array([1, 3], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)

# matching-but-not-identical nans
if is_matching_na(nulls_fixture, float("NaN")):
index = Index(["a", float("NaN"), "b", float("NaN")])
indexer, missing = index.get_indexer_non_unique([nulls_fixture])

expected_indexer = np.array([1, 3], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)


class TestSliceLocs:
@pytest.mark.parametrize(
"in_slice,expected",
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/indexes/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
take
where
get_indexer
get_indexer_for
slice_locs
asof_locs
Expand All @@ -25,6 +26,7 @@
Int64Index,
IntervalIndex,
MultiIndex,
NaT,
PeriodIndex,
RangeIndex,
Series,
Expand Down Expand Up @@ -294,3 +296,32 @@ def test_maybe_cast_slice_bound_kind_deprecated(index):
with tm.assert_produces_warning(FutureWarning):
# pass as positional
index._maybe_cast_slice_bound(index[0], "left", "loc")


@pytest.mark.parametrize(
"idx,target,expected",
[
([np.nan, "var1", np.nan], [np.nan], np.array([0, 2], dtype=np.intp)),
alexhlim marked this conversation as resolved.
Show resolved Hide resolved
(
[np.nan, "var1", np.nan],
[np.nan, "var1"],
np.array([0, 2, 1], dtype=np.intp),
),
(
np.array([np.nan, "var1", np.nan], dtype=object),
[np.nan],
np.array([0, 2], dtype=np.intp),
),
(
DatetimeIndex(["2020-08-05", NaT, NaT]),
[NaT],
np.array([1, 2], dtype=np.intp),
),
(["a", "b", "a", np.nan], [np.nan], np.array([3], dtype=np.intp)),
],
)
def test_get_indexer_non_unique_multiple_nans(idx, target, expected):
# GH 35392
axis = Index(idx)
alexhlim marked this conversation as resolved.
Show resolved Hide resolved
actual = axis.get_indexer_for(target)
tm.assert_numpy_array_equal(actual, expected)