Skip to content

Commit

Permalink
REF: de-duplicate get_indexer methods (#38372)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Dec 11, 2020
1 parent 4b022e6 commit 5a7514c
Show file tree
Hide file tree
Showing 10 changed files with 60 additions and 76 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ Interval

Indexing
^^^^^^^^
- Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`)
- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
-
-
Expand Down
30 changes: 25 additions & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3147,6 +3147,11 @@ def get_indexer(
method = missing.clean_reindex_fill_method(method)
target = ensure_index(target)

self._check_indexing_method(method)

if not self._index_as_unique:
raise InvalidIndexError(self._requires_unique_msg)

# Treat boolean labels passed to a numeric index as not found. Without
# this fix False and True would be treated as 0 and 1 respectively.
# (GH #16877)
Expand Down Expand Up @@ -3174,11 +3179,6 @@ def _get_indexer(
target, method=method, limit=limit, tolerance=tolerance
)

if not self.is_unique:
raise InvalidIndexError(
"Reindexing only valid with uniquely valued Index objects"
)

if method == "pad" or method == "backfill":
indexer = self._get_fill_indexer(target, method, limit, tolerance)
elif method == "nearest":
Expand All @@ -3199,6 +3199,24 @@ def _get_indexer(

return ensure_platform_int(indexer)

def _check_indexing_method(self, method):
"""
Raise if we have a get_indexer `method` that is not supported or valid.
"""
# GH#37871 for now this is only for IntervalIndex and CategoricalIndex
if not (is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype)):
return

if method is None:
return

if method in ["bfill", "backfill", "pad", "ffill", "nearest"]:
raise NotImplementedError(
f"method {method} not yet implemented for {type(self).__name__}"
)

raise ValueError("Invalid fill method")

def _convert_tolerance(self, tolerance, target):
# override this method on subclasses
tolerance = np.asarray(tolerance)
Expand Down Expand Up @@ -5014,6 +5032,8 @@ def _index_as_unique(self):
"""
return self.is_unique

_requires_unique_msg = "Reindexing only valid with uniquely valued Index objects"

@final
def _maybe_promote(self, other: "Index"):
"""
Expand Down
5 changes: 1 addition & 4 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,14 +494,11 @@ def _reindex_non_unique(self, target):
def _maybe_cast_indexer(self, key) -> int:
return self._data._unbox_scalar(key)

@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def _get_indexer(
self, target: "Index", method=None, limit=None, tolerance=None
) -> np.ndarray:

self._check_indexing_method(method)

if self.is_unique and self.equals(target):
if self.equals(target):
return np.arange(len(self), dtype="intp")

return self._get_indexer_non_unique(target._values)[0]
Expand Down
15 changes: 0 additions & 15 deletions pandas/core/indexes/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,21 +254,6 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:

# ---------------------------------------------------------------------

def _check_indexing_method(self, method):
"""
Raise if we have a get_indexer `method` that is not supported or valid.
"""
# GH#37871 for now this is only for IntervalIndex and CategoricalIndex
if method is None:
return

if method in ["bfill", "backfill", "pad", "ffill", "nearest"]:
raise NotImplementedError(
f"method {method} not yet implemented for {type(self).__name__}"
)

raise ValueError("Invalid fill method")

def _get_engine_target(self) -> np.ndarray:
return np.asarray(self._data)

Expand Down
31 changes: 5 additions & 26 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset
from pandas._typing import AnyArrayLike, DtypeObj, Label
from pandas.errors import InvalidIndexError
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.util._decorators import Appender, cache_readonly
from pandas.util._exceptions import rewrite_exception

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -646,23 +646,6 @@ def get_loc(
return mask.argmax()
return lib.maybe_booleans_to_slice(mask.view("u1"))

@Substitution(
**dict(
_index_doc_kwargs,
**{
"raises_section": textwrap.dedent(
"""
Raises
------
NotImplementedError
If any method argument other than the default of
None is specified as these are not yet implemented.
"""
)
},
)
)
@Appender(_index_shared_docs["get_indexer"])
def _get_indexer(
self,
target: Index,
Expand All @@ -671,14 +654,6 @@ def _get_indexer(
tolerance: Optional[Any] = None,
) -> np.ndarray:

self._check_indexing_method(method)

if self.is_overlapping:
raise InvalidIndexError(
"cannot handle overlapping indices; "
"use IntervalIndex.get_indexer_non_unique"
)

if isinstance(target, IntervalIndex):
# equal indexes -> 1:1 positional match
if self.equals(target):
Expand Down Expand Up @@ -767,6 +742,10 @@ def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray]
def _index_as_unique(self):
return not self.is_overlapping

_requires_unique_msg = (
"cannot handle overlapping indices; use IntervalIndex.get_indexer_non_unique"
)

def _convert_slice_indexer(self, key: slice, kind: str):
if not (key.step is None or key.step == 1):
# GH#31658 if label-based, we require step == 1,
Expand Down
6 changes: 1 addition & 5 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2595,11 +2595,10 @@ def _get_partial_string_timestamp_match_key(self, key):

return key

@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):

# empty indexer
if is_list_like(target) and not len(target):
if not len(target):
return ensure_platform_int(np.array([]))

if not isinstance(target, MultiIndex):
Expand All @@ -2613,9 +2612,6 @@ def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):
target, method=method, limit=limit, tolerance=tolerance
)

if not self.is_unique:
raise ValueError("Reindexing only valid with uniquely valued Index objects")

if method == "pad" or method == "backfill":
if tolerance is not None:
raise NotImplementedError(
Expand Down
9 changes: 2 additions & 7 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas._libs.tslibs.parsing import DateParseError, parse_time_string
from pandas._typing import DtypeObj
from pandas.errors import InvalidIndexError
from pandas.util._decorators import Appender, cache_readonly, doc
from pandas.util._decorators import cache_readonly, doc

from pandas.core.dtypes.common import (
is_bool_dtype,
Expand All @@ -31,11 +31,7 @@
)
import pandas.core.common as com
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import (
_index_shared_docs,
ensure_index,
maybe_extract_name,
)
from pandas.core.indexes.base import ensure_index, maybe_extract_name
from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
from pandas.core.indexes.datetimes import DatetimeIndex, Index
from pandas.core.indexes.extension import inherit_names
Expand Down Expand Up @@ -448,7 +444,6 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False)
# ------------------------------------------------------------------------
# Indexing Methods

@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):

if not self._should_compare(target):
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas._libs.lib import no_default
from pandas._typing import Label
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender, cache_readonly, doc
from pandas.util._decorators import cache_readonly, doc

from pandas.core.dtypes.common import (
ensure_platform_int,
Expand All @@ -28,7 +28,7 @@
import pandas.core.common as com
from pandas.core.construction import extract_array
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name
from pandas.core.indexes.base import maybe_extract_name
from pandas.core.indexes.numeric import Float64Index, Int64Index
from pandas.core.ops.common import unpack_zerodim_and_defer

Expand Down Expand Up @@ -354,7 +354,6 @@ def get_loc(self, key, method=None, tolerance=None):
raise KeyError(key)
return super().get_loc(key, method=method, tolerance=tolerance)

@Appender(_index_shared_docs["get_indexer"])
def _get_indexer(self, target, method=None, limit=None, tolerance=None):
if com.any_not_none(method, tolerance, limit) or not is_list_like(target):
return super()._get_indexer(
Expand Down
31 changes: 21 additions & 10 deletions pandas/tests/indexes/categorical/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

from pandas.errors import InvalidIndexError

import pandas as pd
from pandas import CategoricalIndex, Index, IntervalIndex, Timestamp
import pandas._testing as tm
Expand Down Expand Up @@ -204,18 +206,19 @@ def test_get_indexer_base(self):
with pytest.raises(ValueError, match="Invalid fill method"):
idx.get_indexer(idx, method="invalid")

def test_get_indexer_non_unique(self):
def test_get_indexer_requires_unique(self):
np.random.seed(123456789)

ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
oidx = Index(np.array(ci))

msg = "Reindexing only valid with uniquely valued Index objects"

for n in [1, 2, 5, len(ci)]:
finder = oidx[np.random.randint(0, len(ci), size=n)]
expected = oidx.get_indexer_non_unique(finder)[0]

actual = ci.get_indexer(finder)
tm.assert_numpy_array_equal(expected, actual)
with pytest.raises(InvalidIndexError, match=msg):
ci.get_indexer(finder)

# see gh-17323
#
Expand All @@ -224,19 +227,27 @@ def test_get_indexer_non_unique(self):
# respect duplicates instead of taking
# the fast-track path.
for finder in [list("aabbca"), list("aababca")]:
expected = oidx.get_indexer_non_unique(finder)[0]

actual = ci.get_indexer(finder)
tm.assert_numpy_array_equal(expected, actual)
with pytest.raises(InvalidIndexError, match=msg):
ci.get_indexer(finder)

def test_get_indexer(self):
def test_get_indexer_non_unique(self):

idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
idx2 = CategoricalIndex(list("abf"))

for indexer in [idx2, list("abf"), Index(list("abf"))]:
r1 = idx1.get_indexer(idx2)
tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))
msg = "Reindexing only valid with uniquely valued Index objects"
with pytest.raises(InvalidIndexError, match=msg):
idx1.get_indexer(idx2)

r1, _ = idx1.get_indexer_non_unique(idx2)
expected = np.array([0, 1, 2, -1], dtype=np.intp)
tm.assert_almost_equal(r1, expected)

def test_get_indexer_method(self):
idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
idx2 = CategoricalIndex(list("abf"))

msg = "method pad not yet implemented for CategoricalIndex"
with pytest.raises(NotImplementedError, match=msg):
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,10 @@ def test_reindex_base(self):
def test_get_indexer_consistency(self, index):
# See GH 16819
if isinstance(index, IntervalIndex):
# requires index.is_non_overlapping
return

if index.is_unique or isinstance(index, CategoricalIndex):
if index.is_unique:
indexer = index.get_indexer(index[0:2])
assert isinstance(indexer, np.ndarray)
assert indexer.dtype == np.intp
Expand Down

0 comments on commit 5a7514c

Please sign in to comment.