diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d0afc24aaecac..ae3ac44a8388c 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -127,6 +127,7 @@ Interval Indexing ^^^^^^^^ +- Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 11b7acc0a9deb..4220061b2385b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3147,6 +3147,11 @@ def get_indexer( method = missing.clean_reindex_fill_method(method) target = ensure_index(target) + self._check_indexing_method(method) + + if not self._index_as_unique: + raise InvalidIndexError(self._requires_unique_msg) + # Treat boolean labels passed to a numeric index as not found. Without # this fix False and True would be treated as 0 and 1 respectively. # (GH #16877) @@ -3174,11 +3179,6 @@ def _get_indexer( target, method=method, limit=limit, tolerance=tolerance ) - if not self.is_unique: - raise InvalidIndexError( - "Reindexing only valid with uniquely valued Index objects" - ) - if method == "pad" or method == "backfill": indexer = self._get_fill_indexer(target, method, limit, tolerance) elif method == "nearest": @@ -3199,6 +3199,24 @@ def _get_indexer( return ensure_platform_int(indexer) + def _check_indexing_method(self, method): + """ + Raise if we have a get_indexer `method` that is not supported or valid. + """ + # GH#37871 for now this is only for IntervalIndex and CategoricalIndex + if not (is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype)): + return + + if method is None: + return + + if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: + raise NotImplementedError( + f"method {method} not yet implemented for {type(self).__name__}" + ) + + raise ValueError("Invalid fill method") + def _convert_tolerance(self, tolerance, target): # override this method on subclasses tolerance = np.asarray(tolerance) @@ -5014,6 +5032,8 @@ def _index_as_unique(self): """ return self.is_unique + _requires_unique_msg = "Reindexing only valid with uniquely valued Index objects" + @final def _maybe_promote(self, other: "Index"): """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 6c9f839f4b8b2..e2a7752cf3f0d 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -494,14 +494,11 @@ def _reindex_non_unique(self, target): def _maybe_cast_indexer(self, key) -> int: return self._data._unbox_scalar(key) - @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def _get_indexer( self, target: "Index", method=None, limit=None, tolerance=None ) -> np.ndarray: - self._check_indexing_method(method) - - if self.is_unique and self.equals(target): + if self.equals(target): return np.arange(len(self), dtype="intp") return self._get_indexer_non_unique(target._values)[0] diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 92bd82f8263e9..73f96b2f6ad41 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -254,21 +254,6 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: # --------------------------------------------------------------------- - def _check_indexing_method(self, method): - """ - Raise if we have a get_indexer `method` that is not supported or valid. - """ - # GH#37871 for now this is only for IntervalIndex and CategoricalIndex - if method is None: - return - - if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: - raise NotImplementedError( - f"method {method} not yet implemented for {type(self).__name__}" - ) - - raise ValueError("Invalid fill method") - def _get_engine_target(self) -> np.ndarray: return np.asarray(self._data) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ee25a9d81a60f..0d0489ebaafa7 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -13,7 +13,7 @@ from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset from pandas._typing import AnyArrayLike, DtypeObj, Label from pandas.errors import InvalidIndexError -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._decorators import Appender, cache_readonly from pandas.util._exceptions import rewrite_exception from pandas.core.dtypes.cast import ( @@ -646,23 +646,6 @@ def get_loc( return mask.argmax() return lib.maybe_booleans_to_slice(mask.view("u1")) - @Substitution( - **dict( - _index_doc_kwargs, - **{ - "raises_section": textwrap.dedent( - """ - Raises - ------ - NotImplementedError - If any method argument other than the default of - None is specified as these are not yet implemented. - """ - ) - }, - ) - ) - @Appender(_index_shared_docs["get_indexer"]) def _get_indexer( self, target: Index, @@ -671,14 +654,6 @@ def _get_indexer( tolerance: Optional[Any] = None, ) -> np.ndarray: - self._check_indexing_method(method) - - if self.is_overlapping: - raise InvalidIndexError( - "cannot handle overlapping indices; " - "use IntervalIndex.get_indexer_non_unique" - ) - if isinstance(target, IntervalIndex): # equal indexes -> 1:1 positional match if self.equals(target): @@ -767,6 +742,10 @@ def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray] def _index_as_unique(self): return not self.is_overlapping + _requires_unique_msg = ( + "cannot handle overlapping indices; use IntervalIndex.get_indexer_non_unique" + ) + def _convert_slice_indexer(self, key: slice, kind: str): if not (key.step is None or key.step == 1): # GH#31658 if label-based, we require step == 1, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e4e29f32e62e6..75386a8779b20 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2595,11 +2595,10 @@ def _get_partial_string_timestamp_match_key(self, key): return key - @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None): # empty indexer - if is_list_like(target) and not len(target): + if not len(target): return ensure_platform_int(np.array([])) if not isinstance(target, MultiIndex): @@ -2613,9 +2612,6 @@ def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None): target, method=method, limit=limit, tolerance=tolerance ) - if not self.is_unique: - raise ValueError("Reindexing only valid with uniquely valued Index objects") - if method == "pad" or method == "backfill": if tolerance is not None: raise NotImplementedError( diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ac9fb31a4c35b..063dfa5cc1c5a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -9,7 +9,7 @@ from pandas._libs.tslibs.parsing import DateParseError, parse_time_string from pandas._typing import DtypeObj from pandas.errors import InvalidIndexError -from pandas.util._decorators import Appender, cache_readonly, doc +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( is_bool_dtype, @@ -31,11 +31,7 @@ ) import pandas.core.common as com import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import ( - _index_shared_docs, - ensure_index, - maybe_extract_name, -) +from pandas.core.indexes.base import ensure_index, maybe_extract_name from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin from pandas.core.indexes.datetimes import DatetimeIndex, Index from pandas.core.indexes.extension import inherit_names @@ -448,7 +444,6 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) # ------------------------------------------------------------------------ # Indexing Methods - @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None): if not self._should_compare(target): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index f14c126180642..e716605245da5 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -10,7 +10,7 @@ from pandas._libs.lib import no_default from pandas._typing import Label from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, cache_readonly, doc +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -28,7 +28,7 @@ import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name +from pandas.core.indexes.base import maybe_extract_name from pandas.core.indexes.numeric import Float64Index, Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer @@ -354,7 +354,6 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) return super().get_loc(key, method=method, tolerance=tolerance) - @Appender(_index_shared_docs["get_indexer"]) def _get_indexer(self, target, method=None, limit=None, tolerance=None): if com.any_not_none(method, tolerance, limit) or not is_list_like(target): return super()._get_indexer( diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 617ffdb48b3b7..13e622a61b4bd 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import InvalidIndexError + import pandas as pd from pandas import CategoricalIndex, Index, IntervalIndex, Timestamp import pandas._testing as tm @@ -204,18 +206,19 @@ def test_get_indexer_base(self): with pytest.raises(ValueError, match="Invalid fill method"): idx.get_indexer(idx, method="invalid") - def test_get_indexer_non_unique(self): + def test_get_indexer_requires_unique(self): np.random.seed(123456789) ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) oidx = Index(np.array(ci)) + msg = "Reindexing only valid with uniquely valued Index objects" + for n in [1, 2, 5, len(ci)]: finder = oidx[np.random.randint(0, len(ci), size=n)] - expected = oidx.get_indexer_non_unique(finder)[0] - actual = ci.get_indexer(finder) - tm.assert_numpy_array_equal(expected, actual) + with pytest.raises(InvalidIndexError, match=msg): + ci.get_indexer(finder) # see gh-17323 # @@ -224,19 +227,27 @@ def test_get_indexer_non_unique(self): # respect duplicates instead of taking # the fast-track path. for finder in [list("aabbca"), list("aababca")]: - expected = oidx.get_indexer_non_unique(finder)[0] - actual = ci.get_indexer(finder) - tm.assert_numpy_array_equal(expected, actual) + with pytest.raises(InvalidIndexError, match=msg): + ci.get_indexer(finder) - def test_get_indexer(self): + def test_get_indexer_non_unique(self): idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) idx2 = CategoricalIndex(list("abf")) for indexer in [idx2, list("abf"), Index(list("abf"))]: - r1 = idx1.get_indexer(idx2) - tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) + msg = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=msg): + idx1.get_indexer(idx2) + + r1, _ = idx1.get_indexer_non_unique(idx2) + expected = np.array([0, 1, 2, -1], dtype=np.intp) + tm.assert_almost_equal(r1, expected) + + def test_get_indexer_method(self): + idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) + idx2 = CategoricalIndex(list("abf")) msg = "method pad not yet implemented for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index d098e5b639f25..6874db66a8597 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -205,9 +205,10 @@ def test_reindex_base(self): def test_get_indexer_consistency(self, index): # See GH 16819 if isinstance(index, IntervalIndex): + # requires index.is_non_overlapping return - if index.is_unique or isinstance(index, CategoricalIndex): + if index.is_unique: indexer = index.get_indexer(index[0:2]) assert isinstance(indexer, np.ndarray) assert indexer.dtype == np.intp