REF: de-duplicate get_indexer methods (#38372)

pandas-dev · Dec 11, 2020 · 5a7514c · 5a7514c
1 parent 4b022e6
commit 5a7514c
Show file tree

Hide file tree

Showing 10 changed files with 60 additions and 76 deletions.
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -127,6 +127,7 @@ Interval
 
 Indexing
 ^^^^^^^^
+- Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`)
 - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
 -
 -

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3147,6 +3147,11 @@ def get_indexer(
         method = missing.clean_reindex_fill_method(method)
         target = ensure_index(target)
 
+        self._check_indexing_method(method)
+
+        if not self._index_as_unique:
+            raise InvalidIndexError(self._requires_unique_msg)
+
         # Treat boolean labels passed to a numeric index as not found. Without
         # this fix False and True would be treated as 0 and 1 respectively.
         # (GH #16877)
@@ -3174,11 +3179,6 @@ def _get_indexer(
                 target, method=method, limit=limit, tolerance=tolerance
             )
 
-        if not self.is_unique:
-            raise InvalidIndexError(
-                "Reindexing only valid with uniquely valued Index objects"
-            )
-
         if method == "pad" or method == "backfill":
             indexer = self._get_fill_indexer(target, method, limit, tolerance)
         elif method == "nearest":
@@ -3199,6 +3199,24 @@ def _get_indexer(
 
         return ensure_platform_int(indexer)
 
+    def _check_indexing_method(self, method):
+        """
+        Raise if we have a get_indexer `method` that is not supported or valid.
+        """
+        # GH#37871 for now this is only for IntervalIndex and CategoricalIndex
+        if not (is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype)):
+            return
+
+        if method is None:
+            return
+
+        if method in ["bfill", "backfill", "pad", "ffill", "nearest"]:
+            raise NotImplementedError(
+                f"method {method} not yet implemented for {type(self).__name__}"
+            )
+
+        raise ValueError("Invalid fill method")
+
     def _convert_tolerance(self, tolerance, target):
         # override this method on subclasses
         tolerance = np.asarray(tolerance)
@@ -5014,6 +5032,8 @@ def _index_as_unique(self):
         """
         return self.is_unique
 
+    _requires_unique_msg = "Reindexing only valid with uniquely valued Index objects"
+
     @final
     def _maybe_promote(self, other: "Index"):
         """

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -494,14 +494,11 @@ def _reindex_non_unique(self, target):
     def _maybe_cast_indexer(self, key) -> int:
         return self._data._unbox_scalar(key)
 
-    @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
     def _get_indexer(
         self, target: "Index", method=None, limit=None, tolerance=None
     ) -> np.ndarray:
 
-        self._check_indexing_method(method)
-
-        if self.is_unique and self.equals(target):
+        if self.equals(target):
             return np.arange(len(self), dtype="intp")
 
         return self._get_indexer_non_unique(target._values)[0]

diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
@@ -254,21 +254,6 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:
 
     # ---------------------------------------------------------------------
 
-    def _check_indexing_method(self, method):
-        """
-        Raise if we have a get_indexer `method` that is not supported or valid.
-        """
-        # GH#37871 for now this is only for IntervalIndex and CategoricalIndex
-        if method is None:
-            return
-
-        if method in ["bfill", "backfill", "pad", "ffill", "nearest"]:
-            raise NotImplementedError(
-                f"method {method} not yet implemented for {type(self).__name__}"
-            )
-
-        raise ValueError("Invalid fill method")
-
     def _get_engine_target(self) -> np.ndarray:
         return np.asarray(self._data)
 

diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -13,7 +13,7 @@
 from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset
 from pandas._typing import AnyArrayLike, DtypeObj, Label
 from pandas.errors import InvalidIndexError
-from pandas.util._decorators import Appender, Substitution, cache_readonly
+from pandas.util._decorators import Appender, cache_readonly
 from pandas.util._exceptions import rewrite_exception
 
 from pandas.core.dtypes.cast import (
@@ -646,23 +646,6 @@ def get_loc(
             return mask.argmax()
         return lib.maybe_booleans_to_slice(mask.view("u1"))
 
-    @Substitution(
-        **dict(
-            _index_doc_kwargs,
-            **{
-                "raises_section": textwrap.dedent(
-                    """
-        Raises
-        ------
-        NotImplementedError
-            If any method argument other than the default of
-            None is specified as these are not yet implemented.
-        """
-                )
-            },
-        )
-    )
-    @Appender(_index_shared_docs["get_indexer"])
     def _get_indexer(
         self,
         target: Index,
@@ -671,14 +654,6 @@ def _get_indexer(
         tolerance: Optional[Any] = None,
     ) -> np.ndarray:
 
-        self._check_indexing_method(method)
-
-        if self.is_overlapping:
-            raise InvalidIndexError(
-                "cannot handle overlapping indices; "
-                "use IntervalIndex.get_indexer_non_unique"
-            )
-
         if isinstance(target, IntervalIndex):
             # equal indexes -> 1:1 positional match
             if self.equals(target):
@@ -767,6 +742,10 @@ def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray]
     def _index_as_unique(self):
         return not self.is_overlapping
 
+    _requires_unique_msg = (
+        "cannot handle overlapping indices; use IntervalIndex.get_indexer_non_unique"
+    )
+
     def _convert_slice_indexer(self, key: slice, kind: str):
         if not (key.step is None or key.step == 1):
             # GH#31658 if label-based, we require step == 1,

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2595,11 +2595,10 @@ def _get_partial_string_timestamp_match_key(self, key):
 
         return key
 
-    @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
     def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):
 
         # empty indexer
-        if is_list_like(target) and not len(target):
+        if not len(target):
             return ensure_platform_int(np.array([]))
 
         if not isinstance(target, MultiIndex):
@@ -2613,9 +2612,6 @@ def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):
                         target, method=method, limit=limit, tolerance=tolerance
                     )
 
-        if not self.is_unique:
-            raise ValueError("Reindexing only valid with uniquely valued Index objects")
-
         if method == "pad" or method == "backfill":
             if tolerance is not None:
                 raise NotImplementedError(

diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -9,7 +9,7 @@
 from pandas._libs.tslibs.parsing import DateParseError, parse_time_string
 from pandas._typing import DtypeObj
 from pandas.errors import InvalidIndexError
-from pandas.util._decorators import Appender, cache_readonly, doc
+from pandas.util._decorators import cache_readonly, doc
 
 from pandas.core.dtypes.common import (
     is_bool_dtype,
@@ -31,11 +31,7 @@
 )
 import pandas.core.common as com
 import pandas.core.indexes.base as ibase
-from pandas.core.indexes.base import (
-    _index_shared_docs,
-    ensure_index,
-    maybe_extract_name,
-)
+from pandas.core.indexes.base import ensure_index, maybe_extract_name
 from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
 from pandas.core.indexes.datetimes import DatetimeIndex, Index
 from pandas.core.indexes.extension import inherit_names
@@ -448,7 +444,6 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False)
     # ------------------------------------------------------------------------
     # Indexing Methods
 
-    @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
     def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):
 
         if not self._should_compare(target):

diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
@@ -10,7 +10,7 @@
 from pandas._libs.lib import no_default
 from pandas._typing import Label
 from pandas.compat.numpy import function as nv
-from pandas.util._decorators import Appender, cache_readonly, doc
+from pandas.util._decorators import cache_readonly, doc
 
 from pandas.core.dtypes.common import (
     ensure_platform_int,
@@ -28,7 +28,7 @@
 import pandas.core.common as com
 from pandas.core.construction import extract_array
 import pandas.core.indexes.base as ibase
-from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name
+from pandas.core.indexes.base import maybe_extract_name
 from pandas.core.indexes.numeric import Float64Index, Int64Index
 from pandas.core.ops.common import unpack_zerodim_and_defer
 
@@ -354,7 +354,6 @@ def get_loc(self, key, method=None, tolerance=None):
             raise KeyError(key)
         return super().get_loc(key, method=method, tolerance=tolerance)
 
-    @Appender(_index_shared_docs["get_indexer"])
     def _get_indexer(self, target, method=None, limit=None, tolerance=None):
         if com.any_not_none(method, tolerance, limit) or not is_list_like(target):
             return super()._get_indexer(

diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.errors import InvalidIndexError
+
 import pandas as pd
 from pandas import CategoricalIndex, Index, IntervalIndex, Timestamp
 import pandas._testing as tm
@@ -204,18 +206,19 @@ def test_get_indexer_base(self):
         with pytest.raises(ValueError, match="Invalid fill method"):
             idx.get_indexer(idx, method="invalid")
 
-    def test_get_indexer_non_unique(self):
+    def test_get_indexer_requires_unique(self):
         np.random.seed(123456789)
 
         ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
         oidx = Index(np.array(ci))
 
+        msg = "Reindexing only valid with uniquely valued Index objects"
+
         for n in [1, 2, 5, len(ci)]:
             finder = oidx[np.random.randint(0, len(ci), size=n)]
-            expected = oidx.get_indexer_non_unique(finder)[0]
 
-            actual = ci.get_indexer(finder)
-            tm.assert_numpy_array_equal(expected, actual)
+            with pytest.raises(InvalidIndexError, match=msg):
+                ci.get_indexer(finder)
 
         # see gh-17323
         #
@@ -224,19 +227,27 @@ def test_get_indexer_non_unique(self):
         # respect duplicates instead of taking
         # the fast-track path.
         for finder in [list("aabbca"), list("aababca")]:
-            expected = oidx.get_indexer_non_unique(finder)[0]
 
-            actual = ci.get_indexer(finder)
-            tm.assert_numpy_array_equal(expected, actual)
+            with pytest.raises(InvalidIndexError, match=msg):
+                ci.get_indexer(finder)
 
-    def test_get_indexer(self):
+    def test_get_indexer_non_unique(self):
 
         idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
         idx2 = CategoricalIndex(list("abf"))
 
         for indexer in [idx2, list("abf"), Index(list("abf"))]:
-            r1 = idx1.get_indexer(idx2)
-            tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))
+            msg = "Reindexing only valid with uniquely valued Index objects"
+            with pytest.raises(InvalidIndexError, match=msg):
+                idx1.get_indexer(idx2)
+
+            r1, _ = idx1.get_indexer_non_unique(idx2)
+            expected = np.array([0, 1, 2, -1], dtype=np.intp)
+            tm.assert_almost_equal(r1, expected)
+
+    def test_get_indexer_method(self):
+        idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
+        idx2 = CategoricalIndex(list("abf"))
 
         msg = "method pad not yet implemented for CategoricalIndex"
         with pytest.raises(NotImplementedError, match=msg):

diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
@@ -205,9 +205,10 @@ def test_reindex_base(self):
     def test_get_indexer_consistency(self, index):
         # See GH 16819
         if isinstance(index, IntervalIndex):
+            # requires index.is_non_overlapping
             return
 
-        if index.is_unique or isinstance(index, CategoricalIndex):
+        if index.is_unique:
             indexer = index.get_indexer(index[0:2])
             assert isinstance(indexer, np.ndarray)
             assert indexer.dtype == np.intp