BUG: CategoricalIndex allowed reindexing duplicate sources (pandas-de…

…v#28257)
proost · Dec 19, 2019 · b7be0e9 · b7be0e9
1 parent e336fe1
commit b7be0e9
Show file tree

Hide file tree

Showing 10 changed files with 126 additions and 86 deletions.
diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst
@@ -783,27 +783,41 @@ values **not** in the categories, similarly to how you can reindex **any** panda
 
 .. ipython:: python
 
-   df2.reindex(['a', 'e'])
-   df2.reindex(['a', 'e']).index
-   df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde')))
-   df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))).index
+   df3 = pd.DataFrame({'A': np.arange(3),
+                       'B': pd.Series(list('abc')).astype('category')})
+   df3 = df3.set_index('B')
+   df3
+
+.. ipython:: python
+
+   df3.reindex(['a', 'e'])
+   df3.reindex(['a', 'e']).index
+   df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe')))
+   df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index
 
 .. warning::
 
    Reshaping and Comparison operations on a ``CategoricalIndex`` must have the same categories
    or a ``TypeError`` will be raised.
 
-   .. code-block:: ipython
+   .. ipython:: python
 
-    In [9]: df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')})
+      df4 = pd.DataFrame({'A': np.arange(2),
+                          'B': list('ba')})
+      df4['B'] = df4['B'].astype(CategoricalDtype(list('ab')))
+      df4 = df4.set_index('B')
+      df4.index
 
-    In [11]: df3 = df3.set_index('B')
+      df5 = pd.DataFrame({'A': np.arange(2),
+                          'B': list('bc')})
+      df5['B'] = df5['B'].astype(CategoricalDtype(list('bc')))
+      df5 = df5.set_index('B')
+      df5.index
 
-    In [11]: df3.index
-    Out[11]: CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, name='B', dtype='category')
+   .. code-block:: ipython
 
-    In [12]: pd.concat([df2, df3])
-    TypeError: categories must match existing categories when appending
+      In [1]: pd.concat([df4, df5])
+      TypeError: categories must match existing categories when appending
 
 .. _indexing.rangeindex:
 

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -223,6 +223,7 @@ Categorical
 
 - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`)
 - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`)
+- :meth:`DataFrame.reindex` with a :class:`CategoricalIndex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`)
 - Bug in :meth:`Categorical.astype` not allowing for casting to extension dtypes (:issue:`28668`)
 - Bug where :func:`merge` was unable to join on categorical and extension dtype columns (:issue:`28668`)
 - :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`)
@@ -292,6 +293,9 @@ Indexing
 - Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`)
 - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`)
 - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`)
+- :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`)
+- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`)
+-
 
 Missing
 ^^^^^^^

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -286,7 +286,7 @@ cdef class IndexEngine:
         cdef:
             ndarray values, x
             ndarray[int64_t] result, missing
-            set stargets
+            set stargets, remaining_stargets
             dict d = {}
             object val
             int count = 0, count_missing = 0
@@ -309,12 +309,20 @@ cdef class IndexEngine:
         if stargets and len(stargets) < 5 and self.is_monotonic_increasing:
             # if there are few enough stargets and the index is monotonically
             # increasing, then use binary search for each starget
+            remaining_stargets = set()
             for starget in stargets:
-                start = values.searchsorted(starget, side='left')
-                end = values.searchsorted(starget, side='right')
-                if start != end:
-                    d[starget] = list(range(start, end))
-        else:
+                try:
+                    start = values.searchsorted(starget, side='left')
+                    end = values.searchsorted(starget, side='right')
+                except TypeError:  # e.g. if we tried to search for string in int array
+                    remaining_stargets.add(starget)
+                else:
+                    if start != end:
+                        d[starget] = list(range(start, end))
+
+            stargets = remaining_stargets
+
+        if stargets:
             # otherwise, map by iterating through all items in the index
             for i in range(n):
                 val = values[i]

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2493,8 +2493,12 @@ def _union(self, other, sort):
                 value_set = set(lvals)
                 result.extend([x for x in rvals if x not in value_set])
         else:
-            indexer = self.get_indexer(other)
-            indexer, = (indexer == -1).nonzero()
+            # find indexes of things in "other" that are not in "self"
+            if self.is_unique:
+                indexer = self.get_indexer(other)
+                indexer = (indexer == -1).nonzero()[0]
+            else:
+                indexer = algos.unique1d(self.get_indexer_non_unique(other)[1])
 
             if len(indexer) > 0:
                 other_diff = algos.take_nd(rvals, indexer, allow_fill=False)

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -552,10 +552,6 @@ def get_value(self, series: AnyArrayLike, key: Any):
         # we might be a positional inexer
         return super().get_value(series, key)
 
-    def _can_reindex(self, indexer):
-        """ always allow reindexing """
-        pass
-
     @Substitution(klass="CategoricalIndex")
     @Appender(_shared_docs["searchsorted"])
     def searchsorted(self, value, side="left", sorter=None):
@@ -585,7 +581,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
             Indices of output values in original index
 
         """
-
         if method is not None:
             raise NotImplementedError(
                 "argument method is not implemented for CategoricalIndex.reindex"
@@ -605,9 +600,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
             indexer = None
             missing = []
         else:
-            if not target.is_unique:
-                raise ValueError("cannot reindex with a non-unique indexer")
-
             indexer, missing = self.get_indexer_non_unique(np.array(target))
 
         if len(self.codes) and indexer is not None:

diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
@@ -599,15 +599,19 @@ def test_reindex_dtype(self):
         tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
 
     def test_reindex_duplicate_target(self):
-        # See GH23963
-        c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
-        with pytest.raises(ValueError, match="non-unique indexer"):
-            c.reindex(["a", "a", "c"])
+        # See GH25459
+        cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"])
+        res, indexer = cat.reindex(["a", "c", "c"])
+        exp = Index(["a", "c", "c"], dtype="object")
+        tm.assert_index_equal(res, exp, exact=True)
+        tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
 
-        with pytest.raises(ValueError, match="non-unique indexer"):
-            c.reindex(
-                CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"])
-            )
+        res, indexer = cat.reindex(
+            CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
+        )
+        exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
+        tm.assert_index_equal(res, exp, exact=True)
+        tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
 
     def test_reindex_empty_index(self):
         # See GH16770

diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
@@ -561,92 +561,89 @@ def test_read_only_source(self):
         assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3])
 
     def test_reindexing(self):
+        df = DataFrame(
+            {
+                "A": np.arange(3, dtype="int64"),
+                "B": Series(list("abc")).astype(CDT(list("cabe"))),
+            }
+        ).set_index("B")
 
         # reindexing
         # convert to a regular index
-        result = self.df2.reindex(["a", "b", "e"])
-        expected = DataFrame(
-            {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))}
-        ).set_index("B")
+        result = df.reindex(["a", "b", "e"])
+        expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
+            "B"
+        )
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(["a", "b"])
-        expected = DataFrame(
-            {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))}
-        ).set_index("B")
+        result = df.reindex(["a", "b"])
+        expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(["e"])
+        result = df.reindex(["e"])
         expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(["d"])
+        result = df.reindex(["d"])
         expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B")
         assert_frame_equal(result, expected, check_index_type=True)
 
         # since we are actually reindexing with a Categorical
         # then return a Categorical
         cats = list("cabe")
 
-        result = self.df2.reindex(Categorical(["a", "d"], categories=cats))
+        result = df.reindex(Categorical(["a", "e"], categories=cats))
         expected = DataFrame(
-            {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))}
+            {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
         ).set_index("B")
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(Categorical(["a"], categories=cats))
+        result = df.reindex(Categorical(["a"], categories=cats))
         expected = DataFrame(
-            {"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))}
+            {"A": [0], "B": Series(list("a")).astype(CDT(cats))}
         ).set_index("B")
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(["a", "b", "e"])
-        expected = DataFrame(
-            {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))}
-        ).set_index("B")
+        result = df.reindex(["a", "b", "e"])
+        expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
+            "B"
+        )
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(["a", "b"])
-        expected = DataFrame(
-            {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))}
-        ).set_index("B")
+        result = df.reindex(["a", "b"])
+        expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(["e"])
+        result = df.reindex(["e"])
         expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
         assert_frame_equal(result, expected, check_index_type=True)
 
         # give back the type of categorical that we received
-        result = self.df2.reindex(
-            Categorical(["a", "d"], categories=cats, ordered=True)
-        )
+        result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True))
         expected = DataFrame(
-            {
-                "A": [0, 1, 5, np.nan],
-                "B": Series(list("aaad")).astype(CDT(cats, ordered=True)),
-            }
+            {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
         ).set_index("B")
         assert_frame_equal(result, expected, check_index_type=True)
 
-        result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"]))
+        result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
         expected = DataFrame(
-            {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))}
+            {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
         ).set_index("B")
         assert_frame_equal(result, expected, check_index_type=True)
 
         # passed duplicate indexers are not allowed
-        msg = "cannot reindex with a non-unique indexer"
+        msg = "cannot reindex from a duplicate axis"
         with pytest.raises(ValueError, match=msg):
-            self.df2.reindex(["a", "a"])
+            self.df2.reindex(["a", "b"])
 
         # args NotImplemented ATM
         msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
         with pytest.raises(NotImplementedError, match=msg.format("method")):
-            self.df2.reindex(["a"], method="ffill")
+            df.reindex(["a"], method="ffill")
         with pytest.raises(NotImplementedError, match=msg.format("level")):
-            self.df2.reindex(["a"], level=1)
+            df.reindex(["a"], level=1)
         with pytest.raises(NotImplementedError, match=msg.format("limit")):
-            self.df2.reindex(["a"], limit=2)
+            df.reindex(["a"], limit=2)
 
     def test_loc_slice(self):
         # slicing

diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py
@@ -7,7 +7,6 @@
 import pandas as pd
 from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna
 from pandas.core import ops
-from pandas.core.indexes.base import InvalidIndexError
 import pandas.core.nanops as nanops
 import pandas.util.testing as tm
 from pandas.util.testing import (
@@ -282,44 +281,54 @@ def test_logical_ops_with_index(self, op):
         result = op(ser, idx2)
         assert_series_equal(result, expected)
 
+    def test_reversed_xor_with_index_returns_index(self):
+        # GH#22092, GH#19792
+        ser = Series([True, True, False, False])
+        idx1 = Index([True, False, True, False])
+        idx2 = Index([1, 0, 1, 0])
+
+        expected = Index.symmetric_difference(idx1, ser)
+        result = idx1 ^ ser
+        assert_index_equal(result, expected)
+
+        expected = Index.symmetric_difference(idx2, ser)
+        result = idx2 ^ ser
+        assert_index_equal(result, expected)
+
     @pytest.mark.parametrize(
         "op",
         [
             pytest.param(
                 ops.rand_,
                 marks=pytest.mark.xfail(
-                    reason="GH#22092 Index implementation returns Index",
+                    reason="GH#22092 Index __and__ returns Index intersection",
                     raises=AssertionError,
                     strict=True,
                 ),
             ),
             pytest.param(
                 ops.ror_,
                 marks=pytest.mark.xfail(
-                    reason="Index.get_indexer with non unique index",
-                    raises=InvalidIndexError,
+                    reason="GH#22092 Index __or__ returns Index union",
+                    raises=AssertionError,
                     strict=True,
                 ),
             ),
-            ops.rxor,
         ],
     )
-    def test_reversed_logical_ops_with_index(self, op):
+    def test_reversed_logical_op_with_index_returns_series(self, op):
         # GH#22092, GH#19792
         ser = Series([True, True, False, False])
         idx1 = Index([True, False, True, False])
         idx2 = Index([1, 0, 1, 0])
 
-        # symmetric_difference is only for rxor, but other 2 should fail
-        expected = idx1.symmetric_difference(ser)
-
+        expected = pd.Series(op(idx1.values, ser.values))
         result = op(ser, idx1)
-        assert_index_equal(result, expected)
-
-        expected = idx2.symmetric_difference(ser)
+        assert_series_equal(result, expected)
 
+        expected = pd.Series(op(idx2.values, ser.values))
         result = op(ser, idx2)
-        assert_index_equal(result, expected)
+        assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
         "op, expected",

diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
@@ -1009,6 +1009,12 @@ def test_bool_indexing(self, indexer_klass, indexer):
             s = pd.Series(idx)
             tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx])
 
+    def test_get_indexer_non_unique_dtype_mismatch(self):
+        # GH 25459
+        indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0]))
+        tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes)
+        tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing)
+
 
 class TestTranspose(Ops):
     errmsg = "the 'axes' parameter is not supported"