Skip to content

Commit

Permalink
BUG: CategoricalIndex allowed reindexing duplicate sources (pandas-de…
Browse files Browse the repository at this point in the history
  • Loading branch information
batterseapower authored and proost committed Dec 19, 2019
1 parent e336fe1 commit b7be0e9
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 86 deletions.
36 changes: 25 additions & 11 deletions doc/source/user_guide/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -783,27 +783,41 @@ values **not** in the categories, similarly to how you can reindex **any** panda

.. ipython:: python
df2.reindex(['a', 'e'])
df2.reindex(['a', 'e']).index
df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde')))
df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))).index
df3 = pd.DataFrame({'A': np.arange(3),
'B': pd.Series(list('abc')).astype('category')})
df3 = df3.set_index('B')
df3
.. ipython:: python
df3.reindex(['a', 'e'])
df3.reindex(['a', 'e']).index
df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe')))
df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index
.. warning::

Reshaping and Comparison operations on a ``CategoricalIndex`` must have the same categories
or a ``TypeError`` will be raised.

.. code-block:: ipython
.. ipython:: python
In [9]: df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')})
df4 = pd.DataFrame({'A': np.arange(2),
'B': list('ba')})
df4['B'] = df4['B'].astype(CategoricalDtype(list('ab')))
df4 = df4.set_index('B')
df4.index
In [11]: df3 = df3.set_index('B')
df5 = pd.DataFrame({'A': np.arange(2),
'B': list('bc')})
df5['B'] = df5['B'].astype(CategoricalDtype(list('bc')))
df5 = df5.set_index('B')
df5.index
In [11]: df3.index
Out[11]: CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, name='B', dtype='category')
.. code-block:: ipython
In [12]: pd.concat([df2, df3])
TypeError: categories must match existing categories when appending
In [1]: pd.concat([df4, df5])
TypeError: categories must match existing categories when appending
.. _indexing.rangeindex:

Expand Down
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ Categorical

- Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`)
- Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`)
- :meth:`DataFrame.reindex` with a :class:`CategoricalIndex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`)
- Bug in :meth:`Categorical.astype` not allowing for casting to extension dtypes (:issue:`28668`)
- Bug where :func:`merge` was unable to join on categorical and extension dtype columns (:issue:`28668`)
- :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`)
Expand Down Expand Up @@ -292,6 +293,9 @@ Indexing
- Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`)
- Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`)
- Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`)
- :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`)
- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`)
-

Missing
^^^^^^^
Expand Down
20 changes: 14 additions & 6 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ cdef class IndexEngine:
cdef:
ndarray values, x
ndarray[int64_t] result, missing
set stargets
set stargets, remaining_stargets
dict d = {}
object val
int count = 0, count_missing = 0
Expand All @@ -309,12 +309,20 @@ cdef class IndexEngine:
if stargets and len(stargets) < 5 and self.is_monotonic_increasing:
# if there are few enough stargets and the index is monotonically
# increasing, then use binary search for each starget
remaining_stargets = set()
for starget in stargets:
start = values.searchsorted(starget, side='left')
end = values.searchsorted(starget, side='right')
if start != end:
d[starget] = list(range(start, end))
else:
try:
start = values.searchsorted(starget, side='left')
end = values.searchsorted(starget, side='right')
except TypeError: # e.g. if we tried to search for string in int array
remaining_stargets.add(starget)
else:
if start != end:
d[starget] = list(range(start, end))

stargets = remaining_stargets

if stargets:
# otherwise, map by iterating through all items in the index
for i in range(n):
val = values[i]
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2493,8 +2493,12 @@ def _union(self, other, sort):
value_set = set(lvals)
result.extend([x for x in rvals if x not in value_set])
else:
indexer = self.get_indexer(other)
indexer, = (indexer == -1).nonzero()
# find indexes of things in "other" that are not in "self"
if self.is_unique:
indexer = self.get_indexer(other)
indexer = (indexer == -1).nonzero()[0]
else:
indexer = algos.unique1d(self.get_indexer_non_unique(other)[1])

if len(indexer) > 0:
other_diff = algos.take_nd(rvals, indexer, allow_fill=False)
Expand Down
8 changes: 0 additions & 8 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,10 +552,6 @@ def get_value(self, series: AnyArrayLike, key: Any):
# we might be a positional inexer
return super().get_value(series, key)

def _can_reindex(self, indexer):
""" always allow reindexing """
pass

@Substitution(klass="CategoricalIndex")
@Appender(_shared_docs["searchsorted"])
def searchsorted(self, value, side="left", sorter=None):
Expand Down Expand Up @@ -585,7 +581,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
Indices of output values in original index
"""

if method is not None:
raise NotImplementedError(
"argument method is not implemented for CategoricalIndex.reindex"
Expand All @@ -605,9 +600,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
indexer = None
missing = []
else:
if not target.is_unique:
raise ValueError("cannot reindex with a non-unique indexer")

indexer, missing = self.get_indexer_non_unique(np.array(target))

if len(self.codes) and indexer is not None:
Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,15 +599,19 @@ def test_reindex_dtype(self):
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

def test_reindex_duplicate_target(self):
# See GH23963
c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
with pytest.raises(ValueError, match="non-unique indexer"):
c.reindex(["a", "a", "c"])
# See GH25459
cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"])
res, indexer = cat.reindex(["a", "c", "c"])
exp = Index(["a", "c", "c"], dtype="object")
tm.assert_index_equal(res, exp, exact=True)
tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))

with pytest.raises(ValueError, match="non-unique indexer"):
c.reindex(
CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"])
)
res, indexer = cat.reindex(
CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
)
exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
tm.assert_index_equal(res, exp, exact=True)
tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))

def test_reindex_empty_index(self):
# See GH16770
Expand Down
71 changes: 34 additions & 37 deletions pandas/tests/indexing/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,92 +561,89 @@ def test_read_only_source(self):
assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3])

def test_reindexing(self):
df = DataFrame(
{
"A": np.arange(3, dtype="int64"),
"B": Series(list("abc")).astype(CDT(list("cabe"))),
}
).set_index("B")

# reindexing
# convert to a regular index
result = self.df2.reindex(["a", "b", "e"])
expected = DataFrame(
{"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))}
).set_index("B")
result = df.reindex(["a", "b", "e"])
expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
"B"
)
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(["a", "b"])
expected = DataFrame(
{"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))}
).set_index("B")
result = df.reindex(["a", "b"])
expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(["e"])
result = df.reindex(["e"])
expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(["d"])
result = df.reindex(["d"])
expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B")
assert_frame_equal(result, expected, check_index_type=True)

# since we are actually reindexing with a Categorical
# then return a Categorical
cats = list("cabe")

result = self.df2.reindex(Categorical(["a", "d"], categories=cats))
result = df.reindex(Categorical(["a", "e"], categories=cats))
expected = DataFrame(
{"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))}
{"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
).set_index("B")
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(Categorical(["a"], categories=cats))
result = df.reindex(Categorical(["a"], categories=cats))
expected = DataFrame(
{"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))}
{"A": [0], "B": Series(list("a")).astype(CDT(cats))}
).set_index("B")
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(["a", "b", "e"])
expected = DataFrame(
{"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))}
).set_index("B")
result = df.reindex(["a", "b", "e"])
expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
"B"
)
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(["a", "b"])
expected = DataFrame(
{"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))}
).set_index("B")
result = df.reindex(["a", "b"])
expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(["e"])
result = df.reindex(["e"])
expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
assert_frame_equal(result, expected, check_index_type=True)

# give back the type of categorical that we received
result = self.df2.reindex(
Categorical(["a", "d"], categories=cats, ordered=True)
)
result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True))
expected = DataFrame(
{
"A": [0, 1, 5, np.nan],
"B": Series(list("aaad")).astype(CDT(cats, ordered=True)),
}
{"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
).set_index("B")
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"]))
result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
expected = DataFrame(
{"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))}
{"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
).set_index("B")
assert_frame_equal(result, expected, check_index_type=True)

# passed duplicate indexers are not allowed
msg = "cannot reindex with a non-unique indexer"
msg = "cannot reindex from a duplicate axis"
with pytest.raises(ValueError, match=msg):
self.df2.reindex(["a", "a"])
self.df2.reindex(["a", "b"])

# args NotImplemented ATM
msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
with pytest.raises(NotImplementedError, match=msg.format("method")):
self.df2.reindex(["a"], method="ffill")
df.reindex(["a"], method="ffill")
with pytest.raises(NotImplementedError, match=msg.format("level")):
self.df2.reindex(["a"], level=1)
df.reindex(["a"], level=1)
with pytest.raises(NotImplementedError, match=msg.format("limit")):
self.df2.reindex(["a"], limit=2)
df.reindex(["a"], limit=2)

def test_loc_slice(self):
# slicing
Expand Down
35 changes: 22 additions & 13 deletions pandas/tests/series/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pandas as pd
from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna
from pandas.core import ops
from pandas.core.indexes.base import InvalidIndexError
import pandas.core.nanops as nanops
import pandas.util.testing as tm
from pandas.util.testing import (
Expand Down Expand Up @@ -282,44 +281,54 @@ def test_logical_ops_with_index(self, op):
result = op(ser, idx2)
assert_series_equal(result, expected)

def test_reversed_xor_with_index_returns_index(self):
# GH#22092, GH#19792
ser = Series([True, True, False, False])
idx1 = Index([True, False, True, False])
idx2 = Index([1, 0, 1, 0])

expected = Index.symmetric_difference(idx1, ser)
result = idx1 ^ ser
assert_index_equal(result, expected)

expected = Index.symmetric_difference(idx2, ser)
result = idx2 ^ ser
assert_index_equal(result, expected)

@pytest.mark.parametrize(
"op",
[
pytest.param(
ops.rand_,
marks=pytest.mark.xfail(
reason="GH#22092 Index implementation returns Index",
reason="GH#22092 Index __and__ returns Index intersection",
raises=AssertionError,
strict=True,
),
),
pytest.param(
ops.ror_,
marks=pytest.mark.xfail(
reason="Index.get_indexer with non unique index",
raises=InvalidIndexError,
reason="GH#22092 Index __or__ returns Index union",
raises=AssertionError,
strict=True,
),
),
ops.rxor,
],
)
def test_reversed_logical_ops_with_index(self, op):
def test_reversed_logical_op_with_index_returns_series(self, op):
# GH#22092, GH#19792
ser = Series([True, True, False, False])
idx1 = Index([True, False, True, False])
idx2 = Index([1, 0, 1, 0])

# symmetric_difference is only for rxor, but other 2 should fail
expected = idx1.symmetric_difference(ser)

expected = pd.Series(op(idx1.values, ser.values))
result = op(ser, idx1)
assert_index_equal(result, expected)

expected = idx2.symmetric_difference(ser)
assert_series_equal(result, expected)

expected = pd.Series(op(idx2.values, ser.values))
result = op(ser, idx2)
assert_index_equal(result, expected)
assert_series_equal(result, expected)

@pytest.mark.parametrize(
"op, expected",
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1009,6 +1009,12 @@ def test_bool_indexing(self, indexer_klass, indexer):
s = pd.Series(idx)
tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx])

def test_get_indexer_non_unique_dtype_mismatch(self):
# GH 25459
indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0]))
tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes)
tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing)


class TestTranspose(Ops):
errmsg = "the 'axes' parameter is not supported"
Expand Down
Loading

0 comments on commit b7be0e9

Please sign in to comment.