Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: implement _should_compare/_is_comparable_dtype for all Index subclasses #38251

Merged
merged 2 commits into from
Dec 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 23 additions & 8 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4904,16 +4904,31 @@ def get_indexer_non_unique(self, target):
# Treat boolean labels passed to a numeric index as not found. Without
# this fix False and True would be treated as 0 and 1 respectively.
# (GH #16877)
no_matches = -1 * np.ones(self.shape, dtype=np.intp)
return no_matches, no_matches
return self._get_indexer_non_comparable(target, method=None, unique=False)

pself, ptarget = self._maybe_promote(target)
if pself is not self or ptarget is not target:
return pself.get_indexer_non_unique(ptarget)

if not self._is_comparable_dtype(target.dtype):
no_matches = -1 * np.ones(self.shape, dtype=np.intp)
return no_matches, no_matches
if not self._should_compare(target):
return self._get_indexer_non_comparable(target, method=None, unique=False)

if not is_dtype_equal(self.dtype, target.dtype):
# TODO: if object, could use infer_dtype to pre-empt costly
# conversion if still non-comparable?
dtype = find_common_type([self.dtype, target.dtype])
if (
dtype.kind in ["i", "u"]
and is_categorical_dtype(target.dtype)
and target.hasnans
):
# FIXME: find_common_type incorrect with Categorical GH#38240
# FIXME: some cases where float64 cast can be lossy?
dtype = np.dtype(np.float64)

this = self.astype(dtype, copy=False)
that = target.astype(dtype, copy=False)
return this.get_indexer_non_unique(that)

if is_categorical_dtype(target.dtype):
tgt_values = np.asarray(target)
Expand Down Expand Up @@ -4966,7 +4981,7 @@ def _get_indexer_non_comparable(self, target: "Index", method, unique: bool = Tr
If doing an inequality check, i.e. method is not None.
"""
if method is not None:
other = _unpack_nested_dtype(target)
other = unpack_nested_dtype(target)
raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")

no_matches = -1 * np.ones(target.shape, dtype=np.intp)
Expand Down Expand Up @@ -5017,7 +5032,7 @@ def _should_compare(self, other: "Index") -> bool:
"""
Check if `self == other` can ever have non-False entries.
"""
other = _unpack_nested_dtype(other)
other = unpack_nested_dtype(other)
dtype = other.dtype
return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)

Expand Down Expand Up @@ -6170,7 +6185,7 @@ def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]:
return names


def _unpack_nested_dtype(other: Index) -> Index:
def unpack_nested_dtype(other: Index) -> Index:
"""
When checking if our dtype is comparable with another, we need
to unpack CategoricalDtype to look at its categories.dtype.
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,9 @@ def _maybe_cast_slice_bound(self, label, side: str, kind):

# --------------------------------------------------------------------

def _is_comparable_dtype(self, dtype):
return self.categories._is_comparable_dtype(dtype)

def take_nd(self, *args, **kwargs):
"""Alias for `take`"""
warnings.warn(
Expand Down
20 changes: 17 additions & 3 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pandas._libs import lib
from pandas._libs.interval import Interval, IntervalMixin, IntervalTree
from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset
from pandas._typing import AnyArrayLike, Label
from pandas._typing import AnyArrayLike, DtypeObj, Label
from pandas.errors import InvalidIndexError
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.util._exceptions import rewrite_exception
Expand All @@ -38,6 +38,7 @@
is_object_dtype,
is_scalar,
)
from pandas.core.dtypes.dtypes import IntervalDtype

from pandas.core.algorithms import take_1d
from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs
Expand All @@ -50,6 +51,7 @@
default_pprint,
ensure_index,
maybe_extract_name,
unpack_nested_dtype,
)
from pandas.core.indexes.datetimes import DatetimeIndex, date_range
from pandas.core.indexes.extension import ExtensionIndex, inherit_names
Expand Down Expand Up @@ -807,15 +809,27 @@ def _convert_list_indexer(self, keyarr):

return locs

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
if not isinstance(dtype, IntervalDtype):
return False
common_subtype = find_common_type([self.dtype.subtype, dtype.subtype])
return not is_object_dtype(common_subtype)

def _should_compare(self, other) -> bool:
if not super()._should_compare(other):
return False
other = unpack_nested_dtype(other)
return other.closed == self.closed

# TODO: use should_compare and get rid of _is_non_comparable_own_type
def _is_non_comparable_own_type(self, other: "IntervalIndex") -> bool:
# different closed or incompatible subtype -> no matches

# TODO: once closed is part of IntervalDtype, we can just define
# is_comparable_dtype GH#19371
if self.closed != other.closed:
return True
common_subtype = find_common_type([self.dtype.subtype, other.dtype.subtype])
return is_object_dtype(common_subtype)
return not self._is_comparable_dtype(other.dtype)

# --------------------------------------------------------------------

Expand Down
15 changes: 6 additions & 9 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from pandas._libs import algos as libalgos, index as libindex, lib
from pandas._libs.hashtable import duplicated_int64
from pandas._typing import AnyArrayLike, Label, Scalar, Shape
from pandas._typing import AnyArrayLike, DtypeObj, Label, Scalar, Shape
from pandas.compat.numpy import function as nv
from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError
from pandas.util._decorators import Appender, cache_readonly, doc
Expand Down Expand Up @@ -3582,6 +3582,9 @@ def union(self, other, sort=None):
zip(*uniq_tuples), sortorder=0, names=result_names
)

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
return is_object_dtype(dtype)

def intersection(self, other, sort=False):
"""
Form the intersection of two MultiIndex objects.
Expand Down Expand Up @@ -3617,15 +3620,9 @@ def intersection(self, other, sort=False):
def _intersection(self, other, sort=False):
other, result_names = self._convert_can_do_setop(other)

if not is_object_dtype(other.dtype):
if not self._is_comparable_dtype(other.dtype):
# The intersection is empty
# TODO: we have no tests that get here
return MultiIndex(
levels=self.levels,
codes=[[]] * self.nlevels,
names=result_names,
verify_integrity=False,
)
return self[:0].rename(result_names)

lvals = self._values
rvals = other._values
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np

from pandas._libs import index as libindex, lib
from pandas._typing import Dtype, Label
from pandas._typing import Dtype, DtypeObj, Label
from pandas.util._decorators import doc

from pandas.core.dtypes.cast import astype_nansafe
Expand Down Expand Up @@ -148,6 +148,10 @@ def _convert_tolerance(self, tolerance, target):
)
return tolerance

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
# If we ever have BoolIndex or ComplexIndex, this may need to be tightened
return is_numeric_dtype(dtype)

@classmethod
def _assert_safe_casting(cls, data, subarr):
"""
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1249,10 +1249,9 @@ def test_get_indexer_numeric_index_boolean_target(self, method, idx_class):
if method == "get_indexer":
tm.assert_numpy_array_equal(result, expected)
else:
expected = np.array([-1, -1, -1, -1], dtype=np.intp)

missing = np.arange(3, dtype=np.intp)
tm.assert_numpy_array_equal(result[0], expected)
tm.assert_numpy_array_equal(result[1], expected)
tm.assert_numpy_array_equal(result[1], missing)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason for changing the behaviour here?
Did you consider this a bug?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. As mentioned in the OP, as far as i can tell it isnt actually affecting anything


def test_get_indexer_with_NA_values(
self, unique_nulls_fixture, unique_nulls_fixture2
Expand Down Expand Up @@ -2346,5 +2345,6 @@ def construct(dtype):

else:
no_matches = np.array([-1] * 6, dtype=np.intp)
missing = np.arange(6, dtype=np.intp)
tm.assert_numpy_array_equal(result[0], no_matches)
tm.assert_numpy_array_equal(result[1], no_matches)
tm.assert_numpy_array_equal(result[1], missing)