Skip to content

Commit

Permalink
API: This fixes a number of inconsistencies and API issues
Browse files Browse the repository at this point in the history
w.r.t. dtype conversions.

This is a reprise of pandas-dev#14145 & pandas-dev#16408.

This removes some code from the core structures & pushes it to internals,
where the primitives are made more consistent.

This should all us to be a bit more consistent for pandas2 type things.

closes pandas-dev#16402
supersedes pandas-dev#14145
closes pandas-dev#14001
  • Loading branch information
jreback committed Jul 11, 2017
1 parent 37c1ec8 commit cbd40cf
Show file tree
Hide file tree
Showing 16 changed files with 428 additions and 386 deletions.
12 changes: 12 additions & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,18 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in
- Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
- ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`)

.. _whatsnew_0210.dtype_conversions:

Dtype Conversions
^^^^^^^^^^^^^^^^^

Example about setitem / where with bools.



- Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`)
- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)

.. _whatsnew_0210.api:

Expand Down
26 changes: 20 additions & 6 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ cimport tslib
from hashtable cimport *
from pandas._libs import tslib, algos, hashtable as _hash
from pandas._libs.tslib import Timestamp, Timedelta
from datetime import datetime, timedelta

from datetime cimport (get_datetime64_value, _pydatetime_to_dts,
pandas_datetimestruct)
Expand Down Expand Up @@ -507,24 +508,37 @@ cdef class TimedeltaEngine(DatetimeEngine):
return 'm8[ns]'

cpdef convert_scalar(ndarray arr, object value):
# we don't turn integers
# into datetimes/timedeltas

# we don't turn bools into int/float/complex

if arr.descr.type_num == NPY_DATETIME:
if isinstance(value, np.ndarray):
pass
elif isinstance(value, Timestamp):
return value.value
elif isinstance(value, datetime):
return Timestamp(value).value
elif value is None or value != value:
return iNaT
else:
elif util.is_string_object(value):
return Timestamp(value).value
raise ValueError("cannot set a Timestamp with a non-timestamp")

elif arr.descr.type_num == NPY_TIMEDELTA:
if isinstance(value, np.ndarray):
pass
elif isinstance(value, Timedelta):
return value.value
elif isinstance(value, timedelta):
return Timedelta(value).value
elif value is None or value != value:
return iNaT
else:
elif util.is_string_object(value):
return Timedelta(value).value
raise ValueError("cannot set a Timedelta with a non-timedelta")

if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and not
issubclass(arr.dtype.type, np.bool_)):
if util.is_bool_object(value):
raise ValueError('Cannot assign bool to float/integer series')

if issubclass(arr.dtype.type, (np.integer, np.bool_)):
if util.is_float_object(value) and value != value:
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@ def _reconstruct_data(values, dtype, original):
pass
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
values = Index(original)._shallow_copy(values, name=None)
elif is_bool_dtype(dtype):
values = values.astype(dtype)

# we only support object dtypes bool Index
if isinstance(original, Index):
values = values.astype(object)
elif dtype is not None:
values = values.astype(dtype)

Expand Down
60 changes: 51 additions & 9 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def maybe_promote(dtype, fill_value=np.nan):
else:
if issubclass(dtype.type, np.datetime64):
try:
fill_value = Timestamp(fill_value).value
fill_value = tslib.Timestamp(fill_value).value
except:
# the proper thing to do here would probably be to upcast
# to object (but numpy 1.6.1 doesn't do this properly)
Expand Down Expand Up @@ -333,6 +333,23 @@ def maybe_promote(dtype, fill_value=np.nan):
return dtype, fill_value


def infer_dtype_from(val, pandas_dtype=False):
"""
interpret the dtype from a scalar or array. This is a convenience
routines to infer dtype from a scalar or an array
Parameters
----------
pandas_dtype : bool, default False
whether to infer dtype including pandas extension types.
If False, scalar/array belongs to pandas extension types is inferred as
object
"""
if is_scalar(val):
return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)


def infer_dtype_from_scalar(val, pandas_dtype=False):
"""
interpret the dtype from a scalar
Expand Down Expand Up @@ -408,23 +425,29 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
return dtype, val


def infer_dtype_from_array(arr):
def infer_dtype_from_array(arr, pandas_dtype=False):
"""
infer the dtype from a scalar or array
Parameters
----------
arr : scalar or array
pandas_dtype : bool, default False
whether to infer dtype including pandas extension types.
If False, array belongs to pandas extension types
is inferred as object
Returns
-------
tuple (numpy-compat dtype, array)
tuple (numpy-compat/pandas-compat dtype, array)
Notes
-----
These infer to numpy dtypes exactly
with the exception that mixed / object dtypes
are not coerced by stringifying or conversion
if pandas_dtype=False. these infer to numpy dtypes
exactly with the exception that mixed / object dtypes
if pandas_dtype=True. datetime64tz-aware/categorical
types will retain there character.
Examples
--------
Expand All @@ -442,6 +465,13 @@ def infer_dtype_from_array(arr):
if not is_list_like(arr):
arr = [arr]

if pandas_dtype and (is_categorical_dtype(arr) or
is_datetime64tz_dtype(arr)):
return arr.dtype, arr

elif isinstance(arr, ABCSeries):
return arr.dtype, np.asarray(arr)

# don't force numpy coerce with nan's
inferred = lib.infer_dtype(arr)
if inferred in ['string', 'bytes', 'unicode',
Expand Down Expand Up @@ -552,7 +582,7 @@ def conv(r, dtype):
if isnull(r):
pass
elif dtype == _NS_DTYPE:
r = Timestamp(r)
r = tslib.Timestamp(r)
elif dtype == _TD_DTYPE:
r = _coerce_scalar_to_timedelta_type(r)
elif dtype == np.bool_:
Expand Down Expand Up @@ -1028,13 +1058,25 @@ def find_common_type(types):
return np.find_common_type(types, [])


def _cast_scalar_to_array(shape, value, dtype=None):
def cast_scalar_to_array(shape, value, dtype=None):
"""
create np.ndarray of specified shape and dtype, filled with values
Parameters
----------
shape : tuple
value : scalar value
dtype : np.dtype, optional
dtype to coerce
Returns
-------
ndarray of shape, filled with value, of specified / inferred dtype
"""

if dtype is None:
dtype, fill_value = _infer_dtype_from_scalar(value)
dtype, fill_value = infer_dtype_from_scalar(value)
else:
fill_value = value

Expand Down
13 changes: 12 additions & 1 deletion pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
ExtensionDtype)
from .generic import (ABCCategorical, ABCPeriodIndex,
ABCDatetimeIndex, ABCSeries,
ABCSparseArray, ABCSparseSeries)
ABCSparseArray, ABCSparseSeries,
ABCIndexClass)
from .inference import is_string_like
from .inference import * # noqa

Expand Down Expand Up @@ -1543,6 +1544,16 @@ def is_bool_dtype(arr_or_dtype):
except ValueError:
# this isn't even a dtype
return False

if isinstance(arr_or_dtype, ABCIndexClass):

# TODO(jreback)
# we don't have a boolean Index class
# so its object, we need to infer to
# guess this
return (arr_or_dtype.is_object and
arr_or_dtype.inferred_type == 'boolean')

return issubclass(tipo, np.bool_)


Expand Down
11 changes: 6 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
import numpy.ma as ma

from pandas.core.dtypes.cast import (
maybe_upcast, infer_dtype_from_scalar,
maybe_upcast,
cast_scalar_to_array,
maybe_cast_to_datetime,
maybe_infer_to_datetimelike,
maybe_convert_platform,
Expand Down Expand Up @@ -386,8 +387,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
raise_with_traceback(exc)

if arr.ndim == 0 and index is not None and columns is not None:
values = _cast_scalar_to_array((len(index), len(columns)),
data, dtype=dtype)
values = cast_scalar_to_array((len(index), len(columns)),
data, dtype=dtype)
mgr = self._init_ndarray(values, index, columns,
dtype=values.dtype, copy=False)
else:
Expand Down Expand Up @@ -2684,8 +2685,8 @@ def reindexer(value):

else:
# upcast the scalar
value = _cast_scalar_to_array(len(self.index), value)
value = _possibly_cast_to_datetime(value, value.dtype)
value = cast_scalar_to_array(len(self.index), value)
value = maybe_cast_to_datetime(value, value.dtype)

# return internal types directly
if is_extension_type(value):
Expand Down
46 changes: 2 additions & 44 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from pandas.core.dtypes.common import (
_ensure_int64,
_ensure_object,
needs_i8_conversion,
is_scalar,
is_number,
is_integer, is_bool,
Expand All @@ -26,7 +25,8 @@
is_dict_like,
is_re_compilable,
pandas_dtype)
from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
from pandas.core.dtypes.cast import (
maybe_promote, maybe_upcast_putmask)
from pandas.core.dtypes.missing import isnull, notnull
from pandas.core.dtypes.generic import ABCSeries, ABCPanel

Expand Down Expand Up @@ -5335,48 +5335,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
raise NotImplementedError("cannot align with a higher "
"dimensional NDFrame")

elif is_list_like(other):

if self.ndim == 1:

# try to set the same dtype as ourselves
try:
new_other = np.array(other, dtype=self.dtype)
except ValueError:
new_other = np.array(other)
except TypeError:
new_other = other

# we can end up comparing integers and m8[ns]
# which is a numpy no no
is_i8 = needs_i8_conversion(self.dtype)
if is_i8:
matches = False
else:
matches = (new_other == np.array(other))

if matches is False or not matches.all():

# coerce other to a common dtype if we can
if needs_i8_conversion(self.dtype):
try:
other = np.array(other, dtype=self.dtype)
except:
other = np.array(other)
else:
other = np.asarray(other)
other = np.asarray(other,
dtype=np.common_type(other,
new_other))

# we need to use the new dtype
try_quick = False
else:
other = new_other
else:

other = np.array(other)

if isinstance(other, np.ndarray):

if other.shape != self.shape:
Expand Down
12 changes: 11 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
is_object_dtype,
is_categorical_dtype,
is_interval_dtype,
is_bool,
is_bool_dtype,
is_signed_integer_dtype,
is_unsigned_integer_dtype,
Expand Down Expand Up @@ -610,9 +611,18 @@ def repeat(self, repeats, *args, **kwargs):
def where(self, cond, other=None):
if other is None:
other = self._na_value
values = np.where(cond, self.values, other)

dtype = self.dtype
values = self.values

if is_bool(other) or is_bool_dtype(other):

# bools force casting
values = values.astype(object)
dtype = None

values = np.where(cond, values, other)

if self._is_numeric_dtype and np.any(isnull(values)):
# We can't coerce to the numeric dtype of "self" (unless
# it's float) if there are NaN values in our output.
Expand Down
21 changes: 18 additions & 3 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@
from pandas._libs import (index as libindex,
algos as libalgos, join as libjoin)
from pandas.core.dtypes.common import (
is_dtype_equal, pandas_dtype,
is_float_dtype, is_object_dtype,
is_integer_dtype, is_scalar)
is_dtype_equal,
pandas_dtype,
is_float_dtype,
is_object_dtype,
is_integer_dtype,
is_bool,
is_bool_dtype,
is_scalar)
from pandas.core.common import _asarray_tuplesafe, _values_from_object

from pandas import compat
Expand Down Expand Up @@ -56,6 +61,16 @@ def _maybe_cast_slice_bound(self, label, side, kind):
# we will try to coerce to integers
return self._maybe_cast_indexer(label)

def _convert_for_op(self, value):
""" Convert value to be insertable to ndarray """

if is_bool(value) or is_bool_dtype(value):
# force conversion to object
# so we don't lose the bools
raise TypeError

return value

def _convert_tolerance(self, tolerance):
try:
return float(tolerance)
Expand Down
Loading

0 comments on commit cbd40cf

Please sign in to comment.