API: This fixes a number of inconsistencies and API issues

w.r.t. dtype conversions. This is a reprise of pandas-dev#14145 & pandas-dev#16408. This removes some code from the core structures & pushes it to internals, where the primitives are made more consistent. This should all us to be a bit more consistent for pandas2 type things. closes pandas-dev#16402 supersedes pandas-dev#14145 closes pandas-dev#14001 CLN: remove uneeded code in internals; use split_and_operate when possible
jreback · Jul 20, 2017 · 676b773 · 676b773
1 parent f19966e
commit 676b773
Show file tree

Hide file tree

Showing 23 changed files with 841 additions and 609 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -127,6 +127,65 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in
    ...
    ValueError: Cannot operate inplace if there is no assignment
 
+.. _whatsnew_0210.dtype_conversions:
+
+Dtype Conversions
+^^^^^^^^^^^^^^^^^
+
+- Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to
+  same type (e.g. int / float), or raise for datetimelikes. These will now preseve the bools with ``object`` dtypes. (:issue:`16821`).
+
+   .. ipython:: python
+
+      s = Series([1, 2, 3])
+
+   .. code-block:: python
+
+      In [5]: s[1] = True
+
+      In [6]: s
+      Out[6]:
+      0    1
+      1    1
+      2    3
+      dtype: int64
+
+   New Behavior
+
+   .. ipython:: python
+
+      s[1] = True
+      s
+
+- Previously as assignment to a datetimelike with a non-datetimelike would coerce the
+  non-datetime-like item being assigned (:issue:`14145`).
+
+   .. ipython:: python
+
+      s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')])
+
+   .. code-block:: python
+
+      In [1]: s[1] = 1
+
+      In [2]: s
+      Out[2]:
+      0   2011-01-01 00:00:00.000000000
+      1   1970-01-01 00:00:00.000000001
+      dtype: datetime64[ns]
+
+   These now coerce to ``object`` dtype.
+
+   .. ipython:: python
+
+      s[1] = 1
+      s
+
+- Additional bug fixes w.r.t. dtype conversions.
+
+  - Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`)
+   - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
+
 .. _whatsnew_0210.api:
 
 Other API Changes
@@ -142,13 +201,6 @@ Other API Changes
 - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
 - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`)
 - Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`)
-
-
-.. _whatsnew_0210.api:
-
-Other API Changes
-^^^^^^^^^^^^^^^^^
-
 - Moved definition of ``MergeError`` to the ``pandas.errors`` module.
 
 
@@ -192,7 +244,7 @@ Bug Fixes
 Conversion
 ^^^^^^^^^^
 
-- Bug in assignment against datetime-like data with ``int`` may incorrectly converted to datetime-like (:issue:`14145`)
+- Bug in assignment against datetime-like data with ``int`` may incorrectly converte to datetime-like (:issue:`14145`)
 - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
 
 

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -19,6 +19,7 @@ cimport tslib
 from hashtable cimport *
 from pandas._libs import tslib, algos, hashtable as _hash
 from pandas._libs.tslib import Timestamp, Timedelta
+from datetime import datetime, timedelta
 
 from datetime cimport (get_datetime64_value, _pydatetime_to_dts,
                        pandas_datetimestruct)
@@ -507,24 +508,37 @@ cdef class TimedeltaEngine(DatetimeEngine):
         return 'm8[ns]'
 
 cpdef convert_scalar(ndarray arr, object value):
+    # we don't turn integers
+    # into datetimes/timedeltas
+
+    # we don't turn bools into int/float/complex
+
     if arr.descr.type_num == NPY_DATETIME:
         if isinstance(value, np.ndarray):
             pass
-        elif isinstance(value, Timestamp):
-            return value.value
+        elif isinstance(value, datetime):
+            return Timestamp(value).value
         elif value is None or value != value:
             return iNaT
-        else:
+        elif util.is_string_object(value):
             return Timestamp(value).value
+        raise ValueError("cannot set a Timestamp with a non-timestamp")
+
     elif arr.descr.type_num == NPY_TIMEDELTA:
         if isinstance(value, np.ndarray):
             pass
-        elif isinstance(value, Timedelta):
-            return value.value
+        elif isinstance(value, timedelta):
+            return Timedelta(value).value
         elif value is None or value != value:
             return iNaT
-        else:
+        elif util.is_string_object(value):
             return Timedelta(value).value
+        raise ValueError("cannot set a Timedelta with a non-timedelta")
+
+    if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and
+            not issubclass(arr.dtype.type, np.bool_)):
+        if util.is_bool_object(value):
+            raise ValueError('Cannot assign bool to float/integer series')
 
     if issubclass(arr.dtype.type, (np.integer, np.bool_)):
         if util.is_float_object(value) and value != value:

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -14,6 +14,7 @@ cdef bint PY3 = (sys.version_info[0] >= 3)
 from cpython cimport (
     PyTypeObject,
     PyFloat_Check,
+    PyComplex_Check,
     PyLong_Check,
     PyObject_RichCompareBool,
     PyObject_RichCompare,
@@ -902,7 +903,7 @@ cdef inline bint _checknull_with_nat(object val):
 cdef inline bint _check_all_nulls(object val):
     """ utility to check if a value is any type of null """
     cdef bint res
-    if PyFloat_Check(val):
+    if PyFloat_Check(val) or PyComplex_Check(val):
         res = val != val
     elif val is NaT:
         res = 1

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -151,6 +151,12 @@ def _reconstruct_data(values, dtype, original):
         pass
     elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
         values = Index(original)._shallow_copy(values, name=None)
+    elif is_bool_dtype(dtype):
+        values = values.astype(dtype)
+
+        # we only support object dtypes bool Index
+        if isinstance(original, Index):
+            values = values.astype(object)
     elif dtype is not None:
         values = values.astype(dtype)
 

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -273,7 +273,7 @@ def maybe_promote(dtype, fill_value=np.nan):
         else:
             if issubclass(dtype.type, np.datetime64):
                 try:
-                    fill_value = Timestamp(fill_value).value
+                    fill_value = tslib.Timestamp(fill_value).value
                 except:
                     # the proper thing to do here would probably be to upcast
                     # to object (but numpy 1.6.1 doesn't do this properly)
@@ -334,6 +334,23 @@ def maybe_promote(dtype, fill_value=np.nan):
     return dtype, fill_value
 
 
+def infer_dtype_from(val, pandas_dtype=False):
+    """
+    interpret the dtype from a scalar or array. This is a convenience
+    routines to infer dtype from a scalar or an array
+
+    Parameters
+    ----------
+    pandas_dtype : bool, default False
+        whether to infer dtype including pandas extension types.
+        If False, scalar/array belongs to pandas extension types is inferred as
+        object
+    """
+    if is_scalar(val):
+        return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
+    return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
+
+
 def infer_dtype_from_scalar(val, pandas_dtype=False):
     """
     interpret the dtype from a scalar
@@ -409,24 +426,31 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
     return dtype, val
 
 
-def infer_dtype_from_array(arr):
+def infer_dtype_from_array(arr, pandas_dtype=False):
     """
     infer the dtype from a scalar or array
 
     Parameters
     ----------
     arr : scalar or array
+    pandas_dtype : bool, default False
+        whether to infer dtype including pandas extension types.
+        If False, array belongs to pandas extension types
+        is inferred as object
 
     Returns
     -------
-    tuple (numpy-compat dtype, array)
+    tuple (numpy-compat/pandas-compat dtype, array)
 
     Notes
     -----
-    These infer to numpy dtypes exactly
-    with the exception that mixed / object dtypes
+    if pandas_dtype=False. these infer to numpy dtypes
+    exactly with the exception that mixed / object dtypes
     are not coerced by stringifying or conversion
 
+    if pandas_dtype=True. datetime64tz-aware/categorical
+    types will retain there character.
+
     Examples
     --------
     >>> np.asarray([1, '1'])
@@ -443,6 +467,12 @@ def infer_dtype_from_array(arr):
     if not is_list_like(arr):
         arr = [arr]
 
+    if pandas_dtype and is_extension_type(arr):
+        return arr.dtype, arr
+
+    elif isinstance(arr, ABCSeries):
+        return arr.dtype, np.asarray(arr)
+
     # don't force numpy coerce with nan's
     inferred = lib.infer_dtype(arr)
     if inferred in ['string', 'bytes', 'unicode',
@@ -553,7 +583,7 @@ def conv(r, dtype):
             if isnull(r):
                 pass
             elif dtype == _NS_DTYPE:
-                r = Timestamp(r)
+                r = tslib.Timestamp(r)
             elif dtype == _TD_DTYPE:
                 r = _coerce_scalar_to_timedelta_type(r)
             elif dtype == np.bool_:
@@ -1029,13 +1059,25 @@ def find_common_type(types):
     return np.find_common_type(types, [])
 
 
-def _cast_scalar_to_array(shape, value, dtype=None):
+def cast_scalar_to_array(shape, value, dtype=None):
     """
     create np.ndarray of specified shape and dtype, filled with values
+
+    Parameters
+    ----------
+    shape : tuple
+    value : scalar value
+    dtype : np.dtype, optional
+        dtype to coerce
+
+    Returns
+    -------
+    ndarray of shape, filled with value, of specified / inferred dtype
+
     """
 
     if dtype is None:
-        dtype, fill_value = _infer_dtype_from_scalar(value)
+        dtype, fill_value = infer_dtype_from_scalar(value)
     else:
         fill_value = value
 

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -11,7 +11,8 @@
                      ExtensionDtype)
 from .generic import (ABCCategorical, ABCPeriodIndex,
                       ABCDatetimeIndex, ABCSeries,
-                      ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex)
+                      ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex,
+                      ABCIndexClass)
 from .inference import is_string_like
 from .inference import *  # noqa
 
@@ -1545,6 +1546,16 @@ def is_bool_dtype(arr_or_dtype):
     except ValueError:
         # this isn't even a dtype
         return False
+
+    if isinstance(arr_or_dtype, ABCIndexClass):
+
+        # TODO(jreback)
+        # we don't have a boolean Index class
+        # so its object, we need to infer to
+        # guess this
+        return (arr_or_dtype.is_object and
+                arr_or_dtype.inferred_type == 'boolean')
+
     return issubclass(tipo, np.bool_)