From de448778809d66427a87361ed4f1fe6c5ede139a Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 3 Apr 2016 03:37:14 +0900 Subject: [PATCH 1/3] BUG: replace coerces incorrect dtype --- pandas/core/internals.py | 20 +++++++++-- pandas/core/missing.py | 4 +++ pandas/tests/indexing/test_coercion.py | 50 +++++++++++++++++++------- pandas/tests/series/test_replace.py | 4 +-- pandas/types/cast.py | 44 ++++++++++++++++++----- 5 files changed, 96 insertions(+), 26 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 289ce150eb46b..ff1357644ab8c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1890,8 +1890,11 @@ def convert(self, *args, **kwargs): blocks.append(newb) else: - values = fn( - self.values.ravel(), **fn_kwargs).reshape(self.values.shape) + values = fn(self.values.ravel(), **fn_kwargs) + try: + values = values.reshape(self.values.shape) + except NotImplementedError: + pass blocks.append(make_block(values, ndim=self.ndim, placement=self.mgr_locs)) @@ -3233,6 +3236,16 @@ def comp(s): return _possibly_compare(values, getattr(s, 'asm8', s), operator.eq) + def _cast(block, scalar): + dtype, val = _infer_dtype_from_scalar(scalar, pandas_dtype=True) + if not is_dtype_equal(block.dtype, dtype): + dtype = _find_common_type([block.dtype, dtype]) + block = block.astype(dtype) + # use original value + val = scalar + + return block, val + masks = [comp(s) for i, s in enumerate(src_list)] result_blocks = [] @@ -3255,7 +3268,8 @@ def comp(s): # particular block m = masks[i][b.mgr_locs.indexer] if m.any(): - new_rb.extend(b.putmask(m, d, inplace=True)) + b, val = _cast(b, d) + new_rb.extend(b.putmask(m, val, inplace=True)) else: new_rb.append(b) rb = new_rb diff --git a/pandas/core/missing.py b/pandas/core/missing.py index e83a0518d97f6..cc783a88c8482 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -39,6 +39,8 @@ def mask_missing(arr, values_to_mask): # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): mask = False + # elif is_object_dtype(arr): + # mask = lib.scalar_compare(arr, x, operator.eq) else: mask = arr == x @@ -51,6 +53,8 @@ def mask_missing(arr, values_to_mask): # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): mask |= False + # elif is_object_dtype(arr): + # mask |= lib.scalar_compare(arr, x, operator.eq) else: mask |= arr == x diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 0cfa7258461f1..d8e52021eb086 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1155,12 +1155,27 @@ def setUp(self): self.rep['float64'] = [1.1, 2.2] self.rep['complex128'] = [1 + 1j, 2 + 2j] self.rep['bool'] = [True, False] + self.rep['datetime64[ns]'] = [pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-03')] + + for tz in ['UTC', 'US/Eastern']: + # to test tz => different tz replacement + key = 'datetime64[ns, {0}]'.format(tz) + self.rep[key] = [pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2011-01-03', tz=tz)] + + self.rep['timedelta64[ns]'] = [pd.Timedelta('1 day'), + pd.Timedelta('2 day')] def _assert_replace_conversion(self, from_key, to_key, how): index = pd.Index([3, 4], name='xxx') obj = pd.Series(self.rep[from_key], index=index, name='yyy') self.assertEqual(obj.dtype, from_key) + if (from_key.startswith('datetime') and to_key.startswith('datetime')): + # different tz, currently mask_missing raises SystemError + return + if how == 'dict': replacer = dict(zip(self.rep[from_key], self.rep[to_key])) elif how == 'series': @@ -1177,17 +1192,10 @@ def _assert_replace_conversion(self, from_key, to_key, how): raise nose.SkipTest("windows platform buggy: {0} -> {1}".format (from_key, to_key)) - if ((from_key == 'float64' and - to_key in ('bool', 'int64')) or - + if ((from_key == 'float64' and to_key in ('bool', 'int64')) or (from_key == 'complex128' and to_key in ('bool', 'int64', 'float64')) or - - (from_key == 'int64' and - to_key in ('bool')) or - - # TODO_GH12747 The result must be int? - (from_key == 'bool' and to_key == 'int64')): + (from_key == 'int64' and to_key in ('bool'))): # buggy on 32-bit if tm.is_platform_32bit(): @@ -1250,13 +1258,31 @@ def test_replace_series_bool(self): self._assert_replace_conversion(from_key, to_key, how='series') def test_replace_series_datetime64(self): - pass + from_key = 'datetime64[ns]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='dict') + + from_key = 'datetime64[ns]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='series') def test_replace_series_datetime64tz(self): - pass + from_key = 'datetime64[ns, US/Eastern]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='dict') + + from_key = 'datetime64[ns, US/Eastern]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='series') def test_replace_series_timedelta64(self): - pass + from_key = 'timedelta64[ns]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='dict') + + from_key = 'timedelta64[ns]' + for to_key in self.rep: + self._assert_replace_conversion(from_key, to_key, how='series') def test_replace_series_period(self): pass diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index d80328ea3863a..ce555b40c1adb 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -134,8 +134,8 @@ def check_replace(to_rep, val, expected): tm.assert_series_equal(expected, r) tm.assert_series_equal(expected, sc) - # should NOT upcast to float - e = pd.Series([0, 1, 2, 3, 4]) + # MUST upcast to float + e = pd.Series([0., 1., 2., 3., 4.]) tr, v = [3], [3.0] check_replace(tr, v, e) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 6b1c3f9c00351..31fb9293e124a 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -19,8 +19,8 @@ _ensure_int8, _ensure_int16, _ensure_int32, _ensure_int64, _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, - _POSSIBLY_CAST_DTYPES) -from .dtypes import ExtensionDtype + _DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) +from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries from .missing import isnull, notnull from .inference import is_list_like @@ -310,8 +310,17 @@ def _maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def _infer_dtype_from_scalar(val): - """ interpret the dtype from a scalar """ +def _infer_dtype_from_scalar(val, pandas_dtype=False): + """ + interpret the dtype from a scalar + + Parameters + ---------- + pandas_dtype : bool, default False + whether to infer dtype including pandas extension types. + If False, scalar belongs to pandas extension types is inferred as + object + """ dtype = np.object_ @@ -334,13 +343,23 @@ def _infer_dtype_from_scalar(val): dtype = np.object_ - elif isinstance(val, (np.datetime64, - datetime)) and getattr(val, 'tzinfo', None) is None: - val = lib.Timestamp(val).value - dtype = np.dtype('M8[ns]') + elif isinstance(val, (np.datetime64, datetime)): + val = tslib.Timestamp(val) + if val is tslib.NaT or val.tz is None: + dtype = np.dtype('M8[ns]') + else: + if pandas_dtype: + dtype = DatetimeTZDtype(unit='ns', tz=val.tz) + # ToDo: This localization is not needed if + # DatetimeTZBlock doesn't localize internal values + val = val.tz_localize(None) + else: + # return datetimetz as object + return np.object_, val + val = val.value elif isinstance(val, (np.timedelta64, timedelta)): - val = lib.Timedelta(val).value + val = tslib.Timedelta(val).value dtype = np.dtype('m8[ns]') elif is_bool(val): @@ -361,6 +380,13 @@ def _infer_dtype_from_scalar(val): elif is_complex(val): dtype = np.complex_ + elif pandas_dtype: + # to do use util + from pandas.tseries.period import Period + if isinstance(val, Period): + dtype = PeriodDtype(freq=val.freq) + val = val.ordinal + return dtype, val From 279fdf6b2c339ed1fc4e75e7ccafa9fa55ed7a72 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 19 Jan 2017 08:29:51 +0900 Subject: [PATCH 2/3] remove import failure --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/internals.py | 4 ++-- pandas/types/cast.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 798151971363e..16efdbaf2bf76 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -384,6 +384,7 @@ Bug Fixes - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) +- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ff1357644ab8c..5c9b67e9c28ad 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3236,7 +3236,7 @@ def comp(s): return _possibly_compare(values, getattr(s, 'asm8', s), operator.eq) - def _cast(block, scalar): + def _cast_scalar(block, scalar): dtype, val = _infer_dtype_from_scalar(scalar, pandas_dtype=True) if not is_dtype_equal(block.dtype, dtype): dtype = _find_common_type([block.dtype, dtype]) @@ -3268,7 +3268,7 @@ def _cast(block, scalar): # particular block m = masks[i][b.mgr_locs.indexer] if m.any(): - b, val = _cast(b, d) + b, val = _cast_scalar(b, d) new_rb.extend(b.putmask(m, val, inplace=True)) else: new_rb.append(b) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 31fb9293e124a..cd3f3a2bf5a96 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -19,7 +19,7 @@ _ensure_int8, _ensure_int16, _ensure_int32, _ensure_int64, _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, - _DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) + _POSSIBLY_CAST_DTYPES) from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries from .missing import isnull, notnull From f9154e8a70e128cb0b07edcc325b4444930adcf5 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 23 Jan 2017 20:12:39 +0900 Subject: [PATCH 3/3] remove unnecessary comments --- pandas/core/missing.py | 4 ---- pandas/types/cast.py | 7 +------ 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index cc783a88c8482..e83a0518d97f6 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -39,8 +39,6 @@ def mask_missing(arr, values_to_mask): # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): mask = False - # elif is_object_dtype(arr): - # mask = lib.scalar_compare(arr, x, operator.eq) else: mask = arr == x @@ -53,8 +51,6 @@ def mask_missing(arr, values_to_mask): # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): mask |= False - # elif is_object_dtype(arr): - # mask |= lib.scalar_compare(arr, x, operator.eq) else: mask |= arr == x diff --git a/pandas/types/cast.py b/pandas/types/cast.py index cd3f3a2bf5a96..518b0dad98df5 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -350,9 +350,6 @@ def _infer_dtype_from_scalar(val, pandas_dtype=False): else: if pandas_dtype: dtype = DatetimeTZDtype(unit='ns', tz=val.tz) - # ToDo: This localization is not needed if - # DatetimeTZBlock doesn't localize internal values - val = val.tz_localize(None) else: # return datetimetz as object return np.object_, val @@ -381,9 +378,7 @@ def _infer_dtype_from_scalar(val, pandas_dtype=False): dtype = np.complex_ elif pandas_dtype: - # to do use util - from pandas.tseries.period import Period - if isinstance(val, Period): + if lib.is_period(val): dtype = PeriodDtype(freq=val.freq) val = val.ordinal