diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index bb513605b1c94..302f8043f3ba7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1019,6 +1019,7 @@ Reshaping - Bug in :func:`DataFrame.iterrows`, which would infers strings not compliant to `ISO8601 `_ to datetimes (:issue:`19671`) - Bug in :class:`Series` constructor with ``Categorical`` where a ```ValueError`` is not raised when an index of different length is given (:issue:`19342`) - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) +- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) Other ^^^^^ diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 359c030157bd3..30132ddc05c40 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -1,6 +1,7 @@ """ Quantilization functions and related stuff """ +from functools import partial from pandas.core.dtypes.missing import isna from pandas.core.dtypes.common import ( @@ -9,6 +10,7 @@ is_categorical_dtype, is_datetime64_dtype, is_timedelta64_dtype, + is_datetime64tz_dtype, _ensure_int64) import pandas.core.algorithms as algos @@ -239,7 +241,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, ids = _ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: - ids[x == bins[0]] = 1 + # Numpy 1.9 support: ensure this mask is a Numpy array + ids[np.asarray(x == bins[0])] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() @@ -284,12 +287,14 @@ def _coerce_to_type(x): """ dtype = None - if is_timedelta64_dtype(x): - x = to_timedelta(x) - dtype = np.timedelta64 + if is_datetime64tz_dtype(x): + dtype = x.dtype elif is_datetime64_dtype(x): x = to_datetime(x) dtype = np.datetime64 + elif is_timedelta64_dtype(x): + x = to_timedelta(x) + dtype = np.timedelta64 if dtype is not None: # GH 19768: force NaT to NaN during integer conversion @@ -305,7 +310,7 @@ def _convert_bin_to_numeric_type(bins, dtype): Parameters ---------- - bins : list-liek of bins + bins : list-like of bins dtype : dtype of data Raises @@ -318,7 +323,7 @@ def _convert_bin_to_numeric_type(bins, dtype): bins = to_timedelta(bins).view(np.int64) else: raise ValueError("bins must be of timedelta64 dtype") - elif is_datetime64_dtype(dtype): + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): if bins_dtype in ['datetime', 'datetime64']: bins = to_datetime(bins).view(np.int64) else: @@ -333,7 +338,10 @@ def _format_labels(bins, precision, right=True, closed = 'right' if right else 'left' - if is_datetime64_dtype(dtype): + if is_datetime64tz_dtype(dtype): + formatter = partial(Timestamp, tz=dtype.tz) + adjust = lambda x: x - Timedelta('1ns') + elif is_datetime64_dtype(dtype): formatter = Timestamp adjust = lambda x: x - Timedelta('1ns') elif is_timedelta64_dtype(dtype): @@ -372,7 +380,13 @@ def _preprocess_for_cut(x): series_index = x.index name = x.name - x = np.asarray(x) + # Check that the passed array is a Pandas or Numpy object + # We don't want to strip away a Pandas data-type here (e.g. datetimetz) + ndim = getattr(x, 'ndim', None) + if ndim is None: + x = np.asarray(x) + if x.ndim != 1: + raise ValueError("Input array must be 1 dimensional") return x_is_series, series_index, name, x diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index ff914273d47b1..8d093f2784ba1 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -4,7 +4,7 @@ import numpy as np from pandas.compat import zip -from pandas import (Series, isna, to_datetime, DatetimeIndex, +from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index, Timestamp, Interval, IntervalIndex, Categorical, cut, qcut, date_range, NaT, TimedeltaIndex) from pandas.tseries.offsets import Nano, Day @@ -104,6 +104,12 @@ def test_cut_corner(self): pytest.raises(ValueError, cut, [1, 2, 3], 0.5) + @pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))]) + @pytest.mark.parametrize('cut_func', [cut, qcut]) + def test_cut_not_1d_arg(self, arg, cut_func): + with pytest.raises(ValueError): + cut_func(arg, 2) + def test_cut_out_of_range_more(self): # #1511 s = Series([0, -1, 0, 1, -3], name='x') @@ -251,18 +257,6 @@ def test_qcut_nas(self): result = qcut(arr, 4) assert isna(result[:20]).all() - @pytest.mark.parametrize('s', [ - Series(DatetimeIndex(['20180101', NaT, '20180103'])), - Series(TimedeltaIndex(['0 days', NaT, '2 days']))], - ids=lambda x: str(x.dtype)) - def test_qcut_nat(self, s): - # GH 19768 - intervals = IntervalIndex.from_tuples( - [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) - expected = Series(Categorical(intervals, ordered=True)) - result = qcut(s, 2) - tm.assert_series_equal(result, expected) - def test_qcut_index(self): result = qcut([0, 2], 2) intervals = [Interval(-0.001, 1), Interval(1, 2)] @@ -452,6 +446,37 @@ def test_single_bin(self): result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "array_1_writeable, array_2_writeable", + [(True, True), (True, False), (False, False)]) + def test_cut_read_only(self, array_1_writeable, array_2_writeable): + # issue 18773 + array_1 = np.arange(0, 100, 10) + array_1.flags.writeable = array_1_writeable + + array_2 = np.arange(0, 100, 10) + array_2.flags.writeable = array_2_writeable + + hundred_elements = np.arange(100) + + tm.assert_categorical_equal(cut(hundred_elements, array_1), + cut(hundred_elements, array_2)) + + +class TestDatelike(object): + + @pytest.mark.parametrize('s', [ + Series(DatetimeIndex(['20180101', NaT, '20180103'])), + Series(TimedeltaIndex(['0 days', NaT, '2 days']))], + ids=lambda x: str(x.dtype)) + def test_qcut_nat(self, s): + # GH 19768 + intervals = IntervalIndex.from_tuples( + [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) + expected = Series(Categorical(intervals, ordered=True)) + result = qcut(s, 2) + tm.assert_series_equal(result, expected) + def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series @@ -488,6 +513,47 @@ def test_datetime_cut(self): result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) + @pytest.mark.parametrize('bins', [ + 3, [Timestamp('2013-01-01 04:57:07.200000'), + Timestamp('2013-01-01 21:00:00'), + Timestamp('2013-01-02 13:00:00'), + Timestamp('2013-01-03 05:00:00')]]) + @pytest.mark.parametrize('box', [list, np.array, Index, Series]) + def test_datetimetz_cut(self, bins, box): + # GH 19872 + tz = 'US/Eastern' + s = Series(date_range('20130101', periods=3, tz=tz)) + if not isinstance(bins, int): + bins = box(bins) + result = cut(s, bins) + expected = ( + Series(IntervalIndex([ + Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz), + Timestamp('2013-01-01 16:00:00', tz=tz)), + Interval(Timestamp('2013-01-01 16:00:00', tz=tz), + Timestamp('2013-01-02 08:00:00', tz=tz)), + Interval(Timestamp('2013-01-02 08:00:00', tz=tz), + Timestamp('2013-01-03 00:00:00', tz=tz))])) + .astype(CDT(ordered=True))) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)]) + def test_datetimetz_qcut(self, bins): + # GH 19872 + tz = 'US/Eastern' + s = Series(date_range('20130101', periods=3, tz=tz)) + result = qcut(s, bins) + expected = ( + Series(IntervalIndex([ + Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz), + Timestamp('2013-01-01 16:00:00', tz=tz)), + Interval(Timestamp('2013-01-01 16:00:00', tz=tz), + Timestamp('2013-01-02 08:00:00', tz=tz)), + Interval(Timestamp('2013-01-02 08:00:00', tz=tz), + Timestamp('2013-01-03 00:00:00', tz=tz))])) + .astype(CDT(ordered=True))) + tm.assert_series_equal(result, expected) + def test_datetime_bin(self): data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] @@ -523,19 +589,3 @@ def f(): mask = result.isna() tm.assert_numpy_array_equal( mask, np.array([False, True, True, True, True])) - - @pytest.mark.parametrize( - "array_1_writeable, array_2_writeable", - [(True, True), (True, False), (False, False)]) - def test_cut_read_only(self, array_1_writeable, array_2_writeable): - # issue 18773 - array_1 = np.arange(0, 100, 10) - array_1.flags.writeable = array_1_writeable - - array_2 = np.arange(0, 100, 10) - array_2.flags.writeable = array_2_writeable - - hundred_elements = np.arange(100) - - tm.assert_categorical_equal(cut(hundred_elements, array_1), - cut(hundred_elements, array_2))