Skip to content

Commit

Permalink
API: preserve freq in DTI/TDI.factorize (#38120)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Nov 30, 2020
1 parent aad85ad commit c29c176
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 24 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`)
- Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`)
- Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`)
- Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`)
- Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`)
- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
Expand Down
35 changes: 30 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,13 @@
pandas_dtype,
)
from pandas.core.dtypes.generic import (
ABCDatetimeArray,
ABCExtensionArray,
ABCIndexClass,
ABCMultiIndex,
ABCRangeIndex,
ABCSeries,
ABCTimedeltaArray,
)
from pandas.core.dtypes.missing import isna, na_value_for_dtype

Expand Down Expand Up @@ -199,8 +201,16 @@ def _reconstruct_data(
-------
ExtensionArray or np.ndarray
"""
if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
# Catch DatetimeArray/TimedeltaArray
return values

if is_extension_array_dtype(dtype):
values = dtype.construct_array_type()._from_sequence(values)
cls = dtype.construct_array_type()
if isinstance(values, cls) and values.dtype == dtype:
return values

values = cls._from_sequence(values)
elif is_bool_dtype(dtype):
values = values.astype(dtype, copy=False)

Expand Down Expand Up @@ -674,8 +684,13 @@ def factorize(
# responsible only for factorization. All data coercion, sorting and boxing
# should happen here.

if isinstance(values, ABCRangeIndex):
return values.factorize(sort=sort)

values = _ensure_arraylike(values)
original = values
if not isinstance(values, ABCMultiIndex):
values = extract_array(values, extract_numpy=True)

# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
# of values, assign na_sentinel=-1 to replace code value for NaN.
Expand All @@ -684,10 +699,20 @@ def factorize(
na_sentinel = -1
dropna = False

if isinstance(values, ABCRangeIndex):
return values.factorize(sort=sort)
elif is_extension_array_dtype(values.dtype):
values = extract_array(values)
if (
isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
and values.freq is not None
):
codes, uniques = values.factorize(sort=sort)
if isinstance(original, ABCIndexClass):
uniques = original._shallow_copy(uniques, name=None)
elif isinstance(original, ABCSeries):
from pandas import Index

uniques = Index(uniques)
return codes, uniques

if is_extension_array_dtype(values.dtype):
codes, uniques = values.factorize(na_sentinel=na_sentinel)
dtype = original.dtype
else:
Expand Down
18 changes: 18 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1645,6 +1645,24 @@ def _with_freq(self, freq):
arr._freq = freq
return arr

# --------------------------------------------------------------

def factorize(self, na_sentinel=-1, sort: bool = False):
if self.freq is not None:
# We must be unique, so can short-circuit (and retain freq)
codes = np.arange(len(self), dtype=np.intp)
uniques = self.copy() # TODO: copy or view?
if sort and self.freq.n < 0:
codes = codes[::-1]
# TODO: overload __getitem__, a slice indexer returns same type as self
# error: Incompatible types in assignment (expression has type
# "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable
# has type "TimelikeOps") [assignment]
uniques = uniques[::-1] # type: ignore[assignment]
return codes, uniques
# FIXME: shouldn't get here; we are ignoring sort
return super().factorize(na_sentinel=na_sentinel)


# -------------------------------------------------------------------
# Shared Constructor Helpers
Expand Down
51 changes: 35 additions & 16 deletions pandas/tests/indexes/datetimes/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,12 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

arr, idx = idx1.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

# tz must be preserved
idx1 = idx1.tz_localize("Asia/Tokyo")
Expand All @@ -277,6 +279,7 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

idx2 = DatetimeIndex(
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"]
Expand All @@ -287,49 +290,65 @@ def test_factorize(self):
arr, idx = idx2.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp)
exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"])
arr, idx = idx2.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

# freq must be preserved
def test_factorize_preserves_freq(self):
# GH#38120 freq should be preserved
idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo")
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)

arr, idx = idx3.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

arr, idx = pd.factorize(idx3)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

def test_factorize_tz(self, tz_naive_fixture):
def test_factorize_tz(self, tz_naive_fixture, index_or_series):
tz = tz_naive_fixture
# GH#13750
base = date_range("2016-11-05", freq="H", periods=100, tz=tz)
idx = base.repeat(5)

exp_arr = np.arange(100, dtype=np.intp).repeat(5)

for obj in [idx, pd.Series(idx)]:
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
expected = base._with_freq(None)
tm.assert_index_equal(res, expected)
obj = index_or_series(idx)

arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
expected = base._with_freq(None)
tm.assert_index_equal(res, expected)
assert res.freq == expected.freq

def test_factorize_dst(self):
def test_factorize_dst(self, index_or_series):
# GH 13750
idx = date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern")
obj = index_or_series(idx)

for obj in [idx, pd.Series(idx)]:
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
if index_or_series is Index:
assert res.freq == idx.freq

idx = date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern")
obj = index_or_series(idx)

for obj in [idx, pd.Series(idx)]:
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
if index_or_series is Index:
assert res.freq == idx.freq

@pytest.mark.parametrize(
"arr, expected",
Expand Down
11 changes: 10 additions & 1 deletion pandas/tests/indexes/timedeltas/test_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,26 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

arr, idx = idx1.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

# freq must be preserved
def test_factorize_preserves_freq(self):
# GH#38120 freq should be preserved
idx3 = timedelta_range("1 day", periods=4, freq="s")
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)
arr, idx = idx3.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

arr, idx = pd.factorize(idx3)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

def test_sort_values(self):

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/indexing/multiindex/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,13 @@ def test_nested_tuples_duplicates(self):
df3 = df.copy(deep=True)
df3.loc[[(dti[0], "a")], "c2"] = 1.0
tm.assert_frame_equal(df3, expected)

def test_multiindex_with_datatime_level_preserves_freq(self):
# https://github.com/pandas-dev/pandas/issues/35563
idx = Index(range(2), name="A")
dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B")
mi = MultiIndex.from_product([idx, dti])
df = DataFrame(np.random.randn(14, 2), index=mi)
result = df.loc[0].index
tm.assert_index_equal(result, dti)
assert result.freq == dti.freq
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ def test_ewm_pairwise_cov_corr(func, frame):
result = result.loc[(slice(None), 1), 5]
result.index = result.index.droplevel(1)
expected = getattr(frame[1].ewm(span=10, min_periods=5), func)(frame[5])
expected.index = expected.index._with_freq(None)
tm.assert_series_equal(result, expected, check_names=False)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def test_rolling_pairwise_cov_corr(func, frame):
result = result.loc[(slice(None), 1), 5]
result.index = result.index.droplevel(1)
expected = getattr(frame[1].rolling(window=10, min_periods=5), func)(frame[5])
expected.index = expected.index._with_freq(None)
tm.assert_series_equal(result, expected, check_names=False)


Expand Down

0 comments on commit c29c176

Please sign in to comment.