diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 15a9265a75e..6bc632e0a53 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -17,6 +17,7 @@ Tuple, Type, Union, + cast, ) import cupy @@ -1427,14 +1428,21 @@ def __repr__(self): dtype_index = tmp_meta.rfind(" dtype=") prior_to_dtype = tmp_meta[:dtype_index] lines = lines[:-1] - lines.append(prior_to_dtype + " dtype='%s'" % self.dtype) + keywords = [f"dtype='{self.dtype}'"] if self.name is not None: - lines[-1] = lines[-1] + ", name='%s'" % self.name + keywords.append(f"name={self.name!r}") if "length" in tmp_meta: - lines[-1] = lines[-1] + ", length=%d)" % len(self) - else: - lines[-1] = lines[-1] + ")" - + keywords.append(f"length={len(self)}") + if ( + "freq" in tmp_meta + and isinstance(self, DatetimeIndex) + and self._freq is not None + ): + keywords.append( + f"freq={self._freq._maybe_as_fast_pandas_offset().freqstr!r}" + ) + keywords = ", ".join(keywords) + lines.append(f"{prior_to_dtype} {keywords})") return "\n".join(lines) @_cudf_nvtx_annotate @@ -2125,8 +2133,6 @@ def __init__( # pandas dtindex creation first which. For now # just make sure we handle np.datetime64 arrays # and then just dispatch upstream - if freq is not None: - raise NotImplementedError("Freq is not yet supported") if tz is not None: raise NotImplementedError("tz is not yet supported") if normalize is not False: @@ -2140,6 +2146,8 @@ def __init__( if yearfirst is not False: raise NotImplementedError("yearfirst == True is not yet supported") + self._freq = _validate_freq(freq) + valid_dtypes = tuple( f"datetime64[{res}]" for res in ("s", "ms", "us", "ns") ) @@ -2157,6 +2165,30 @@ def __init__( super().__init__(data, **kwargs) + if self._freq is not None: + unique_vals = self.to_series().diff().unique() + if len(unique_vals) > 2 or ( + len(unique_vals) == 2 + and unique_vals[1] != self._freq._maybe_as_fast_pandas_offset() + ): + raise ValueError("No unique frequency found") + + @_cudf_nvtx_annotate + def _copy_type_metadata( + self: DatetimeIndex, other: DatetimeIndex, *, override_dtypes=None + ) -> GenericIndex: + super()._copy_type_metadata(other, override_dtypes=override_dtypes) + self._freq = _validate_freq(other._freq) + return self + + @classmethod + def _from_data( + cls, data: MutableMapping, name: Any = no_default, freq: Any = None + ): + result = super()._from_data(data, name) + result._freq = _validate_freq(freq) + return result + def __getitem__(self, index): value = super().__getitem__(index) if cudf.get_option("mode.pandas_compatible") and isinstance( @@ -2165,6 +2197,11 @@ def __getitem__(self, index): return pd.Timestamp(value) return value + @_cudf_nvtx_annotate + def copy(self, name=None, deep=False, dtype=None, names=None): + idx_copy = super().copy(name=name, deep=deep, dtype=dtype, names=names) + return idx_copy._copy_type_metadata(self) + def searchsorted( self, value, @@ -2518,7 +2555,13 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex: ) else: nanos = self._values.astype("datetime64[ns]") - return pd.DatetimeIndex(nanos.to_pandas(), name=self.name) + + freq = ( + self._freq._maybe_as_fast_pandas_offset() + if self._freq is not None + else None + ) + return pd.DatetimeIndex(nanos.to_pandas(), name=self.name, freq=freq) @_cudf_nvtx_annotate def _get_dt_field(self, field): @@ -2663,10 +2706,9 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): >>> tz_naive = cudf.date_range('2018-03-01 09:00', periods=3, freq='D') >>> tz_aware = tz_naive.tz_localize("America/New_York") >>> tz_aware - DatetimeIndex(['2018-03-01 09:00:00-05:00', - '2018-03-02 09:00:00-05:00', + DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00', '2018-03-03 09:00:00-05:00'], - dtype='datetime64[ns, America/New_York]') + dtype='datetime64[ns, America/New_York]', freq='D') Ambiguous or nonexistent datetimes are converted to NaT. @@ -2685,14 +2727,16 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): ``ambiguous`` and ``nonexistent`` arguments. Any ambiguous or nonexistent timestamps are converted to 'NaT'. - """ + """ # noqa: E501 from cudf.core._internals.timezones import delocalize, localize if tz is None: result_col = delocalize(self._column) else: result_col = localize(self._column, tz, ambiguous, nonexistent) - return DatetimeIndex._from_data({self.name: result_col}) + return DatetimeIndex._from_data( + {self.name: result_col}, freq=self._freq + ) def tz_convert(self, tz): """ @@ -2717,16 +2761,15 @@ def tz_convert(self, tz): >>> dti = cudf.date_range('2018-03-01 09:00', periods=3, freq='D') >>> dti = dti.tz_localize("America/New_York") >>> dti - DatetimeIndex(['2018-03-01 09:00:00-05:00', - '2018-03-02 09:00:00-05:00', + DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00', '2018-03-03 09:00:00-05:00'], - dtype='datetime64[ns, America/New_York]') + dtype='datetime64[ns, America/New_York]', freq='D') >>> dti.tz_convert("Europe/London") DatetimeIndex(['2018-03-01 14:00:00+00:00', '2018-03-02 14:00:00+00:00', '2018-03-03 14:00:00+00:00'], dtype='datetime64[ns, Europe/London]') - """ + """ # noqa: E501 from cudf.core._internals.timezones import convert if tz is None: @@ -3625,3 +3668,11 @@ def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: old_s, s = s, old_s - quotient * s old_t, t = t, old_t - quotient * t return old_r, old_s, old_t + + +def _validate_freq(freq: Any) -> cudf.DateOffset: + if isinstance(freq, str): + return cudf.DateOffset._from_freqstr(freq) + elif freq is not None and not isinstance(freq, cudf.DateOffset): + raise ValueError(f"Invalid frequency: {freq}") + return cast(cudf.DateOffset, freq) diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index eb59cf83926..fbf25104303 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -121,6 +121,10 @@ class _ResampleGrouping(_Grouping): bin_labels: cudf.core.index.Index + def __init__(self, obj, by=None, level=None): + self._freq = getattr(by, "freq", None) + super().__init__(obj, by, level) + def copy(self, deep=True): out = super().copy(deep=deep) result = _ResampleGrouping.__new__(_ResampleGrouping) @@ -128,13 +132,22 @@ def copy(self, deep=True): result._named_columns = out._named_columns result._key_columns = out._key_columns result.bin_labels = self.bin_labels.copy(deep=deep) + result._freq = self._freq return result + @property + def keys(self): + index = super().keys + if self._freq is not None and isinstance(index, cudf.DatetimeIndex): + return cudf.DatetimeIndex._from_data(index._data, freq=self._freq) + return index + def serialize(self): header, frames = super().serialize() labels_head, labels_frames = self.bin_labels.serialize() header["__bin_labels"] = labels_head header["__bin_labels_count"] = len(labels_frames) + header["_freq"] = self._freq frames.extend(labels_frames) return header, frames @@ -152,6 +165,7 @@ def deserialize(cls, header, frames): out.bin_labels = cudf.core.index.Index.deserialize( header["__bin_labels"], frames[-header["__bin_labels_count"] :] ) + out._freq = header["_freq"] return out def _handle_frequency_grouper(self, by): diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 14b27e179a2..6ec9dcb5f44 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -463,13 +463,19 @@ class DateOffset: } _CODES_TO_UNITS = { + "N": "nanoseconds", "ns": "nanoseconds", + "U": "microseconds", "us": "microseconds", "ms": "milliseconds", "L": "milliseconds", "s": "seconds", + "S": "seconds", "m": "minutes", + "min": "minutes", + "T": "minutes", "h": "hours", + "H": "hours", "D": "days", "W": "weeks", "M": "months", @@ -487,7 +493,7 @@ class DateOffset: pd_offset.Nano: "nanoseconds", } - _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") + _FREQSTR_REGEX = re.compile("([-+]?[0-9]*)([a-zA-Z]+)") def __init__(self, n=1, normalize=False, **kwds): if normalize: @@ -843,10 +849,6 @@ def date_range( arr = cp.linspace(start=start, stop=end, num=periods) result = cudf.core.column.as_column(arr).astype("datetime64[ns]") return cudf.DatetimeIndex._from_data({name: result}) - elif cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError( - "`DatetimeIndex` with `freq` cannot be constructed." - ) # The code logic below assumes `freq` is defined. It is first normalized # into `DateOffset` for further computation with timestamps. @@ -940,7 +942,7 @@ def date_range( arr = cp.arange(start=start, stop=stop, step=step, dtype="int64") res = cudf.core.column.as_column(arr).astype("datetime64[ns]") - return cudf.DatetimeIndex._from_data({name: res}) + return cudf.DatetimeIndex._from_data({name: res}, freq=freq) def _has_fixed_frequency(freq: DateOffset) -> bool: diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 193ef404a8c..c50e72b4b12 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -707,6 +707,14 @@ def Index__new__(cls, *args, **kwargs): "Resampler", cudf.core.resample._Resampler, pd_Resampler ) +DataFrameResampler = make_intermediate_proxy_type( + "DataFrameResampler", cudf.core.resample.DataFrameResampler, pd_Resampler +) + +SeriesResampler = make_intermediate_proxy_type( + "SeriesResampler", cudf.core.resample.SeriesResampler, pd_Resampler +) + StataReader = make_intermediate_proxy_type( "StataReader", _Unusable, diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index ee0985a54dd..07c8c407ab9 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -12,7 +12,7 @@ import cudf import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_LT_140 from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( DATETIME_TYPES, @@ -1571,6 +1571,44 @@ def test_date_range_start_end_freq(request, start, end, freq): reason="https://github.com/rapidsai/cudf/issues/12133", ) ) + request.applymarker( + pytest.mark.xfail( + condition=( + isinstance(freq, dict) + and freq.get("hours", None) == 10 + and freq.get("days", None) == 57 + and freq.get("nanoseconds", None) == 3 + and ( + ( + start == "1996-11-21 04:05:30" + and end == "2000-02-13 08:41:06" + ) + or ( + start == "1970-01-01 00:00:00" + and end == "2000-02-13 08:41:06" + ) + or ( + start == "1970-01-01 00:00:00" + and end == "1996-11-21 04:05:30" + ) + or ( + start == "1831-05-08 15:23:21" + and end == "2000-02-13 08:41:06" + ) + or ( + start == "1831-05-08 15:23:21" + and end == "1996-11-21 04:05:30" + ) + or ( + start == "1831-05-08 15:23:21" + and end == "1970-01-01 00:00:00" + ) + ) + ), + reason="Nanosecond offsets being dropped by pandas, which is " + "fixed in pandas-2.0+", + ) + ) if isinstance(freq, str): _gfreq = _pfreq = freq else: @@ -1586,7 +1624,29 @@ def test_date_range_start_end_freq(request, start, end, freq): ) -def test_date_range_start_freq_periods(start, freq, periods): +def test_date_range_start_freq_periods(request, start, freq, periods): + request.applymarker( + pytest.mark.xfail( + condition=( + isinstance(freq, dict) + and freq.get("hours", None) == 10 + and freq.get("days", None) == 57 + and freq.get("nanoseconds", None) == 3 + and periods in (10, 100) + and ( + start + in { + "2000-02-13 08:41:06", + "1996-11-21 04:05:30", + "1970-01-01 00:00:00", + "1831-05-08 15:23:21", + } + ) + ), + reason="Nanosecond offsets being dropped by pandas, which is " + "fixed in pandas-2.0+", + ) + ) if isinstance(freq, str): _gfreq = _pfreq = freq else: @@ -1613,6 +1673,28 @@ def test_date_range_end_freq_periods(request, end, freq, periods): reason="https://github.com/pandas-dev/pandas/issues/46877", ) ) + request.applymarker( + pytest.mark.xfail( + condition=( + isinstance(freq, dict) + and freq.get("hours", None) == 10 + and freq.get("days", None) == 57 + and freq.get("nanoseconds", None) == 3 + and periods in (10, 100) + and ( + end + in { + "2000-02-13 08:41:06", + "1996-11-21 04:05:30", + "1970-01-01 00:00:00", + "1831-05-08 15:23:21", + } + ) + ), + reason="Nanosecond offsets being dropped by pandas, which is " + "fixed in pandas-2.0+", + ) + ) if isinstance(freq, str): _gfreq = _pfreq = freq else: @@ -2163,8 +2245,6 @@ def test_datetime_getitem_na(): def test_daterange_pandas_compatibility(): with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.date_range("20010101", "20020215", freq="400h", name="times") expected = pd.date_range( "2010-01-01", "2010-02-01", periods=10, name="times" ) @@ -2174,6 +2254,46 @@ def test_daterange_pandas_compatibility(): assert_eq(expected, actual) +@pytest.mark.parametrize( + "data,dtype,freq", + [ + ([10], "datetime64[ns]", "2N"), + ([10, 12, 14, 16], "datetime64[ns]", "2N"), + ([10, 11, 12, 13], "datetime64[ns]", "1N"), + ([100, 200, 300, 400], "datetime64[s]", "100s"), + ([101, 201, 301, 401], "datetime64[ms]", "100ms"), + ], +) +def test_datetime_index_with_freq(request, data, dtype, freq): + request.applymarker( + pytest.mark.xfail( + condition=(not PANDAS_GE_200 and dtype != "datetime64[ns]"), + reason="Pandas < 2.0 lacks non-nano-second dtype support.", + ) + ) + actual = cudf.DatetimeIndex(data, dtype=dtype, freq=freq) + expected = pd.DatetimeIndex(data, dtype=dtype, freq=freq) + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "data,dtype,freq", + [ + ([10, 1232, 13244, 13426], "datetime64[ns]", "2N"), + ([10, 11, 12, 13], "datetime64[ns]", "1s"), + ([10000, 200, 300, 400], "datetime64[s]", "100s"), + ([107871, 201, 301, 401], "datetime64[ms]", "100ns"), + ], +) +def test_datetime_index_freq_error(data, dtype, freq): + assert_exceptions_equal( + pd.DatetimeIndex, + cudf.DatetimeIndex, + ([data], {"dtype": dtype, "freq": freq}), + ([data], {"dtype": dtype, "freq": freq}), + ) + + def test_strings_with_utc_offset_not_implemented(): with pytest.warns(DeprecationWarning, match="parsing timezone"): # cupy with pytest.raises(NotImplementedError):