From 96027152e94e0a91b114821480e94104246aaa02 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 7 Dec 2023 09:03:36 -0500 Subject: [PATCH 01/14] Support `freq` in DatetimeIndex --- python/cudf/cudf/core/index.py | 41 ++++++++++++++++++++---- python/cudf/cudf/core/tools/datetimes.py | 6 +--- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 98d537b2a0f..7a9fed86580 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1433,9 +1433,10 @@ def __repr__(self): if self.name is not None: lines[-1] = lines[-1] + ", name='%s'" % self.name if "length" in tmp_meta: - lines[-1] = lines[-1] + ", length=%d)" % len(self) - else: - lines[-1] = lines[-1] + ")" + lines[-1] = lines[-1] + ", length=%d" % len(self) + if "freq" in tmp_meta and self._freq is not None: + lines[-1] = lines[-1] + f", freq={self._freq}" + lines[-1] = lines[-1] + ")" return "\n".join(lines) @@ -2127,8 +2128,6 @@ def __init__( # pandas dtindex creation first which. For now # just make sure we handle np.datetime64 arrays # and then just dispatch upstream - if freq is not None: - raise NotImplementedError("Freq is not yet supported") if tz is not None: raise NotImplementedError("tz is not yet supported") if normalize is not False: @@ -2142,6 +2141,8 @@ def __init__( if yearfirst is not False: raise NotImplementedError("yearfirst == True is not yet supported") + self._freq = _validate_freq(freq) + valid_dtypes = tuple( f"datetime64[{res}]" for res in ("s", "ms", "us", "ns") ) @@ -2159,6 +2160,19 @@ def __init__( super().__init__(data, **kwargs) + if self._freq is not None: + unique_vals = self[1:] - self[:-1] + if len(unique_vals) != 1 or unique_vals[0] != self._freq: + raise ValueError() + + @classmethod + def _from_data( + cls, data: MutableMapping, name: Any = no_default, freq: Any = None + ): + result = super()._from_data(data, name) + result._freq = _validate_freq(freq) + return result + def __getitem__(self, index): value = super().__getitem__(index) if cudf.get_option("mode.pandas_compatible") and isinstance( @@ -2520,7 +2534,13 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex: ) else: nanos = self._values.astype("datetime64[ns]") - return pd.DatetimeIndex(nanos.to_pandas(), name=self.name) + + freq = ( + self._freq._maybe_as_fast_pandas_offset() + if self._freq is not None + else None + ) + return pd.DatetimeIndex(nanos.to_pandas(), name=self.name, freq=freq) @_cudf_nvtx_annotate def _get_dt_field(self, field): @@ -3625,3 +3645,12 @@ def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: old_s, s = s, old_s - quotient * s old_t, t = t, old_t - quotient * t return old_r, old_s, old_t + + +def _validate_freq(freq: Any) -> cudf.DateOffset: + if isinstance(freq, str): + return cudf.DateOffset._from_freqstr(freq) + elif freq is not None: + if not isinstance(freq, cudf.DateOffset): + raise ValueError(f"Invalid frequency: {freq}") + return freq diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 14b27e179a2..4ac15773fbf 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -843,10 +843,6 @@ def date_range( arr = cp.linspace(start=start, stop=end, num=periods) result = cudf.core.column.as_column(arr).astype("datetime64[ns]") return cudf.DatetimeIndex._from_data({name: result}) - elif cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError( - "`DatetimeIndex` with `freq` cannot be constructed." - ) # The code logic below assumes `freq` is defined. It is first normalized # into `DateOffset` for further computation with timestamps. @@ -940,7 +936,7 @@ def date_range( arr = cp.arange(start=start, stop=stop, step=step, dtype="int64") res = cudf.core.column.as_column(arr).astype("datetime64[ns]") - return cudf.DatetimeIndex._from_data({name: res}) + return cudf.DatetimeIndex._from_data({name: res}, freq=freq) def _has_fixed_frequency(freq: DateOffset) -> bool: From 98e5e1ee20be9cb12938377d6b3a788fd28bf313 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 7 Dec 2023 11:12:43 -0500 Subject: [PATCH 02/14] "T" is minutes --- python/cudf/cudf/core/tools/datetimes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 4ac15773fbf..0e6b2ad35c9 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -469,6 +469,7 @@ class DateOffset: "L": "milliseconds", "s": "seconds", "m": "minutes", + "T": "minutes", "h": "hours", "D": "days", "W": "weeks", From 6b0beee7336c96d201fba6fdc08ce551ba85525c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 7 Dec 2023 11:23:57 -0500 Subject: [PATCH 03/14] Add more string aliases --- python/cudf/cudf/core/tools/datetimes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 0e6b2ad35c9..9030beeea3f 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -468,9 +468,12 @@ class DateOffset: "ms": "milliseconds", "L": "milliseconds", "s": "seconds", + "S": "seconds", "m": "minutes", + "min": "minutes", "T": "minutes", "h": "hours", + "H": "hours", "D": "days", "W": "weeks", "M": "months", From 20ca2bb41db31cac3f803ae053d3de7f12ca4492 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 7 Dec 2023 12:20:54 -0500 Subject: [PATCH 04/14] Define resamplers --- python/cudf/cudf/pandas/_wrappers/pandas.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 193ef404a8c..c50e72b4b12 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -707,6 +707,14 @@ def Index__new__(cls, *args, **kwargs): "Resampler", cudf.core.resample._Resampler, pd_Resampler ) +DataFrameResampler = make_intermediate_proxy_type( + "DataFrameResampler", cudf.core.resample.DataFrameResampler, pd_Resampler +) + +SeriesResampler = make_intermediate_proxy_type( + "SeriesResampler", cudf.core.resample.SeriesResampler, pd_Resampler +) + StataReader = make_intermediate_proxy_type( "StataReader", _Unusable, From b461ecbc7bf96ed72900f330c5995e82b6446e6c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 7 Dec 2023 17:33:32 +0000 Subject: [PATCH 05/14] fix metadata issues --- python/cudf/cudf/core/index.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7a9fed86580..7d852206a87 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1228,7 +1228,8 @@ def copy(self, name=None, deep=False, dtype=None, names=None): name = self.name if name is None else name col = self._values.astype(dtype) - return _index_from_data({name: col.copy(True) if deep else col}) + idx_copy = _index_from_data({name: col.copy(True) if deep else col}) + return idx_copy._copy_type_metadata(self) @_cudf_nvtx_annotate def astype(self, dtype, copy: bool = True): @@ -2165,6 +2166,14 @@ def __init__( if len(unique_vals) != 1 or unique_vals[0] != self._freq: raise ValueError() + @_cudf_nvtx_annotate + def _copy_type_metadata( + self: DatetimeIndex, other: DatetimeIndex, *, override_dtypes=None + ) -> GenericIndex: + super()._copy_type_metadata(other, override_dtypes=override_dtypes) + self._freq = other._freq + return self + @classmethod def _from_data( cls, data: MutableMapping, name: Any = no_default, freq: Any = None From 03378408afa33c6a4544782c153e1fa0227839ec Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 7 Dec 2023 20:29:13 +0000 Subject: [PATCH 06/14] Fix more cases --- python/cudf/cudf/core/index.py | 8 ++++++-- python/cudf/cudf/core/tools/datetimes.py | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7d852206a87..dc263512dcc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1228,8 +1228,7 @@ def copy(self, name=None, deep=False, dtype=None, names=None): name = self.name if name is None else name col = self._values.astype(dtype) - idx_copy = _index_from_data({name: col.copy(True) if deep else col}) - return idx_copy._copy_type_metadata(self) + return _index_from_data({name: col.copy(True) if deep else col}) @_cudf_nvtx_annotate def astype(self, dtype, copy: bool = True): @@ -2190,6 +2189,11 @@ def __getitem__(self, index): return pd.Timestamp(value) return value + @_cudf_nvtx_annotate + def copy(self, name=None, deep=False, dtype=None, names=None): + idx_copy = super().copy(name=name, deep=deep, dtype=dtype, names=names) + return idx_copy._copy_type_metadata(self) + def searchsorted( self, value, diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 9030beeea3f..6ec9dcb5f44 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -463,7 +463,9 @@ class DateOffset: } _CODES_TO_UNITS = { + "N": "nanoseconds", "ns": "nanoseconds", + "U": "microseconds", "us": "microseconds", "ms": "milliseconds", "L": "milliseconds", @@ -491,7 +493,7 @@ class DateOffset: pd_offset.Nano: "nanoseconds", } - _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") + _FREQSTR_REGEX = re.compile("([-+]?[0-9]*)([a-zA-Z]+)") def __init__(self, n=1, normalize=False, **kwds): if normalize: From 957c7c5ea7b57f03f9dfe13126b465989c9efc75 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 7 Dec 2023 14:30:56 -0600 Subject: [PATCH 07/14] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/cudf/core/index.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index dc263512dcc..9d50215a5bc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3663,7 +3663,6 @@ def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: def _validate_freq(freq: Any) -> cudf.DateOffset: if isinstance(freq, str): return cudf.DateOffset._from_freqstr(freq) - elif freq is not None: - if not isinstance(freq, cudf.DateOffset): - raise ValueError(f"Invalid frequency: {freq}") + elif freq is not None and not isinstance(freq, cudf.DateOffset): + raise ValueError(f"Invalid frequency: {freq}") return freq From ed3ba3ff17cf686d1e6e38f01073d27b1be64799 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 7 Dec 2023 21:51:59 +0000 Subject: [PATCH 08/14] fix more cases --- python/cudf/cudf/core/index.py | 9 ++- python/cudf/cudf/tests/test_datetime.py | 86 ++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 9d50215a5bc..15103d827ef 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -17,6 +17,7 @@ Tuple, Type, Union, + cast, ) import cupy @@ -1434,7 +1435,11 @@ def __repr__(self): lines[-1] = lines[-1] + ", name='%s'" % self.name if "length" in tmp_meta: lines[-1] = lines[-1] + ", length=%d" % len(self) - if "freq" in tmp_meta and self._freq is not None: + if ( + "freq" in tmp_meta + and isinstance(self, DatetimeIndex) + and self._freq is not None + ): lines[-1] = lines[-1] + f", freq={self._freq}" lines[-1] = lines[-1] + ")" @@ -3665,4 +3670,4 @@ def _validate_freq(freq: Any) -> cudf.DateOffset: return cudf.DateOffset._from_freqstr(freq) elif freq is not None and not isinstance(freq, cudf.DateOffset): raise ValueError(f"Invalid frequency: {freq}") - return freq + return cast(cudf.DateOffset, freq) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index ee0985a54dd..5b509e7ba86 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1571,6 +1571,44 @@ def test_date_range_start_end_freq(request, start, end, freq): reason="https://github.com/rapidsai/cudf/issues/12133", ) ) + request.applymarker( + pytest.mark.xfail( + condition=( + isinstance(freq, dict) + and freq.get("hours", None) == 10 + and freq.get("days", None) == 57 + and freq.get("nanoseconds", None) == 3 + and ( + ( + start == "1996-11-21 04:05:30" + and end == "2000-02-13 08:41:06" + ) + or ( + start == "1970-01-01 00:00:00" + and end == "2000-02-13 08:41:06" + ) + or ( + start == "1970-01-01 00:00:00" + and end == "1996-11-21 04:05:30" + ) + or ( + start == "1831-05-08 15:23:21" + and end == "2000-02-13 08:41:06" + ) + or ( + start == "1831-05-08 15:23:21" + and end == "1996-11-21 04:05:30" + ) + or ( + start == "1831-05-08 15:23:21" + and end == "1970-01-01 00:00:00" + ) + ) + ), + reason="Nanosecond offsets being dropped by pandas, which is " + "fixed in pandas-2.0+", + ) + ) if isinstance(freq, str): _gfreq = _pfreq = freq else: @@ -1586,7 +1624,29 @@ def test_date_range_start_end_freq(request, start, end, freq): ) -def test_date_range_start_freq_periods(start, freq, periods): +def test_date_range_start_freq_periods(request, start, freq, periods): + request.applymarker( + pytest.mark.xfail( + condition=( + isinstance(freq, dict) + and freq.get("hours", None) == 10 + and freq.get("days", None) == 57 + and freq.get("nanoseconds", None) == 3 + and periods in (10, 100) + and ( + start + in { + "2000-02-13 08:41:06", + "1996-11-21 04:05:30", + "1970-01-01 00:00:00", + "1831-05-08 15:23:21", + } + ) + ), + reason="Nanosecond offsets being dropped by pandas, which is " + "fixed in pandas-2.0+", + ) + ) if isinstance(freq, str): _gfreq = _pfreq = freq else: @@ -1613,6 +1673,28 @@ def test_date_range_end_freq_periods(request, end, freq, periods): reason="https://github.com/pandas-dev/pandas/issues/46877", ) ) + request.applymarker( + pytest.mark.xfail( + condition=( + isinstance(freq, dict) + and freq.get("hours", None) == 10 + and freq.get("days", None) == 57 + and freq.get("nanoseconds", None) == 3 + and periods in (10, 100) + and ( + end + in { + "2000-02-13 08:41:06", + "1996-11-21 04:05:30", + "1970-01-01 00:00:00", + "1831-05-08 15:23:21", + } + ) + ), + reason="Nanosecond offsets being dropped by pandas, which is " + "fixed in pandas-2.0+", + ) + ) if isinstance(freq, str): _gfreq = _pfreq = freq else: @@ -2163,8 +2245,6 @@ def test_datetime_getitem_na(): def test_daterange_pandas_compatibility(): with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.date_range("20010101", "20020215", freq="400h", name="times") expected = pd.date_range( "2010-01-01", "2010-02-01", periods=10, name="times" ) From ce4f3bdffb3eb6400d415aa8e16bdf0a0ccc11a3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 8 Dec 2023 05:17:10 +0000 Subject: [PATCH 09/14] address reviews --- python/cudf/cudf/core/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 15103d827ef..3f791cb6b47 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2166,7 +2166,7 @@ def __init__( super().__init__(data, **kwargs) if self._freq is not None: - unique_vals = self[1:] - self[:-1] + unique_vals = self.diff().unique() if len(unique_vals) != 1 or unique_vals[0] != self._freq: raise ValueError() From cd00345fb6a995c42e0c7827a59f29224f73bc2d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 8 Dec 2023 08:37:52 -0600 Subject: [PATCH 10/14] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/cudf/core/index.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d5a59b92f6e..c8883e01b5d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1431,16 +1431,16 @@ def __repr__(self): lines = lines[:-1] lines.append(prior_to_dtype + " dtype='%s'" % self.dtype) if self.name is not None: - lines[-1] = lines[-1] + ", name='%s'" % self.name + lines[-1] += f", name='{self.name}'" if "length" in tmp_meta: - lines[-1] = lines[-1] + ", length=%d" % len(self) + lines[-1] += f", length={len(self)}" if ( "freq" in tmp_meta and isinstance(self, DatetimeIndex) and self._freq is not None ): - lines[-1] = lines[-1] + f", freq={self._freq}" - lines[-1] = lines[-1] + ")" + lines[-1] += f", freq={self._freq}" + lines[-1] += ")" return "\n".join(lines) From 6d00347a78af681e9e73fc8fbdc92acc4da02799 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 8 Dec 2023 15:56:03 +0000 Subject: [PATCH 11/14] fix freq calculations --- python/cudf/cudf/core/index.py | 9 ++++-- python/cudf/cudf/tests/test_datetime.py | 42 ++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3f791cb6b47..23ab41a55a1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2166,9 +2166,12 @@ def __init__( super().__init__(data, **kwargs) if self._freq is not None: - unique_vals = self.diff().unique() - if len(unique_vals) != 1 or unique_vals[0] != self._freq: - raise ValueError() + unique_vals = self.to_series().diff().unique() + if len(unique_vals) > 2 or ( + len(unique_vals) == 2 + and unique_vals[1] != self._freq._maybe_as_fast_pandas_offset() + ): + raise ValueError("No unique frequency found") @_cudf_nvtx_annotate def _copy_type_metadata( diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 5b509e7ba86..07c8c407ab9 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -12,7 +12,7 @@ import cudf import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_LT_140 from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( DATETIME_TYPES, @@ -2254,6 +2254,46 @@ def test_daterange_pandas_compatibility(): assert_eq(expected, actual) +@pytest.mark.parametrize( + "data,dtype,freq", + [ + ([10], "datetime64[ns]", "2N"), + ([10, 12, 14, 16], "datetime64[ns]", "2N"), + ([10, 11, 12, 13], "datetime64[ns]", "1N"), + ([100, 200, 300, 400], "datetime64[s]", "100s"), + ([101, 201, 301, 401], "datetime64[ms]", "100ms"), + ], +) +def test_datetime_index_with_freq(request, data, dtype, freq): + request.applymarker( + pytest.mark.xfail( + condition=(not PANDAS_GE_200 and dtype != "datetime64[ns]"), + reason="Pandas < 2.0 lacks non-nano-second dtype support.", + ) + ) + actual = cudf.DatetimeIndex(data, dtype=dtype, freq=freq) + expected = pd.DatetimeIndex(data, dtype=dtype, freq=freq) + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "data,dtype,freq", + [ + ([10, 1232, 13244, 13426], "datetime64[ns]", "2N"), + ([10, 11, 12, 13], "datetime64[ns]", "1s"), + ([10000, 200, 300, 400], "datetime64[s]", "100s"), + ([107871, 201, 301, 401], "datetime64[ms]", "100ns"), + ], +) +def test_datetime_index_freq_error(data, dtype, freq): + assert_exceptions_equal( + pd.DatetimeIndex, + cudf.DatetimeIndex, + ([data], {"dtype": dtype, "freq": freq}), + ([data], {"dtype": dtype, "freq": freq}), + ) + + def test_strings_with_utc_offset_not_implemented(): with pytest.warns(DeprecationWarning, match="parsing timezone"): # cupy with pytest.raises(NotImplementedError): From 55266cdfa91c6022a720d4c7cbeb292cc3cda004 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 8 Dec 2023 16:01:15 +0000 Subject: [PATCH 12/14] Add validation --- python/cudf/cudf/core/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 95d1c762184..931cab0e20a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2177,7 +2177,7 @@ def _copy_type_metadata( self: DatetimeIndex, other: DatetimeIndex, *, override_dtypes=None ) -> GenericIndex: super()._copy_type_metadata(other, override_dtypes=override_dtypes) - self._freq = other._freq + self._freq = _validate_freq(other._freq) return self @classmethod From e1b697f92de3ba5d0f062875f5c314463cefe3fc Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 8 Dec 2023 16:24:49 +0000 Subject: [PATCH 13/14] Simplify repr --- python/cudf/cudf/core/index.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 931cab0e20a..8fb617cd5b8 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1429,19 +1429,19 @@ def __repr__(self): dtype_index = tmp_meta.rfind(" dtype=") prior_to_dtype = tmp_meta[:dtype_index] lines = lines[:-1] - lines.append(prior_to_dtype + " dtype='%s'" % self.dtype) + keywords = [f"dtype='{self.dtype}'"] if self.name is not None: - lines[-1] += f", name='{self.name}'" + keywords.append(f"name={self.name!r}") if "length" in tmp_meta: - lines[-1] += f", length={len(self)}" + keywords.append(f"length={len(self)}") if ( "freq" in tmp_meta and isinstance(self, DatetimeIndex) and self._freq is not None ): - lines[-1] += f", freq={self._freq}" - lines[-1] += ")" - + keywords.append(f"freq={self._freq}") + keywords = ", ".join(keywords) + lines.append(f"{prior_to_dtype} {keywords})") return "\n".join(lines) @_cudf_nvtx_annotate From 0d5c452a6bc84b9f63aa1390449cbcf0d0ba3aad Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 12 Dec 2023 02:26:23 +0000 Subject: [PATCH 14/14] Handle freq in groupby ops --- python/cudf/cudf/core/index.py | 22 ++++++++++++---------- python/cudf/cudf/core/resample.py | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8fb617cd5b8..9b14c4b0143 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1439,7 +1439,9 @@ def __repr__(self): and isinstance(self, DatetimeIndex) and self._freq is not None ): - keywords.append(f"freq={self._freq}") + keywords.append( + f"freq={self._freq._maybe_as_fast_pandas_offset().freqstr!r}" + ) keywords = ", ".join(keywords) lines.append(f"{prior_to_dtype} {keywords})") return "\n".join(lines) @@ -2705,10 +2707,9 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): >>> tz_naive = cudf.date_range('2018-03-01 09:00', periods=3, freq='D') >>> tz_aware = tz_naive.tz_localize("America/New_York") >>> tz_aware - DatetimeIndex(['2018-03-01 09:00:00-05:00', - '2018-03-02 09:00:00-05:00', + DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00', '2018-03-03 09:00:00-05:00'], - dtype='datetime64[ns, America/New_York]') + dtype='datetime64[ns, America/New_York]', freq='D') Ambiguous or nonexistent datetimes are converted to NaT. @@ -2727,14 +2728,16 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): ``ambiguous`` and ``nonexistent`` arguments. Any ambiguous or nonexistent timestamps are converted to 'NaT'. - """ + """ # noqa: E501 from cudf.core._internals.timezones import delocalize, localize if tz is None: result_col = delocalize(self._column) else: result_col = localize(self._column, tz, ambiguous, nonexistent) - return DatetimeIndex._from_data({self.name: result_col}) + return DatetimeIndex._from_data( + {self.name: result_col}, freq=self._freq + ) def tz_convert(self, tz): """ @@ -2759,16 +2762,15 @@ def tz_convert(self, tz): >>> dti = cudf.date_range('2018-03-01 09:00', periods=3, freq='D') >>> dti = dti.tz_localize("America/New_York") >>> dti - DatetimeIndex(['2018-03-01 09:00:00-05:00', - '2018-03-02 09:00:00-05:00', + DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00', '2018-03-03 09:00:00-05:00'], - dtype='datetime64[ns, America/New_York]') + dtype='datetime64[ns, America/New_York]', freq='D') >>> dti.tz_convert("Europe/London") DatetimeIndex(['2018-03-01 14:00:00+00:00', '2018-03-02 14:00:00+00:00', '2018-03-03 14:00:00+00:00'], dtype='datetime64[ns, Europe/London]') - """ + """ # noqa: E501 from cudf.core._internals.timezones import convert if tz is None: diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index eb59cf83926..fbf25104303 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -121,6 +121,10 @@ class _ResampleGrouping(_Grouping): bin_labels: cudf.core.index.Index + def __init__(self, obj, by=None, level=None): + self._freq = getattr(by, "freq", None) + super().__init__(obj, by, level) + def copy(self, deep=True): out = super().copy(deep=deep) result = _ResampleGrouping.__new__(_ResampleGrouping) @@ -128,13 +132,22 @@ def copy(self, deep=True): result._named_columns = out._named_columns result._key_columns = out._key_columns result.bin_labels = self.bin_labels.copy(deep=deep) + result._freq = self._freq return result + @property + def keys(self): + index = super().keys + if self._freq is not None and isinstance(index, cudf.DatetimeIndex): + return cudf.DatetimeIndex._from_data(index._data, freq=self._freq) + return index + def serialize(self): header, frames = super().serialize() labels_head, labels_frames = self.bin_labels.serialize() header["__bin_labels"] = labels_head header["__bin_labels_count"] = len(labels_frames) + header["_freq"] = self._freq frames.extend(labels_frames) return header, frames @@ -152,6 +165,7 @@ def deserialize(cls, header, frames): out.bin_labels = cudf.core.index.Index.deserialize( header["__bin_labels"], frames[-header["__bin_labels_count"] :] ) + out._freq = header["_freq"] return out def _handle_frequency_grouper(self, by):