From fd9e620a84389170138cc014ee5a0213718beb78 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Sat, 30 May 2020 14:08:26 -0400 Subject: [PATCH] xr.infer_freq (#4033) * xr.infer_freq and related code * Formatting and comments * Rewrite _CFTimeFrequencyInferer independently of pandas * Syntax and add frequency.py file * Fix tests and month_deltas * Require cftime 1.1.0 for the test * Apply suggestions from code review Co-authored-by: Spencer Clark * Changes following code review * Docs * Docs * Black * Fix tests for requiring cftime 1.1.0 * Update whats-new * Apply suggestions from code review Co-authored-by: Spencer Clark Co-authored-by: Mathias Hauser * Add invalid input tests for better coverage * Fix link in whats-new.rst Co-authored-by: Spencer Clark Co-authored-by: Mathias Hauser --- doc/api.rst | 1 + doc/weather-climate.rst | 9 + doc/whats-new.rst | 3 +- xarray/__init__.py | 2 + xarray/coding/cftimeindex.py | 3 +- xarray/coding/frequencies.py | 272 +++++++++++++++++++++++++++++++ xarray/tests/test_cftimeindex.py | 70 ++++++++ 7 files changed, 358 insertions(+), 2 deletions(-) create mode 100644 xarray/coding/frequencies.py diff --git a/doc/api.rst b/doc/api.rst index c9f24e8c3f1..3f25ac1a070 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -26,6 +26,7 @@ Top-level functions combine_nested where set_options + infer_freq full_like zeros_like ones_like diff --git a/doc/weather-climate.rst b/doc/weather-climate.rst index 1eb63d24630..f03dfd14c73 100644 --- a/doc/weather-climate.rst +++ b/doc/weather-climate.rst @@ -74,6 +74,15 @@ instance, we can create the same dates and DataArray we created above using: dates = xr.cftime_range(start="0001", periods=24, freq="MS", calendar="noleap") da = xr.DataArray(np.arange(24), coords=[dates], dims=["time"], name="foo") +Mirroring pandas' method with the same name, :py:meth:`~xarray.infer_freq` allows one to +infer the sampling frequency of a :py:class:`~xarray.CFTimeIndex` or a 1-D +:py:class:`~xarray.DataArray` containing cftime objects. It also works transparently with +``np.datetime64[ns]`` and ``np.timedelta64[ns]`` data. + +.. ipython:: python + + xr.infer_freq(dates) + With :py:meth:`~xarray.CFTimeIndex.strftime` we can also easily generate formatted strings from the datetime values of a :py:class:`~xarray.CFTimeIndex` directly or through the :py:meth:`~xarray.DataArray.dt` accessor for a :py:class:`~xarray.DataArray` diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e06ed5be897..e8e30917cff 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,7 +43,8 @@ Enhancements New Features ~~~~~~~~~~~~ - +- Added :py:meth:`xarray.infer_freq` for extending frequency inferring to CFTime indexes and data (:pull:`4033`). + By `Pascal Bourgault `_. - ``chunks='auto'`` is now supported in the ``chunks`` argument of :py:meth:`Dataset.chunk`. (:issue:`4055`) By `Andrew Williams `_ diff --git a/xarray/__init__.py b/xarray/__init__.py index e8274d13ffe..cb4824d188d 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -13,6 +13,7 @@ from .backends.zarr import open_zarr from .coding.cftime_offsets import cftime_range from .coding.cftimeindex import CFTimeIndex +from .coding.frequencies import infer_freq from .conventions import SerializationWarning, decode_cf from .core.alignment import align, broadcast from .core.combine import auto_combine, combine_by_coords, combine_nested @@ -57,6 +58,7 @@ "cov", "corr", "full_like", + "infer_freq", "load_dataarray", "load_dataset", "map_blocks", diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 6fc28d213dd..2a7eaa99edb 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -578,7 +578,8 @@ def asi8(self): [ _total_microseconds(exact_cftime_datetime_difference(epoch, date)) for date in self.values - ] + ], + dtype=np.int64, ) def _round_via_method(self, freq, method): diff --git a/xarray/coding/frequencies.py b/xarray/coding/frequencies.py new file mode 100644 index 00000000000..86f84ba5fbd --- /dev/null +++ b/xarray/coding/frequencies.py @@ -0,0 +1,272 @@ +"""FrequencyInferer analog for cftime.datetime objects""" +# The infer_freq method and the _CFTimeFrequencyInferer +# subclass defined here were copied and adapted for +# use with cftime.datetime objects based on the source code in +# pandas.tseries.Frequencies._FrequencyInferer + +# For reference, here is a copy of the pandas copyright notice: + +# (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +# All rights reserved. + +# Copyright (c) 2008-2011 AQR Capital Management, LLC +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. + +# * Neither the name of the copyright holder nor the names of any +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import pandas as pd + +from ..core.common import _contains_datetime_like_objects +from .cftime_offsets import _MONTH_ABBREVIATIONS +from .cftimeindex import CFTimeIndex + +_ONE_MICRO = 1 +_ONE_MILLI = _ONE_MICRO * 1000 +_ONE_SECOND = _ONE_MILLI * 1000 +_ONE_MINUTE = 60 * _ONE_SECOND +_ONE_HOUR = 60 * _ONE_MINUTE +_ONE_DAY = 24 * _ONE_HOUR + + +def infer_freq(index): + """ + Infer the most likely frequency given the input index. + + Parameters + ---------- + index : CFTimeIndex, DataArray, pd.DatetimeIndex, pd.TimedeltaIndex, pd.Series + If not passed a CFTimeIndex, this simply calls `pandas.infer_freq`. + If passed a Series or a DataArray will use the values of the series (NOT THE INDEX). + + Returns + ------- + str or None + None if no discernible frequency. + + Raises + ------ + TypeError + If the index is not datetime-like. + ValueError + If there are fewer than three values or the index is not 1D. + """ + from xarray.core.dataarray import DataArray + + if isinstance(index, (DataArray, pd.Series)): + if index.ndim != 1: + raise ValueError("'index' must be 1D") + elif not _contains_datetime_like_objects(DataArray(index)): + raise ValueError("'index' must contain datetime-like objects") + dtype = np.asarray(index).dtype + if dtype == "datetime64[ns]": + index = pd.DatetimeIndex(index.values) + elif dtype == "timedelta64[ns]": + index = pd.TimedeltaIndex(index.values) + else: + index = CFTimeIndex(index.values) + + if isinstance(index, CFTimeIndex): + inferer = _CFTimeFrequencyInferer(index) + return inferer.get_freq() + + return pd.infer_freq(index) + + +class _CFTimeFrequencyInferer: # (pd.tseries.frequencies._FrequencyInferer): + def __init__(self, index): + self.index = index + self.values = index.asi8 + + if len(index) < 3: + raise ValueError("Need at least 3 dates to infer frequency") + + self.is_monotonic = ( + self.index.is_monotonic_decreasing or self.index.is_monotonic_increasing + ) + + self._deltas = None + self._year_deltas = None + self._month_deltas = None + + def get_freq(self): + """Find the appropriate frequency string to describe the inferred frequency of self.index + + Adapted from `pandas.tsseries.frequencies._FrequencyInferer.get_freq` for CFTimeIndexes. + + Returns + ------- + str or None + """ + if not self.is_monotonic or not self.index.is_unique: + return None + + delta = self.deltas[0] # Smallest delta + if _is_multiple(delta, _ONE_DAY): + return self._infer_daily_rule() + # There is no possible intraday frequency with a non-unique delta + # Different from pandas: we don't need to manage DST and business offsets in cftime + elif not len(self.deltas) == 1: + return None + + if _is_multiple(delta, _ONE_HOUR): + return _maybe_add_count("H", delta / _ONE_HOUR) + elif _is_multiple(delta, _ONE_MINUTE): + return _maybe_add_count("T", delta / _ONE_MINUTE) + elif _is_multiple(delta, _ONE_SECOND): + return _maybe_add_count("S", delta / _ONE_SECOND) + elif _is_multiple(delta, _ONE_MILLI): + return _maybe_add_count("L", delta / _ONE_MILLI) + else: + return _maybe_add_count("U", delta / _ONE_MICRO) + + def _infer_daily_rule(self): + annual_rule = self._get_annual_rule() + if annual_rule: + nyears = self.year_deltas[0] + month = _MONTH_ABBREVIATIONS[self.index[0].month] + alias = f"{annual_rule}-{month}" + return _maybe_add_count(alias, nyears) + + quartely_rule = self._get_quartely_rule() + if quartely_rule: + nquarters = self.month_deltas[0] / 3 + mod_dict = {0: 12, 2: 11, 1: 10} + month = _MONTH_ABBREVIATIONS[mod_dict[self.index[0].month % 3]] + alias = f"{quartely_rule}-{month}" + return _maybe_add_count(alias, nquarters) + + monthly_rule = self._get_monthly_rule() + if monthly_rule: + return _maybe_add_count(monthly_rule, self.month_deltas[0]) + + if len(self.deltas) == 1: + # Daily as there is no "Weekly" offsets with CFTime + days = self.deltas[0] / _ONE_DAY + return _maybe_add_count("D", days) + + # CFTime has no business freq and no "week of month" (WOM) + return None + + def _get_annual_rule(self): + if len(self.year_deltas) > 1: + return None + + if len(np.unique(self.index.month)) > 1: + return None + + return {"cs": "AS", "ce": "A"}.get(month_anchor_check(self.index)) + + def _get_quartely_rule(self): + if len(self.month_deltas) > 1: + return None + + if not self.month_deltas[0] % 3 == 0: + return None + + return {"cs": "QS", "ce": "Q"}.get(month_anchor_check(self.index)) + + def _get_monthly_rule(self): + if len(self.month_deltas) > 1: + return None + + return {"cs": "MS", "ce": "M"}.get(month_anchor_check(self.index)) + + @property + def deltas(self): + """Sorted unique timedeltas as microseconds.""" + if self._deltas is None: + self._deltas = _unique_deltas(self.values) + return self._deltas + + @property + def year_deltas(self): + """Sorted unique year deltas.""" + if self._year_deltas is None: + self._year_deltas = _unique_deltas(self.index.year) + return self._year_deltas + + @property + def month_deltas(self): + """Sorted unique month deltas.""" + if self._month_deltas is None: + self._month_deltas = _unique_deltas(self.index.year * 12 + self.index.month) + return self._month_deltas + + +def _unique_deltas(arr): + """Sorted unique deltas of numpy array""" + return np.sort(np.unique(np.diff(arr))) + + +def _is_multiple(us, mult: int): + """Whether us is a multiple of mult""" + return us % mult == 0 + + +def _maybe_add_count(base: str, count: float): + """If count is greater than 1, add it to the base offset string""" + if count != 1: + assert count == int(count) + count = int(count) + return f"{count}{base}" + else: + return base + + +def month_anchor_check(dates): + """Return the monthly offset string. + + Return "cs" if all dates are the first days of the month, + "ce" if all dates are the last day of the month, + None otherwise. + + Replicated pandas._libs.tslibs.resolution.month_position_check + but without business offset handling. + """ + calendar_end = True + calendar_start = True + + for date in dates: + if calendar_start: + calendar_start &= date.day == 1 + + if calendar_end: + cal = date.day == date.daysinmonth + if calendar_end: + calendar_end &= cal + elif not calendar_start: + break + + if calendar_end: + return "ce" + elif calendar_start: + return "cs" + else: + return None diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index b30e32c92ad..745ae341370 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -1046,3 +1046,73 @@ def test_asi8_distant_date(): result = index.asi8 expected = np.array([1000000 * 86400 * 400 * 8000 + 12345 * 1000000 + 123456]) np.testing.assert_array_equal(result, expected) + + +@requires_cftime_1_1_0 +def test_infer_freq_valid_types(): + cf_indx = xr.cftime_range("2000-01-01", periods=3, freq="D") + assert xr.infer_freq(cf_indx) == "D" + assert xr.infer_freq(xr.DataArray(cf_indx)) == "D" + + pd_indx = pd.date_range("2000-01-01", periods=3, freq="D") + assert xr.infer_freq(pd_indx) == "D" + assert xr.infer_freq(xr.DataArray(pd_indx)) == "D" + + pd_td_indx = pd.timedelta_range(start="1D", periods=3, freq="D") + assert xr.infer_freq(pd_td_indx) == "D" + assert xr.infer_freq(xr.DataArray(pd_td_indx)) == "D" + + +@requires_cftime_1_1_0 +def test_infer_freq_invalid_inputs(): + # Non-datetime DataArray + with pytest.raises(ValueError, match="must contain datetime-like objects"): + xr.infer_freq(xr.DataArray([0, 1, 2])) + + indx = xr.cftime_range("1990-02-03", periods=4, freq="MS") + # 2D DataArray + with pytest.raises(ValueError, match="must be 1D"): + xr.infer_freq(xr.DataArray([indx, indx])) + + # CFTimeIndex too short + with pytest.raises(ValueError, match="Need at least 3 dates to infer frequency"): + xr.infer_freq(indx[:2]) + + # Non-monotonic input + assert xr.infer_freq(indx[np.array([0, 2, 1, 3])]) is None + + # Non-unique input + assert xr.infer_freq(indx[np.array([0, 1, 1, 2])]) is None + + # No unique frequency (here 1st step is MS, second is 2MS) + assert xr.infer_freq(indx[np.array([0, 1, 3])]) is None + + # Same, but for QS + indx = xr.cftime_range("1990-02-03", periods=4, freq="QS") + assert xr.infer_freq(indx[np.array([0, 1, 3])]) is None + + +@requires_cftime_1_1_0 +@pytest.mark.parametrize( + "freq", + [ + "300AS-JAN", + "A-DEC", + "AS-JUL", + "2AS-FEB", + "Q-NOV", + "3QS-DEC", + "MS", + "4M", + "7D", + "D", + "30H", + "5T", + "40S", + ], +) +@pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) +def test_infer_freq(freq, calendar): + indx = xr.cftime_range("2000-01-01", periods=3, freq=freq, calendar=calendar) + out = xr.infer_freq(indx) + assert out == freq