Add support for CFTimeIndex in get_clean_interp_index (#3631)

* add support for CFTimeIndex in get_clean_interp_index * black * added test comparing cftime index with standard index * added comment * index in ns instead of days * pep8 * datetime_to_numeric: convert timedelta objects using np.timedelta64 type conversion. add overflow tests * added interp test * switched clean_interp_index resolution to us. Fixed interpolate_na and added support for CFTimeIndex. * Error message to explain overflow problem. * switched timedelta64 units from ms to us * reverted default user-visible resolution to ns. Converts to float, possibly lossy. * pep8 * black * special case for older numpy versions * black * added xfail for overflow error with numpy < 1.17 * changes following PR comments from spencerclark * bypass pandas to convert timedeltas to floats. avoids overflow errors. * black * removed numpy conversion. added docstrings. renamed tests. * pep8 * updated whats new * Update doc/whats-new.rst Co-Authored-By: Spencer Clark <spencerkclark@gmail.com> * update interpolate_na docstrings * black * dt conflicts with accessor * replaced assert_equal by assert_allclose * Update xarray/core/duck_array_ops.py Co-Authored-By: Spencer Clark <spencerkclark@gmail.com> * Update xarray/core/duck_array_ops.py Co-Authored-By: Spencer Clark <spencerkclark@gmail.com> * renamed array to value in timedelta_to_numeric. Added tests * removed support for TimedeltaIndex in timedelta_to_numeric * added tests for np_timedelta64_to_float and pd_timedelta_to_float. renamed array to value for pd_timedelta_to_float. removed pd_timedeltaindex_to_float. * black * Fix flake8 error * black Co-authored-by: Spencer Clark <spencerkclark@gmail.com>
pydata · Jan 26, 2020 · 8772355 · 8772355
1 parent cc142f4
commit 8772355
Show file tree

Hide file tree

Showing 9 changed files with 352 additions and 95 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -29,8 +29,8 @@ Breaking changes
   - scipy 1.3
 
 - Remove ``compat`` and ``encoding`` kwargs from ``DataArray``, which
-  have been deprecated since 0.12. (:pull:`3650`). 
-  Instead, specify the encoding when writing to disk or set 
+  have been deprecated since 0.12. (:pull:`3650`).
+  Instead, specify the encoding when writing to disk or set
   the ``encoding`` attribute directly.
   By `Maximilian Roos <https://github.com/max-sixty>`_
 - :py:func:`xarray.dot`, :py:meth:`DataArray.dot`, and the ``@`` operator now
@@ -67,10 +67,15 @@ New Features
 - :py:meth:`Dataset.swap_dims` and :py:meth:`DataArray.swap_dims`
   now allow swapping to dimension names that don't exist yet. (:pull:`3636`)
   By `Justus Magin <https://github.com/keewis>`_.
-- Extend :py:class:`core.accessor_dt.DatetimeAccessor` properties 
-  and support `.dt` accessor for timedelta 
+- Extend :py:class:`core.accessor_dt.DatetimeAccessor` properties
+  and support `.dt` accessor for timedelta
   via :py:class:`core.accessor_dt.TimedeltaAccessor` (:pull:`3612`)
   By `Anderson Banihirwe <https://github.com/andersy005>`_.
+- Support CFTimeIndex in :py:meth:`DataArray.interpolate_na`, define 1970-01-01
+  as the default offset for the interpolation index for both DatetimeIndex and
+  CFTimeIndex, use microseconds in the conversion from timedelta objects
+  to floats to avoid overflow errors (:issue:`3641`, :pull:`3631`).
+  By David Huard `<https://github.com/huard>`_.
 
 Bug fixes
 ~~~~~~~~~

diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py
@@ -430,7 +430,14 @@ def __sub__(self, other):
         import cftime
 
         if isinstance(other, (CFTimeIndex, cftime.datetime)):
-            return pd.TimedeltaIndex(np.array(self) - np.array(other))
+            try:
+                return pd.TimedeltaIndex(np.array(self) - np.array(other))
+            except OverflowError:
+                raise ValueError(
+                    "The time difference exceeds the range of values "
+                    "that can be expressed at the nanosecond resolution."
+                )
+
         elif isinstance(other, pd.TimedeltaIndex):
             return CFTimeIndex(np.array(self) - other.to_pytimedelta())
         else:

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -18,6 +18,7 @@
     cast,
 )
 
+import datetime
 import numpy as np
 import pandas as pd
 
@@ -2041,7 +2042,9 @@ def interpolate_na(
         method: str = "linear",
         limit: int = None,
         use_coordinate: Union[bool, str] = True,
-        max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
+        max_gap: Union[
+            int, float, str, pd.Timedelta, np.timedelta64, datetime.timedelta
+        ] = None,
         **kwargs: Any,
     ) -> "DataArray":
         """Fill in NaNs by interpolating according to different methods.
@@ -2073,14 +2076,15 @@ def interpolate_na(
             or None for no limit. This filling is done regardless of the size of
             the gap in the data. To only interpolate over gaps less than a given length,
             see ``max_gap``.
-        max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None.
+        max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default None.
             Maximum size of gap, a continuous sequence of NaNs, that will be filled.
             Use None for no limit. When interpolating along a datetime64 dimension
             and ``use_coordinate=True``, ``max_gap`` can be one of the following:
 
             - a string that is valid input for pandas.to_timedelta
             - a :py:class:`numpy.timedelta64` object
             - a :py:class:`pandas.Timedelta` object
+            - a :py:class:`datetime.timedelta` object
 
             Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled
             dimensions has not been implemented yet. Gap length is defined as the difference

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -27,6 +27,7 @@
     cast,
 )
 
+import datetime
 import numpy as np
 import pandas as pd
 
@@ -3995,7 +3996,9 @@ def interpolate_na(
         method: str = "linear",
         limit: int = None,
         use_coordinate: Union[bool, Hashable] = True,
-        max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
+        max_gap: Union[
+            int, float, str, pd.Timedelta, np.timedelta64, datetime.timedelta
+        ] = None,
         **kwargs: Any,
     ) -> "Dataset":
         """Fill in NaNs by interpolating according to different methods.
@@ -4028,14 +4031,15 @@ def interpolate_na(
             or None for no limit. This filling is done regardless of the size of
             the gap in the data. To only interpolate over gaps less than a given length,
             see ``max_gap``.
-        max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None.
+        max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default None.
             Maximum size of gap, a continuous sequence of NaNs, that will be filled.
             Use None for no limit. When interpolating along a datetime64 dimension
             and ``use_coordinate=True``, ``max_gap`` can be one of the following:
 
             - a string that is valid input for pandas.to_timedelta
             - a :py:class:`numpy.timedelta64` object
             - a :py:class:`pandas.Timedelta` object
+            - a :py:class:`datetime.timedelta` object
 
             Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled
             dimensions has not been implemented yet. Gap length is defined as the difference

diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -372,51 +372,141 @@ def _datetime_nanmin(array):
 
 
 def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
-    """Convert an array containing datetime-like data to an array of floats.
+    """Convert an array containing datetime-like data to numerical values.
+
+    Convert the datetime array to a timedelta relative to an offset.
 
     Parameters
     ----------
-    da : np.array
-        Input data
-    offset: Scalar with the same type of array or None
-        If None, subtract minimum values to reduce round off error
-    datetime_unit: None or any of {'Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms',
-        'us', 'ns', 'ps', 'fs', 'as'}
-    dtype: target dtype
+    da : array-like
+      Input data
+    offset: None, datetime or cftime.datetime
+      Datetime offset. If None, this is set by default to the array's minimum
+      value to reduce round off errors.
+    datetime_unit: {None, Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
+      If not None, convert output to a given datetime unit. Note that some
+      conversions are not allowed due to non-linear relationships between units.
+    dtype: dtype
+      Output dtype.
 
     Returns
     -------
     array
+      Numerical representation of datetime object relative to an offset.
+
+    Notes
+    -----
+    Some datetime unit conversions won't work, for example from days to years, even
+    though some calendars would allow for them (e.g. no_leap). This is because there
+    is no `cftime.timedelta` object.
     """
     # TODO: make this function dask-compatible?
+    # Set offset to minimum if not given
     if offset is None:
         if array.dtype.kind in "Mm":
             offset = _datetime_nanmin(array)
         else:
             offset = min(array)
+
+    # Compute timedelta object.
+    # For np.datetime64, this can silently yield garbage due to overflow.
+    # One option is to enforce 1970-01-01 as the universal offset.
     array = array - offset
 
-    if not hasattr(array, "dtype"):  # scalar is converted to 0d-array
+    # Scalar is converted to 0d-array
+    if not hasattr(array, "dtype"):
         array = np.array(array)
 
+    # Convert timedelta objects to float by first converting to microseconds.
     if array.dtype.kind in "O":
-        # possibly convert object array containing datetime.timedelta
-        array = np.asarray(pd.Series(array.ravel())).reshape(array.shape)
+        return py_timedelta_to_float(array, datetime_unit or "ns").astype(dtype)
 
-    if datetime_unit:
-        array = array / np.timedelta64(1, datetime_unit)
+    # Convert np.NaT to np.nan
+    elif array.dtype.kind in "mM":
 
-    # convert np.NaT to np.nan
-    if array.dtype.kind in "mM":
+        # Convert to specified timedelta units.
+        if datetime_unit:
+            array = array / np.timedelta64(1, datetime_unit)
         return np.where(isnull(array), np.nan, array.astype(dtype))
-    return array.astype(dtype)
+
+
+def timedelta_to_numeric(value, datetime_unit="ns", dtype=float):
+    """Convert a timedelta-like object to numerical values.
+
+    Parameters
+    ----------
+    value : datetime.timedelta, numpy.timedelta64, pandas.Timedelta, str
+      Time delta representation.
+    datetime_unit : {Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
+      The time units of the output values. Note that some conversions are not allowed due to
+      non-linear relationships between units.
+    dtype : type
+      The output data type.
+
+    """
+    import datetime as dt
+
+    if isinstance(value, dt.timedelta):
+        out = py_timedelta_to_float(value, datetime_unit)
+    elif isinstance(value, np.timedelta64):
+        out = np_timedelta64_to_float(value, datetime_unit)
+    elif isinstance(value, pd.Timedelta):
+        out = pd_timedelta_to_float(value, datetime_unit)
+    elif isinstance(value, str):
+        try:
+            a = pd.to_timedelta(value)
+        except ValueError:
+            raise ValueError(
+                f"Could not convert {value!r} to timedelta64 using pandas.to_timedelta"
+            )
+        return py_timedelta_to_float(a, datetime_unit)
+    else:
+        raise TypeError(
+            f"Expected value of type str, pandas.Timedelta, datetime.timedelta "
+            f"or numpy.timedelta64, but received {type(value).__name__}"
+        )
+    return out.astype(dtype)
 
 
 def _to_pytimedelta(array, unit="us"):
     index = pd.TimedeltaIndex(array.ravel(), unit=unit)
     return index.to_pytimedelta().reshape(array.shape)
 
 
+def np_timedelta64_to_float(array, datetime_unit):
+    """Convert numpy.timedelta64 to float.
+
+    Notes
+    -----
+    The array is first converted to microseconds, which is less likely to
+    cause overflow errors.
+    """
+    array = array.astype("timedelta64[ns]").astype(np.float64)
+    conversion_factor = np.timedelta64(1, "ns") / np.timedelta64(1, datetime_unit)
+    return conversion_factor * array
+
+
+def pd_timedelta_to_float(value, datetime_unit):
+    """Convert pandas.Timedelta to float.
+
+    Notes
+    -----
+    Built on the assumption that pandas timedelta values are in nanoseconds,
+    which is also the numpy default resolution.
+    """
+    value = value.to_timedelta64()
+    return np_timedelta64_to_float(value, datetime_unit)
+
+
+def py_timedelta_to_float(array, datetime_unit):
+    """Convert a timedelta object to a float, possibly at a loss of resolution.
+    """
+    array = np.asarray(array)
+    array = np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6
+    conversion_factor = np.timedelta64(1, "us") / np.timedelta64(1, datetime_unit)
+    return conversion_factor * array
+
+
 def mean(array, axis=None, skipna=None, **kwargs):
     """inhouse mean that can handle np.datetime64 or cftime.datetime
     dtypes"""