Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement to_datetime(..., utc=True) #14749

Merged
merged 3 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 50 additions & 50 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

import math
import re
import warnings
from typing import Sequence, Union
from typing import Literal, Optional, Sequence, Union

import cupy as cp
import numpy as np
Expand Down Expand Up @@ -49,16 +49,16 @@

def to_datetime(
arg,
errors="raise",
dayfirst=False,
yearfirst=False,
utc=None,
format=None,
exact=True,
unit="ns",
infer_datetime_format=False,
errors: Literal["raise", "coerce", "warn", "ignore"] = "raise",
dayfirst: bool = False,
yearfirst: bool = False,
utc: bool = False,
format: Optional[str] = None,
exact: bool = True,
unit: str = "ns",
infer_datetime_format: bool = False,
origin="unix",
cache=True,
cache: bool = True,
):
"""
Convert argument to datetime.
Expand All @@ -80,6 +80,8 @@ def to_datetime(
2012-11-10.
Warning: dayfirst=True is not strict, but will prefer to parse
with day first (this is a known bug, based on dateutil behavior).
utc : bool, default False
Whether the result should be have a UTC timezone.
format : str, default None
The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
all the way up to nanoseconds.
Expand Down Expand Up @@ -148,9 +150,6 @@ def to_datetime(
if yearfirst:
raise NotImplementedError("yearfirst support is not yet implemented")

if utc:
raise NotImplementedError("utc is not yet implemented")

if format is not None:
if "%Z" in format or "%z" in format:
raise NotImplementedError(
Expand All @@ -165,24 +164,24 @@ def to_datetime(
required = ["year", "month", "day"]
req = list(set(required) - set(arg._data.names))
if len(req):
req = ",".join(req)
err_req = ",".join(req)
raise ValueError(
f"to assemble mappings requires at least that "
f"[year, month, day] be specified: [{req}] "
f"[year, month, day] be specified: [{err_req}] "
f"is missing"
)

# replace passed column name with values in _unit_map
unit = {k: get_units(k) for k in arg._data.names}
unit_rev = {v: k for k, v in unit.items()}
got_units = {k: get_units(k) for k in arg._data.names}
unit_rev = {v: k for k, v in got_units.items()}

# keys we don't recognize
excess = set(unit_rev.keys()) - set(_unit_map.values())
if len(excess):
excess = ",".join(excess)
err_excess = ",".join(excess)
raise ValueError(
f"extra keys have been passed to the "
f"datetime assemblage: [{excess}]"
f"datetime assemblage: [{err_excess}]"
)

new_series = (
Expand Down Expand Up @@ -245,38 +244,29 @@ def to_datetime(
col = (col.astype(dtype="int64") + times_column).astype(
dtype=col.dtype
)
return cudf.Series(col, index=arg.index)
elif isinstance(arg, cudf.BaseIndex):
col = arg._values
col = _process_col(
col=col,
unit=unit,
dayfirst=dayfirst,
infer_datetime_format=infer_datetime_format,
format=format,
)
return as_index(col, name=arg.name)
elif isinstance(arg, (cudf.Series, pd.Series)):
col = column.as_column(arg)
col = _process_col(
col=col,
unit=unit,
dayfirst=dayfirst,
infer_datetime_format=infer_datetime_format,
format=format,
utc=utc,
)
return cudf.Series(col, index=arg.index, name=arg.name)
return cudf.Series(col, index=arg.index)
else:
col = column.as_column(arg)
col = _process_col(
col=col,
col=column.as_column(arg),
unit=unit,
dayfirst=dayfirst,
infer_datetime_format=infer_datetime_format,
format=format,
utc=utc,
)

if is_scalar(arg):
if isinstance(arg, (cudf.BaseIndex, pd.Index)):
return as_index(col, name=arg.name)
elif isinstance(arg, (cudf.Series, pd.Series)):
return cudf.Series(col, index=arg.index, name=arg.name)
elif is_scalar(arg):
return col.element_indexing(0)
else:
return as_index(col)
Expand All @@ -295,11 +285,18 @@ def to_datetime(
return arg


def _process_col(col, unit, dayfirst, infer_datetime_format, format):
if col.dtype.kind == "M":
return col
def _process_col(
col,
unit: str,
dayfirst: bool,
infer_datetime_format: bool,
format: Optional[str],
utc: bool,
):
# Causes circular import
from cudf.core._internals.timezones import localize

elif col.dtype.kind in ("f"):
if col.dtype.kind == "f":
if unit not in (None, "ns"):
factor = cudf.Scalar(
column.datetime._unit_to_nanoseconds_conversion[unit]
Expand All @@ -325,9 +322,8 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
)
else:
col = col.as_datetime_column(dtype="datetime64[ns]")
return col

elif col.dtype.kind in ("i"):
elif col.dtype.kind in "iu":
if unit in ("D", "h", "m"):
factor = cudf.Scalar(
column.datetime._unit_to_nanoseconds_conversion[unit]
Expand All @@ -341,9 +337,8 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
)
else:
col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
return col

elif col.dtype.kind in ("O"):
elif col.dtype.kind == "O":
if unit not in (None, "ns") or col.null_count == len(col):
try:
col = col.astype(dtype="int64")
Expand All @@ -355,6 +350,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
dayfirst=dayfirst,
infer_datetime_format=infer_datetime_format,
format=format,
utc=utc,
)
else:
if format is None:
Expand All @@ -367,13 +363,17 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
element=col.element_indexing(0),
dayfirst=dayfirst,
)
return col.as_datetime_column(
col = col.as_datetime_column(
dtype=_unit_dtype_map[unit],
format=format,
)
raise TypeError(
f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
)
elif col.dtype.kind != "M":
raise TypeError(
f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
)
if utc and not isinstance(col.dtype, pd.DatetimeTZDtype):
return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT")
return col


def get_units(value):
Expand Down
35 changes: 35 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2431,6 +2431,41 @@ def test_to_datetime_errors_non_scalar_not_implemented(errors):
cudf.to_datetime([1, ""], unit="s", errors=errors)


@pytest.mark.parametrize(
"box", [list, pd.Index, cudf.Index, pd.Series, cudf.Series]
)
@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
def test_to_datetime_arraylike_utc_true(box, dtype):
pd_data = [1, 2]
cudf_data = box(pd_data)
if box is not list:
cudf_data = cudf_data.astype(dtype)
if box is cudf.Series or box is pd.Series:
pd_data = pd.Series(pd_data)
result = cudf.to_datetime(cudf_data, utc=True)
expected = pd.to_datetime(pd_data, utc=True)
assert_eq(result, expected)


@pytest.mark.xfail(
raises=TypeError,
reason="libcudf.copying.get_element doesn't understand pd.DatetimeTZDtype",
)
def test_to_datetime_scalar_utc_true():
data = pd.Timestamp(2020, 1, 1)
with cudf.option_context("mode.pandas_compatible", True):
result = cudf.to_datetime(data, utc=True)
expected = pd.Timestamp(year=2020, month=1, day=1, tz="UTC")
assert_eq(result, expected)


def test_to_datetime_dataframe_utc_true():
data = cudf.DataFrame([[2020, 1, 1]], columns=["year", "month", "day"])
result = cudf.to_datetime(data, utc=True)
expected = pd.Series([datetime.datetime(2020, 1, 1)]).dt.tz_localize("UTC")
assert_eq(result, expected)


def test_datetimeindex_dtype_np_dtype():
dtype = np.dtype("datetime64[ns]")
data = [1]
Expand Down