Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement _most_ of the EA interface for DTA/TDA #23643

Merged
merged 11 commits into from
Nov 14, 2018
70 changes: 69 additions & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from pandas.core.dtypes.missing import isna

import pandas.core.common as com
from pandas.core.algorithms import checked_add_with_arr
from pandas.core.algorithms import checked_add_with_arr, take, unique1d

from .base import ExtensionOpsMixin
from pandas.util._decorators import deprecate_kwarg
Expand Down Expand Up @@ -196,6 +196,74 @@ def astype(self, dtype, copy=True):
return self._box_values(self.asi8)
return super(DatetimeLikeArrayMixin, self).astype(dtype, copy)

# ------------------------------------------------------------------
# ExtensionArray Interface
# TODO:
# * _from_sequence
# * argsort / _values_for_argsort
# * _reduce

def unique(self):
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
result = unique1d(self.asi8)
return type(self)(result, dtype=self.dtype)

def _validate_fill_value(self, fill_value):
"""
If a fill_value is passed to `take` convert it to an i8 representation,
raising ValueError if this is not possible.

Parameters
----------
fill_value : object

Returns
-------
fill_value : np.int64

Raises
------
ValueError
"""
raise AbstractMethodError(self)

def take(self, indices, allow_fill=False, fill_value=None):
if allow_fill:
fill_value = self._validate_fill_value(fill_value)

new_values = take(self.asi8,
indices,
allow_fill=allow_fill,
fill_value=fill_value)

return type(self)(new_values, dtype=self.dtype)

@classmethod
def _concat_same_type(cls, to_concat):
freqs = {x.freq for x in to_concat}
assert len(freqs) == 1
freq = list(freqs)[0]

# dtype captures tz for datetime64tz case
dtypes = {x.dtype for x in to_concat}
assert len(dtypes) == 1
dtype = list(dtypes)[0]

values = np.concatenate([x.asi8 for x in to_concat])
return cls(values, dtype=dtype, freq=freq)

def copy(self, deep=False):
values = self.asi8
if deep:
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
values = values.copy()
return type(self)(values, dtype=self.dtype, freq=self.freq)

def _values_for_factorize(self):
return self.asi8, iNaT

@classmethod
def _from_factorized(cls, values, original):
return cls(values, dtype=original.dtype, freq=original.freq)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to pass original.freq here?
It seems a bit strange as this is creating a new array which does not necessarily have the same order as the original one.

Although in practice, if you have a freq, that means you have a regular and unique array to start with, so the factorization is kind of a no-op and the result will still have the same freq? (but might be missing corner cases)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking of a possible corner case, which is currently actually broken: a sorted factorize of a DatetimeIndex with a negative freq:

In [57]: idx = pd.date_range("2012-01-01", periods=3)

In [58]: idx
Out[58]: DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03'], dtype='datetime64[ns]', freq='D')

In [59]: pd.factorize(idx)
Out[59]: 
(array([0, 1, 2]),
 DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03'], dtype='datetime64[ns]', freq='D'))

In [60]: pd.factorize(idx[::-1])
Out[60]: 
(array([0, 1, 2]),
 DatetimeIndex(['2012-01-03', '2012-01-02', '2012-01-01'], dtype='datetime64[ns]', freq='-1D'))

In [61]: pd.factorize(idx[::-1], sort=True)
Out[61]: 
(array([2, 1, 0]),
 DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03'], dtype='datetime64[ns]', freq='-1D'))


# ------------------------------------------------------------------
# Null Handling

Expand Down
28 changes: 23 additions & 5 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
conversion, fields, timezones,
resolution as libresolution)

from pandas.util._decorators import cache_readonly
from pandas.util._decorators import cache_readonly, Appender
from pandas.errors import PerformanceWarning
from pandas import compat

Expand All @@ -21,8 +21,7 @@
is_object_dtype,
is_int64_dtype,
is_datetime64tz_dtype,
is_datetime64_dtype,
ensure_int64)
is_datetime64_dtype)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import isna
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
Expand Down Expand Up @@ -294,7 +293,7 @@ def _generate_range(cls, start, end, periods, freq, tz=None,

if tz is not None and index.tz is None:
arr = conversion.tz_localize_to_utc(
ensure_int64(index.values),
index.asi8,
tz, ambiguous=ambiguous)

index = cls(arr)
Expand All @@ -317,7 +316,7 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
if not right_closed and len(index) and index[-1] == end:
index = index[:-1]

return cls._simple_new(index.values, freq=freq, tz=tz)
return cls._simple_new(index.asi8, freq=freq, tz=tz)

# -----------------------------------------------------------------
# Descriptive Properties
Expand Down Expand Up @@ -419,6 +418,25 @@ def __iter__(self):
for v in converted:
yield v

# ----------------------------------------------------------------
# ExtensionArray Interface

@property
def _ndarray_values(self):
return self._data

@Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__)
def _validate_fill_value(self, fill_value):
if isna(fill_value):
fill_value = iNaT
elif isinstance(fill_value, (datetime, np.datetime64)):
self._assert_tzawareness_compat(fill_value)
fill_value = Timestamp(fill_value).value
else:
raise ValueError("'fill_value' should be a Timestamp. "
"Got '{got}'.".format(got=fill_value))
return fill_value

# -----------------------------------------------------------------
# Comparison Methods

Expand Down
53 changes: 14 additions & 39 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,14 +216,6 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
ordinals = libperiod.extract_ordinals(periods, freq)
return cls(ordinals, freq=freq)

def _values_for_factorize(self):
return self.asi8, iNaT

@classmethod
def _from_factorized(cls, values, original):
# type: (Sequence[Optional[Period]], PeriodArray) -> PeriodArray
return cls(values, freq=original.freq)

@classmethod
def _from_datetime64(cls, data, freq, tz=None):
"""Construct a PeriodArray from a datetime64 array
Expand Down Expand Up @@ -262,14 +254,6 @@ def _generate_range(cls, start, end, periods, freq, fields):

return subarr, freq

@classmethod
def _concat_same_type(cls, to_concat):
freq = {x.freq for x in to_concat}
assert len(freq) == 1
freq = list(freq)[0]
values = np.concatenate([x._data for x in to_concat])
return cls(values, freq=freq)

# --------------------------------------------------------------------
# Data / Attributes

Expand Down Expand Up @@ -415,29 +399,20 @@ def __setitem__(
raise TypeError(msg)
self._data[key] = value

def take(self, indices, allow_fill=False, fill_value=None):
if allow_fill:
if isna(fill_value):
fill_value = iNaT
elif isinstance(fill_value, Period):
if self.freq != fill_value.freq:
msg = DIFFERENT_FREQ_INDEX.format(
self.freq.freqstr,
fill_value.freqstr
)
raise IncompatibleFrequency(msg)

fill_value = fill_value.ordinal
else:
msg = "'fill_value' should be a Period. Got '{}'."
raise ValueError(msg.format(fill_value))

new_values = algos.take(self._data,
indices,
allow_fill=allow_fill,
fill_value=fill_value)

return type(self)(new_values, self.freq)
@Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__)
def _validate_fill_value(self, fill_value):
if isna(fill_value):
fill_value = iNaT
elif isinstance(fill_value, Period):
if fill_value.freq != self.freq:
msg = DIFFERENT_FREQ_INDEX.format(self.freq.freqstr,
fill_value.freqstr)
raise IncompatibleFrequency(msg)
fill_value = fill_value.ordinal
else:
raise ValueError("'fill_value' should be a Period. "
"Got '{got}'.".format(got=fill_value))
return fill_value

def fillna(self, value=None, method=None, limit=None):
# TODO(#20300)
Expand Down
14 changes: 13 additions & 1 deletion pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pandas._libs.tslibs.fields import get_timedelta_field
from pandas._libs.tslibs.timedeltas import (
array_to_timedelta64, parse_timedelta_unit)
from pandas.util._decorators import Appender

from pandas import compat

Expand Down Expand Up @@ -139,7 +140,7 @@ def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE):
result._freq = freq
return result

def __new__(cls, values, freq=None):
def __new__(cls, values, freq=None, dtype=_TD_DTYPE):
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved

freq, freq_infer = dtl.maybe_infer_freq(freq)

Expand Down Expand Up @@ -193,6 +194,17 @@ def _generate_range(cls, start, end, periods, freq, closed=None):
# ----------------------------------------------------------------
# Array-Like / EA-Interface Methods

@Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__)
def _validate_fill_value(self, fill_value):
if isna(fill_value):
fill_value = iNaT
elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)):
fill_value = Timedelta(fill_value).value
else:
raise ValueError("'fill_value' should be a Timedelta. "
"Got '{got}'.".format(got=fill_value))
return fill_value

# ----------------------------------------------------------------
# Arithmetic Methods

Expand Down
7 changes: 6 additions & 1 deletion pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ def ceil(self, freq, ambiguous='raise', nonexistent='raise'):
class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin):
""" common ops mixin to support a unified interface datetimelike Index """

# override DatetimeLikeArrayMixin method
copy = Index.copy
unique = Index.unique
take = Index.take

# DatetimeLikeArrayMixin assumes subclasses are mutable, so these are
# properties there. They can be made into cache_readonly for Index
# subclasses bc they are immutable
Expand Down Expand Up @@ -760,7 +765,7 @@ def _ensure_datetimelike_to_i8(other, to_utc=False):
try:
return np.array(other, copy=False).view('i8')
except TypeError:
# period array cannot be coerces to int
# period array cannot be coerced to int
other = Index(other)
return other.asi8

Expand Down
17 changes: 7 additions & 10 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,16 +551,13 @@ def snap(self, freq='S'):
# TODO: what about self.name? if so, use shallow_copy?

def unique(self, level=None):
# Override here since IndexOpsMixin.unique uses self._values.unique
# For DatetimeIndex with TZ, that's a DatetimeIndex -> recursion error
# So we extract the tz-naive DatetimeIndex, unique that, and wrap the
# result with out TZ.
if self.tz is not None:
naive = type(self)(self._ndarray_values, copy=False)
else:
naive = self
result = super(DatetimeIndex, naive).unique(level=level)
return self._shallow_copy(result.values)
if level is not None:
self._validate_index_level(level)

# TODO(DatetimeArray): change dispatch once inheritance is removed
# call DatetimeArray method
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
result = DatetimeArray.unique(self)
return self._shallow_copy(result._data)

def union(self, other):
"""
Expand Down
Loading