From 8654a9ed3cc2246ef9eaf2fe8725369a2e885d35 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 31 Aug 2016 09:12:52 -0400 Subject: [PATCH] API: Expanded resample closes #13500 Author: Chris Closes #13961 from chris-b1/resample-api and squashes the following commits: b8dd114 [Chris] make _from_selection a property 10c7280 [Chris] NotImp -> ValueError e203fcf [Chris] doc updates 384026b [Chris] remove PeriodIndex workaround c7b299e [Chris] cleanup debugging 5fd97d9 [Chris] add from_selection bookkeeping 7f9add4 [Chris] more wip b55309a [Chris] wip c4db0e7 [Chris] move error handling; doc fixups def74de [Chris] API: Expanded resample --- doc/source/timeseries.rst | 24 ++++ doc/source/whatsnew/v0.19.0.txt | 14 ++ pandas/core/generic.py | 21 ++- pandas/core/groupby.py | 3 +- pandas/tseries/resample.py | 33 ++++- pandas/tseries/tests/test_resample.py | 180 +++++++++++++++++++------- 6 files changed, 221 insertions(+), 54 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 6f44ee0c87945..36e492df29983 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1473,6 +1473,30 @@ Furthermore, you can also specify multiple aggregation functions for each column r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) +If a ``DataFrame`` does not have a datetimelike index, but instead you want +to resample based on datetimelike column in the frame, it can passed to the +``on`` keyword. + +.. ipython:: python + + df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), + 'a': np.arange(5)}, + index=pd.MultiIndex.from_arrays([ + [1,2,3,4,5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v','d'])) + df + df.resample('M', on='date').sum() + +Similarly, if you instead want to resample by a datetimelike +level of ``MultiIndex``, its name or location can be passed to the +``level`` keyword. + +.. ipython:: python + + df.resample(level='d').sum() + + .. _timeseries.periods: Time Span Representation diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 29971f4419ae1..9c4010f8f024a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -397,6 +397,20 @@ Other enhancements pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) +- the ``.resample()`` function now accepts a ``on=`` or ``level=`` parameter for resampling on a datetimelike column or ``MultiIndex`` level (:issue:`13500`) + + .. ipython:: python + + df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), + 'a': np.arange(5)}, + index=pd.MultiIndex.from_arrays([ + [1,2,3,4,5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v','d'])) + df + df.resample('M', on='date').sum() + df.resample('M', level='d').sum() + - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2a6f00c65c7fb..5a17401ea67b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4047,10 +4047,12 @@ def between_time(self, start_time, end_time, include_start=True, def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, - limit=None, base=0): + limit=None, base=0, on=None, level=None): """ - Convenience method for frequency conversion and resampling of regular - time-series data. + Convenience method for frequency conversion and resampling of time + series. Object must have a datetime-like index (DatetimeIndex, + PeriodIndex, or TimedeltaIndex), or pass datetime-like values + to the on or level keyword. Parameters ---------- @@ -4068,7 +4070,17 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0 + on : string, optional + For a DataFrame, column to use instead of index for resampling. + Column must be datetime-like. + .. versionadded:: 0.19.0 + + level : string or int, optional + For a MultiIndex, level (name or number) to use for + resampling. Level must be datetime-like. + + .. versionadded:: 0.19.0 To learn more about the offset strings, please see `this link `__. @@ -4173,12 +4185,11 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, """ from pandas.tseries.resample import (resample, _maybe_process_deprecations) - axis = self._get_axis_number(axis) r = resample(self, freq=rule, label=label, closed=closed, axis=axis, kind=kind, loffset=loffset, convention=convention, - base=base) + base=base, key=on, level=level) return _maybe_process_deprecations(r, how=how, fill_method=fill_method, diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 9436257b88941..66e30229cd52b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -255,7 +255,8 @@ def _set_grouper(self, obj, sort=False): Parameters ---------- obj : the subject object - + sort : bool, default False + whether the resulting grouper should be sorted """ if self.key is not None and self.level is not None: diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 5c4bfe5360fac..f1a209053445a 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -112,6 +112,15 @@ def _typ(self): return 'series' return 'dataframe' + @property + def _from_selection(self): + """ is the resampling from a DataFrame column or MultiIndex level """ + # upsampling and PeriodIndex resampling do not work + # with selection, this state used to catch and raise an error + return (self.groupby is not None and + (self.groupby.key is not None or + self.groupby.level is not None)) + def _deprecated(self, op): warnings.warn(("\n.resample() is now a deferred operation\n" "You called {op}(...) on this deferred object " @@ -207,6 +216,10 @@ def _convert_obj(self, obj): Parameters ---------- obj : the object to be resampled + + Returns + ------- + obj : converted object """ obj = obj.consolidate() return obj @@ -706,6 +719,11 @@ def _upsample(self, method, limit=None): self._set_binner() if self.axis: raise AssertionError('axis must be 0') + if self._from_selection: + raise ValueError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") ax = self.ax obj = self._selected_obj @@ -763,7 +781,15 @@ def _convert_obj(self, obj): # convert to timestamp if not (self.kind is None or self.kind == 'period'): - obj = obj.to_timestamp(how=self.convention) + if self._from_selection: + # see GH 14008, GH 12871 + msg = ("Resampling from level= or on= selection" + " with a PeriodIndex is not currently supported," + " use .set_index(...) to explicitly set index") + raise NotImplementedError(msg) + else: + obj = obj.to_timestamp(how=self.convention) + return obj def aggregate(self, arg, *args, **kwargs): @@ -841,6 +867,11 @@ def _upsample(self, method, limit=None): .fillna """ + if self._from_selection: + raise ValueError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': return super(PeriodIndexResampler, self)._upsample(method, diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 49802ba640d70..2ebcdc999a797 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -371,18 +371,44 @@ def test_apply_without_aggregation(self): result = t.apply(lambda x: x) assert_series_equal(result, self.series) + def test_agg_consistency(self): + + # make sure that we are consistent across + # similar aggregations with and w/o selection list + df = DataFrame(np.random.randn(1000, 3), + index=pd.date_range('1/1/2012', freq='S', periods=1000), + columns=['A', 'B', 'C']) + + r = df.resample('3T') + + expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) + result = r.agg({'r1': 'mean', 'r2': 'sum'}) + assert_frame_equal(result, expected) + + # TODO: once GH 14008 is fixed, move these tests into + # `Base` test class def test_agg(self): - # test with both a Resampler and a TimeGrouper + # test with all three Resampler apis and TimeGrouper np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) a_mean = r['A'].mean() a_std = r['A'].std() a_sum = r['A'].sum() @@ -393,12 +419,12 @@ def test_agg(self): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', 'std']]) - for t in [r, g]: + for t in cases: result = t.aggregate([np.mean, np.std]) assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': np.mean, 'B': np.std}) assert_frame_equal(result, expected, check_like=True) @@ -406,20 +432,20 @@ def test_agg(self): expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'std')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': ['mean', 'std']}) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ['mean', 'sum'] - for t in [r, g]: + for t in cases: result = t['A'].aggregate(['mean', 'sum']) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'sum')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) assert_frame_equal(result, expected, check_like=True) @@ -428,7 +454,7 @@ def test_agg(self): ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, 'B': {'mean2': 'mean', 'sum2': 'sum'}}) assert_frame_equal(result, expected, check_like=True) @@ -438,7 +464,7 @@ def test_agg(self): ('A', 'std'), ('B', 'mean'), ('B', 'std')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) @@ -450,20 +476,30 @@ def test_agg(self): ('r2', 'B', 'sum')]) def test_agg_misc(self): - # test with both a Resampler and a TimeGrouper + # test with all three Resampler apis and TimeGrouper np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) - - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] # passed lambda - for t in [r, g]: + for t in cases: result = t.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) rcustom = t['B'].apply(lambda x: np.std(x, ddof=1)) @@ -480,7 +516,7 @@ def test_agg_misc(self): ('result1', 'B'), ('result2', 'A'), ('result2', 'B')]) - for t in [r, g]: + for t in cases: result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum), ('result2', np.mean)])) assert_frame_equal(result, expected, check_like=True) @@ -495,19 +531,19 @@ def test_agg_misc(self): ('A', 'std'), ('B', 'mean'), ('B', 'std')]) - for t in [r, g]: + for t in cases: result = t.agg(OrderedDict([('A', ['sum', 'std']), ('B', ['mean', 'std'])])) assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not - for t in [r, g]: - result = g[['A', 'B']].agg({'A': ['sum', 'std'], + for t in cases: + result = t[['A', 'B']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) # series like aggs - for t in [r, g]: + for t in cases: result = t['A'].agg({'A': ['sum', 'std']}) expected = pd.concat([t['A'].sum(), t['A'].std()], @@ -528,9 +564,9 @@ def test_agg_misc(self): # errors # invalid names in the agg specification - for t in [r, g]: + for t in cases: def f(): - r[['A']].agg({'A': ['sum', 'std'], + t[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) self.assertRaises(SpecificationError, f) @@ -538,22 +574,31 @@ def f(): def test_agg_nested_dicts(self): np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) - - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) - - for t in [r, g]: + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] + + for t in cases: def f(): t.aggregate({'r1': {'A': ['mean', 'sum']}, 'r2': {'B': ['mean', 'sum']}}) self.assertRaises(ValueError, f) - for t in [r, g]: + for t in cases: expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(), t['B'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( @@ -567,19 +612,44 @@ def f(): 'B': {'rb': ['mean', 'std']}}) assert_frame_equal(result, expected, check_like=True) - def test_agg_consistency(self): + def test_selection_api_validation(self): + # GH 13500 + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + df = pd.DataFrame({'date': index, + 'a': np.arange(len(index), dtype=np.int64)}, + index=pd.MultiIndex.from_arrays([ + np.arange(len(index), dtype=np.int64), + index], names=['v', 'd'])) + df_exp = pd.DataFrame({'a': np.arange(len(index), dtype=np.int64)}, + index=index) - # make sure that we are consistent across - # similar aggregations with and w/o selection list - df = DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2012', freq='S', periods=1000), - columns=['A', 'B', 'C']) + # non DatetimeIndex + with tm.assertRaises(TypeError): + df.resample('2D', level='v') - r = df.resample('3T') + with tm.assertRaises(ValueError): + df.resample('2D', on='date', level='d') - expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) - result = r.agg({'r1': 'mean', 'r2': 'sum'}) - assert_frame_equal(result, expected) + with tm.assertRaises(TypeError): + df.resample('2D', on=['a', 'date']) + + with tm.assertRaises(KeyError): + df.resample('2D', level=['a', 'date']) + + # upsampling not allowed + with tm.assertRaises(ValueError): + df.resample('2D', level='d').asfreq() + + with tm.assertRaises(ValueError): + df.resample('2D', on='date').asfreq() + + exp = df_exp.resample('2D').sum() + exp.index.name = 'date' + assert_frame_equal(exp, df.resample('2D', on='date').sum()) + + exp.index.name = 'd' + assert_frame_equal(exp, df.resample('2D', level='d').sum()) class Base(object): @@ -2009,6 +2079,22 @@ def test_asfreq_upsample(self): result = frame.resample('1H').asfreq() assert_frame_equal(result, expected) + def test_selection(self): + index = self.create_series().index + # This is a bug, these should be implemented + # GH 14008 + df = pd.DataFrame({'date': index, + 'a': np.arange(len(index), dtype=np.int64)}, + index=pd.MultiIndex.from_arrays([ + np.arange(len(index), dtype=np.int64), + index], names=['v', 'd'])) + + with tm.assertRaises(NotImplementedError): + df.resample('2D', on='date') + + with tm.assertRaises(NotImplementedError): + df.resample('2D', level='d') + def test_annual_upsample_D_s_f(self): self._check_annual_upsample_cases('D', 'start', 'ffill')