Skip to content

Commit

Permalink
SNOW-1707602, SNOW-1707624: Implement Resampler.nunique and `Resamp…
Browse files Browse the repository at this point in the history
…ler.quantile` (#2391)

SNOW-1707602
SNOW-1707624

This PR leverages the groupby implementations of `nunique` and
`quantile` to implement them for `Resampler` and adds the associated
tests.

---------

Signed-off-by: Naren Krishna <naren.krishna@snowflake.com>
  • Loading branch information
sfc-gh-nkrishna authored Oct 3, 2024
1 parent 30ce834 commit cb05b4a
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 22 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
- Added support for `by`, `left_by`, `right_by`, `left_index`, and `right_index` for `pd.merge_asof`.
- Added support for passing parameter `include_describe` to `Session.query_history`.
- Added support for `DatetimeIndex.mean` and `DatetimeIndex.std` methods.
- Added support for `Resampler.asfreq`.
- Added support for `Resampler.asfreq`, `Resampler.nunique`, and `Resampler.quantile`.
- Added support for `resample` frequency `W`, `ME`, `YE` with `closed = "left"`.
- Added support for `DataFrame.rolling.corr` and `Series.rolling.corr` for `pairwise = False` and int `window`.
- Added support for string time-based `window` and `min_periods = None` for `Rolling`.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/modin/supported/groupby_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ Computations/descriptive stats
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``prod`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``quantile`` | Y | See ``count`` |
| ``quantile`` | P | ``N`` for list-like ``q`` |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``rank`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
6 changes: 2 additions & 4 deletions docs/source/modin/supported/resampling_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,13 @@ Computations / descriptive stats
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``min`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``nunique`` | N | |
| ``nunique`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``ohlc`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``pad`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``prod`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``quantile`` | N | |
| ``quantile`` | P | ``N`` for list-like ``q`` |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``sem`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
"size",
"first",
"last",
"quantile",
"nunique",
]
IMPLEMENTED_MISC_METHODS = ["ffill"]
SUPPORTED_RESAMPLE_RULES = ("second", "minute", "hour", "day", "week", "month", "year")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12398,7 +12398,7 @@ def resample(
frame = fill_missing_resample_bins_for_frame(
qc._modin_frame, rule, start_date, end_date
)
if resample_method in ("sum", "count", "size"):
if resample_method in ("sum", "count", "size", "nunique"):
values_arg: Union[int, dict]
if resample_method == "sum":
# For sum(), we need to fill NaN values as Timedelta(0)
Expand Down
106 changes: 104 additions & 2 deletions src/snowflake/snowpark/modin/plugin/docstrings/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,50 @@ def count():
"""

def nunique():
pass
"""
Return number of unique elements in the group.
Returns
-------
:class:`~modin.pandas.Series`
Number of unique values within each group.
Examples
--------
For Series:
>>> lst1 = pd.date_range('2020-01-01', periods=4, freq='1D')
>>> ser1 = pd.Series([1, 1, 2, 2], index=lst1)
>>> ser1
2020-01-01 1
2020-01-02 1
2020-01-03 2
2020-01-04 2
Freq: None, dtype: int64
>>> ser1.resample('2D').nunique()
2020-01-01 1
2020-01-03 1
Freq: None, dtype: int64
For DataFrame:
>>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
>>> df = pd.DataFrame(data,
... columns=["a", "b", "c"],
... index=pd.date_range('2020-01-01', periods=4, freq='1D'))
>>> df
a b c
2020-01-01 1 8 2
2020-01-02 1 2 5
2020-01-03 2 5 8
2020-01-04 2 6 9
>>> df.resample('2D').nunique()
a b c
2020-01-01 1 2 2
2020-01-03 1 2 2
"""

def first():
"""
Expand Down Expand Up @@ -1295,4 +1338,63 @@ def var():
"""

def quantile():
pass
"""
Return value at the given quantile.
Parameters
----------
q : float or array-like, default 0.5 (50% quantile)
Returns
-------
:class:`~modin.pandas.Series` or :class:`~modin.pandas.DataFrame`
Quantile of values within each group.
See Also
--------
Series.quantile : Return a series, where the index is q and the values are the quantiles.
DataFrame.quantile : Return a DataFrame, where the columns are the columns of self,
and the values are the quantiles.
DataFrameGroupBy.quantile : Return a DataFrame, where the columns are groupby columns,
and the values are its quantiles.
Notes
-----
List-like ``q`` is not yet supported.
Examples
--------
For Series:
>>> lst1 = pd.date_range('2020-01-01', periods=4, freq='1D')
>>> ser1 = pd.Series([1, 2, 3, 4], index=lst1)
>>> ser1
2020-01-01 1
2020-01-02 2
2020-01-03 3
2020-01-04 4
Freq: None, dtype: int64
>>> ser1.resample('2D').quantile()
2020-01-01 1.5
2020-01-03 3.5
Freq: None, dtype: float64
For DataFrame:
>>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
>>> df = pd.DataFrame(data,
... columns=["a", "b", "c"],
... index=pd.date_range('2020-01-01', periods=4, freq='1D'))
>>> df
a b c
2020-01-01 1 8 2
2020-01-02 1 2 5
2020-01-03 2 5 8
2020-01-04 2 6 9
>>> df.resample('2D').quantile(q=0.2)
a b c
2020-01-01 1.0 3.199 2.6
2020-01-03 2.0 5.200 8.2
"""
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,10 @@ def ffill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
)
)

def backfill(self, limit: Optional[int] = None):
def backfill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
return self.bfill(limit=limit)

def bfill(self, limit: Optional[int] = None):
def bfill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
is_series = not self._dataframe._is_dataframe

if limit is not None:
Expand All @@ -271,13 +271,15 @@ def bfill(self, limit: Optional[int] = None):
)
)

def pad(self, limit: Optional[int] = None):
def pad(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
return self.ffill(limit=limit)

def nearest(self, limit: Optional[int] = None): # pragma: no cover
self._method_not_implemented("nearest")

def fillna(self, method: str, limit: Optional[int] = None):
def fillna(
self, method: str, limit: Optional[int] = None
) -> Union[pd.DataFrame, pd.Series]:
if not isinstance(method, str) or method not in (
"pad",
"ffill",
Expand All @@ -290,7 +292,9 @@ def fillna(self, method: str, limit: Optional[int] = None):
)
return getattr(self, method)(limit=limit)

def asfreq(self, fill_value: Optional[Any] = None):
def asfreq(
self, fill_value: Optional[Any] = None
) -> Union[pd.DataFrame, pd.Series]:
is_series = not self._dataframe._is_dataframe

if fill_value is not None:
Expand Down Expand Up @@ -341,9 +345,17 @@ def count(self) -> Union[pd.DataFrame, pd.Series]:
)
)

def nunique(self, *args: Any, **kwargs: Any): # pragma: no cover
def nunique(self, *args: Any, **kwargs: Any) -> pd.Series:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._method_not_implemented("nunique")
return self._dataframe.__constructor__(
query_compiler=self._query_compiler.resample(
self.resample_kwargs,
"nunique",
tuple(),
dict(),
True,
)
)

def first(
self,
Expand All @@ -352,7 +364,7 @@ def first(
skipna: bool = True,
*args: Any,
**kwargs: Any,
):
) -> Union[pd.DataFrame, pd.Series]:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._validate_numeric_only_for_aggregate_methods(numeric_only)

Expand All @@ -376,7 +388,7 @@ def last(
skipna: bool = True,
*args: Any,
**kwargs: Any,
):
) -> Union[pd.DataFrame, pd.Series]:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._validate_numeric_only_for_aggregate_methods(numeric_only)

Expand Down Expand Up @@ -500,7 +512,7 @@ def prod(
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._method_not_implemented("prod")

def size(self):
def size(self) -> Union[pd.DataFrame, pd.Series]:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
is_series = not self._dataframe._is_dataframe

Expand Down Expand Up @@ -601,7 +613,18 @@ def var(
)

def quantile(
self, q: Union[float, AnyArrayLike] = 0.5, **kwargs: Any
): # pragma: no cover
self,
q: Union[float, AnyArrayLike] = 0.5,
**kwargs: Any,
) -> Union[pd.DataFrame, pd.Series]:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._method_not_implemented("quantile")
agg_kwargs = dict(q=q)
return self._dataframe.__constructor__(
query_compiler=self._query_compiler.resample(
self.resample_kwargs,
"quantile",
tuple(),
agg_kwargs,
False,
)
)
14 changes: 14 additions & 0 deletions tests/integ/modin/resample/test_resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,3 +357,17 @@ def test_resample_date_trunc_hour():
lambda df: df.resample("2H").min(),
check_freq=False,
)


# One extra query to convert index to native pandas for dataframe constructor
@pytest.mark.parametrize("q", [0.1, 0.7])
@sql_count_checker(query_count=3, join_count=1)
def test_resample_quantile_various_q(q):
eval_snowpark_pandas_result(
*create_test_dfs(
{"A": np.random.randn(15)},
index=native_pd.date_range("2020-01-01", periods=15, freq="1s"),
),
lambda df: df.resample(rule="3s").quantile(q=q),
check_freq=False,
)

0 comments on commit cb05b4a

Please sign in to comment.