Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1707602, SNOW-1707624: Implement Resampler.nunique and Resampler.quantile #2391

Merged
merged 7 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
- Added support for `by`, `left_by`, `right_by`, `left_index`, and `right_index` for `pd.merge_asof`.
- Added support for passing parameter `include_describe` to `Session.query_history`.
- Added support for `DatetimeIndex.mean` and `DatetimeIndex.std` methods.
- Added support for `Resampler.asfreq`.
- Added support for `Resampler.asfreq`, `Resampler.nunique`, and `Resampler.quantile`.
- Added support for `resample` frequency `W`, `ME`, `YE` with `closed = "left"`.
- Added support for `DataFrame.rolling.corr` and `Series.rolling.corr` for `pairwise = False` and int `window`.
- Added support for string time-based `window` and `min_periods = None` for `Rolling`.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/modin/supported/groupby_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ Computations/descriptive stats
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``prod`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``quantile`` | Y | See ``count`` |
| ``quantile`` | P | ``N`` for list-like ``q`` |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``rank`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
6 changes: 2 additions & 4 deletions docs/source/modin/supported/resampling_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,13 @@ Computations / descriptive stats
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``min`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``nunique`` | N | |
| ``nunique`` | Y | |
sfc-gh-nkrishna marked this conversation as resolved.
Show resolved Hide resolved
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``ohlc`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``pad`` | N | |
sfc-gh-helmeleegy marked this conversation as resolved.
Show resolved Hide resolved
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``prod`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``quantile`` | N | |
| ``quantile`` | P | ``N`` for list-like ``q`` |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``sem`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
"size",
"first",
"last",
"quantile",
"nunique",
]
IMPLEMENTED_MISC_METHODS = ["ffill"]
SUPPORTED_RESAMPLE_RULES = ("second", "minute", "hour", "day", "week", "month", "year")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12402,7 +12402,7 @@ def resample(
frame = fill_missing_resample_bins_for_frame(
qc._modin_frame, rule, start_date, end_date
)
if resample_method in ("sum", "count", "size"):
if resample_method in ("sum", "count", "size", "nunique"):
values_arg: Union[int, dict]
if resample_method == "sum":
# For sum(), we need to fill NaN values as Timedelta(0)
Expand Down
106 changes: 104 additions & 2 deletions src/snowflake/snowpark/modin/plugin/docstrings/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,50 @@ def count():
"""

def nunique():
pass
"""
Return number of unique elements in the group.

Returns
-------
:class:`~modin.pandas.Series`
Number of unique values within each group.

Examples
--------
For Series:

>>> lst1 = pd.date_range('2020-01-01', periods=4, freq='1D')
>>> ser1 = pd.Series([1, 1, 2, 2], index=lst1)
>>> ser1
2020-01-01 1
2020-01-02 1
2020-01-03 2
2020-01-04 2
Freq: None, dtype: int64

>>> ser1.resample('2D').nunique()
2020-01-01 1
2020-01-03 1
Freq: None, dtype: int64

For DataFrame:

>>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
>>> df = pd.DataFrame(data,
... columns=["a", "b", "c"],
... index=pd.date_range('2020-01-01', periods=4, freq='1D'))
>>> df
a b c
2020-01-01 1 8 2
2020-01-02 1 2 5
2020-01-03 2 5 8
2020-01-04 2 6 9

>>> df.resample('2D').nunique()
a b c
2020-01-01 1 2 2
2020-01-03 1 2 2
"""

def first():
"""
Expand Down Expand Up @@ -1295,4 +1338,63 @@ def var():
"""

def quantile():
pass
"""
Return value at the given quantile.

Parameters
----------
q : float or array-like, default 0.5 (50% quantile)

Returns
-------
:class:`~modin.pandas.Series` or :class:`~modin.pandas.DataFrame`
Quantile of values within each group.

See Also
--------
Series.quantile : Return a series, where the index is q and the values are the quantiles.
DataFrame.quantile : Return a DataFrame, where the columns are the columns of self,
and the values are the quantiles.
DataFrameGroupBy.quantile : Return a DataFrame, where the columns are groupby columns,
and the values are its quantiles.

Notes
-----
List-like ``q`` is not yet supported.

Examples
--------
For Series:

>>> lst1 = pd.date_range('2020-01-01', periods=4, freq='1D')
>>> ser1 = pd.Series([1, 2, 3, 4], index=lst1)
>>> ser1
2020-01-01 1
2020-01-02 2
2020-01-03 3
2020-01-04 4
Freq: None, dtype: int64

>>> ser1.resample('2D').quantile()
2020-01-01 1.5
2020-01-03 3.5
Freq: None, dtype: float64

For DataFrame:

>>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
>>> df = pd.DataFrame(data,
... columns=["a", "b", "c"],
... index=pd.date_range('2020-01-01', periods=4, freq='1D'))
>>> df
a b c
2020-01-01 1 8 2
2020-01-02 1 2 5
2020-01-03 2 5 8
2020-01-04 2 6 9

>>> df.resample('2D').quantile(q=0.2)
a b c
2020-01-01 1.0 3.199 2.6
2020-01-03 2.0 5.200 8.2
"""
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,10 @@ def ffill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
)
)

def backfill(self, limit: Optional[int] = None):
def backfill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
return self.bfill(limit=limit)

def bfill(self, limit: Optional[int] = None):
def bfill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
is_series = not self._dataframe._is_dataframe

if limit is not None:
Expand All @@ -271,13 +271,15 @@ def bfill(self, limit: Optional[int] = None):
)
)

def pad(self, limit: Optional[int] = None):
def pad(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
return self.ffill(limit=limit)

def nearest(self, limit: Optional[int] = None): # pragma: no cover
self._method_not_implemented("nearest")

def fillna(self, method: str, limit: Optional[int] = None):
def fillna(
self, method: str, limit: Optional[int] = None
) -> Union[pd.DataFrame, pd.Series]:
if not isinstance(method, str) or method not in (
"pad",
"ffill",
Expand All @@ -290,7 +292,9 @@ def fillna(self, method: str, limit: Optional[int] = None):
)
return getattr(self, method)(limit=limit)

def asfreq(self, fill_value: Optional[Any] = None):
def asfreq(
self, fill_value: Optional[Any] = None
) -> Union[pd.DataFrame, pd.Series]:
is_series = not self._dataframe._is_dataframe

if fill_value is not None:
Expand Down Expand Up @@ -341,9 +345,17 @@ def count(self) -> Union[pd.DataFrame, pd.Series]:
)
)

def nunique(self, *args: Any, **kwargs: Any): # pragma: no cover
def nunique(self, *args: Any, **kwargs: Any) -> pd.Series:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._method_not_implemented("nunique")
return self._dataframe.__constructor__(
query_compiler=self._query_compiler.resample(
self.resample_kwargs,
"nunique",
tuple(),
dict(),
True,
)
)

def first(
self,
Expand All @@ -352,7 +364,7 @@ def first(
skipna: bool = True,
*args: Any,
**kwargs: Any,
):
) -> Union[pd.DataFrame, pd.Series]:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._validate_numeric_only_for_aggregate_methods(numeric_only)

Expand All @@ -376,7 +388,7 @@ def last(
skipna: bool = True,
*args: Any,
**kwargs: Any,
):
) -> Union[pd.DataFrame, pd.Series]:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._validate_numeric_only_for_aggregate_methods(numeric_only)

Expand Down Expand Up @@ -500,7 +512,7 @@ def prod(
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._method_not_implemented("prod")

def size(self):
def size(self) -> Union[pd.DataFrame, pd.Series]:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
is_series = not self._dataframe._is_dataframe

Expand Down Expand Up @@ -601,7 +613,18 @@ def var(
)

def quantile(
self, q: Union[float, AnyArrayLike] = 0.5, **kwargs: Any
): # pragma: no cover
self,
q: Union[float, AnyArrayLike] = 0.5,
**kwargs: Any,
) -> Union[pd.DataFrame, pd.Series]:
# TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
self._method_not_implemented("quantile")
agg_kwargs = dict(q=q)
return self._dataframe.__constructor__(
query_compiler=self._query_compiler.resample(
self.resample_kwargs,
"quantile",
tuple(),
agg_kwargs,
False,
)
)
14 changes: 14 additions & 0 deletions tests/integ/modin/resample/test_resample.py
sfc-gh-mvashishtha marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -357,3 +357,17 @@ def test_resample_date_trunc_hour():
lambda df: df.resample("2H").min(),
check_freq=False,
)


# One extra query to convert index to native pandas for dataframe constructor
@pytest.mark.parametrize("q", [0.1, 0.7])
sfc-gh-mvashishtha marked this conversation as resolved.
Show resolved Hide resolved
@sql_count_checker(query_count=3, join_count=1)
def test_resample_quantile_various_q(q):
eval_snowpark_pandas_result(
*create_test_dfs(
{"A": np.random.randn(15)},
index=native_pd.date_range("2020-01-01", periods=15, freq="1s"),
),
lambda df: df.resample(rule="3s").quantile(q=q),
check_freq=False,
)
Loading