diff --git a/CHANGELOG.md b/CHANGELOG.md index 73804f28f7..8535071981 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,7 @@ - Added support for `by`, `left_by`, `right_by`, `left_index`, and `right_index` for `pd.merge_asof`. - Added support for passing parameter `include_describe` to `Session.query_history`. - Added support for `DatetimeIndex.mean` and `DatetimeIndex.std` methods. -- Added support for `Resampler.asfreq`. +- Added support for `Resampler.asfreq`, `Resampler.nunique`, and `Resampler.quantile`. - Added support for `resample` frequency `W`, `ME`, `YE` with `closed = "left"`. - Added support for `DataFrame.rolling.corr` and `Series.rolling.corr` for `pairwise = False` and int `window`. - Added support for string time-based `window` and `min_periods = None` for `Rolling`. diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index 1345d1b10b..66e768ba67 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -140,7 +140,7 @@ Computations/descriptive stats +-----------------------------+---------------------------------+----------------------------------------------------+ | ``prod`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``quantile`` | Y | See ``count`` | +| ``quantile`` | P | ``N`` for list-like ``q`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rank`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/docs/source/modin/supported/resampling_supported.rst b/docs/source/modin/supported/resampling_supported.rst index 7c00f30d45..e5d920318d 100644 --- a/docs/source/modin/supported/resampling_supported.rst +++ b/docs/source/modin/supported/resampling_supported.rst @@ -76,15 +76,13 @@ Computations / descriptive stats +-----------------------------+---------------------------------+----------------------------------------------------+ | ``min`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``nunique`` | N | | +| ``nunique`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ohlc`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``pad`` | N | | -+-----------------------------+---------------------------------+----------------------------------------------------+ | ``prod`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``quantile`` | N | | +| ``quantile`` | P | ``N`` for list-like ``q`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sem`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py index 8e2bc5eeee..ee4151ea4e 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py @@ -45,6 +45,8 @@ "size", "first", "last", + "quantile", + "nunique", ] IMPLEMENTED_MISC_METHODS = ["ffill"] SUPPORTED_RESAMPLE_RULES = ("second", "minute", "hour", "day", "week", "month", "year") diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 2ec0453177..fb20fe436d 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -12402,7 +12402,7 @@ def resample( frame = fill_missing_resample_bins_for_frame( qc._modin_frame, rule, start_date, end_date ) - if resample_method in ("sum", "count", "size"): + if resample_method in ("sum", "count", "size", "nunique"): values_arg: Union[int, dict] if resample_method == "sum": # For sum(), we need to fill NaN values as Timedelta(0) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/resample.py b/src/snowflake/snowpark/modin/plugin/docstrings/resample.py index 3bf90e5de6..735d6a6141 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/resample.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/resample.py @@ -509,7 +509,50 @@ def count(): """ def nunique(): - pass + """ + Return number of unique elements in the group. + + Returns + ------- + :class:`~modin.pandas.Series` + Number of unique values within each group. + + Examples + -------- + For Series: + + >>> lst1 = pd.date_range('2020-01-01', periods=4, freq='1D') + >>> ser1 = pd.Series([1, 1, 2, 2], index=lst1) + >>> ser1 + 2020-01-01 1 + 2020-01-02 1 + 2020-01-03 2 + 2020-01-04 2 + Freq: None, dtype: int64 + + >>> ser1.resample('2D').nunique() + 2020-01-01 1 + 2020-01-03 1 + Freq: None, dtype: int64 + + For DataFrame: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, + ... columns=["a", "b", "c"], + ... index=pd.date_range('2020-01-01', periods=4, freq='1D')) + >>> df + a b c + 2020-01-01 1 8 2 + 2020-01-02 1 2 5 + 2020-01-03 2 5 8 + 2020-01-04 2 6 9 + + >>> df.resample('2D').nunique() + a b c + 2020-01-01 1 2 2 + 2020-01-03 1 2 2 + """ def first(): """ @@ -1295,4 +1338,63 @@ def var(): """ def quantile(): - pass + """ + Return value at the given quantile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + + Returns + ------- + :class:`~modin.pandas.Series` or :class:`~modin.pandas.DataFrame` + Quantile of values within each group. + + See Also + -------- + Series.quantile : Return a series, where the index is q and the values are the quantiles. + DataFrame.quantile : Return a DataFrame, where the columns are the columns of self, + and the values are the quantiles. + DataFrameGroupBy.quantile : Return a DataFrame, where the columns are groupby columns, + and the values are its quantiles. + + Notes + ----- + List-like ``q`` is not yet supported. + + Examples + -------- + For Series: + + >>> lst1 = pd.date_range('2020-01-01', periods=4, freq='1D') + >>> ser1 = pd.Series([1, 2, 3, 4], index=lst1) + >>> ser1 + 2020-01-01 1 + 2020-01-02 2 + 2020-01-03 3 + 2020-01-04 4 + Freq: None, dtype: int64 + + >>> ser1.resample('2D').quantile() + 2020-01-01 1.5 + 2020-01-03 3.5 + Freq: None, dtype: float64 + + For DataFrame: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, + ... columns=["a", "b", "c"], + ... index=pd.date_range('2020-01-01', periods=4, freq='1D')) + >>> df + a b c + 2020-01-01 1 8 2 + 2020-01-02 1 2 5 + 2020-01-03 2 5 8 + 2020-01-04 2 6 9 + + >>> df.resample('2D').quantile(q=0.2) + a b c + 2020-01-01 1.0 3.199 2.6 + 2020-01-03 2.0 5.200 8.2 + """ diff --git a/src/snowflake/snowpark/modin/plugin/extensions/resample_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/resample_overrides.py index 5fbec8e48a..74df8e6f46 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/resample_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/resample_overrides.py @@ -250,10 +250,10 @@ def ffill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]: ) ) - def backfill(self, limit: Optional[int] = None): + def backfill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]: return self.bfill(limit=limit) - def bfill(self, limit: Optional[int] = None): + def bfill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]: is_series = not self._dataframe._is_dataframe if limit is not None: @@ -271,13 +271,15 @@ def bfill(self, limit: Optional[int] = None): ) ) - def pad(self, limit: Optional[int] = None): + def pad(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]: return self.ffill(limit=limit) def nearest(self, limit: Optional[int] = None): # pragma: no cover self._method_not_implemented("nearest") - def fillna(self, method: str, limit: Optional[int] = None): + def fillna( + self, method: str, limit: Optional[int] = None + ) -> Union[pd.DataFrame, pd.Series]: if not isinstance(method, str) or method not in ( "pad", "ffill", @@ -290,7 +292,9 @@ def fillna(self, method: str, limit: Optional[int] = None): ) return getattr(self, method)(limit=limit) - def asfreq(self, fill_value: Optional[Any] = None): + def asfreq( + self, fill_value: Optional[Any] = None + ) -> Union[pd.DataFrame, pd.Series]: is_series = not self._dataframe._is_dataframe if fill_value is not None: @@ -341,9 +345,17 @@ def count(self) -> Union[pd.DataFrame, pd.Series]: ) ) - def nunique(self, *args: Any, **kwargs: Any): # pragma: no cover + def nunique(self, *args: Any, **kwargs: Any) -> pd.Series: # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample - self._method_not_implemented("nunique") + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.resample( + self.resample_kwargs, + "nunique", + tuple(), + dict(), + True, + ) + ) def first( self, @@ -352,7 +364,7 @@ def first( skipna: bool = True, *args: Any, **kwargs: Any, - ): + ) -> Union[pd.DataFrame, pd.Series]: # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample self._validate_numeric_only_for_aggregate_methods(numeric_only) @@ -376,7 +388,7 @@ def last( skipna: bool = True, *args: Any, **kwargs: Any, - ): + ) -> Union[pd.DataFrame, pd.Series]: # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample self._validate_numeric_only_for_aggregate_methods(numeric_only) @@ -500,7 +512,7 @@ def prod( # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample self._method_not_implemented("prod") - def size(self): + def size(self) -> Union[pd.DataFrame, pd.Series]: # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample is_series = not self._dataframe._is_dataframe @@ -601,7 +613,18 @@ def var( ) def quantile( - self, q: Union[float, AnyArrayLike] = 0.5, **kwargs: Any - ): # pragma: no cover + self, + q: Union[float, AnyArrayLike] = 0.5, + **kwargs: Any, + ) -> Union[pd.DataFrame, pd.Series]: # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample - self._method_not_implemented("quantile") + agg_kwargs = dict(q=q) + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.resample( + self.resample_kwargs, + "quantile", + tuple(), + agg_kwargs, + False, + ) + ) diff --git a/tests/integ/modin/resample/test_resample.py b/tests/integ/modin/resample/test_resample.py index c27bdc38f0..b70854e45f 100644 --- a/tests/integ/modin/resample/test_resample.py +++ b/tests/integ/modin/resample/test_resample.py @@ -357,3 +357,17 @@ def test_resample_date_trunc_hour(): lambda df: df.resample("2H").min(), check_freq=False, ) + + +# One extra query to convert index to native pandas for dataframe constructor +@pytest.mark.parametrize("q", [0.1, 0.7]) +@sql_count_checker(query_count=3, join_count=1) +def test_resample_quantile_various_q(q): + eval_snowpark_pandas_result( + *create_test_dfs( + {"A": np.random.randn(15)}, + index=native_pd.date_range("2020-01-01", periods=15, freq="1s"), + ), + lambda df: df.resample(rule="3s").quantile(q=q), + check_freq=False, + )