SNOW-1707602, SNOW-1707624: Implement Resampler.nunique and `Resamp…

…ler.quantile` (#2391) SNOW-1707602 SNOW-1707624 This PR leverages the groupby implementations of `nunique` and `quantile` to implement them for `Resampler` and adds the associated tests. --------- Signed-off-by: Naren Krishna <naren.krishna@snowflake.com>
snowflakedb · Oct 3, 2024 · cb05b4a · cb05b4a
1 parent 30ce834
commit cb05b4a
Show file tree

Hide file tree

Showing 8 changed files with 161 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,7 +28,7 @@
 - Added support for `by`, `left_by`, `right_by`, `left_index`, and `right_index` for `pd.merge_asof`.
 - Added support for passing parameter `include_describe` to `Session.query_history`.
 - Added support for `DatetimeIndex.mean` and `DatetimeIndex.std` methods.
-- Added support for `Resampler.asfreq`.
+- Added support for `Resampler.asfreq`, `Resampler.nunique`, and `Resampler.quantile`.
 - Added support for `resample` frequency `W`, `ME`, `YE` with `closed = "left"`.
 - Added support for `DataFrame.rolling.corr` and `Series.rolling.corr` for `pairwise = False` and int `window`.
 - Added support for string time-based `window` and `min_periods = None` for `Rolling`.

diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst
@@ -140,7 +140,7 @@ Computations/descriptive stats
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``prod``                    | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``quantile``                | Y                               | See ``count``                                      |
+| ``quantile``                | P                               | ``N`` for list-like ``q``                          |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``rank``                    | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/resampling_supported.rst b/docs/source/modin/supported/resampling_supported.rst
@@ -76,15 +76,13 @@ Computations / descriptive stats
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``min``                     | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``nunique``                 | N                               |                                                    |
+| ``nunique``                 | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``ohlc``                    | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``pad``                     | N                               |                                                    |
-+-----------------------------+---------------------------------+----------------------------------------------------+
 | ``prod``                    | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``quantile``                | N                               |                                                    |
+| ``quantile``                | P                               | ``N`` for list-like ``q``                          |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``sem``                     | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py
@@ -45,6 +45,8 @@
     "size",
     "first",
     "last",
+    "quantile",
+    "nunique",
 ]
 IMPLEMENTED_MISC_METHODS = ["ffill"]
 SUPPORTED_RESAMPLE_RULES = ("second", "minute", "hour", "day", "week", "month", "year")

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -12398,7 +12398,7 @@ def resample(
             frame = fill_missing_resample_bins_for_frame(
                 qc._modin_frame, rule, start_date, end_date
             )
-            if resample_method in ("sum", "count", "size"):
+            if resample_method in ("sum", "count", "size", "nunique"):
                 values_arg: Union[int, dict]
                 if resample_method == "sum":
                     # For sum(), we need to fill NaN values as Timedelta(0)

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/resample.py b/src/snowflake/snowpark/modin/plugin/docstrings/resample.py
@@ -509,7 +509,50 @@ def count():
         """
 
     def nunique():
-        pass
+        """
+        Return number of unique elements in the group.
+
+        Returns
+        -------
+        :class:`~modin.pandas.Series`
+            Number of unique values within each group.
+
+        Examples
+        --------
+        For Series:
+
+        >>> lst1 = pd.date_range('2020-01-01', periods=4, freq='1D')
+        >>> ser1 = pd.Series([1, 1, 2, 2], index=lst1)
+        >>> ser1
+        2020-01-01    1
+        2020-01-02    1
+        2020-01-03    2
+        2020-01-04    2
+        Freq: None, dtype: int64
+
+        >>> ser1.resample('2D').nunique()
+        2020-01-01    1
+        2020-01-03    1
+        Freq: None, dtype: int64
+
+        For DataFrame:
+
+        >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
+        >>> df = pd.DataFrame(data,
+        ...      columns=["a", "b", "c"],
+        ...      index=pd.date_range('2020-01-01', periods=4, freq='1D'))
+        >>> df
+                    a  b  c
+        2020-01-01  1  8  2
+        2020-01-02  1  2  5
+        2020-01-03  2  5  8
+        2020-01-04  2  6  9
+
+        >>> df.resample('2D').nunique()
+                    a  b  c
+        2020-01-01  1  2  2
+        2020-01-03  1  2  2
+        """
 
     def first():
         """
@@ -1295,4 +1338,63 @@ def var():
         """
 
     def quantile():
-        pass
+        """
+        Return value at the given quantile.
+
+        Parameters
+        ----------
+        q : float or array-like, default 0.5 (50% quantile)
+
+        Returns
+        -------
+        :class:`~modin.pandas.Series` or :class:`~modin.pandas.DataFrame`
+            Quantile of values within each group.
+
+        See Also
+        --------
+        Series.quantile : Return a series, where the index is q and the values are the quantiles.
+        DataFrame.quantile : Return a DataFrame, where the columns are the columns of self,
+            and the values are the quantiles.
+        DataFrameGroupBy.quantile : Return a DataFrame, where the columns are groupby columns,
+            and the values are its quantiles.
+
+        Notes
+        -----
+        List-like ``q`` is not yet supported.
+
+        Examples
+        --------
+        For Series:
+
+        >>> lst1 = pd.date_range('2020-01-01', periods=4, freq='1D')
+        >>> ser1 = pd.Series([1, 2, 3, 4], index=lst1)
+        >>> ser1
+        2020-01-01    1
+        2020-01-02    2
+        2020-01-03    3
+        2020-01-04    4
+        Freq: None, dtype: int64
+
+        >>> ser1.resample('2D').quantile()
+        2020-01-01    1.5
+        2020-01-03    3.5
+        Freq: None, dtype: float64
+
+        For DataFrame:
+
+        >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
+        >>> df = pd.DataFrame(data,
+        ...      columns=["a", "b", "c"],
+        ...      index=pd.date_range('2020-01-01', periods=4, freq='1D'))
+        >>> df
+                    a  b  c
+        2020-01-01  1  8  2
+        2020-01-02  1  2  5
+        2020-01-03  2  5  8
+        2020-01-04  2  6  9
+
+        >>> df.resample('2D').quantile(q=0.2)
+                      a      b    c
+        2020-01-01  1.0  3.199  2.6
+        2020-01-03  2.0  5.200  8.2
+        """
diff --git a/src/snowflake/snowpark/modin/plugin/extensions/resample_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/resample_overrides.py
@@ -250,10 +250,10 @@ def ffill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
             )
         )
 
-    def backfill(self, limit: Optional[int] = None):
+    def backfill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
         return self.bfill(limit=limit)
 
-    def bfill(self, limit: Optional[int] = None):
+    def bfill(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
         is_series = not self._dataframe._is_dataframe
 
         if limit is not None:
@@ -271,13 +271,15 @@ def bfill(self, limit: Optional[int] = None):
             )
         )
 
-    def pad(self, limit: Optional[int] = None):
+    def pad(self, limit: Optional[int] = None) -> Union[pd.DataFrame, pd.Series]:
         return self.ffill(limit=limit)
 
     def nearest(self, limit: Optional[int] = None):  # pragma: no cover
         self._method_not_implemented("nearest")
 
-    def fillna(self, method: str, limit: Optional[int] = None):
+    def fillna(
+        self, method: str, limit: Optional[int] = None
+    ) -> Union[pd.DataFrame, pd.Series]:
         if not isinstance(method, str) or method not in (
             "pad",
             "ffill",
@@ -290,7 +292,9 @@ def fillna(self, method: str, limit: Optional[int] = None):
             )
         return getattr(self, method)(limit=limit)
 
-    def asfreq(self, fill_value: Optional[Any] = None):
+    def asfreq(
+        self, fill_value: Optional[Any] = None
+    ) -> Union[pd.DataFrame, pd.Series]:
         is_series = not self._dataframe._is_dataframe
 
         if fill_value is not None:
@@ -341,9 +345,17 @@ def count(self) -> Union[pd.DataFrame, pd.Series]:
             )
         )
 
-    def nunique(self, *args: Any, **kwargs: Any):  # pragma: no cover
+    def nunique(self, *args: Any, **kwargs: Any) -> pd.Series:
         # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
-        self._method_not_implemented("nunique")
+        return self._dataframe.__constructor__(
+            query_compiler=self._query_compiler.resample(
+                self.resample_kwargs,
+                "nunique",
+                tuple(),
+                dict(),
+                True,
+            )
+        )
 
     def first(
         self,
@@ -352,7 +364,7 @@ def first(
         skipna: bool = True,
         *args: Any,
         **kwargs: Any,
-    ):
+    ) -> Union[pd.DataFrame, pd.Series]:
         # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
         self._validate_numeric_only_for_aggregate_methods(numeric_only)
 
@@ -376,7 +388,7 @@ def last(
         skipna: bool = True,
         *args: Any,
         **kwargs: Any,
-    ):
+    ) -> Union[pd.DataFrame, pd.Series]:
         # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
         self._validate_numeric_only_for_aggregate_methods(numeric_only)
 
@@ -500,7 +512,7 @@ def prod(
         # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
         self._method_not_implemented("prod")
 
-    def size(self):
+    def size(self) -> Union[pd.DataFrame, pd.Series]:
         # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
         is_series = not self._dataframe._is_dataframe
 
@@ -601,7 +613,18 @@ def var(
         )
 
     def quantile(
-        self, q: Union[float, AnyArrayLike] = 0.5, **kwargs: Any
-    ):  # pragma: no cover
+        self,
+        q: Union[float, AnyArrayLike] = 0.5,
+        **kwargs: Any,
+    ) -> Union[pd.DataFrame, pd.Series]:
         # TODO: SNOW-1063368: Modin upgrade - modin.pandas.resample.Resample
-        self._method_not_implemented("quantile")
+        agg_kwargs = dict(q=q)
+        return self._dataframe.__constructor__(
+            query_compiler=self._query_compiler.resample(
+                self.resample_kwargs,
+                "quantile",
+                tuple(),
+                agg_kwargs,
+                False,
+            )
+        )
diff --git a/tests/integ/modin/resample/test_resample.py b/tests/integ/modin/resample/test_resample.py
@@ -357,3 +357,17 @@ def test_resample_date_trunc_hour():
         lambda df: df.resample("2H").min(),
         check_freq=False,
     )
+
+
+# One extra query to convert index to native pandas for dataframe constructor
+@pytest.mark.parametrize("q", [0.1, 0.7])
+@sql_count_checker(query_count=3, join_count=1)
+def test_resample_quantile_various_q(q):
+    eval_snowpark_pandas_result(
+        *create_test_dfs(
+            {"A": np.random.randn(15)},
+            index=native_pd.date_range("2020-01-01", periods=15, freq="1s"),
+        ),
+        lambda df: df.resample(rule="3s").quantile(q=q),
+        check_freq=False,
+    )