Merge branch 'main' into helmeleegy-SNOW-1445842

snowflakedb · Jul 12, 2024 · c43345e · c43345e
2 parents a498c5f + 7c854cb
commit c43345e
Show file tree

Hide file tree

Showing 16 changed files with 257 additions and 145 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - Added support for function `arrays_zip`.
 - Allow `df.plot()` and `series.plot()` to be called, materializing the data into the local client
 - Improves performance for binary column expression and df._in by avoiding unnecessary cast for numeric values. This optimization can be enabled through session.eliminate_numeric_sql_value_cast_enabled = True.
+- Improved error message for `write_pandas` when target table does not exists and `auto_create_table=False`.
 
 #### Bug Fixes
 
@@ -125,6 +126,7 @@
 - Added support for Index APIs: `dtype`, `values`, `item()`, `tolist()`, `to_series()` and `to_frame()`
 - Expand support for DataFrames with no rows in `pd.pivot_table` and `DataFrame.pivot_table`.
 - Added support for `inplace` parameter in `DataFrame.sort_index` and `Series.sort_index`.
+- Added support for `Index.unique` and `Index.nunique`.
 
 ## 1.18.0 (2024-05-28)
 

diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst
@@ -121,9 +121,9 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``putmask``                 | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``unique``                  | N                               |                                  |                                                    |
+| ``unique``                  | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``nunique``                 | N                               |                                  |                                                    |
+| ``nunique``                 | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``value_counts``            | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/src/snowflake/snowpark/_internal/error_message.py b/src/snowflake/snowpark/_internal/error_message.py
@@ -176,7 +176,7 @@ def DF_PANDAS_TABLE_DOES_NOT_EXIST_EXCEPTION(
         return SnowparkPandasException(
             f"Cannot write pandas DataFrame to table {location} "
             f"because it does not exist. Create table before "
-            f"trying to write a pandas DataFrame",
+            f"trying to write a pandas DataFrame or set auto_create_table=True.",
             error_code="1114",
         )
 

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -7396,14 +7396,18 @@ def get_axis_len(
         """
         return self._modin_frame.num_rows if axis == 0 else len(self.columns)
 
-    def _nunique_columns(self, dropna: bool) -> "SnowflakeQueryCompiler":
+    def _nunique_columns(
+        self, dropna: bool, include_index: bool = False
+    ) -> "SnowflakeQueryCompiler":
         """
         Helper function to compute the number of unique elements in each column.
 
         Parameters
         ----------
         dropna: bool
             When true, does not consider NULL values as elements.
+        include_index: bool, default False
+            When true, include index columns when counting the number of unique elements.
 
         Returns
         -------
@@ -7419,7 +7423,7 @@ def _nunique_columns(self, dropna: bool) -> "SnowflakeQueryCompiler":
             )[0]
         )
 
-        if len(self.columns) == 0:
+        if not include_index and len(self.columns) == 0:
             return SnowflakeQueryCompiler.from_pandas(
                 native_pd.DataFrame([], index=["unique"], dtype=float)
             )
@@ -7437,9 +7441,19 @@ def make_nunique(identifier: str, dropna: bool) -> SnowparkColumn:
                 )
 
         # get a new ordered df with nunique columns
+        snowflake_quoted_identifiers = (
+            internal_frame.data_column_snowflake_quoted_identifiers
+        )
+        pandas_labels = internal_frame.data_column_pandas_labels
+        if include_index:
+            snowflake_quoted_identifiers = (
+                internal_frame.index_column_snowflake_quoted_identifiers
+                + snowflake_quoted_identifiers
+            )
+            pandas_labels = ["unique_index"] + internal_frame.data_column_pandas_labels
         nunique_columns = [
             make_nunique(identifier, dropna).as_(identifier)
-            for identifier in internal_frame.data_column_snowflake_quoted_identifiers
+            for identifier in snowflake_quoted_identifiers
         ]
 
         # since we don't compute count on the index, we need to add a column for it
@@ -7452,14 +7466,28 @@ def make_nunique(identifier: str, dropna: bool) -> SnowparkColumn:
         # get a new internal frame
         frame = InternalFrame.create(
             ordered_dataframe=ordered_dataframe,
-            data_column_pandas_labels=internal_frame.data_column_pandas_labels,
-            data_column_snowflake_quoted_identifiers=internal_frame.data_column_snowflake_quoted_identifiers,
+            data_column_pandas_labels=pandas_labels,
+            data_column_snowflake_quoted_identifiers=snowflake_quoted_identifiers,
             data_column_pandas_index_names=internal_frame.data_column_pandas_index_names,
             index_column_pandas_labels=[INDEX_LABEL],
             index_column_snowflake_quoted_identifiers=[new_index_identifier],
         )
         return SnowflakeQueryCompiler(frame)
 
+    def nunique_index(self, dropna: bool) -> int:
+        """
+        Return number of unique elements in an Index object.
+
+        Returns
+        -------
+        int : The number of unique elements.
+        """
+        return (
+            self._nunique_columns(dropna=dropna, include_index=True)
+            .to_pandas()
+            .iloc[0, 0]
+        )
+
     def nunique(
         self, axis: Axis, dropna: bool, **kwargs: Any
     ) -> "SnowflakeQueryCompiler":

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
@@ -1237,23 +1237,23 @@ def bfill():
 
         >>> df = pd.DataFrame({'A': [1, None, None, 4], 'B': [None, 5, None, 7]})
         >>> df
-              A     B
-        0   1.0   NaN
-        1   NaN   5.0
-        2   NaN   NaN
-        3   4.0   7.0
+             A     B
+        0  1.0   NaN
+        1  NaN   5.0
+        2  NaN   NaN
+        3  4.0   7.0
         >>> df.bfill()
-              A     B
-        0   1.0   5.0
-        1   4.0   5.0
-        2   4.0   7.0
-        3   4.0   7.0
+               A     B
+        0  1.0   5.0
+        1  4.0   5.0
+        2  4.0   7.0
+        3  4.0   7.0
         >>> df.bfill(limit=1)
-              A     B
-        0   1.0   5.0
-        1   NaN   5.0
-        2   4.0   7.0
-        3   4.0   7.0
+             A     B
+        0  1.0   5.0
+        1  NaN   5.0
+        2  4.0   7.0
+        3  4.0   7.0
         """
 
     def boxplot():
@@ -1412,10 +1412,10 @@ def ffill():
 
         >>> ser = pd.Series([1, np.nan, 2, 3])
         >>> ser.ffill()
-        0   1.0
-        1   1.0
-        2   2.0
-        3   3.0
+        0    1.0
+        1    1.0
+        2    2.0
+        3    3.0
         dtype: float64
         """
 

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py
@@ -780,23 +780,23 @@ def bfill():
 
         >>> df = pd.DataFrame({'A': [1, None, None, 4], 'B': [None, 5, None, 7]})
         >>> df
-              A     B
-        0   1.0   NaN
-        1   NaN   5.0
-        2   NaN   NaN
-        3   4.0   7.0
+             A    B
+        0  1.0  NaN
+        1  NaN  5.0
+        2  NaN  NaN
+        3  4.0  7.0
         >>> df.bfill()
-              A     B
-        0   1.0   5.0
-        1   4.0   5.0
-        2   4.0   7.0
-        3   4.0   7.0
+             A    B
+        0  1.0  5.0
+        1  4.0  5.0
+        2  4.0  7.0
+        3  4.0  7.0
         >>> df.bfill(limit=1)
-              A     B
-        0   1.0   5.0
-        1   NaN   5.0
-        2   4.0   7.0
-        3   4.0   7.0
+             A    B
+        0  1.0  5.0
+        1  NaN  5.0
+        2  4.0  7.0
+        3  4.0  7.0
         """
 
     def compare():
@@ -1282,10 +1282,10 @@ def ffill():
 
         >>> ser = pd.Series([1, np.nan, 2, 3])
         >>> ser.ffill()
-        0   1.0
-        1   1.0
-        2   2.0
-        3   3.0
+        0    1.0
+        1    1.0
+        2    2.0
+        3    3.0
         dtype: float64
         """
 

diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py
@@ -415,17 +415,28 @@ def unique(self, level: Hashable | None = None) -> Index:
         See Also
         --------
         unique : Numpy array of unique values in that column.
-        Series.unique : Return unique values of Series object.
+        Series.unique : Return unique values of a Series object.
 
         Examples
         --------
         >>> idx = pd.Index([1, 1, 2, 3, 3])
         >>> idx.unique()
         Index([1, 2, 3], dtype='int64')
         """
-        # TODO: SNOW-1458132 implement unique
-        WarningMessage.index_to_pandas_warning("unique")
-        return Index(self.to_pandas().unique(level=level))
+        if level not in [None, 0, -1]:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, {level} is not a valid level number."
+            )
+        return Index(
+            data=self._query_compiler.groupby_agg(
+                by=self._query_compiler.get_index_names(axis=0),
+                agg_func={},
+                axis=0,
+                groupby_kwargs={"sort": False, "as_index": True, "dropna": False},
+                agg_args=[],
+                agg_kwargs={},
+            )
+        )
 
     @property
     @is_lazy_check
@@ -1372,8 +1383,8 @@ def rename(self) -> None:
         """
         # TODO: SNOW-1458122 implement rename
 
-    @index_not_implemented()
-    def nunique(self) -> None:
+    @is_lazy_check
+    def nunique(self, dropna: bool = True) -> int:
         """
         Return number of unique elements in the object.
 
@@ -1392,8 +1403,22 @@ def nunique(self) -> None:
         --------
         DataFrame.nunique: Method nunique for DataFrame.
         Series.count: Count non-NA/null observations in the Series.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 3, 5, 7, 7])
+        >>> s
+        0    1
+        1    3
+        2    5
+        3    7
+        4    7
+        dtype: int64
+
+        >>> s.nunique()
+        4
         """
-        # TODO: SNOW-1458132 implement nunique
+        return self._query_compiler.nunique_index(dropna=dropna)
 
     @is_lazy_check
     def value_counts(

diff --git a/tests/integ/modin/frame/test_bfill_ffill.py b/tests/integ/modin/frame/test_bfill_ffill.py
@@ -5,14 +5,16 @@
 import modin.pandas as pd
 import numpy as np
 import pandas as native_pd
+import pytest
 
 import snowflake.snowpark.modin.plugin  # noqa: F401
 from tests.integ.modin.sql_counter import sql_count_checker
 from tests.integ.modin.utils import eval_snowpark_pandas_result
 
 
+@pytest.mark.parametrize("func", ["backfill", "bfill", "ffill", "pad"])
 @sql_count_checker(query_count=1)
-def test_df_ffill():
+def test_df_func(func):
     native_df = native_pd.DataFrame(
         [
             [np.nan, 2, np.nan, 0],
@@ -27,45 +29,5 @@ def test_df_ffill():
     eval_snowpark_pandas_result(
         snow_df,
         native_df,
-        lambda df: df.ffill(),
-    )
-
-
-@sql_count_checker(query_count=1)
-def test_df_bfill():
-    native_df = native_pd.DataFrame(
-        [
-            [np.nan, 2, np.nan, 0],
-            [3, 4, np.nan, 1],
-            [np.nan, np.nan, np.nan, np.nan],
-            [np.nan, 3, np.nan, 4],
-            [3, np.nan, 4, np.nan],
-        ],
-        columns=list("ABCD"),
-    )
-    snow_df = pd.DataFrame(native_df)
-    eval_snowpark_pandas_result(
-        snow_df,
-        native_df,
-        lambda df: df.bfill(),
-    )
-
-
-@sql_count_checker(query_count=1)
-def test_df_pad():
-    native_df = native_pd.DataFrame(
-        [
-            [np.nan, 2, np.nan, 0],
-            [3, 4, np.nan, 1],
-            [np.nan, np.nan, np.nan, np.nan],
-            [np.nan, 3, np.nan, 4],
-            [3, np.nan, 4, np.nan],
-        ],
-        columns=list("ABCD"),
-    )
-    snow_df = pd.DataFrame(native_df)
-    eval_snowpark_pandas_result(
-        snow_df,
-        native_df,
-        lambda df: df.pad(),
+        lambda df: getattr(df, func)(),
     )
diff --git a/tests/integ/modin/frame/test_set_index.py b/tests/integ/modin/frame/test_set_index.py
@@ -351,8 +351,8 @@ def test_set_index_pass_multiindex(drop, append, native_df):
 @pytest.mark.parametrize(
     "keys, expected_query_count",
     [
-        (["a"], 5),
-        ([[1, 6, 6]], 7),
+        (["a"], 4),
+        ([[1, 6, 6]], 6),
     ],
 )
 def test_set_index_verify_integrity_negative(native_df, keys, expected_query_count):