Skip to content

Commit

Permalink
Merge branch 'main' into helmeleegy-SNOW-1445842
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-helmeleegy committed Jul 12, 2024
2 parents a498c5f + 7c854cb commit c43345e
Show file tree
Hide file tree
Showing 16 changed files with 257 additions and 145 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- Added support for function `arrays_zip`.
- Allow `df.plot()` and `series.plot()` to be called, materializing the data into the local client
- Improves performance for binary column expression and df._in by avoiding unnecessary cast for numeric values. This optimization can be enabled through session.eliminate_numeric_sql_value_cast_enabled = True.
- Improved error message for `write_pandas` when target table does not exists and `auto_create_table=False`.

#### Bug Fixes

Expand Down Expand Up @@ -125,6 +126,7 @@
- Added support for Index APIs: `dtype`, `values`, `item()`, `tolist()`, `to_series()` and `to_frame()`
- Expand support for DataFrames with no rows in `pd.pivot_table` and `DataFrame.pivot_table`.
- Added support for `inplace` parameter in `DataFrame.sort_index` and `Series.sort_index`.
- Added support for `Index.unique` and `Index.nunique`.

## 1.18.0 (2024-05-28)

Expand Down
4 changes: 2 additions & 2 deletions docs/source/modin/supported/index_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,9 @@ Methods
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``putmask`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``unique`` | N | | |
| ``unique`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``nunique`` | N | | |
| ``nunique`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``value_counts`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion src/snowflake/snowpark/_internal/error_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def DF_PANDAS_TABLE_DOES_NOT_EXIST_EXCEPTION(
return SnowparkPandasException(
f"Cannot write pandas DataFrame to table {location} "
f"because it does not exist. Create table before "
f"trying to write a pandas DataFrame",
f"trying to write a pandas DataFrame or set auto_create_table=True.",
error_code="1114",
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7396,14 +7396,18 @@ def get_axis_len(
"""
return self._modin_frame.num_rows if axis == 0 else len(self.columns)

def _nunique_columns(self, dropna: bool) -> "SnowflakeQueryCompiler":
def _nunique_columns(
self, dropna: bool, include_index: bool = False
) -> "SnowflakeQueryCompiler":
"""
Helper function to compute the number of unique elements in each column.

Parameters
----------
dropna: bool
When true, does not consider NULL values as elements.
include_index: bool, default False
When true, include index columns when counting the number of unique elements.

Returns
-------
Expand All @@ -7419,7 +7423,7 @@ def _nunique_columns(self, dropna: bool) -> "SnowflakeQueryCompiler":
)[0]
)

if len(self.columns) == 0:
if not include_index and len(self.columns) == 0:
return SnowflakeQueryCompiler.from_pandas(
native_pd.DataFrame([], index=["unique"], dtype=float)
)
Expand All @@ -7437,9 +7441,19 @@ def make_nunique(identifier: str, dropna: bool) -> SnowparkColumn:
)

# get a new ordered df with nunique columns
snowflake_quoted_identifiers = (
internal_frame.data_column_snowflake_quoted_identifiers
)
pandas_labels = internal_frame.data_column_pandas_labels
if include_index:
snowflake_quoted_identifiers = (
internal_frame.index_column_snowflake_quoted_identifiers
+ snowflake_quoted_identifiers
)
pandas_labels = ["unique_index"] + internal_frame.data_column_pandas_labels
nunique_columns = [
make_nunique(identifier, dropna).as_(identifier)
for identifier in internal_frame.data_column_snowflake_quoted_identifiers
for identifier in snowflake_quoted_identifiers
]

# since we don't compute count on the index, we need to add a column for it
Expand All @@ -7452,14 +7466,28 @@ def make_nunique(identifier: str, dropna: bool) -> SnowparkColumn:
# get a new internal frame
frame = InternalFrame.create(
ordered_dataframe=ordered_dataframe,
data_column_pandas_labels=internal_frame.data_column_pandas_labels,
data_column_snowflake_quoted_identifiers=internal_frame.data_column_snowflake_quoted_identifiers,
data_column_pandas_labels=pandas_labels,
data_column_snowflake_quoted_identifiers=snowflake_quoted_identifiers,
data_column_pandas_index_names=internal_frame.data_column_pandas_index_names,
index_column_pandas_labels=[INDEX_LABEL],
index_column_snowflake_quoted_identifiers=[new_index_identifier],
)
return SnowflakeQueryCompiler(frame)

def nunique_index(self, dropna: bool) -> int:
"""
Return number of unique elements in an Index object.

Returns
-------
int : The number of unique elements.
"""
return (
self._nunique_columns(dropna=dropna, include_index=True)
.to_pandas()
.iloc[0, 0]
)

def nunique(
self, axis: Axis, dropna: bool, **kwargs: Any
) -> "SnowflakeQueryCompiler":
Expand Down
38 changes: 19 additions & 19 deletions src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1237,23 +1237,23 @@ def bfill():
>>> df = pd.DataFrame({'A': [1, None, None, 4], 'B': [None, 5, None, 7]})
>>> df
A B
0 1.0 NaN
1 NaN 5.0
2 NaN NaN
3 4.0 7.0
A B
0 1.0 NaN
1 NaN 5.0
2 NaN NaN
3 4.0 7.0
>>> df.bfill()
A B
0 1.0 5.0
1 4.0 5.0
2 4.0 7.0
3 4.0 7.0
A B
0 1.0 5.0
1 4.0 5.0
2 4.0 7.0
3 4.0 7.0
>>> df.bfill(limit=1)
A B
0 1.0 5.0
1 NaN 5.0
2 4.0 7.0
3 4.0 7.0
A B
0 1.0 5.0
1 NaN 5.0
2 4.0 7.0
3 4.0 7.0
"""

def boxplot():
Expand Down Expand Up @@ -1412,10 +1412,10 @@ def ffill():
>>> ser = pd.Series([1, np.nan, 2, 3])
>>> ser.ffill()
0 1.0
1 1.0
2 2.0
3 3.0
0 1.0
1 1.0
2 2.0
3 3.0
dtype: float64
"""

Expand Down
38 changes: 19 additions & 19 deletions src/snowflake/snowpark/modin/plugin/docstrings/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,23 +780,23 @@ def bfill():
>>> df = pd.DataFrame({'A': [1, None, None, 4], 'B': [None, 5, None, 7]})
>>> df
A B
0 1.0 NaN
1 NaN 5.0
2 NaN NaN
3 4.0 7.0
A B
0 1.0 NaN
1 NaN 5.0
2 NaN NaN
3 4.0 7.0
>>> df.bfill()
A B
0 1.0 5.0
1 4.0 5.0
2 4.0 7.0
3 4.0 7.0
A B
0 1.0 5.0
1 4.0 5.0
2 4.0 7.0
3 4.0 7.0
>>> df.bfill(limit=1)
A B
0 1.0 5.0
1 NaN 5.0
2 4.0 7.0
3 4.0 7.0
A B
0 1.0 5.0
1 NaN 5.0
2 4.0 7.0
3 4.0 7.0
"""

def compare():
Expand Down Expand Up @@ -1282,10 +1282,10 @@ def ffill():
>>> ser = pd.Series([1, np.nan, 2, 3])
>>> ser.ffill()
0 1.0
1 1.0
2 2.0
3 3.0
0 1.0
1 1.0
2 2.0
3 3.0
dtype: float64
"""

Expand Down
39 changes: 32 additions & 7 deletions src/snowflake/snowpark/modin/plugin/extensions/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,17 +415,28 @@ def unique(self, level: Hashable | None = None) -> Index:
See Also
--------
unique : Numpy array of unique values in that column.
Series.unique : Return unique values of Series object.
Series.unique : Return unique values of a Series object.
Examples
--------
>>> idx = pd.Index([1, 1, 2, 3, 3])
>>> idx.unique()
Index([1, 2, 3], dtype='int64')
"""
# TODO: SNOW-1458132 implement unique
WarningMessage.index_to_pandas_warning("unique")
return Index(self.to_pandas().unique(level=level))
if level not in [None, 0, -1]:
raise IndexError(
f"Too many levels: Index has only 1 level, {level} is not a valid level number."
)
return Index(
data=self._query_compiler.groupby_agg(
by=self._query_compiler.get_index_names(axis=0),
agg_func={},
axis=0,
groupby_kwargs={"sort": False, "as_index": True, "dropna": False},
agg_args=[],
agg_kwargs={},
)
)

@property
@is_lazy_check
Expand Down Expand Up @@ -1372,8 +1383,8 @@ def rename(self) -> None:
"""
# TODO: SNOW-1458122 implement rename

@index_not_implemented()
def nunique(self) -> None:
@is_lazy_check
def nunique(self, dropna: bool = True) -> int:
"""
Return number of unique elements in the object.
Expand All @@ -1392,8 +1403,22 @@ def nunique(self) -> None:
--------
DataFrame.nunique: Method nunique for DataFrame.
Series.count: Count non-NA/null observations in the Series.
Examples
--------
>>> s = pd.Series([1, 3, 5, 7, 7])
>>> s
0 1
1 3
2 5
3 7
4 7
dtype: int64
>>> s.nunique()
4
"""
# TODO: SNOW-1458132 implement nunique
return self._query_compiler.nunique_index(dropna=dropna)

@is_lazy_check
def value_counts(
Expand Down
46 changes: 4 additions & 42 deletions tests/integ/modin/frame/test_bfill_ffill.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
import modin.pandas as pd
import numpy as np
import pandas as native_pd
import pytest

import snowflake.snowpark.modin.plugin # noqa: F401
from tests.integ.modin.sql_counter import sql_count_checker
from tests.integ.modin.utils import eval_snowpark_pandas_result


@pytest.mark.parametrize("func", ["backfill", "bfill", "ffill", "pad"])
@sql_count_checker(query_count=1)
def test_df_ffill():
def test_df_func(func):
native_df = native_pd.DataFrame(
[
[np.nan, 2, np.nan, 0],
Expand All @@ -27,45 +29,5 @@ def test_df_ffill():
eval_snowpark_pandas_result(
snow_df,
native_df,
lambda df: df.ffill(),
)


@sql_count_checker(query_count=1)
def test_df_bfill():
native_df = native_pd.DataFrame(
[
[np.nan, 2, np.nan, 0],
[3, 4, np.nan, 1],
[np.nan, np.nan, np.nan, np.nan],
[np.nan, 3, np.nan, 4],
[3, np.nan, 4, np.nan],
],
columns=list("ABCD"),
)
snow_df = pd.DataFrame(native_df)
eval_snowpark_pandas_result(
snow_df,
native_df,
lambda df: df.bfill(),
)


@sql_count_checker(query_count=1)
def test_df_pad():
native_df = native_pd.DataFrame(
[
[np.nan, 2, np.nan, 0],
[3, 4, np.nan, 1],
[np.nan, np.nan, np.nan, np.nan],
[np.nan, 3, np.nan, 4],
[3, np.nan, 4, np.nan],
],
columns=list("ABCD"),
)
snow_df = pd.DataFrame(native_df)
eval_snowpark_pandas_result(
snow_df,
native_df,
lambda df: df.pad(),
lambda df: getattr(df, func)(),
)
4 changes: 2 additions & 2 deletions tests/integ/modin/frame/test_set_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,8 +351,8 @@ def test_set_index_pass_multiindex(drop, append, native_df):
@pytest.mark.parametrize(
"keys, expected_query_count",
[
(["a"], 5),
([[1, 6, 6]], 7),
(["a"], 4),
([[1, 6, 6]], 6),
],
)
def test_set_index_verify_integrity_negative(native_df, keys, expected_query_count):
Expand Down
Loading

0 comments on commit c43345e

Please sign in to comment.