Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: Add how="any","all" to df.dropna, dropinf, etc #2104

Merged
merged 5 commits into from
Jun 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions packages/vaex-core/vaex/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5036,42 +5036,61 @@ def create(current):
return selections.SelectionDropNa(drop_nan, drop_masked, column_names, current, mode)
self._selection(create, name)

def dropmissing(self, column_names=None):
def dropmissing(self, column_names=None, how="any"):
"""Create a shallow copy of a DataFrame, with filtering set using ismissing.

:param column_names: The columns to consider, default: all (real, non-virtual) columns
:param str how: One of ("any", "all").
If "any", then drop rows where any of the columns are missing.
If "all", then drop rows where all of the columns are missing.
:rtype: DataFrame
"""
return self._filter_all(self.func.ismissing, column_names)
return self._filter_all(self.func.ismissing, column_names, how=how)

def dropnan(self, column_names=None):
def dropnan(self, column_names=None, how="any"):
"""Create a shallow copy of a DataFrame, with filtering set using isnan.

:param column_names: The columns to consider, default: all (real, non-virtual) columns
:param str how: One of ("any", "all").
If "any", then drop rows where any of the columns are nan.
If "all", then drop rows where all of the columns are nan.
:rtype: DataFrame
"""
return self._filter_all(self.func.isnan, column_names)
return self._filter_all(self.func.isnan, column_names, how=how)

def dropna(self, column_names=None):
def dropna(self, column_names=None, how="any"):
"""Create a shallow copy of a DataFrame, with filtering set using isna.

:param column_names: The columns to consider, default: all (real, non-virtual) columns
:param str how: One of ("any", "all").
If "any", then drop rows where any of the columns are na.
If "all", then drop rows where all of the columns are na.
:rtype: DataFrame
"""
return self._filter_all(self.func.isna, column_names)
return self._filter_all(self.func.isna, column_names, how=how)

def dropinf(self, column_names=None):
def dropinf(self, column_names=None, how="any"):
""" Create a shallow copy of a DataFrame, with filtering set using isinf.

:param column_names: The columns to consider, default: all (real, non-virtual) columns
:param str how: One of ("any", "all").
If "any", then drop rows where any of the columns are inf.
If "all", then drop rows where all of the columns are inf.
:rtype: DataFrame
"""
return self._filter_all(self.func.isinf, column_names)
return self._filter_all(self.func.isinf, column_names, how=how)

def _filter_all(self, f, column_names=None):
column_names = column_names or self.get_column_names(virtual=False)
def _filter_all(self, f, column_names=None, how="any"):
if column_names is None:
column_names = self.get_column_names(virtual=False)
if how not in ("any", "all"):
raise ValueError("`how` must be either 'any' or 'all'")
expression = f(self[column_names[0]])
for column in column_names[1:]:
expression = expression | f(self[column])
if how == "any":
expression = expression | f(self[column])
else:
expression = expression & f(self[column])
return self.filter(~expression, mode='and')

def select_nothing(self, name="default"):
Expand Down
32 changes: 22 additions & 10 deletions tests/dropna_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,28 @@ def test_dropna():
assert (df.s.dropna().tolist() == ["aap", "noot", "mies"])
assert (df.o.dropna().tolist() == ["aap", "noot"])


def test_dropna_all_columns():
x = [1, 2, 3, 4, 5]
y = ['dog', 'dog', None, 'cat', None]
df = vaex.from_arrays(x=x, y=y)

df_dropped = df.dropna()
assert df_dropped.x.tolist() == [1, 2, 4]
assert df_dropped.y.tolist() == ['dog', 'dog', 'cat']

@pytest.fixture
def df_with_missings(array_factory1, array_factory2):
# Create arrays separately so that the DF might have a mix of
# numpy and arrow arrays.
nan = array_factory1([1.1, np.nan, np.nan, 4.4, 5.5])
na = array_factory2(['dog', 'dog', None, 'cat', None])
df = vaex.from_arrays(nan=nan, na=na)
return df

def test_dropna_all_columns(df_with_missings):
df = df_with_missings
# These two should be equivalent
for df_dropped in (df.dropna(), df.dropna(how="any")):
assert df_dropped.nan.tolist() == [1.1, 4.4]
assert df_dropped.na.tolist() == ['dog', 'cat']

df_dropped = df.dropna(how="all")
assert df_dropped.nan.fillna(99).tolist() == [1.1, 99, 4.4, 5.5]
assert df_dropped.na.tolist() == ['dog', 'dog', 'cat', None]

with pytest.raises(ValueError):
df_dropped = df.dropna(how="invalid")

def test_dropna_string_columns():
data_dict = {'10': [1, 2, np.nan],
Expand Down