From 3892fb1b6cc3e3feb0444f09b5e8a01ae3c34a0b Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 24 Jun 2022 12:45:50 -0600 Subject: [PATCH 1/5] Make df.dropna and friends safer --- packages/vaex-core/vaex/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py index 09199cba74..f9e4df7723 100644 --- a/packages/vaex-core/vaex/dataframe.py +++ b/packages/vaex-core/vaex/dataframe.py @@ -5068,7 +5068,8 @@ def dropinf(self, column_names=None): return self._filter_all(self.func.isinf, column_names) def _filter_all(self, f, column_names=None): - column_names = column_names or self.get_column_names(virtual=False) + if column_names is None: + column_names = self.get_column_names(virtual=False) expression = f(self[column_names[0]]) for column in column_names[1:]: expression = expression | f(self[column]) From a9882bc43a5d7beb6e08c2e1e6b3a090da3602dc Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 24 Jun 2022 13:24:24 -0600 Subject: [PATCH 2/5] Support how="any","all" in df.dropna, dropinf, etc See https://github.com/vaexio/vaex/issues/2084 --- packages/vaex-core/vaex/dataframe.py | 38 ++++++++++++++++++++-------- tests/dropna_test.py | 24 +++++++++++++----- 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py index f9e4df7723..2322c92ced 100644 --- a/packages/vaex-core/vaex/dataframe.py +++ b/packages/vaex-core/vaex/dataframe.py @@ -5036,43 +5036,61 @@ def create(current): return selections.SelectionDropNa(drop_nan, drop_masked, column_names, current, mode) self._selection(create, name) - def dropmissing(self, column_names=None): + def dropmissing(self, column_names=None, how="any"): """Create a shallow copy of a DataFrame, with filtering set using ismissing. :param column_names: The columns to consider, default: all (real, non-virtual) columns + :param str how: One of ("any", "all"). + If "any", then drop rows where any of the columns is missing. + If "all", then drop rows where all of the columns is missing. :rtype: DataFrame """ - return self._filter_all(self.func.ismissing, column_names) + return self._filter_all(self.func.ismissing, column_names, how=how) - def dropnan(self, column_names=None): + def dropnan(self, column_names=None, how="any"): """Create a shallow copy of a DataFrame, with filtering set using isnan. :param column_names: The columns to consider, default: all (real, non-virtual) columns + :param str how: One of ("any", "all"). + If "any", then drop rows where any of the columns is nan. + If "all", then drop rows where all of the columns is nan. :rtype: DataFrame """ - return self._filter_all(self.func.isnan, column_names) + return self._filter_all(self.func.isnan, column_names, how=how) - def dropna(self, column_names=None): + def dropna(self, column_names=None, how="any"): """Create a shallow copy of a DataFrame, with filtering set using isna. :param column_names: The columns to consider, default: all (real, non-virtual) columns + :param str how: One of ("any", "all"). + If "any", then drop rows where any of the columns is na. + If "all", then drop rows where all of the columns is na. :rtype: DataFrame """ - return self._filter_all(self.func.isna, column_names) + return self._filter_all(self.func.isna, column_names, how=how) - def dropinf(self, column_names=None): + def dropinf(self, column_names=None, how="any"): """ Create a shallow copy of a DataFrame, with filtering set using isinf. + :param column_names: The columns to consider, default: all (real, non-virtual) columns + :param str how: One of ("any", "all"). + If "any", then drop rows where any of the columns is inf. + If "all", then drop rows where all of the columns is inf. :rtype: DataFrame """ - return self._filter_all(self.func.isinf, column_names) + return self._filter_all(self.func.isinf, column_names, how=how) - def _filter_all(self, f, column_names=None): + def _filter_all(self, f, column_names=None, how="any"): if column_names is None: column_names = self.get_column_names(virtual=False) + if how not in ("any", "all"): + raise ValueError("`how` must be either 'any' or 'all'") expression = f(self[column_names[0]]) for column in column_names[1:]: - expression = expression | f(self[column]) + if how == "any": + expression = expression | f(self[column]) + else: + expression = expression & f(self[column]) return self.filter(~expression, mode='and') def select_nothing(self, name="default"): diff --git a/tests/dropna_test.py b/tests/dropna_test.py index c82d53b1e7..27620e2834 100644 --- a/tests/dropna_test.py +++ b/tests/dropna_test.py @@ -112,16 +112,26 @@ def test_dropna(): assert (df.s.dropna().tolist() == ["aap", "noot", "mies"]) assert (df.o.dropna().tolist() == ["aap", "noot"]) - -def test_dropna_all_columns(): - x = [1, 2, 3, 4, 5] - y = ['dog', 'dog', None, 'cat', None] +@pytest.fixture +def df_with_missings(): + x = [1.1, np.nan, np.nan, 4.4, 5.5] + y = ["dog", "dog", None, "cat", None] df = vaex.from_arrays(x=x, y=y) + return df - df_dropped = df.dropna() - assert df_dropped.x.tolist() == [1, 2, 4] - assert df_dropped.y.tolist() == ['dog', 'dog', 'cat'] +def test_dropna_all_columns(df_with_missings): + df = df_with_missings + # These two should be equivalent + for df_dropped in (df.dropna(), df.dropna(how="any")): + assert df_dropped.x.tolist() == [1.1, 4.4] + assert df_dropped.y.tolist() == ["dog", "cat"] + + df_dropped = df.dropna(how="all") + assert df_dropped.x.fillna(99).tolist() == [1.1, 99, 4.4, 5.5] + assert df_dropped.y.tolist() == ["dog", "dog", "cat", None] + with pytest.raises(ValueError): + df_dropped = df.dropna(how="invalid") def test_dropna_string_columns(): data_dict = {'10': [1, 2, np.nan], From 79c8bbe68eb3654ba34b5201f05ee08de9891426 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Sat, 25 Jun 2022 10:40:11 -0600 Subject: [PATCH 3/5] Fixup grammar Co-authored-by: Jovan Veljanoski --- packages/vaex-core/vaex/dataframe.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py index 2322c92ced..f61bf5dfcd 100644 --- a/packages/vaex-core/vaex/dataframe.py +++ b/packages/vaex-core/vaex/dataframe.py @@ -5041,8 +5041,8 @@ def dropmissing(self, column_names=None, how="any"): :param column_names: The columns to consider, default: all (real, non-virtual) columns :param str how: One of ("any", "all"). - If "any", then drop rows where any of the columns is missing. - If "all", then drop rows where all of the columns is missing. + If "any", then drop rows where any of the columns are missing. + If "all", then drop rows where all of the columns are missing. :rtype: DataFrame """ return self._filter_all(self.func.ismissing, column_names, how=how) @@ -5052,8 +5052,8 @@ def dropnan(self, column_names=None, how="any"): :param column_names: The columns to consider, default: all (real, non-virtual) columns :param str how: One of ("any", "all"). - If "any", then drop rows where any of the columns is nan. - If "all", then drop rows where all of the columns is nan. + If "any", then drop rows where any of the columns are nan. + If "all", then drop rows where all of the columns are nan. :rtype: DataFrame """ return self._filter_all(self.func.isnan, column_names, how=how) @@ -5063,8 +5063,8 @@ def dropna(self, column_names=None, how="any"): :param column_names: The columns to consider, default: all (real, non-virtual) columns :param str how: One of ("any", "all"). - If "any", then drop rows where any of the columns is na. - If "all", then drop rows where all of the columns is na. + If "any", then drop rows where any of the columns are na. + If "all", then drop rows where all of the columns are na. :rtype: DataFrame """ return self._filter_all(self.func.isna, column_names, how=how) @@ -5074,8 +5074,8 @@ def dropinf(self, column_names=None, how="any"): :param column_names: The columns to consider, default: all (real, non-virtual) columns :param str how: One of ("any", "all"). - If "any", then drop rows where any of the columns is inf. - If "all", then drop rows where all of the columns is inf. + If "any", then drop rows where any of the columns are inf. + If "all", then drop rows where all of the columns are inf. :rtype: DataFrame """ return self._filter_all(self.func.isinf, column_names, how=how) From 349a939abaf18df7072a647a6735d42f6b678c05 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Sat, 25 Jun 2022 10:44:18 -0600 Subject: [PATCH 4/5] Fixup: Apply suggestions from review --- tests/dropna_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/dropna_test.py b/tests/dropna_test.py index 27620e2834..f3aa2ac167 100644 --- a/tests/dropna_test.py +++ b/tests/dropna_test.py @@ -114,21 +114,21 @@ def test_dropna(): @pytest.fixture def df_with_missings(): - x = [1.1, np.nan, np.nan, 4.4, 5.5] - y = ["dog", "dog", None, "cat", None] - df = vaex.from_arrays(x=x, y=y) + nan = [1.1, np.nan, np.nan, 4.4, 5.5] + na = ['dog', 'dog', None, 'cat', None] + df = vaex.from_arrays(nan=nan, na=na) return df def test_dropna_all_columns(df_with_missings): df = df_with_missings # These two should be equivalent for df_dropped in (df.dropna(), df.dropna(how="any")): - assert df_dropped.x.tolist() == [1.1, 4.4] - assert df_dropped.y.tolist() == ["dog", "cat"] + assert df_dropped.nan.tolist() == [1.1, 4.4] + assert df_dropped.na.tolist() == ['dog', 'cat'] df_dropped = df.dropna(how="all") - assert df_dropped.x.fillna(99).tolist() == [1.1, 99, 4.4, 5.5] - assert df_dropped.y.tolist() == ["dog", "dog", "cat", None] + assert df_dropped.nan.fillna(99).tolist() == [1.1, 99, 4.4, 5.5] + assert df_dropped.na.tolist() == ['dog', 'dog', 'cat', None] with pytest.raises(ValueError): df_dropped = df.dropna(how="invalid") From 900643101c8dd3d5844e928bf58784cf9f59d489 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Sun, 26 Jun 2022 18:29:40 -0600 Subject: [PATCH 5/5] Use array_factory in dropna tests Now we test against numpy, arrow, and chunked arrow arrays. --- tests/dropna_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/dropna_test.py b/tests/dropna_test.py index f3aa2ac167..80c4098836 100644 --- a/tests/dropna_test.py +++ b/tests/dropna_test.py @@ -113,9 +113,11 @@ def test_dropna(): assert (df.o.dropna().tolist() == ["aap", "noot"]) @pytest.fixture -def df_with_missings(): - nan = [1.1, np.nan, np.nan, 4.4, 5.5] - na = ['dog', 'dog', None, 'cat', None] +def df_with_missings(array_factory1, array_factory2): + # Create arrays separately so that the DF might have a mix of + # numpy and arrow arrays. + nan = array_factory1([1.1, np.nan, np.nan, 4.4, 5.5]) + na = array_factory2(['dog', 'dog', None, 'cat', None]) df = vaex.from_arrays(nan=nan, na=na) return df