From e2f86edd3713f19b89ac9ee6ed0567f47c3b43e2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Nov 2019 10:58:44 +0100 Subject: [PATCH 1/2] Use NA scalar in BooleanArray --- pandas/_libs/missing.pyx | 3 +- pandas/core/arrays/boolean.py | 79 ++++++++++++++++---------- pandas/tests/arrays/test_boolean.py | 58 +++++++++++++++---- pandas/tests/extension/test_boolean.py | 14 ++++- pandas/tests/scalar/test_na_scalar.py | 4 +- 5 files changed, 112 insertions(+), 46 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 9bf955ad369e7..30832a8e4daab 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -289,7 +289,8 @@ cdef inline bint is_null_period(v): def _create_binary_propagating_op(name, divmod=False): def method(self, other): - if other is C_NA or isinstance(other, str) or isinstance(other, numbers.Number): + if (other is C_NA or isinstance(other, str) + or isinstance(other, (numbers.Number, np.bool_))): if divmod: return NA, NA else: diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c118b6fe26549..57c4de9c669e8 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -1,10 +1,10 @@ import numbers -from typing import TYPE_CHECKING, Type +from typing import TYPE_CHECKING, Any, Tuple, Type import warnings import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.compat import set_function_name from pandas.core.dtypes.base import ExtensionDtype @@ -61,13 +61,13 @@ class BooleanDtype(ExtensionDtype): @property def na_value(self) -> "Scalar": """ - BooleanDtype uses :attr:`numpy.nan` as the missing NA value. + BooleanDtype uses :attr:`pd.NA` as the missing NA value. .. warning:: `na_value` may change in a future release. """ - return np.nan + return libmissing.NA @property def type(self) -> Type: @@ -223,7 +223,7 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin): >>> pd.array([True, False, None], dtype="boolean") - [True, False, NaN] + [True, False, NA] Length: 3, dtype: boolean """ @@ -262,17 +262,17 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False): values, mask = coerce_to_array(scalars, copy=copy) return BooleanArray(values, mask) + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + data = self._data.astype("int8") + data[self._mask] = -1 + return data, -1 + @classmethod def _from_factorized(cls, values, original: "BooleanArray"): return cls._from_sequence(values, dtype=original.dtype) def _formatter(self, boxed=False): - def fmt(x): - if isna(x): - return "NaN" - return str(x) - - return fmt + return str def __getitem__(self, item): if is_integer(item): @@ -281,7 +281,9 @@ def __getitem__(self, item): return self._data[item] return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray(self, force_bool: bool = False): + def _coerce_to_ndarray( + self, force_bool: bool = False, na_value: "Scalar" = lib._no_default + ): """ Coerce to an ndarary of object dtype or bool dtype (if force_bool=True). @@ -290,6 +292,9 @@ def _coerce_to_ndarray(self, force_bool: bool = False): force_bool : bool, default False If True, return bool array or raise error if not possible (in presence of missing values) + na_value : scalar, optional + Scalar missing value indicator to use in numpy array. Defaults + to the native missing value indicator of this array (pd.NA). """ if force_bool: if not self.isna().any(): @@ -298,8 +303,10 @@ def _coerce_to_ndarray(self, force_bool: bool = False): raise ValueError( "cannot convert to bool numpy array in presence of missing values" ) + if na_value is lib._no_default: + na_value = self._na_value data = self._data.astype(object) - data[self._mask] = self._na_value + data[self._mask] = na_value return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us @@ -483,8 +490,17 @@ def astype(self, dtype, copy=True): return IntegerArray( self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False ) + # for integer, error if there are missing values + if is_integer_dtype(dtype): + if self.isna().any(): + raise ValueError("cannot convert NA to integer") + # for float dtype, ensure we use np.nan before casting (numpy cannot + # deal with pd.NA) + na_value = lib._no_default + if is_float_dtype(dtype): + na_value = np.nan # coerce - data = self._coerce_to_ndarray() + data = self._coerce_to_ndarray(na_value=na_value) return astype_nansafe(data, dtype, copy=None) def value_counts(self, dropna=True): @@ -594,8 +610,6 @@ def logical_method(self, other): @classmethod def _create_comparison_method(cls, op): - op_name = op.__name__ - def cmp_method(self, other): if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): @@ -617,21 +631,26 @@ def cmp_method(self, other): if len(self) != len(other): raise ValueError("Lengths must match to compare") - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - result = op(self._data, other) - - # nans propagate - if mask is None: - mask = self._mask + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + result = np.zeros_like(self._data) + mask = np.ones_like(self._data) else: - mask = self._mask | mask + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) + + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask - result[mask] = op_name == "ne" - return BooleanArray(result, np.zeros(len(result), dtype=bool), copy=False) + return BooleanArray(result, mask, copy=False) name = "__{name}__".format(name=op.__name__) return set_function_name(cmp_method, name, cls) @@ -643,7 +662,7 @@ def _reduce(self, name, skipna=True, **kwargs): # coerce to a nan-aware float if needed if mask.any(): data = self._data.astype("float64") - data[mask] = self._na_value + data[mask] = np.nan op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 5cfc7c3837875..1c7303598aaf7 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -216,7 +216,7 @@ def test_coerce_to_numpy_array(): # with missing values -> object dtype arr = pd.array([True, False, None], dtype="boolean") result = np.array(arr) - expected = np.array([True, False, None], dtype="object") + expected = np.array([True, False, pd.NA], dtype="object") tm.assert_numpy_array_equal(result, expected) # also with no missing values -> object dtype @@ -238,12 +238,11 @@ def test_coerce_to_numpy_array(): def test_astype(): # with missing values arr = pd.array([True, False, None], dtype="boolean") - msg = "cannot convert float NaN to" - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match="cannot convert NA to integer"): arr.astype("int64") - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match="cannot convert float NaN to"): arr.astype("bool") result = arr.astype("float64") @@ -406,9 +405,8 @@ def _compare_other(self, data, op_name, other): # array result = pd.Series(op(data, other)) expected = pd.Series(op(data._data, other), dtype="boolean") - - # fill the nan locations - expected[data._mask] = op_name == "__ne__" + # propagate NAs + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -419,9 +417,8 @@ def _compare_other(self, data, op_name, other): expected = pd.Series(data._data) expected = op(expected, other) expected = expected.astype("boolean") - - # fill the nan locations - expected[data._mask] = op_name == "__ne__" + # propagate NAs + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -438,6 +435,47 @@ def test_compare_array(self, data, all_compare_operators): other = pd.Series([True] * len(data)) self._compare_other(data, op_name, other) + @pytest.mark.parametrize("other", [True, False, pd.NA]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None], dtype="boolean") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + + result = op(a, b) + + values = op(a._data, b._data) + mask = a._mask | b._mask + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + class TestArithmeticOps(BaseOpsUtil): def test_error(self, data, all_arithmetic_operators): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 089dd798b2512..a02433da2da12 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -60,13 +60,13 @@ def data_missing_for_sorting(dtype): @pytest.fixture def na_cmp(): - # we are np.nan - return lambda x, y: np.isnan(x) and np.isnan(y) + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture @@ -160,6 +160,14 @@ def check_opname(self, s, op_name, other, exc=None): def _compare_other(self, s, data, op_name, other): self.check_opname(s, op_name, other) + @pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py") + def test_compare_scalar(self, data, all_compare_operators): + pass + + @pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py") + def test_compare_array(self, data, all_compare_operators): + pass + class TestReshaping(base.BaseReshapingTests): pass diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index e68e49814245f..586433698a587 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -48,7 +48,7 @@ def test_arithmetic_ops(all_arithmetic_functions): def test_comparison_ops(): - for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]: + for other in [NA, 1, 1.0, "a", np.int64(1), np.nan, np.bool_(True)]: assert (NA == other) is NA assert (NA != other) is NA assert (NA > other) is NA @@ -56,7 +56,7 @@ def test_comparison_ops(): assert (NA < other) is NA assert (NA <= other) is NA - if isinstance(other, np.int64): + if isinstance(other, (np.int64, np.bool_)): # for numpy scalars we get a deprecation warning and False as result # for equality or error for larger/lesser than continue From d083c88db6cb50ced1d81e12dd510164b8101056 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Dec 2019 08:28:41 +0100 Subject: [PATCH 2/2] updates --- pandas/core/arrays/boolean.py | 30 ++++++++++------------------- pandas/tests/arrays/test_boolean.py | 19 +++++++++++++----- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 57c4de9c669e8..aec3397bddd16 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -61,7 +61,7 @@ class BooleanDtype(ExtensionDtype): @property def na_value(self) -> "Scalar": """ - BooleanDtype uses :attr:`pd.NA` as the missing NA value. + BooleanDtype uses :attr:`pandas.NA` as the missing NA value. .. warning:: @@ -281,31 +281,28 @@ def __getitem__(self, item): return self._data[item] return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray( - self, force_bool: bool = False, na_value: "Scalar" = lib._no_default - ): + def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): """ Coerce to an ndarary of object dtype or bool dtype (if force_bool=True). Parameters ---------- - force_bool : bool, default False - If True, return bool array or raise error if not possible (in - presence of missing values) + dtype : dtype, default object + The numpy dtype to convert to na_value : scalar, optional Scalar missing value indicator to use in numpy array. Defaults to the native missing value indicator of this array (pd.NA). """ - if force_bool: + if dtype is None: + dtype = object + if is_bool_dtype(dtype): if not self.isna().any(): return self._data else: raise ValueError( "cannot convert to bool numpy array in presence of missing values" ) - if na_value is lib._no_default: - na_value = self._na_value - data = self._data.astype(object) + data = self._data.astype(dtype) data[self._mask] = na_value return data @@ -316,15 +313,8 @@ def __array__(self, dtype=None): the array interface, return my values We return an object array here to preserve our scalar values """ - if dtype is not None: - if is_bool_dtype(dtype): - return self._coerce_to_ndarray(force_bool=True) - # TODO can optimize this to not go through object dtype for - # numeric dtypes - arr = self._coerce_to_ndarray() - return arr.astype(dtype, copy=False) # by default (no dtype specified), return an object array - return self._coerce_to_ndarray() + return self._coerce_to_ndarray(dtype=dtype) def __arrow_array__(self, type=None): """ @@ -496,7 +486,7 @@ def astype(self, dtype, copy=True): raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) - na_value = lib._no_default + na_value = self._na_value if is_float_dtype(dtype): na_value = np.nan # coerce diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 1c7303598aaf7..d9cbf3f5b4172 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -101,13 +101,14 @@ def test_to_boolean_array_all_none(): @pytest.mark.parametrize( "a, b", [ - ([True, None], [True, np.nan]), - ([None], [np.nan]), - ([None, np.nan], [np.nan, np.nan]), - ([np.nan, np.nan], [np.nan, np.nan]), + ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]), + ([True, np.nan], [True, None]), + ([True, pd.NA], [True, None]), + ([np.nan, np.nan], [None, None]), + (np.array([np.nan, np.nan], dtype=float), [None, None]), ], ) -def test_to_boolean_array_none_is_nan(a, b): +def test_to_boolean_array_missing_indicators(a, b): result = pd.array(a, dtype="boolean") expected = pd.array(b, dtype="boolean") tm.assert_extension_array_equal(result, expected) @@ -279,6 +280,14 @@ def test_astype_to_integer_array(): tm.assert_extension_array_equal(result, expected) +@pytest.mark.parametrize("na", [None, np.nan, pd.NA]) +def test_setitem_missing_values(na): + arr = pd.array([True, False, None], dtype="boolean") + expected = pd.array([True, None, None], dtype="boolean") + arr[1] = na + tm.assert_extension_array_equal(arr, expected) + + @pytest.mark.parametrize( "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] )