diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c2ce799c64aac..c065fdeba2177 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -15,13 +15,10 @@ is_extension_array_dtype, is_float, is_float_dtype, - is_integer, is_integer_dtype, is_list_like, is_numeric_dtype, - is_object_dtype, is_scalar, - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype @@ -29,10 +26,8 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops -from pandas.core.algorithms import take -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -import pandas.core.common as com -from pandas.core.indexers import check_bool_array_indexer + +from .masked import BaseMaskedArray if TYPE_CHECKING: from pandas._typing import Scalar @@ -199,7 +194,7 @@ def coerce_to_array(values, mask=None, copy: bool = False): return values, mask -class BooleanArray(ExtensionArray, ExtensionOpsMixin): +class BooleanArray(BaseMaskedArray): """ Array of boolean (True/False) data with missing values. @@ -253,6 +248,9 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin): Length: 3, dtype: boolean """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( @@ -297,127 +295,6 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: def _from_factorized(cls, values, original: "BooleanArray"): return cls._from_sequence(values, dtype=original.dtype) - def _formatter(self, boxed=False): - return str - - @property - def _hasna(self) -> bool: - # Note: this is expensive right now! The hope is that we can - # make this faster by having an optional mask, but not have to change - # source code using it.. - return self._mask.any() - - def __getitem__(self, item): - if is_integer(item): - if self._mask[item]: - return self.dtype.na_value - return self._data[item] - - elif com.is_bool_indexer(item): - item = check_bool_array_indexer(self, item) - - return type(self)(self._data[item], self._mask[item]) - - def to_numpy( - self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default, - ): - """ - Convert to a NumPy Array. - - By default converts to an object-dtype NumPy array. Specify the `dtype` and - `na_value` keywords to customize the conversion. - - Parameters - ---------- - dtype : dtype, default object - The numpy dtype to convert to. - copy : bool, default False - Whether to ensure that the returned value is a not a view on - the array. Note that ``copy=False`` does not *ensure* that - ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that - a copy is made, even if not strictly necessary. This is typically - only possible when no missing values are present and `dtype` - is a boolean dtype. - na_value : scalar, optional - Scalar missing value indicator to use in numpy array. Defaults - to the native missing value indicator of this array (pd.NA). - - Returns - ------- - numpy.ndarray - - Examples - -------- - An object-dtype is the default result - - >>> a = pd.array([True, False], dtype="boolean") - >>> a.to_numpy() - array([True, False], dtype=object) - - When no missing values are present, a boolean dtype can be used. - - >>> a.to_numpy(dtype="bool") - array([ True, False]) - - However, requesting a bool dtype will raise a ValueError if - missing values are present and the default missing value :attr:`NA` - is used. - - >>> a = pd.array([True, False, pd.NA], dtype="boolean") - >>> a - - [True, False, NA] - Length: 3, dtype: boolean - - >>> a.to_numpy(dtype="bool") - Traceback (most recent call last): - ... - ValueError: cannot convert to bool numpy array in presence of missing values - - Specify a valid `na_value` instead - - >>> a.to_numpy(dtype="bool", na_value=False) - array([ True, False, False]) - """ - if na_value is lib.no_default: - na_value = libmissing.NA - if dtype is None: - dtype = object - if self._hasna: - if ( - not (is_object_dtype(dtype) or is_string_dtype(dtype)) - and na_value is libmissing.NA - ): - raise ValueError( - f"cannot convert to '{dtype}'-dtype NumPy array " - "with missing values. Specify an appropriate 'na_value' " - "for this dtype." - ) - # don't pass copy to astype -> always need a copy since we are mutating - data = self._data.astype(dtype) - data[self._mask] = na_value - else: - data = self._data.astype(dtype, copy=copy) - return data - - __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - - def __array__(self, dtype=None): - """ - the array interface, return my values - We return an object array here to preserve our scalar values - """ - # by default (no dtype specified), return an object array - return self.to_numpy(dtype=dtype) - - def __arrow_array__(self, type=None): - """ - Convert myself into a pyarrow Array. - """ - import pyarrow as pa - - return pa.array(self._data, mask=self._mask, type=type) - _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -465,40 +342,6 @@ def reconstruct(x): else: return reconstruct(result) - def __iter__(self): - for i in range(len(self)): - if self._mask[i]: - yield self.dtype.na_value - else: - yield self._data[i] - - def take(self, indexer, allow_fill=False, fill_value=None): - # we always fill with False internally - # to avoid upcasting - data_fill_value = False if isna(fill_value) else fill_value - result = take( - self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill - ) - - mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) - - # if we are filling - # we only fill where the indexer is null - # not existing missing values - # TODO(jreback) what if we have a non-na float as a fill value? - if allow_fill and notna(fill_value): - fill_mask = np.asarray(indexer) == -1 - result[fill_mask] = fill_value - mask = mask ^ fill_mask - - return type(self)(result, mask, copy=False) - - def copy(self): - data, mask = self._data, self._mask - data = data.copy() - mask = mask.copy() - return type(self)(data, mask, copy=False) - def __setitem__(self, key, value): _is_scalar = is_scalar(value) if _is_scalar: @@ -512,26 +355,6 @@ def __setitem__(self, key, value): self._data[key] = value self._mask[key] = mask - def __len__(self): - return len(self._data) - - @property - def nbytes(self): - return self._data.nbytes + self._mask.nbytes - - def isna(self): - return self._mask - - @property - def _na_value(self): - return self._dtype.na_value - - @classmethod - def _concat_same_type(cls, to_concat): - data = np.concatenate([x._data for x in to_concat]) - mask = np.concatenate([x._mask for x in to_concat]) - return cls(data, mask) - def astype(self, dtype, copy=True): """ Cast to a NumPy array or ExtensionArray with 'dtype'. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d63692c5ba972..91b334a6654e3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -5,7 +5,6 @@ import numpy as np from pandas._libs import lib, missing as libmissing -from pandas._typing import Scalar from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -20,20 +19,17 @@ is_list_like, is_object_dtype, is_scalar, - is_string_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops -from pandas.core.algorithms import take -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -import pandas.core.common as com -from pandas.core.indexers import check_bool_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric +from .masked import BaseMaskedArray + class _IntegerDtype(ExtensionDtype): """ @@ -261,7 +257,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): return values, mask -class IntegerArray(ExtensionArray, ExtensionOpsMixin): +class IntegerArray(BaseMaskedArray): """ Array of integer (optional missing) values. @@ -331,6 +327,9 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): Length: 3, dtype: UInt16 """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 + @cache_readonly def dtype(self): return _dtypes[str(self._data.dtype)] @@ -367,65 +366,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) - def __getitem__(self, item): - if is_integer(item): - if self._mask[item]: - return self.dtype.na_value - return self._data[item] - - elif com.is_bool_indexer(item): - item = check_bool_array_indexer(self, item) - - return type(self)(self._data[item], self._mask[item]) - - @property - def _hasna(self) -> bool: - # Note: this is expensive right now! The hope is that we can - # make this faster by having an optional mask, but not have to change - # source code using it.. - return self._mask.any() - - def to_numpy( - self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default, - ): - if na_value is lib.no_default: - na_value = libmissing.NA - if dtype is None: - dtype = object - if self._hasna: - if ( - not (is_object_dtype(dtype) or is_string_dtype(dtype)) - and na_value is libmissing.NA - ): - raise ValueError( - f"cannot convert to '{dtype}'-dtype NumPy array " - "with missing values. Specify an appropriate 'na_value' " - "for this dtype." - ) - # don't pass copy to astype -> always need a copy since we are mutating - data = self._data.astype(dtype) - data[self._mask] = na_value - else: - data = self._data.astype(dtype, copy=copy) - return data - - __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - - def __array__(self, dtype=None): - """ - the array interface, return my values - We return an object array here to preserve our scalar values - """ - return self.to_numpy(dtype=dtype) - - def __arrow_array__(self, type=None): - """ - Convert myself into a pyarrow Array. - """ - import pyarrow as pa - - return pa.array(self._data, mask=self._mask, type=type) - _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -473,40 +413,6 @@ def reconstruct(x): else: return reconstruct(result) - def __iter__(self): - for i in range(len(self)): - if self._mask[i]: - yield self.dtype.na_value - else: - yield self._data[i] - - def take(self, indexer, allow_fill=False, fill_value=None): - # we always fill with 1 internally - # to avoid upcasting - data_fill_value = 1 if isna(fill_value) else fill_value - result = take( - self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill - ) - - mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) - - # if we are filling - # we only fill where the indexer is null - # not existing missing values - # TODO(jreback) what if we have a non-na float as a fill value? - if allow_fill and notna(fill_value): - fill_mask = np.asarray(indexer) == -1 - result[fill_mask] = fill_value - mask = mask ^ fill_mask - - return type(self)(result, mask, copy=False) - - def copy(self): - data, mask = self._data, self._mask - data = data.copy() - mask = mask.copy() - return type(self)(data, mask, copy=False) - def __setitem__(self, key, value): _is_scalar = is_scalar(value) if _is_scalar: @@ -520,26 +426,6 @@ def __setitem__(self, key, value): self._data[key] = value self._mask[key] = mask - def __len__(self) -> int: - return len(self._data) - - @property - def nbytes(self): - return self._data.nbytes + self._mask.nbytes - - def isna(self): - return self._mask - - @property - def _na_value(self): - return self.dtype.na_value - - @classmethod - def _concat_same_type(cls, to_concat): - data = np.concatenate([x._data for x in to_concat]) - mask = np.concatenate([x._mask for x in to_concat]) - return cls(data, mask) - def astype(self, dtype, copy=True): """ Cast to a NumPy array or IntegerArray with 'dtype'. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py new file mode 100644 index 0000000000000..6fd9f1efbb408 --- /dev/null +++ b/pandas/core/arrays/masked.py @@ -0,0 +1,203 @@ +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs import lib, missing as libmissing + +from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype +from pandas.core.dtypes.missing import isna, notna + +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): + """ + Base class for masked arrays (which use _data and _mask to store the data). + + numpy based + """ + + _data: np.ndarray + _mask: np.ndarray + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value: "Scalar" + + def __getitem__(self, item): + if is_integer(item): + if self._mask[item]: + return self.dtype.na_value + return self._data[item] + + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + + return type(self)(self._data[item], self._mask[item]) + + def __iter__(self): + for i in range(len(self)): + if self._mask[i]: + yield self.dtype.na_value + else: + yield self._data[i] + + def __len__(self) -> int: + return len(self._data) + + def to_numpy( + self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default, + ): + """ + Convert to a NumPy Array. + + By default converts to an object-dtype NumPy array. Specify the `dtype` and + `na_value` keywords to customize the conversion. + + Parameters + ---------- + dtype : dtype, default object + The numpy dtype to convert to. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + the array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. This is typically + only possible when no missing values are present and `dtype` + is the equivalent numpy dtype. + na_value : scalar, optional + Scalar missing value indicator to use in numpy array. Defaults + to the native missing value indicator of this array (pd.NA). + + Returns + ------- + numpy.ndarray + + Examples + -------- + An object-dtype is the default result + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a.to_numpy() + array([True, False, NA], dtype=object) + + When no missing values are present, an equivalent dtype can be used. + + >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") + array([ True, False]) + >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") + array([1, 2]) + + However, requesting such dtype will raise a ValueError if + missing values are present and the default missing value :attr:`NA` + is used. + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a + + [True, False, NA] + Length: 3, dtype: boolean + + >>> a.to_numpy(dtype="bool") + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + + Specify a valid `na_value` instead + + >>> a.to_numpy(dtype="bool", na_value=False) + array([ True, False, False]) + """ + if na_value is lib.no_default: + na_value = libmissing.NA + if dtype is None: + dtype = object + if self._hasna: + if ( + not (is_object_dtype(dtype) or is_string_dtype(dtype)) + and na_value is libmissing.NA + ): + raise ValueError( + f"cannot convert to '{dtype}'-dtype NumPy array " + "with missing values. Specify an appropriate 'na_value' " + "for this dtype." + ) + # don't pass copy to astype -> always need a copy since we are mutating + data = self._data.astype(dtype) + data[self._mask] = na_value + else: + data = self._data.astype(dtype, copy=copy) + return data + + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + + def __array__(self, dtype=None): + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + return self.to_numpy(dtype=dtype) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + + @property + def _hasna(self) -> bool: + # Note: this is expensive right now! The hope is that we can + # make this faster by having an optional mask, but not have to change + # source code using it.. + return self._mask.any() + + def isna(self): + return self._mask + + @property + def _na_value(self): + return self.dtype.na_value + + @property + def nbytes(self): + return self._data.nbytes + self._mask.nbytes + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x._data for x in to_concat]) + mask = np.concatenate([x._mask for x in to_concat]) + return cls(data, mask) + + def take(self, indexer, allow_fill=False, fill_value=None): + # we always fill with 1 internally + # to avoid upcasting + data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) + + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return type(self)(result, mask, copy=False) + + def copy(self): + data, mask = self._data, self._mask + data = data.copy() + mask = mask.copy() + return type(self)(data, mask, copy=False)