Skip to content

Commit

Permalink
REF: Implement BaseMaskedArray class for integer/boolean ExtensionArr…
Browse files Browse the repository at this point in the history
…ays (#30789)
  • Loading branch information
jorisvandenbossche authored and jreback committed Jan 9, 2020
1 parent a73ce98 commit 89bc0aa
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 303 deletions.
189 changes: 6 additions & 183 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,19 @@
is_extension_array_dtype,
is_float,
is_float_dtype,
is_integer,
is_integer_dtype,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, notna

from pandas.core import nanops, ops
from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer

from .masked import BaseMaskedArray

if TYPE_CHECKING:
from pandas._typing import Scalar
Expand Down Expand Up @@ -199,7 +194,7 @@ def coerce_to_array(values, mask=None, copy: bool = False):
return values, mask


class BooleanArray(ExtensionArray, ExtensionOpsMixin):
class BooleanArray(BaseMaskedArray):
"""
Array of boolean (True/False) data with missing values.
Expand Down Expand Up @@ -253,6 +248,9 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
Length: 3, dtype: boolean
"""

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False

def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
raise TypeError(
Expand Down Expand Up @@ -297,127 +295,6 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
def _from_factorized(cls, values, original: "BooleanArray"):
return cls._from_sequence(values, dtype=original.dtype)

def _formatter(self, boxed=False):
return str

@property
def _hasna(self) -> bool:
# Note: this is expensive right now! The hope is that we can
# make this faster by having an optional mask, but not have to change
# source code using it..
return self._mask.any()

def __getitem__(self, item):
if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])

def to_numpy(
self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default,
):
"""
Convert to a NumPy Array.
By default converts to an object-dtype NumPy array. Specify the `dtype` and
`na_value` keywords to customize the conversion.
Parameters
----------
dtype : dtype, default object
The numpy dtype to convert to.
copy : bool, default False
Whether to ensure that the returned value is a not a view on
the array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary. This is typically
only possible when no missing values are present and `dtype`
is a boolean dtype.
na_value : scalar, optional
Scalar missing value indicator to use in numpy array. Defaults
to the native missing value indicator of this array (pd.NA).
Returns
-------
numpy.ndarray
Examples
--------
An object-dtype is the default result
>>> a = pd.array([True, False], dtype="boolean")
>>> a.to_numpy()
array([True, False], dtype=object)
When no missing values are present, a boolean dtype can be used.
>>> a.to_numpy(dtype="bool")
array([ True, False])
However, requesting a bool dtype will raise a ValueError if
missing values are present and the default missing value :attr:`NA`
is used.
>>> a = pd.array([True, False, pd.NA], dtype="boolean")
>>> a
<BooleanArray>
[True, False, NA]
Length: 3, dtype: boolean
>>> a.to_numpy(dtype="bool")
Traceback (most recent call last):
...
ValueError: cannot convert to bool numpy array in presence of missing values
Specify a valid `na_value` instead
>>> a.to_numpy(dtype="bool", na_value=False)
array([ True, False, False])
"""
if na_value is lib.no_default:
na_value = libmissing.NA
if dtype is None:
dtype = object
if self._hasna:
if (
not (is_object_dtype(dtype) or is_string_dtype(dtype))
and na_value is libmissing.NA
):
raise ValueError(
f"cannot convert to '{dtype}'-dtype NumPy array "
"with missing values. Specify an appropriate 'na_value' "
"for this dtype."
)
# don't pass copy to astype -> always need a copy since we are mutating
data = self._data.astype(dtype)
data[self._mask] = na_value
else:
data = self._data.astype(dtype, copy=copy)
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us

def __array__(self, dtype=None):
"""
the array interface, return my values
We return an object array here to preserve our scalar values
"""
# by default (no dtype specified), return an object array
return self.to_numpy(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Convert myself into a pyarrow Array.
"""
import pyarrow as pa

return pa.array(self._data, mask=self._mask, type=type)

_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
Expand Down Expand Up @@ -465,40 +342,6 @@ def reconstruct(x):
else:
return reconstruct(result)

def __iter__(self):
for i in range(len(self)):
if self._mask[i]:
yield self.dtype.na_value
else:
yield self._data[i]

def take(self, indexer, allow_fill=False, fill_value=None):
# we always fill with False internally
# to avoid upcasting
data_fill_value = False if isna(fill_value) else fill_value
result = take(
self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
)

mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)

# if we are filling
# we only fill where the indexer is null
# not existing missing values
# TODO(jreback) what if we have a non-na float as a fill value?
if allow_fill and notna(fill_value):
fill_mask = np.asarray(indexer) == -1
result[fill_mask] = fill_value
mask = mask ^ fill_mask

return type(self)(result, mask, copy=False)

def copy(self):
data, mask = self._data, self._mask
data = data.copy()
mask = mask.copy()
return type(self)(data, mask, copy=False)

def __setitem__(self, key, value):
_is_scalar = is_scalar(value)
if _is_scalar:
Expand All @@ -512,26 +355,6 @@ def __setitem__(self, key, value):
self._data[key] = value
self._mask[key] = mask

def __len__(self):
return len(self._data)

@property
def nbytes(self):
return self._data.nbytes + self._mask.nbytes

def isna(self):
return self._mask

@property
def _na_value(self):
return self._dtype.na_value

@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x._data for x in to_concat])
mask = np.concatenate([x._mask for x in to_concat])
return cls(data, mask)

def astype(self, dtype, copy=True):
"""
Cast to a NumPy array or ExtensionArray with 'dtype'.
Expand Down
Loading

0 comments on commit 89bc0aa

Please sign in to comment.