Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: astype fill_value for SparseArray.astype #23547

Merged
merged 11 commits into from
Nov 12, 2018
105 changes: 91 additions & 14 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,83 @@ def is_dtype(cls, dtype):
return True
return isinstance(dtype, np.dtype) or dtype == 'Sparse'

def update_dtype(self, dtype):
"""Convert the SparseDtype to a new dtype.

This takes care of converting the ``fill_value``.

Parameters
----------
dtype : Union[str, numpy.dtype, SparseDtype]
The new dtype to use.

* For a SparseDtype, it is simply returned
* For a NumPy dtype (or str), the current fill value
is converted to the new dtype, and a SparseDtype
with `dtype` and the new fill value is returned.

Returns
-------
SparseDtype
A new SparseDtype with the corret `dtype` and fill value
for that `dtype`.

Raises
------
ValueError
When the current fill value cannot be converted to the
new `dtype` (e.g. trying to convert ``np.nan`` to an
integer dtype).


Examples
--------
>>> SparseDtype(int, 0).update_dtype(float)
Sparse[float64, 0.0]

>>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
Sparse[float64, nan]
"""
cls = type(self)
dtype = pandas_dtype(dtype)

if not isinstance(dtype, cls):
fill_value = astype_nansafe(np.array(self.fill_value),
dtype).item()
dtype = cls(dtype, fill_value=fill_value)

return dtype

@property
def _subtype_with_str(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this is only for Sparse which is ok

"""
Whether the SparseDtype's subtype should be considered ``str``.

Typically, pandas will store string data in an object-dtype array.
When converting values to a dtype, e.g. in ``.astype``, we need to
be more specific, we need the actual underlying type.

Returns
-------

>>> SparseDtype(int, 1)._subtype_with_str
dtype('int64')

>>> SparseDtype(object, 1)._subtype_with_str
dtype('O')

>>> dtype = SparseDtype(str, '')
>>> dtype.subtype
dtype('O')

>>> dtype._subtype_with_str
str
"""
if isinstance(self.fill_value, compat.string_types):
return type(self.fill_value)
return self.subtype


# ----------------------------------------------------------------------------
# Array

Expand Down Expand Up @@ -614,7 +691,7 @@ def __array__(self, dtype=None, copy=True):
# Can't put pd.NaT in a datetime64[ns]
fill_value = np.datetime64('NaT')
try:
dtype = np.result_type(self.sp_values.dtype, fill_value)
dtype = np.result_type(self.sp_values.dtype, type(fill_value))
Copy link
Contributor Author

@TomAugspurger TomAugspurger Nov 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was having trouble with string fill values.

except TypeError:
dtype = object

Expand Down Expand Up @@ -996,7 +1073,7 @@ def _take_with_fill(self, indices, fill_value=None):
if len(self) == 0:
# Empty... Allow taking only if all empty
if (indices == -1).all():
dtype = np.result_type(self.sp_values, fill_value)
dtype = np.result_type(self.sp_values, type(fill_value))
taken = np.empty_like(indices, dtype=dtype)
taken.fill(fill_value)
return taken
Expand All @@ -1009,7 +1086,7 @@ def _take_with_fill(self, indices, fill_value=None):
if self.sp_index.npoints == 0:
# Avoid taking from the empty self.sp_values
taken = np.full(sp_indexer.shape, fill_value=fill_value,
dtype=np.result_type(fill_value))
dtype=np.result_type(type(fill_value)))
else:
taken = self.sp_values.take(sp_indexer)

Expand All @@ -1030,12 +1107,13 @@ def _take_with_fill(self, indices, fill_value=None):
result_type = taken.dtype

if m0.any():
result_type = np.result_type(result_type, self.fill_value)
result_type = np.result_type(result_type,
type(self.fill_value))
taken = taken.astype(result_type)
taken[old_fill_indices] = self.fill_value

if m1.any():
result_type = np.result_type(result_type, fill_value)
result_type = np.result_type(result_type, type(fill_value))
taken = taken.astype(result_type)
taken[new_fill_indices] = fill_value

Expand All @@ -1061,7 +1139,7 @@ def _take_without_fill(self, indices):
# edge case in take...
# I think just return
out = np.full(indices.shape, self.fill_value,
dtype=np.result_type(self.fill_value))
dtype=np.result_type(type(self.fill_value)))
arr, sp_index, fill_value = make_sparse(out,
fill_value=self.fill_value)
return type(self)(arr, sparse_index=sp_index,
Expand All @@ -1073,7 +1151,7 @@ def _take_without_fill(self, indices):

if fillable.any():
# TODO: may need to coerce array to fill value
result_type = np.result_type(taken, self.fill_value)
result_type = np.result_type(taken, type(self.fill_value))
taken = taken.astype(result_type)
taken[fillable] = self.fill_value

Expand All @@ -1093,7 +1171,9 @@ def _concat_same_type(cls, to_concat):

fill_value = fill_values[0]

if len(set(fill_values)) > 1:
# np.nan isn't a singleton, so we may end up with multiple
# NaNs here, so we ignore tha all NA case too.
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
warnings.warn("Concatenating sparse arrays with multiple fill "
"values: '{}'. Picking the first and "
"converting the rest.".format(fill_values),
Expand Down Expand Up @@ -1212,13 +1292,10 @@ def astype(self, dtype=None, copy=True):
IntIndex
Indices: array([2, 3], dtype=int32)
"""
dtype = pandas_dtype(dtype)

if not isinstance(dtype, SparseDtype):
dtype = SparseDtype(dtype, fill_value=self.fill_value)

dtype = self.dtype.update_dtype(dtype)
subtype = dtype._subtype_with_str
sp_values = astype_nansafe(self.sp_values,
dtype.subtype,
subtype,
copy=copy)
if sp_values is self.sp_values and copy:
sp_values = sp_values.copy()
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,34 @@ def test_astype_all(self, any_real_dtype):
tm.assert_numpy_array_equal(np.asarray(res.values),
vals.astype(typ))

@pytest.mark.parametrize('array, dtype, expected', [
(SparseArray([0, 1]), 'float',
SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))),
(SparseArray([0, 1]), bool, SparseArray([False, True])),
(SparseArray([0, 1], fill_value=1), bool,
SparseArray([False, True], dtype=SparseDtype(bool, True))),
pytest.param(
SparseArray([0, 1]), 'datetime64[ns]',
SparseArray(np.array([0, 1], dtype='datetime64[ns]'),
dtype=SparseDtype('datetime64[ns]',
pd.Timestamp('1970'))),
marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)],
),
(SparseArray([0, 1, 10]), str,
SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))),
(SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])),
(SparseArray([0, 1, 0]), object,
SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))),
])
def test_astype_more(self, array, dtype, expected):
result = array.astype(dtype)
tm.assert_sp_array_equal(result, expected)

def test_astype_nan_raises(self):
arr = SparseArray([1.0, np.nan])
with pytest.raises(ValueError, match='Cannot convert non-finite'):
arr.astype(int)

def test_set_fill_value(self):
arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
arr.fill_value = 2
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/arrays/sparse/test_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,23 @@ def test_parse_subtype(string, expected):
def test_construct_from_string_fill_value_raises(string):
with pytest.raises(TypeError, match='fill_value in the string is not'):
SparseDtype.construct_from_string(string)


@pytest.mark.parametrize('original, dtype, expected', [
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
(SparseDtype(int, 1), str, SparseDtype(object, '1')),
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
])
def test_update_dtype(original, dtype, expected):
result = original.update_dtype(dtype)
assert result == expected


@pytest.mark.parametrize("original, dtype", [
(SparseDtype(float, np.nan), int),
(SparseDtype(str, 'abc'), int),
])
def test_update_dtype_raises(original, dtype):
with pytest.raises(ValueError):
original.update_dtype(dtype)