Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: astype fill_value for SparseArray.astype #23547

Merged
merged 11 commits into from
Nov 12, 2018
34 changes: 25 additions & 9 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ def __array__(self, dtype=None, copy=True):
# Can't put pd.NaT in a datetime64[ns]
fill_value = np.datetime64('NaT')
try:
dtype = np.result_type(self.sp_values.dtype, fill_value)
dtype = np.result_type(self.sp_values.dtype, type(fill_value))
Copy link
Contributor Author

@TomAugspurger TomAugspurger Nov 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was having trouble with string fill values.

except TypeError:
dtype = object

Expand Down Expand Up @@ -996,7 +996,7 @@ def _take_with_fill(self, indices, fill_value=None):
if len(self) == 0:
# Empty... Allow taking only if all empty
if (indices == -1).all():
dtype = np.result_type(self.sp_values, fill_value)
dtype = np.result_type(self.sp_values, type(fill_value))
taken = np.empty_like(indices, dtype=dtype)
taken.fill(fill_value)
return taken
Expand All @@ -1009,7 +1009,7 @@ def _take_with_fill(self, indices, fill_value=None):
if self.sp_index.npoints == 0:
# Avoid taking from the empty self.sp_values
taken = np.full(sp_indexer.shape, fill_value=fill_value,
dtype=np.result_type(fill_value))
dtype=np.result_type(type(fill_value)))
else:
taken = self.sp_values.take(sp_indexer)

Expand All @@ -1030,12 +1030,12 @@ def _take_with_fill(self, indices, fill_value=None):
result_type = taken.dtype

if m0.any():
result_type = np.result_type(result_type, self.fill_value)
result_type = np.result_type(result_type, type(self.fill_value))
taken = taken.astype(result_type)
taken[old_fill_indices] = self.fill_value

if m1.any():
result_type = np.result_type(result_type, fill_value)
result_type = np.result_type(result_type, type(fill_value))
taken = taken.astype(result_type)
taken[new_fill_indices] = fill_value

Expand All @@ -1061,7 +1061,7 @@ def _take_without_fill(self, indices):
# edge case in take...
# I think just return
out = np.full(indices.shape, self.fill_value,
dtype=np.result_type(self.fill_value))
dtype=np.result_type(type(self.fill_value)))
arr, sp_index, fill_value = make_sparse(out,
fill_value=self.fill_value)
return type(self)(arr, sparse_index=sp_index,
Expand All @@ -1073,7 +1073,7 @@ def _take_without_fill(self, indices):

if fillable.any():
# TODO: may need to coerce array to fill value
result_type = np.result_type(taken, self.fill_value)
result_type = np.result_type(taken, type(self.fill_value))
taken = taken.astype(result_type)
taken[fillable] = self.fill_value

Expand Down Expand Up @@ -1215,10 +1215,26 @@ def astype(self, dtype=None, copy=True):
dtype = pandas_dtype(dtype)

if not isinstance(dtype, SparseDtype):
dtype = SparseDtype(dtype, fill_value=self.fill_value)
fill_value = astype_nansafe(np.array(self.fill_value),
dtype).item()
dtype = SparseDtype(dtype, fill_value=fill_value)

# Typically we'll just astype the sp_values to dtype.subtype,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is kind ugly, but it's backwards compatible, consistent with the rest of pandas, and does what we need.

Basically, unless we want to support actual numpy string dtypes (which we probably don't), then we need a way of differentiating between array.astype(object) and array.astype(str).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make this a method on the Dtype itself to avoid cluttering this up here? maybe dtype.astype_type
alternatively . we could actually add .astype_nansafe(value, copy=False) as a Dtype method (kind of makes sense actually)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was playing around with that earlier (didn't push it though). I called it SparseDtype.astype. I'll give it another shot and see what it de-duplicates.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, can you clarify what you had in mind for astype_nansafe(value, copy=False)? What would value here be? An array or a scalar?

I'll have a followup PR soon (hopefully today) for ensuring that the dtype of SparseArray.sp_values is consistent with the type of SparseArray.dtype.fill_value. I think my SparseDtype.astype is more useful there. It wouldn't make sense for using here, since we're astyping the actual array of values.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right I think adding a method that returns the dtype of the .astype values on the Dtype iself is what I am looking. The conversion still happens in the Array. Basically the code you added here should be on the Dtype object.

Copy link
Contributor Author

@TomAugspurger TomAugspurger Nov 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to add two methods

  1. SparseDtype.astype: convert from a SparseDtype to a new dtype, taking care to astype self.fill_value if needed.
  2. SparseDtype._subtype_with_str to hold the logic for determining what the "real" subtype is, if we actually want str.

SparseDtype.astype seems reasonably useful to users, so I made it public.

# but SparseDtype follows the pandas convention of storing strings
# as object dtype. So SparseDtype(str) immediately becomes
# SparseDtype(object), and at this point we don't know whether object
# means string or something else. We *cannot* just pass object to
# astype_nansafe below, since that won't convert to string. So
# we rely on the assumption that "string fill_value" means strings
# which is close enough to being true.
if (is_object_dtype(dtype.subtype) and
isinstance(dtype.fill_value, compat.text_type)):
subtype = str
else:
subtype = dtype.subtype

sp_values = astype_nansafe(self.sp_values,
dtype.subtype,
subtype,
copy=copy)
if sp_values is self.sp_values and copy:
sp_values = sp_values.copy()
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,32 @@ def test_astype_all(self, any_real_dtype):
tm.assert_numpy_array_equal(np.asarray(res.values),
vals.astype(typ))

@pytest.mark.parametrize('array, dtype, expected', [
(SparseArray([0, 1]), 'float',
SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))),
(SparseArray([0, 1]), bool, SparseArray([False, True])),
(SparseArray([0, 1], fill_value=1), bool,
SparseArray([False, True], dtype=SparseDtype(bool, True))),
pytest.param(
SparseArray([0, 1]), 'datetime64[ns]',
SparseArray(np.array([0, 1], dtype='datetime64[ns]'),
dtype=SparseDtype('datetime64[ns]',
pd.Timestamp('1970'))),
marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)],
),
(SparseArray([0, 1, 10]), str,
SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))),
(SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])),
])
def test_astype_more(self, array, dtype, expected):
result = array.astype(dtype)
tm.assert_sp_array_equal(result, expected)

def test_astype_nan_raises(self):
arr = SparseArray([1.0, np.nan])
with tm.assert_raises_regex(ValueError, 'Cannot convert non-finite'):
arr.astype(int)

def test_set_fill_value(self):
arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
arr.fill_value = 2
Expand Down