Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop support for NaN categories in Categorical #15806

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,15 @@ Categories must be unique or a `ValueError` is raised:
except ValueError as e:
print("ValueError: " + str(e))

Categories must also not be ``NaN`` or a `ValueError` is raised:

.. ipython:: python

try:
s.cat.categories = [1,2,np.nan]
except ValueError as e:
print("ValueError: " + str(e))

Appending new categories
~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,7 @@ Removal of prior version deprecations/changes
in favor of ``iloc`` and ``iat`` as explained :ref:`here <whatsnew_0170.deprecations>` (:issue:`10711`).
- The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`)
- The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`)
- ``Categorical`` has dropped support for ``NaN`` categories (:issue:`10748`)
- The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`)
- ``Series``, ``Index``, and ``DataFrame`` have dropped the ``sort`` and ``order`` methods (:issue:`10726`)
- Where clauses in ``pytables`` are only accepted as strings and expressions types and not other data-types (:issue:`12027`)
Expand Down
13 changes: 3 additions & 10 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,18 +545,11 @@ def _validate_categories(cls, categories, fastpath=False):

if not fastpath:

# check properties of the categories
# we don't allow NaNs in the categories themselves

# Categories cannot contain NaN.
if categories.hasnans:
# NaNs in cats deprecated in 0.17
# GH 10748
msg = ('\nSetting NaNs in `categories` is deprecated and '
'will be removed in a future version of pandas.')
warn(msg, FutureWarning, stacklevel=3)

# categories must be unique
raise ValueError('Categorial categories cannot be null')

# Categories must be unique.
if not categories.is_unique:
raise ValueError('Categorical categories must be unique')

Expand Down
14 changes: 0 additions & 14 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,6 @@ def test_contains(self):
self.assertFalse(0 in ci)
self.assertFalse(1 in ci)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
ci = CategoricalIndex(
list('aabbca'), categories=list('cabdef') + [np.nan])
self.assertFalse(np.nan in ci)

ci = CategoricalIndex(
list('aabbca') + [np.nan], categories=list('cabdef'))
self.assertTrue(np.nan in ci)
Expand Down Expand Up @@ -541,7 +536,6 @@ def test_ensure_copied_data(self):
self.assertIs(_base(index.values), _base(result.values))

def test_equals_categorical(self):

ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'],
ordered=True)
Expand Down Expand Up @@ -579,14 +573,6 @@ def test_equals_categorical(self):
self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
self.assertTrue(ci.equals(ci.copy()))

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
ci = CategoricalIndex(list('aabca'),
categories=['c', 'a', 'b', np.nan])
self.assertFalse(ci.equals(list('aabca')))
self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
self.assertTrue(ci.equals(ci.copy()))

ci = CategoricalIndex(list('aabca') + [np.nan],
categories=['c', 'a', 'b'])
self.assertFalse(ci.equals(list('aabca')))
Expand Down
207 changes: 28 additions & 179 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
# pylint: disable=E1101,E1103,W0232

import pytest
import sys
from datetime import datetime
from distutils.version import LooseVersion
Expand All @@ -17,7 +18,8 @@
import pandas.compat as compat
import pandas.util.testing as tm
from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex,
Timestamp, CategoricalIndex, isnull)
Timestamp, CategoricalIndex, DatetimeIndex,
isnull, NaT)
from pandas.compat import range, lrange, u, PY3
from pandas.core.config import option_context

Expand Down Expand Up @@ -160,12 +162,6 @@ def f():

self.assertRaises(ValueError, f)

def f():
with tm.assert_produces_warning(FutureWarning):
Categorical([1, 2], [1, 2, np.nan, np.nan])

self.assertRaises(ValueError, f)

# The default should be unordered
c1 = Categorical(["a", "b", "c", "a"])
self.assertFalse(c1.ordered)
Expand Down Expand Up @@ -222,29 +218,12 @@ def f():
cat = pd.Categorical([np.nan, 1., 2., 3.])
self.assertTrue(is_float_dtype(cat.categories))

# Deprecating NaNs in categoires (GH #10748)
# preserve int as far as possible by converting to object if NaN is in
# categories
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical([np.nan, 1, 2, 3],
categories=[np.nan, 1, 2, 3])
self.assertTrue(is_object_dtype(cat.categories))

# This doesn't work -> this would probably need some kind of "remember
# the original type" feature to try to cast the array interface result
# to...

# vals = np.asarray(cat[cat.notnull()])
# self.assertTrue(is_integer_dtype(vals))
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical([np.nan, "a", "b", "c"],
categories=[np.nan, "a", "b", "c"])
self.assertTrue(is_object_dtype(cat.categories))
# but don't do it for floats
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical([np.nan, 1., 2., 3.],
categories=[np.nan, 1., 2., 3.])
self.assertTrue(is_float_dtype(cat.categories))

# corner cases
cat = pd.Categorical([1])
Expand Down Expand Up @@ -295,6 +274,22 @@ def f():
c = Categorical(np.array([], dtype='int64'), # noqa
categories=[3, 2, 1], ordered=True)

def test_constructor_with_null(self):

# Cannot have NaN in categories
with pytest.raises(ValueError):
pd.Categorical([np.nan, "a", "b", "c"],
categories=[np.nan, "a", "b", "c"])

with pytest.raises(ValueError):
pd.Categorical([None, "a", "b", "c"],
categories=[None, "a", "b", "c"])

with pytest.raises(ValueError):
pd.Categorical(DatetimeIndex(['nat', '20160101']),
categories=[NaT, Timestamp('20160101')])


def test_constructor_with_index(self):
ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
tm.assert_categorical_equal(ci.values, Categorical(ci))
Expand Down Expand Up @@ -418,6 +413,12 @@ def f():

self.assertRaises(ValueError, f)

# NaN categories included
def f():
Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])

self.assertRaises(ValueError, f)

# too negative
def f():
Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
Expand Down Expand Up @@ -649,30 +650,6 @@ def test_describe(self):
name='categories'))
tm.assert_frame_equal(desc, expected)

# NA as a category
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical(["a", "c", "c", np.nan],
categories=["b", "a", "c", np.nan])
result = cat.describe()

expected = DataFrame([[0, 0], [1, 0.25], [2, 0.5], [1, 0.25]],
columns=['counts', 'freqs'],
index=pd.CategoricalIndex(['b', 'a', 'c', np.nan],
name='categories'))
tm.assert_frame_equal(result, expected, check_categorical=False)

# NA as an unused category
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical(["a", "c", "c"],
categories=["b", "a", "c", np.nan])
result = cat.describe()

exp_idx = pd.CategoricalIndex(
['b', 'a', 'c', np.nan], name='categories')
expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]],
columns=['counts', 'freqs'], index=exp_idx)
tm.assert_frame_equal(result, expected, check_categorical=False)

def test_print(self):
expected = ["[a, b, b, a, a, c, c, c]",
"Categories (3, object): [a < b < c]"]
Expand Down Expand Up @@ -1119,90 +1096,18 @@ def test_nan_handling(self):
self.assert_numpy_array_equal(c._codes,
np.array([0, -1, -1, 0], dtype=np.int8))

# If categories have nan included, the code should point to that
# instead
with tm.assert_produces_warning(FutureWarning):
c = Categorical(["a", "b", np.nan, "a"],
categories=["a", "b", np.nan])
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, 2, 0], dtype=np.int8))
c[1] = np.nan
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 2, 2, 0], dtype=np.int8))

# Changing categories should also make the replaced category np.nan
c = Categorical(["a", "b", "c", "a"])
with tm.assert_produces_warning(FutureWarning):
c.categories = ["a", "b", np.nan] # noqa

self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, 2, 0], dtype=np.int8))

# Adding nan to categories should make assigned nan point to the
# category!
c = Categorical(["a", "b", np.nan, "a"])
self.assert_index_equal(c.categories, Index(["a", "b"]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, -1, 0], dtype=np.int8))
with tm.assert_produces_warning(FutureWarning):
c.set_categories(["a", "b", np.nan], rename=True, inplace=True)

self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, -1, 0], dtype=np.int8))
c[1] = np.nan
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 2, -1, 0], dtype=np.int8))

# Remove null categories (GH 10156)
cases = [([1.0, 2.0, np.nan], [1.0, 2.0]),
(['a', 'b', None], ['a', 'b']),
([pd.Timestamp('2012-05-01'), pd.NaT],
[pd.Timestamp('2012-05-01')])]

null_values = [np.nan, None, pd.NaT]

for with_null, without in cases:
with tm.assert_produces_warning(FutureWarning):
base = Categorical([], with_null)
expected = Categorical([], without)

for nullval in null_values:
result = base.remove_categories(nullval)
self.assert_categorical_equal(result, expected)

# Different null values are indistinguishable
for i, j in [(0, 1), (0, 2), (1, 2)]:
nulls = [null_values[i], null_values[j]]

def f():
with tm.assert_produces_warning(FutureWarning):
Categorical([], categories=nulls)

self.assertRaises(ValueError, f)

def test_isnull(self):
exp = np.array([False, False, True])
c = Categorical(["a", "b", np.nan])
res = c.isnull()
self.assert_numpy_array_equal(res, exp)

with tm.assert_produces_warning(FutureWarning):
c = Categorical(["a", "b", np.nan], categories=["a", "b", np.nan])
res = c.isnull()
self.assert_numpy_array_equal(res, exp)

# test both nan in categories and as -1
exp = np.array([True, False, True])
c = Categorical(["a", "b", np.nan])
with tm.assert_produces_warning(FutureWarning):
c.set_categories(["a", "b", np.nan], rename=True, inplace=True)
c[0] = np.nan
res = c.isnull()
self.assert_numpy_array_equal(res, exp)

def test_codes_immutable(self):
Expand Down Expand Up @@ -1487,45 +1392,10 @@ def test_slicing_directly(self):

def test_set_item_nan(self):
cat = pd.Categorical([1, 2, 3])
exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
cat[1] = np.nan
tm.assert_categorical_equal(cat, exp)

# if nan in categories, the proper code should be set!
cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1] = np.nan
exp = np.array([0, 3, 2, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1:3] = np.nan
exp = np.array([0, 3, 3, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1:3] = [np.nan, 1]
exp = np.array([0, 3, 0, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1:3] = [np.nan, np.nan]
exp = np.array([0, 3, 3, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, np.nan, 3], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[pd.isnull(cat)] = np.nan
exp = np.array([0, 1, 3, 2], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)
exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(cat, exp)

def test_shift(self):
# GH 9416
Expand Down Expand Up @@ -2026,33 +1896,12 @@ def test_sideeffects_free(self):

def test_nan_handling(self):

# Nans are represented as -1 in labels
# NaNs are represented as -1 in labels
s = Series(Categorical(["a", "b", np.nan, "a"]))
self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
self.assert_numpy_array_equal(s.values.codes,
np.array([0, 1, -1, 0], dtype=np.int8))

# If categories have nan included, the label should point to that
# instead
with tm.assert_produces_warning(FutureWarning):
s2 = Series(Categorical(["a", "b", np.nan, "a"],
categories=["a", "b", np.nan]))

exp_cat = Index(["a", "b", np.nan])
self.assert_index_equal(s2.cat.categories, exp_cat)
self.assert_numpy_array_equal(s2.values.codes,
np.array([0, 1, 2, 0], dtype=np.int8))

# Changing categories should also make the replaced category np.nan
s3 = Series(Categorical(["a", "b", "c", "a"]))
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
s3.cat.categories = ["a", "b", np.nan]

exp_cat = Index(["a", "b", np.nan])
self.assert_index_equal(s3.cat.categories, exp_cat)
self.assert_numpy_array_equal(s3.values.codes,
np.array([0, 1, 2, 0], dtype=np.int8))

def test_cat_accessor(self):
s = Series(Categorical(["a", "b", np.nan, "a"]))
self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
Expand Down