diff --git a/doc/source/basics.rst b/doc/source/basics.rst index fe3fc42992468..3e11552be3612 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1286,14 +1286,14 @@ The ``by`` argument can take a list of column names, e.g.: Series has the method ``order`` (analogous to `R's order function `__) which -sorts by value, with special treatment of NA values via the ``na_last`` +sorts by value, with special treatment of NA values via the ``na_position`` argument: .. ipython:: python s[2] = np.nan s.order() - s.order(na_last=False) + s.order(na_position='first') Some other sorting notes / nuances: diff --git a/doc/source/release.rst b/doc/source/release.rst index 5134130ba7865..ea5af9165b483 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -147,6 +147,8 @@ API Changes - Define and document the order of column vs index names in query/eval (:issue:`6676`) +- ``DataFrame.sort`` now places NaNs at the beginning or end of the sort according to the ``na_position`` parameter. (:issue:`3917`) + Deprecations ~~~~~~~~~~~~ diff --git a/pandas/core/common.py b/pandas/core/common.py index dadd21f8fc128..daeb43c7e76ac 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -316,9 +316,9 @@ def array_equivalent(left, right): # NaNs occur only in object arrays, float or complex arrays. if issubclass(left.dtype.type, np.object_): return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all() - if not issubclass(left.dtype.type, (np.floating, np.complexfloating)): - return np.array_equal(left, right) - return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + if issubclass(left.dtype.type, (np.floating, np.complexfloating)): + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + return np.array_equal(left, right) def _iterable_not_string(x): return (isinstance(x, collections.Iterable) and diff --git a/pandas/core/frame.py b/pandas/core/frame.py old mode 100644 new mode 100755 index a410bb8be8c52..430b309260f8c --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2522,7 +2522,7 @@ def _m8_to_i8(x): # Sorting def sort(self, columns=None, axis=0, ascending=True, - inplace=False): + inplace=False, kind='quicksort', na_position='last'): """ Sort DataFrame either by labels (along either axis) or by the values in column(s) @@ -2540,6 +2540,11 @@ def sort(self, columns=None, axis=0, ascending=True, Sort index/rows versus columns inplace : boolean, default False Sort the DataFrame without creating a new instance + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + This option is only applied when sorting on a single column or label. + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end Examples -------- @@ -2550,10 +2555,10 @@ def sort(self, columns=None, axis=0, ascending=True, sorted : DataFrame """ return self.sort_index(by=columns, axis=axis, ascending=ascending, - inplace=inplace) + inplace=inplace, kind=kind, na_position=na_position) def sort_index(self, axis=0, by=None, ascending=True, inplace=False, - kind='quicksort'): + kind='quicksort', na_position='last'): """ Sort DataFrame either by labels (along either axis) or by the values in a column @@ -2571,6 +2576,11 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False, orders inplace : boolean, default False Sort the DataFrame without creating a new instance + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + This option is only applied when sorting on a single column or label. Examples -------- @@ -2580,8 +2590,8 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False, ------- sorted : DataFrame """ - from pandas.core.groupby import _lexsort_indexer - + + from pandas.core.groupby import _lexsort_indexer, _nargsort axis = self._get_axis_number(axis) if axis not in [0, 1]: # pragma: no cover raise AssertionError('Axis must be 0 or 1, got %s' % str(axis)) @@ -2597,23 +2607,19 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False, if com._is_sequence(ascending) and len(by) != len(ascending): raise ValueError('Length of ascending (%d) != length of by' ' (%d)' % (len(ascending), len(by))) - if len(by) > 1: - keys = [] - for x in by: - k = self[x].values - if k.ndim == 2: - raise ValueError('Cannot sort by duplicate column %s' - % str(x)) - keys.append(k) - def trans(v): if com.needs_i8_conversion(v): return v.view('i8') return v - - keys = [trans(self[x].values) for x in by] - indexer = _lexsort_indexer(keys, orders=ascending) + keys = [] + for x in by: + k = self[x].values + if k.ndim == 2: + raise ValueError('Cannot sort by duplicate column %s' % str(x)) + keys.append(trans(k)) + indexer = _lexsort_indexer(keys, orders=ascending, + na_position=na_position) indexer = com._ensure_platform_int(indexer) else: by = by[0] @@ -2630,20 +2636,17 @@ def trans(v): % str(by)) if isinstance(ascending, (tuple, list)): ascending = ascending[0] + indexer = _nargsort(k, kind=kind, ascending=ascending, + na_position=na_position) - if not ascending: - k = k[::-1] - indexer = k.argsort(kind=kind) - if not ascending: - indexer = indexer.max() - indexer[::-1] elif isinstance(labels, MultiIndex): - indexer = _lexsort_indexer(labels.labels, orders=ascending) + indexer = _lexsort_indexer(labels.labels, orders=ascending, + na_position=na_position) indexer = com._ensure_platform_int(indexer) else: - indexer = labels.argsort(kind=kind) - if not ascending: - indexer = indexer[::-1] - + indexer = _nargsort(labels, kind=kind, ascending=ascending, + na_position=na_position) + if inplace: if axis == 1: new_data = self._data.reindex_items( diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c6ecba7d11836..208f9f1a8e19a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3145,33 +3145,72 @@ def _indexer_from_factorized(labels, shape, compress=True): return indexer -def _lexsort_indexer(keys, orders=None): +def _lexsort_indexer(keys, orders=None, na_position='last'): labels = [] shape = [] - if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) for key, order in zip(keys, orders): + key = np.asanyarray(key) rizer = _hash.Factorizer(len(key)) if not key.dtype == np.object_: key = key.astype('O') + # factorize maps nans to na_sentinel=-1 ids = rizer.factorize(key, sort=True) - n = len(rizer.uniques) + mask = (ids == -1) + if order: # ascending + if na_position == 'last': + ids = np.where(mask, n, ids) + elif na_position == 'first': + ids += 1 + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + else: # not order means descending + if na_position == 'last': + ids = np.where(mask, n, n-ids-1) + elif na_position == 'first': + ids = np.where(mask, 0, n-ids) + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + if mask.any(): + n += 1 shape.append(n) - if not order: - mask = ids == -1 - ids = np.where(mask, -1, n - ids) - labels.append(ids) - return _indexer_from_factorized(labels, shape) +def _nargsort(items, kind='quicksort', ascending=True, na_position='last'): + """ + This is intended to be a drop-in replacement for np.argsort which handles NaNs + It adds ascending and na_position parameters. + GH #6399, #5231 + """ + items = np.asanyarray(items) + idx = np.arange(len(items)) + mask = isnull(items) + non_nans = items[~mask] + non_nan_idx = idx[~mask] + nan_idx = np.nonzero(mask)[0] + if not ascending: + non_nans = non_nans[::-1] + non_nan_idx = non_nan_idx[::-1] + indexer = non_nan_idx[non_nans.argsort(kind=kind)] + if not ascending: + indexer = indexer[::-1] + # Finally, place the NaNs at the end or the beginning according to na_position + if na_position == 'last': + indexer = np.concatenate([indexer, nan_idx]) + elif na_position == 'first': + indexer = np.concatenate([nan_idx, indexer]) + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + return indexer + class _KeyMapper(object): diff --git a/pandas/core/index.py b/pandas/core/index.py index 3eab4d0339082..32c1672566da0 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -13,7 +13,7 @@ from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin from pandas.util.decorators import cache_readonly, deprecate -from pandas.core.common import isnull +from pandas.core.common import isnull, array_equivalent import pandas.core.common as com from pandas.core.common import _values_from_object, is_float, is_integer, ABCSeries from pandas.core.config import get_option @@ -800,7 +800,7 @@ def equals(self, other): if type(other) != Index: return other.equals(self) - return np.array_equal(self, other) + return array_equivalent(self, other) def identical(self, other): """Similar to equals, but check that other comparable attributes are @@ -1872,7 +1872,7 @@ def equals(self, other): # return False try: - return np.array_equal(self, other) + return array_equivalent(self, other) except TypeError: # e.g. fails in numpy 1.6 with DatetimeIndex #1681 return False @@ -3533,7 +3533,7 @@ def equals(self, other): return True if not isinstance(other, MultiIndex): - return np.array_equal(self.values, _ensure_index(other)) + return array_equivalent(self.values, _ensure_index(other)) if self.nlevels != other.nlevels: return False @@ -3546,7 +3546,7 @@ def equals(self, other): allow_fill=False) ovalues = com.take_nd(other.levels[i].values, other.labels[i], allow_fill=False) - if not np.array_equal(svalues, ovalues): + if not array_equivalent(svalues, ovalues): return False return True diff --git a/pandas/core/series.py b/pandas/core/series.py index 60429630eb7d3..14b4e084cdeae 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1743,24 +1743,32 @@ def rank(self, method='average', na_option='keep', ascending=True, ascending=ascending, pct=pct) return self._constructor(ranks, index=self.index).__finalize__(self) - def order(self, na_last=True, ascending=True, kind='mergesort'): + def order(self, na_last=None, ascending=True, kind='mergesort', na_position='last'): """ Sorts Series object, by value, maintaining index-value link Parameters ---------- - na_last : boolean (optional, default=True) + na_last : boolean (optional, default=True) (DEPRECATED; use na_position) Put NaN's at beginning or end ascending : boolean, default True Sort ascending. Passing False sorts descending kind : {'mergesort', 'quicksort', 'heapsort'}, default 'mergesort' Choice of sorting algorithm. See np.sort for more information. 'mergesort' is the only stable algorithm + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end Returns ------- y : Series """ + if na_last is not None: + warnings.warn(("na_last is deprecated. Please use na_position instead"), + FutureWarning) + na_position = 'last' if na_last else 'first' + def _try_kind_sort(arr): # easier to ask forgiveness than permission try: @@ -1784,15 +1792,16 @@ def _try_kind_sort(arr): if not ascending: argsorted = argsorted[::-1] - if na_last: + if na_position == 'last': n = good.sum() sortedIdx[:n] = idx[good][argsorted] sortedIdx[n:] = idx[bad] - else: + elif na_position == 'first': n = bad.sum() sortedIdx[n:] = idx[good][argsorted] sortedIdx[:n] = idx[bad] - + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) return self._constructor(arr[sortedIdx], index=self.index[sortedIdx])\ .__finalize__(self) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 10c43478a5352..d4ed7fac5d6b7 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -835,20 +835,23 @@ cdef class Factorizer: return self.count def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1): + """ + Factorize values with nans replaced by na_sentinel + >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) + array([ 0, 1, 20]) + """ labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel) - + mask = (labels == na_sentinel) # sort on if sort: if labels.dtype != np.int_: labels = labels.astype(np.int_) - sorter = self.uniques.to_array().argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) - - labels = reverse_indexer.take(labels) - + labels = reverse_indexer.take(labels, mode='clip') + labels[mask] = na_sentinel self.count = len(self.uniques) return labels diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 2101c732893e3..c1862c4ff91ab 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -9770,6 +9770,121 @@ def test_sort_index(self): with assertRaisesRegexp(ValueError, msg): frame.sort_index(by=['A', 'B'], axis=0, ascending=[True] * 5) + def test_sort_nan(self): + # GH3917 + nan = np.nan + df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], + 'B': [9, nan, 5, 2, 5, 4, 5]}) + + # sort one column only + expected = DataFrame( + {'A': [nan, 1, 1, 2, 4, 6, 8], + 'B': [5, 9, 2, nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5]) + sorted_df = df.sort(['A'], na_position='first') + assert_frame_equal(sorted_df, expected) + + expected = DataFrame( + {'A': [nan, 8, 6, 4, 2, 1, 1], + 'B': [5, 4, 5, 5, nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3]) + sorted_df = df.sort(['A'], na_position='first', ascending=False) + assert_frame_equal(sorted_df, expected) + + # na_position='last', order + expected = DataFrame( + {'A': [1, 1, 2, 4, 6, 8, nan], + 'B': [2, 9, nan, 5, 5, 4, 5]}, + index=[3, 0, 1, 6, 4, 5, 2]) + sorted_df = df.sort(['A','B']) + assert_frame_equal(sorted_df, expected) + + # na_position='first', order + expected = DataFrame( + {'A': [nan, 1, 1, 2, 4, 6, 8], + 'B': [5, 2, 9, nan, 5, 5, 4]}, + index=[2, 3, 0, 1, 6, 4, 5]) + sorted_df = df.sort(['A','B'], na_position='first') + assert_frame_equal(sorted_df, expected) + + # na_position='first', not order + expected = DataFrame( + {'A': [nan, 1, 1, 2, 4, 6, 8], + 'B': [5, 9, 2, nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5]) + sorted_df = df.sort(['A','B'], ascending=[1,0], na_position='first') + assert_frame_equal(sorted_df, expected) + + # na_position='last', not order + expected = DataFrame( + {'A': [8, 6, 4, 2, 1, 1, nan], + 'B': [4, 5, 5, nan, 2, 9, 5]}, + index=[5, 4, 6, 1, 3, 0, 2]) + sorted_df = df.sort(['A','B'], ascending=[0,1], na_position='last') + assert_frame_equal(sorted_df, expected) + + # Test DataFrame with nan label + df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], + 'B': [9, nan, 5, 2, 5, 4, 5]}, + index = [1, 2, 3, 4, 5, 6, nan]) + + # NaN label, ascending=True, na_position='last' + sorted_df = df.sort(kind='quicksort', ascending=True, na_position='last') + expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], + 'B': [9, nan, 5, 2, 5, 4, 5]}, + index = [1, 2, 3, 4, 5, 6, nan]) + assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=True, na_position='first' + sorted_df = df.sort(na_position='first') + expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8], + 'B': [5, 9, nan, 5, 2, 5, 4]}, + index = [nan, 1, 2, 3, 4, 5, 6]) + assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='last' + sorted_df = df.sort(kind='quicksort', ascending=False) + expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4], + 'B': [4, 5, 2, 5, nan, 9, 5]}, + index = [6, 5, 4, 3, 2, 1, nan]) + assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='first' + sorted_df = df.sort(kind='quicksort', ascending=False, na_position='first') + expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1], + 'B': [5, 4, 5, 2, 5, nan, 9]}, + index = [nan, 6, 5, 4, 3, 2, 1]) + assert_frame_equal(sorted_df, expected) + + def test_stable_descending_sort(self): + # GH #6399 + df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], + columns=['sort_col', 'order']) + sorted_df = df.sort_index(by='sort_col', kind='mergesort', + ascending=False) + assert_frame_equal(df, sorted_df) + + def test_stable_descending_multicolumn_sort(self): + nan = np.nan + df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], + 'B': [9, nan, 5, 2, 5, 4, 5]}) + # test stable mergesort + expected = DataFrame( + {'A': [nan, 8, 6, 4, 2, 1, 1], + 'B': [5, 4, 5, 5, nan, 2, 9]}, + index=[2, 5, 4, 6, 1, 3, 0]) + sorted_df = df.sort(['A','B'], ascending=[0,1], na_position='first', + kind='mergesort') + assert_frame_equal(sorted_df, expected) + + expected = DataFrame( + {'A': [nan, 8, 6, 4, 2, 1, 1], + 'B': [5, 4, 5, 5, nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3]) + sorted_df = df.sort(['A','B'], ascending=[0,0], na_position='first', + kind='mergesort') + assert_frame_equal(sorted_df, expected) + def test_sort_index_multicolumn(self): import random A = np.arange(5).repeat(20) @@ -9926,13 +10041,6 @@ def test_frame_column_inplace_sort_exception(self): cp = s.copy() cp.sort() # it works! - def test_stable_descending_sort(self): - df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], - columns=['sort_col', 'order']) - sorted = df.sort_index(by='sort_col', kind='mergesort', - ascending=False) - assert_frame_equal(df, sorted) - def test_combine_first(self): # disjoint head, tail = self.frame[:5], self.frame[5:] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 506eb348a8113..b14c355f44a1c 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -10,7 +10,8 @@ from pandas.core.index import Index, MultiIndex, Int64Index from pandas.core.common import rands from pandas.core.api import Categorical, DataFrame -from pandas.core.groupby import SpecificationError, DataError +from pandas.core.groupby import (SpecificationError, DataError, + _nargsort, _lexsort_indexer) from pandas.core.series import Series from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, @@ -29,6 +30,7 @@ import pandas.util.testing as tm import pandas as pd +from numpy.testing import assert_equal def commonSetUp(self): self.dateRange = bdate_range('1/1/2005', periods=250) @@ -3831,6 +3833,97 @@ def test_tab_completion(self): ]) self.assertEqual(results, expected) + def test_lexsort_indexer(self): + keys = [[nan]*5 + list(range(100)) + [nan]*5] + # orders=True, na_position='last' + result = _lexsort_indexer(keys, orders=True, na_position='last') + expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # orders=True, na_position='first' + result = _lexsort_indexer(keys, orders=True, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + assert_equal(result, expected) + + # orders=False, na_position='last' + result = _lexsort_indexer(keys, orders=False, na_position='last') + expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # orders=False, na_position='first' + result = _lexsort_indexer(keys, orders=False, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + assert_equal(result, expected) + + def test_nargsort(self): + # np.argsort(items) places NaNs last + items = [nan]*5 + list(range(100)) + [nan]*5 + # np.argsort(items2) may not place NaNs first + items2 = np.array(items, dtype='O') + + try: + # GH 2785; due to a regression in NumPy1.6.2 + np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) + np.argsort(items2, kind='mergesort') + except TypeError as err: + raise nose.SkipTest('requested sort not available for type') + + # mergesort is the most difficult to get right because we want it to be stable. + + # According to numpy/core/tests/test_multiarray, """The number + # of sorted items must be greater than ~50 to check the actual algorithm + # because quick and merge sort fall over to insertion sort for small + # arrays.""" + + + # mergesort, ascending=True, na_position='last' + result = _nargsort( + items, kind='mergesort', ascending=True, na_position='last') + expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # mergesort, ascending=True, na_position='first' + result = _nargsort( + items, kind='mergesort', ascending=True, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + assert_equal(result, expected) + + # mergesort, ascending=False, na_position='last' + result = _nargsort( + items, kind='mergesort', ascending=False, na_position='last') + expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # mergesort, ascending=False, na_position='first' + result = _nargsort( + items, kind='mergesort', ascending=False, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + assert_equal(result, expected) + + # mergesort, ascending=True, na_position='last' + result = _nargsort( + items2, kind='mergesort', ascending=True, na_position='last') + expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # mergesort, ascending=True, na_position='first' + result = _nargsort( + items2, kind='mergesort', ascending=True, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + assert_equal(result, expected) + + # mergesort, ascending=False, na_position='last' + result = _nargsort( + items2, kind='mergesort', ascending=False, na_position='last') + expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # mergesort, ascending=False, na_position='first' + result = _nargsort( + items2, kind='mergesort', ascending=False, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + assert_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tests/test_hashtable.py b/pandas/tests/test_hashtable.py new file mode 100644 index 0000000000000..be51d50ee6783 --- /dev/null +++ b/pandas/tests/test_hashtable.py @@ -0,0 +1,30 @@ +import numpy as np +import unittest +import nose +import pandas.hashtable as _hash +import pandas as pd + +class TestFactorizer(unittest.TestCase): + def test_factorize_nan(self): + # nan should map to na_sentinel, not reverse_indexer[na_sentinel] + # rizer.factorize should not raise an exception if na_sentinel indexes + # outside of reverse_indexer + key = np.array([1, 2, 1, np.nan], dtype='O') + rizer = _hash.Factorizer(len(key)) + for na_sentinel in (-1, 20): + ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) + expected = np.array([0, 1, 0, na_sentinel], dtype='int32') + self.assertEqual(len(set(key)), len(set(expected))) + self.assert_(np.array_equal(pd.isnull(key), expected == na_sentinel)) + + # nan still maps to na_sentinel when sort=False + key = np.array([0, np.nan, 1], dtype='O') + na_sentinel = -1 + ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) + expected = np.array([ 2, -1, 0], dtype='int32') + self.assertEqual(len(set(key)), len(set(expected))) + self.assert_(np.array_equal(pd.isnull(key), expected == na_sentinel)) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index a94ca5dfc1075..95b7b6ace4e2d 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4007,7 +4007,7 @@ def test_order(self): self.assert_(np.isnan(result[-5:]).all()) self.assert_numpy_array_equal(result[:-5], np.sort(vals[5:])) - result = ts.order(na_last=False) + result = ts.order(na_position='first') self.assert_(np.isnan(result[:5]).all()) self.assert_numpy_array_equal(result[5:], np.sort(vals[5:])) @@ -4020,7 +4020,7 @@ def test_order(self): ordered = ts.order(ascending=False) expected = np.sort(ts.valid().values)[::-1] assert_almost_equal(expected, ordered.valid().values) - ordered = ts.order(ascending=False, na_last=False) + ordered = ts.order(ascending=False, na_position='first') assert_almost_equal(expected, ordered.valid().values) def test_rank(self):