Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EHN/FIX: Add na_last parameter to DataFrame.sort. Fixes GH3917 #5231

Merged
merged 1 commit into from
Mar 27, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1286,14 +1286,14 @@ The ``by`` argument can take a list of column names, e.g.:

Series has the method ``order`` (analogous to `R's order function
<http://stat.ethz.ch/R-manual/R-patched/library/base/html/order.html>`__) which
sorts by value, with special treatment of NA values via the ``na_last``
sorts by value, with special treatment of NA values via the ``na_position``
argument:

.. ipython:: python

s[2] = np.nan
s.order()
s.order(na_last=False)
s.order(na_position='first')

Some other sorting notes / nuances:

Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ API Changes
- Define and document the order of column vs index names in query/eval
(:issue:`6676`)

- ``DataFrame.sort`` now places NaNs at the beginning or end of the sort according to the ``na_position`` parameter. (:issue:`3917`)

Deprecations
~~~~~~~~~~~~

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,9 @@ def array_equivalent(left, right):
# NaNs occur only in object arrays, float or complex arrays.
if issubclass(left.dtype.type, np.object_):
return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
if not issubclass(left.dtype.type, (np.floating, np.complexfloating)):
return np.array_equal(left, right)
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
if issubclass(left.dtype.type, (np.floating, np.complexfloating)):
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
return np.array_equal(left, right)

def _iterable_not_string(x):
return (isinstance(x, collections.Iterable) and
Expand Down
57 changes: 30 additions & 27 deletions pandas/core/frame.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2522,7 +2522,7 @@ def _m8_to_i8(x):
# Sorting

def sort(self, columns=None, axis=0, ascending=True,
inplace=False):
inplace=False, kind='quicksort', na_position='last'):
"""
Sort DataFrame either by labels (along either axis) or by the values in
column(s)
Expand All @@ -2540,6 +2540,11 @@ def sort(self, columns=None, axis=0, ascending=True,
Sort index/rows versus columns
inplace : boolean, default False
Sort the DataFrame without creating a new instance
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
This option is only applied when sorting on a single column or label.
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end

Examples
--------
Expand All @@ -2550,10 +2555,10 @@ def sort(self, columns=None, axis=0, ascending=True,
sorted : DataFrame
"""
return self.sort_index(by=columns, axis=axis, ascending=ascending,
inplace=inplace)
inplace=inplace, kind=kind, na_position=na_position)

def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
kind='quicksort'):
kind='quicksort', na_position='last'):
"""
Sort DataFrame either by labels (along either axis) or by the values in
a column
Expand All @@ -2571,6 +2576,11 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
orders
inplace : boolean, default False
Sort the DataFrame without creating a new instance
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
This option is only applied when sorting on a single column or label.

Examples
--------
Expand All @@ -2580,8 +2590,8 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
-------
sorted : DataFrame
"""
from pandas.core.groupby import _lexsort_indexer


from pandas.core.groupby import _lexsort_indexer, _nargsort
axis = self._get_axis_number(axis)
if axis not in [0, 1]: # pragma: no cover
raise AssertionError('Axis must be 0 or 1, got %s' % str(axis))
Expand All @@ -2597,23 +2607,19 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
if com._is_sequence(ascending) and len(by) != len(ascending):
raise ValueError('Length of ascending (%d) != length of by'
' (%d)' % (len(ascending), len(by)))

if len(by) > 1:
keys = []
for x in by:
k = self[x].values
if k.ndim == 2:
raise ValueError('Cannot sort by duplicate column %s'
% str(x))
keys.append(k)

def trans(v):
if com.needs_i8_conversion(v):
return v.view('i8')
return v

keys = [trans(self[x].values) for x in by]
indexer = _lexsort_indexer(keys, orders=ascending)
keys = []
for x in by:
k = self[x].values
if k.ndim == 2:
raise ValueError('Cannot sort by duplicate column %s' % str(x))
keys.append(trans(k))
indexer = _lexsort_indexer(keys, orders=ascending,
na_position=na_position)
indexer = com._ensure_platform_int(indexer)
else:
by = by[0]
Expand All @@ -2630,20 +2636,17 @@ def trans(v):
% str(by))
if isinstance(ascending, (tuple, list)):
ascending = ascending[0]
indexer = _nargsort(k, kind=kind, ascending=ascending,
na_position=na_position)

if not ascending:
k = k[::-1]
indexer = k.argsort(kind=kind)
if not ascending:
indexer = indexer.max() - indexer[::-1]
elif isinstance(labels, MultiIndex):
indexer = _lexsort_indexer(labels.labels, orders=ascending)
indexer = _lexsort_indexer(labels.labels, orders=ascending,
na_position=na_position)
indexer = com._ensure_platform_int(indexer)
else:
indexer = labels.argsort(kind=kind)
if not ascending:
indexer = indexer[::-1]

indexer = _nargsort(labels, kind=kind, ascending=ascending,
na_position=na_position)

if inplace:
if axis == 1:
new_data = self._data.reindex_items(
Expand Down
55 changes: 47 additions & 8 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3145,33 +3145,72 @@ def _indexer_from_factorized(labels, shape, compress=True):
return indexer


def _lexsort_indexer(keys, orders=None):
def _lexsort_indexer(keys, orders=None, na_position='last'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure why _lexsort_indexer is even in groupby.py only uses in series/frame. can you move to algos.py?

labels = []
shape = []

if isinstance(orders, bool):
orders = [orders] * len(keys)
elif orders is None:
orders = [True] * len(keys)

for key, order in zip(keys, orders):
key = np.asanyarray(key)
rizer = _hash.Factorizer(len(key))

if not key.dtype == np.object_:
key = key.astype('O')

# factorize maps nans to na_sentinel=-1
ids = rizer.factorize(key, sort=True)

n = len(rizer.uniques)
mask = (ids == -1)
if order: # ascending
if na_position == 'last':
ids = np.where(mask, n, ids)
elif na_position == 'first':
ids += 1
else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
else: # not order means descending
if na_position == 'last':
ids = np.where(mask, n, n-ids-1)
elif na_position == 'first':
ids = np.where(mask, 0, n-ids)
else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
if mask.any():
n += 1
shape.append(n)
if not order:
mask = ids == -1
ids = np.where(mask, -1, n - ids)

labels.append(ids)

return _indexer_from_factorized(labels, shape)

def _nargsort(items, kind='quicksort', ascending=True, na_position='last'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move to algos.py as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean core/algorithms.py? or algos.pyx?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes...that's what I meant!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait...nvm....I realize this is being called internall in groupby.py

"""
This is intended to be a drop-in replacement for np.argsort which handles NaNs
It adds ascending and na_position parameters.
GH #6399, #5231
"""
items = np.asanyarray(items)
idx = np.arange(len(items))
mask = isnull(items)
non_nans = items[~mask]
non_nan_idx = idx[~mask]
nan_idx = np.nonzero(mask)[0]
if not ascending:
non_nans = non_nans[::-1]
non_nan_idx = non_nan_idx[::-1]
indexer = non_nan_idx[non_nans.argsort(kind=kind)]
if not ascending:
indexer = indexer[::-1]
# Finally, place the NaNs at the end or the beginning according to na_position
if na_position == 'last':
indexer = np.concatenate([indexer, nan_idx])
elif na_position == 'first':
indexer = np.concatenate([nan_idx, indexer])
else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
return indexer


class _KeyMapper(object):

Expand Down
10 changes: 5 additions & 5 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin

from pandas.util.decorators import cache_readonly, deprecate
from pandas.core.common import isnull
from pandas.core.common import isnull, array_equivalent
import pandas.core.common as com
from pandas.core.common import _values_from_object, is_float, is_integer, ABCSeries
from pandas.core.config import get_option
Expand Down Expand Up @@ -800,7 +800,7 @@ def equals(self, other):
if type(other) != Index:
return other.equals(self)

return np.array_equal(self, other)
return array_equivalent(self, other)

def identical(self, other):
"""Similar to equals, but check that other comparable attributes are
Expand Down Expand Up @@ -1872,7 +1872,7 @@ def equals(self, other):
# return False

try:
return np.array_equal(self, other)
return array_equivalent(self, other)
except TypeError:
# e.g. fails in numpy 1.6 with DatetimeIndex #1681
return False
Expand Down Expand Up @@ -3533,7 +3533,7 @@ def equals(self, other):
return True

if not isinstance(other, MultiIndex):
return np.array_equal(self.values, _ensure_index(other))
return array_equivalent(self.values, _ensure_index(other))

if self.nlevels != other.nlevels:
return False
Expand All @@ -3546,7 +3546,7 @@ def equals(self, other):
allow_fill=False)
ovalues = com.take_nd(other.levels[i].values, other.labels[i],
allow_fill=False)
if not np.array_equal(svalues, ovalues):
if not array_equivalent(svalues, ovalues):
return False

return True
Expand Down
19 changes: 14 additions & 5 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1743,24 +1743,32 @@ def rank(self, method='average', na_option='keep', ascending=True,
ascending=ascending, pct=pct)
return self._constructor(ranks, index=self.index).__finalize__(self)

def order(self, na_last=True, ascending=True, kind='mergesort'):
def order(self, na_last=None, ascending=True, kind='mergesort', na_position='last'):
"""
Sorts Series object, by value, maintaining index-value link

Parameters
----------
na_last : boolean (optional, default=True)
na_last : boolean (optional, default=True) (DEPRECATED; use na_position)
Put NaN's at beginning or end
ascending : boolean, default True
Sort ascending. Passing False sorts descending
kind : {'mergesort', 'quicksort', 'heapsort'}, default 'mergesort'
Choice of sorting algorithm. See np.sort for more
information. 'mergesort' is the only stable algorithm
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end

Returns
-------
y : Series
"""
if na_last is not None:
warnings.warn(("na_last is deprecated. Please use na_position instead"),
FutureWarning)
na_position = 'last' if na_last else 'first'

def _try_kind_sort(arr):
# easier to ask forgiveness than permission
try:
Expand All @@ -1784,15 +1792,16 @@ def _try_kind_sort(arr):
if not ascending:
argsorted = argsorted[::-1]

if na_last:
if na_position == 'last':
n = good.sum()
sortedIdx[:n] = idx[good][argsorted]
sortedIdx[n:] = idx[bad]
else:
elif na_position == 'first':
n = bad.sum()
sortedIdx[n:] = idx[good][argsorted]
sortedIdx[:n] = idx[bad]

else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
return self._constructor(arr[sortedIdx], index=self.index[sortedIdx])\
.__finalize__(self)

Expand Down
13 changes: 8 additions & 5 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -835,20 +835,23 @@ cdef class Factorizer:
return self.count

def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1):
"""
Factorize values with nans replaced by na_sentinel
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel)

mask = (labels == na_sentinel)
# sort on
if sort:
if labels.dtype != np.int_:
labels = labels.astype(np.int_)

sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))

labels = reverse_indexer.take(labels)

labels = reverse_indexer.take(labels, mode='clip')
labels[mask] = na_sentinel
self.count = len(self.uniques)
return labels

Expand Down
Loading