diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index ed1bf0a4f83941..452ee3e805ba7d 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -779,7 +779,8 @@ Groupby/Resample/Rolling - Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). - Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). - Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) - +- Bug in :func:`pandas.core.groupby.GroupBy.nth` where column order was not always preserved (:issue:`20760`) + Sparse ^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 61dadd833be35a..f2121b87e25761 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -497,7 +497,8 @@ def _set_group_selection(self): if len(groupers): # GH12839 clear selected obj cache when group selection changes - self._group_selection = ax.difference(Index(groupers)).tolist() + self._group_selection = ax.difference(Index(groupers), + sort=False).tolist() self._reset_cache('_selected_obj') def _set_result_index_ordered(self, result): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b42bbdafcab458..083fab5ac25963 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2915,17 +2915,20 @@ def intersection(self, other): taken.name = None return taken - def difference(self, other): + def difference(self, other, sort=True): """ Return a new Index with elements from the index that are not in `other`. This is the set difference of two Index objects. - It's sorted if sorting is possible. Parameters ---------- other : Index or array-like + sort : bool, default True + Sort the resulting index if possible + + .. versionadded:: 0.24.0 Returns ------- @@ -2934,10 +2937,12 @@ def difference(self, other): Examples -------- - >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx1 = pd.Index([2, 1, 3, 4]) >>> idx2 = pd.Index([3, 4, 5, 6]) >>> idx1.difference(idx2) Int64Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Int64Index([2, 1], dtype='int64') """ self._assert_can_do_setop(other) @@ -2955,10 +2960,11 @@ def difference(self, other): label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) the_diff = this.values.take(label_diff) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + if sort: + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass return this._shallow_copy(the_diff, name=result_name, freq=None) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index a1b748cd50e8f2..4ea4b580a2c3f8 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -390,3 +390,27 @@ def test_nth_empty(): names=['a', 'b']), columns=['c']) assert_frame_equal(result, expected) + + +def test_nth_column_order(): + # GH 20760 + # Check that nth preserves column order + df = DataFrame([[1, 'b', 100], + [1, 'a', 50], + [1, 'a', np.nan], + [2, 'c', 200], + [2, 'd', 150]], + columns=['A', 'C', 'B']) + result = df.groupby('A').nth(0) + expected = DataFrame([['b', 100.0], + ['c', 200.0]], + columns=['C', 'B'], + index=Index([1, 2], name='A')) + assert_frame_equal(result, expected) + + result = df.groupby('A').nth(-1, dropna='any') + expected = DataFrame([['a', 50.0], + ['d', 150.0]], + columns=['C', 'B'], + index=Index([1, 2], name='A')) + assert_frame_equal(result, expected)