BUG: Maintain column order with groupby.nth

pandas-dev · Sep 23, 2018 · dc2428c · dc2428c
1 parent fb784ca
commit dc2428c
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 9 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -779,7 +779,8 @@ Groupby/Resample/Rolling
 - Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`).
 - Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`).
 - Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`)
-
+- Bug in :func:`pandas.core.groupby.GroupBy.nth` where column order was not always preserved (:issue:`20760`)
+
 Sparse
 ^^^^^^
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -497,7 +497,8 @@ def _set_group_selection(self):
 
         if len(groupers):
             # GH12839 clear selected obj cache when group selection changes
-            self._group_selection = ax.difference(Index(groupers)).tolist()
+            self._group_selection = ax.difference(Index(groupers),
+                                                  sort=False).tolist()
             self._reset_cache('_selected_obj')
 
     def _set_result_index_ordered(self, result):

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2915,17 +2915,20 @@ def intersection(self, other):
             taken.name = None
         return taken
 
-    def difference(self, other):
+    def difference(self, other, sort=True):
         """
         Return a new Index with elements from the index that are not in
         `other`.
 
         This is the set difference of two Index objects.
-        It's sorted if sorting is possible.
 
         Parameters
         ----------
         other : Index or array-like
+        sort : bool, default True
+            Sort the resulting index if possible
+
+            .. versionadded:: 0.24.0
 
         Returns
         -------
@@ -2934,10 +2937,12 @@ def difference(self, other):
         Examples
         --------
 
-        >>> idx1 = pd.Index([1, 2, 3, 4])
+        >>> idx1 = pd.Index([2, 1, 3, 4])
         >>> idx2 = pd.Index([3, 4, 5, 6])
         >>> idx1.difference(idx2)
         Int64Index([1, 2], dtype='int64')
+        >>> idx1.difference(idx2, sort=False)
+        Int64Index([2, 1], dtype='int64')
 
         """
         self._assert_can_do_setop(other)
@@ -2955,10 +2960,11 @@ def difference(self, other):
         label_diff = np.setdiff1d(np.arange(this.size), indexer,
                                   assume_unique=True)
         the_diff = this.values.take(label_diff)
-        try:
-            the_diff = sorting.safe_sort(the_diff)
-        except TypeError:
-            pass
+        if sort:
+            try:
+                the_diff = sorting.safe_sort(the_diff)
+            except TypeError:
+                pass
 
         return this._shallow_copy(the_diff, name=result_name, freq=None)
 

diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py
@@ -390,3 +390,27 @@ def test_nth_empty():
                                           names=['a', 'b']),
                          columns=['c'])
     assert_frame_equal(result, expected)
+
+
+def test_nth_column_order():
+    # GH 20760
+    # Check that nth preserves column order
+    df = DataFrame([[1, 'b', 100],
+                    [1, 'a', 50],
+                    [1, 'a', np.nan],
+                    [2, 'c', 200],
+                    [2, 'd', 150]],
+                   columns=['A', 'C', 'B'])
+    result = df.groupby('A').nth(0)
+    expected = DataFrame([['b', 100.0],
+                          ['c', 200.0]],
+                         columns=['C', 'B'],
+                         index=Index([1, 2], name='A'))
+    assert_frame_equal(result, expected)
+
+    result = df.groupby('A').nth(-1, dropna='any')
+    expected = DataFrame([['a', 50.0],
+                          ['d', 150.0]],
+                         columns=['C', 'B'],
+                         index=Index([1, 2], name='A'))
+    assert_frame_equal(result, expected)