Incorporate review feedback

pandas-dev · Jun 28, 2018 · b08dc3d · b08dc3d
1 parent 36b03a1
commit b08dc3d
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 42 deletions.
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -431,22 +431,13 @@ def setup(self, keep, return_inverse):
         self.df3 = df3
 
     def time_frame_duplicated(self, keep, return_inverse):
-        if return_inverse:
-            self.df.duplicated(keep=keep, return_inverse=return_inverse)
-        else:
-            self.df.duplicated(keep=keep)
+        self.df.duplicated(keep=keep, return_inverse=return_inverse)
 
     def time_frame_duplicated_wide(self, keep, return_inverse):
-        if return_inverse:
-            self.df2.duplicated(keep=keep, return_inverse=return_inverse)
-        else:
-            self.df2.duplicated(keep=keep)
+        self.df2.duplicated(keep=keep, return_inverse=return_inverse)
 
     def time_frame_duplicated_mixed(self, keep, return_inverse):
-        if return_inverse:
-            self.df3.duplicated(keep=keep, return_inverse=return_inverse)
-        else:
-            self.df3.duplicated(keep=keep)
+        self.df3.duplicated(keep=keep, return_inverse=return_inverse)
 
 
 class XS(object):

diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -1,7 +1,7 @@
 .. _whatsnew_0231:
 
-v0.23.1 (June 12, 2018)
------------------------
+v0.23.1
+-------
 
 This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes
 and bug fixes. We recommend that all users upgrade to this version.

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -13,29 +13,25 @@ New features
 ``DataFrame.duplicated`` has gained the ``return_inverse`` kwarg
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Previously, there was no way to determine how duplicate rows in a ``DataFrame`` got mapped to the deduplicated, unique subset. This made it hard to push back
-information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a
-``return_inverse``-kwarg, it fails to work with ``object`` data.
-
-Therefore, the ``duplicated``-method has now gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
-to a tuple of two Series (in the following example, the index is not just a simple order, to illustrate that the inverse correctly takes it into account):
+The ``duplicated``-method has gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
+to a tuple of two Series, where the second Series contains the mapping from the indices of the deduplicated, unique subset back to the original index:
 
 .. ipython:: python
 
     df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
                       index=[1, 4, 9, 16, 25])
     df
-    isdup, inv = df.duplicated(return_inverse=True)  # default: keep='first'
-    isdup
-    inv
+    isduplicate, inverse = df.duplicated(return_inverse=True)  # default: keep='first'
+    isduplicate
+    inverse
 
 This allows to reconstruct the original DataFrame as follows:
 
 .. ipython:: python
 
-    unique = df.loc[~isdup]  # same as df.drop_duplicates()
+    unique = df.loc[~isduplicate]  # same as df.drop_duplicates()
     unique
-    reconstruct = unique.reindex(inv.values).set_index(inv.index)
+    reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
     reconstruct.equals(df)
 
 The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -1585,35 +1585,30 @@ def test_duplicated_subset(self, subset, keep):
 
         if subset is None:
             subset = list(df.columns)
+        elif isinstance(subset, string_types):
+            # need to have a DataFrame, not a Series
+            # -> select columns with singleton list, not string
+            subset = [subset]
 
         expected = df[subset].duplicated(keep=keep).rename(name=None)
         result = df.duplicated(keep=keep, subset=subset)
         tm.assert_series_equal(result, expected)
 
-    def test_duplicated_inverse(self):
+    @pytest.mark.parametrize('keep, expected_inv_values', [
+        ('first', [1, 4, 4, 16, 1]),
+        ('last', [25, 9, 9, 16, 25])
+    ])
+    def test_duplicated_inverse(self, keep, expected_inv_values):
         # check that return_inverse kwarg does not affect outcome;
         # index of inverse must be correctly transformed as well
         idx = [1, 4, 9, 16, 25]
         df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
                        index=idx)
 
         # keep = 'first'
-        expected_isdup = df.duplicated(keep='first')
-        expected_inv = Series([1, 4, 4, 16, 1], index=idx)
-        result_isdup, result_inv = df.duplicated(keep='first',
-                                                 return_inverse=True)
-        tm.assert_series_equal(result_isdup, expected_isdup)
-        tm.assert_series_equal(result_inv, expected_inv)
-
-        # test that inv works (and fits together with expected_isdup)
-        unique = df.loc[~expected_isdup]
-        reconstr = unique.reindex(result_inv).set_index(result_inv.index)
-        tm.assert_frame_equal(reconstr, df)
-
-        # keep = 'last'
-        expected_isdup = df.duplicated(keep='last')
-        expected_inv = Series([25, 9, 9, 16, 25], index=idx)
-        result_isdup, result_inv = df.duplicated(keep='last',
+        expected_isdup = df.duplicated(keep=keep)
+        expected_inv = Series(expected_inv_values, index=idx)
+        result_isdup, result_inv = df.duplicated(keep=keep,
                                                  return_inverse=True)
         tm.assert_series_equal(result_isdup, expected_isdup)
         tm.assert_series_equal(result_inv, expected_inv)