Review feedback; refactor tests; add whatsnew

pandas-dev · Jun 27, 2018 · 740df13 · 740df13
1 parent f1cff7f
commit 740df13
Show file tree

Hide file tree

Showing 4 changed files with 176 additions and 140 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -8,12 +8,44 @@ v0.24.0
 New features
 ~~~~~~~~~~~~
 
-- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
+.. _whatsnew_0240.enhancements.duplicated_inverse:
+
+``DataFrame.duplicated`` has gained the ``return_inverse`` kwarg
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, there was no way to determine how duplicate rows in a ``DataFrame`` got mapped to the deduplicated, unique subset. This made it hard to push back
+information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a
+``return_inverse``-kwarg, it fails to work with ``object`` data.
+
+The method has now gained a ``return_inverse`` keyword -- specifying ``return_inverse=True`` will change the output from a single Series to a tuple of two Series:
+
+.. ipython:: python
+
+    df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
+                      index=[1, 4, 9, 16, 25])
+    df
+    isdup, inv = df.duplicated(return_inverse=True)
+    isdup
+    inv
+
+This allows to reconstruct the original DataFrame as follows:
+
+.. ipython:: python
+
+    unique = df.loc[~isdup]  # same as df.drop_duplicates()
+    unique
+    reconstruct = unique.reindex(inv.values).set_index(inv.index)
+    reconstruct.equals(df)
+
+The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
+to construct an inverse).
 
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements
 ^^^^^^^^^^^^^^^^^^
+
+- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
 - :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
 - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
 - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4362,10 +4362,12 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
               last occurrence.
             - False : Mark all duplicates as ``True``. This option is not
               compatible with ``return_inverse``.
-        return_inverse boolean, default False
+        return_inverse : boolean, default False
             Determines whether the mapping from unique elements to the original
             index should be returned. If true, the output is a tuple.
 
+            .. versionadded:: 0.24.0
+
         Returns
         -------
         duplicated : Series or tuple of Series if return_inverse is True
@@ -4413,9 +4415,16 @@ def f(vals):
                                     return_index=True)
             inv = Series(self.index[o2u][u2o], index=self.index)
         elif keep == 'last':
-            ids = ids[::-1]  # np.unique takes first occurrence as unique value
+            # np.unique takes first occurrence as unique value,
+            # so we flip ids that first becomes last
+            ids = ids[::-1]
             _, o2u, u2o = np.unique(ids, return_inverse=True,
                                     return_index=True)
+            # the values in the ids-array correspond(ed) to self.index -
+            # by flipping ids around, we need to do the same for self.index,
+            # ___because o2u and u2o are relative to that order___.
+            # Finally, to fit with 'index=self.index' in the constructor,
+            # we need to flip the values around one last time
             inv = Series(self.index[::-1][o2u][u2o][::-1], index=self.index)
         return isdup, inv
 

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -12,7 +12,7 @@
 from numpy.random import randn
 import numpy as np
 
-from pandas.compat import lrange, PY35
+from pandas.compat import lrange, PY35, string_types
 from pandas import (compat, isna, notna, DataFrame, Series,
                     MultiIndex, date_range, Timestamp, Categorical,
                     _np_version_under1p12,
@@ -1523,6 +1523,137 @@ def test_isin_empty_datetimelike(self):
     # ----------------------------------------------------------------------
     # Row deduplication
 
+    @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
+    def test_duplicated_with_misspelled_column_name(self, subset):
+        # GH 19730
+        df = pd.DataFrame({'A': [0, 0, 1],
+                           'B': [0, 0, 1],
+                           'C': [0, 0, 1]})
+
+        with pytest.raises(KeyError):
+            df.duplicated(subset)
+
+        with pytest.raises(KeyError):
+            df.drop_duplicates(subset)
+
+    @pytest.mark.slow
+    def test_duplicated_do_not_fail_on_wide_dataframes(self):
+        # gh-21524
+        # Given the wide dataframe with a lot of columns
+        # with different (important!) values
+        data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
+                for i in range(100)}
+        df = pd.DataFrame(data).T
+        result = df.duplicated()
+
+        # Then duplicates produce the bool pd.Series as a result
+        # and don't fail during calculation.
+        # Actual values doesn't matter here, though usually
+        # it's all False in this case
+        assert isinstance(result, pd.Series)
+        assert result.dtype == np.bool
+
+    @pytest.mark.parametrize('keep, expected', [
+        ('first', Series([False, False, True, False, True])),
+        ('last', Series([True, True, False, False, False])),
+        (False, Series([True, True, True, False, True]))
+    ])
+    def test_duplicated_keep(self, keep, expected):
+        df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
+
+        result = df.duplicated(keep=keep)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize('keep, expected', [
+        ('first', Series([False, False, True, True, True])),
+        ('last', Series([True, True, False, True, False])),
+        (False, Series([True] * 5))
+    ])
+    def test_duplicated_nan_none(self, keep, expected):
+        # np.nan and None are considered equal
+        df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
+
+        result = df.duplicated(keep=keep)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize('keep', ['first', 'last', False])
+    @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
+    def test_duplicated_subset(self, subset, keep):
+        df = DataFrame({'A': [0, 1, 1, 2, 0],
+                        'B': ['a', 'b', 'b', 'c', 'a'],
+                        'C': [np.nan, 3, 3, None, np.nan]})
+
+        if subset is None:
+            subset = list(df.columns)
+
+        expected = df[subset].duplicated(keep=keep).rename(name=None)
+        result = df.duplicated(keep=keep, subset=subset)
+        tm.assert_series_equal(result, expected)
+
+    def test_duplicated_inverse(self):
+        # check that return_inverse kwarg does not affect outcome;
+        # index of inverse must be correctly transformed as well
+        idx = [1, 4, 9, 16, 25]
+        df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
+                       index=idx)
+
+        # keep = 'first'
+        expected_isdup = df.duplicated(keep='first')
+        expected_inv = Series([1, 4, 4, 16, 1], index=idx)
+        result_isdup, result_inv = df.duplicated(keep='first',
+                                                 return_inverse=True)
+        tm.assert_series_equal(result_isdup, expected_isdup)
+        tm.assert_series_equal(result_inv, expected_inv)
+
+        # test that inv works (and fits together with expected_isdup)
+        unique = df.loc[~expected_isdup]
+        reconstr = unique.reindex(result_inv).set_index(result_inv.index)
+        tm.assert_frame_equal(reconstr, df)
+
+        # keep = 'last'
+        expected_isdup = df.duplicated(keep='last')
+        expected_inv = Series([25, 9, 9, 16, 25], index=idx)
+        result_isdup, result_inv = df.duplicated(keep='last',
+                                                 return_inverse=True)
+        tm.assert_series_equal(result_isdup, expected_isdup)
+        tm.assert_series_equal(result_inv, expected_inv)
+
+        # test that inv works (and fits together with expected_isdup)
+        unique = df.loc[~expected_isdup]
+        reconstr = unique.reindex(result_inv).set_index(result_inv.index)
+        tm.assert_frame_equal(reconstr, df)
+
+    def test_duplicated_inverse_raises(self):
+        df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
+
+        # keep = False
+        rgx = 'The parameters return_inverse=True and keep=False cannot be.*'
+        with tm.assert_raises_regex(ValueError, rgx):
+            df.duplicated(keep=False, return_inverse=True)
+
+    @pytest.mark.parametrize('keep', ['first', 'last'])
+    @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
+    def test_duplicated_inverse_large(self, subset, keep):
+        # unsorted index important to check 'first'/'last' functionality
+        df = DataFrame(np.random.randint(0, 10, (10000, 3)),
+                       columns=list('ABC')).sample(5000)
+
+        expected_isdup = df.duplicated(keep=keep, subset=subset)
+        result_isdup, inv = df.duplicated(keep=keep, subset=subset,
+                                          return_inverse=True)
+        tm.assert_series_equal(result_isdup, expected_isdup)
+
+        if subset is None:
+            subset = list(df.columns)
+        elif isinstance(subset, string_types):
+            # need to have a DataFrame, not a Series
+            # -> select columns with singleton list, not string
+            subset = [subset]
+
+        unique = df.loc[~expected_isdup, subset]
+        reconstr = unique.reindex(inv.values).set_index(inv.index)
+        tm.assert_frame_equal(reconstr, df[subset])
+
     def test_drop_duplicates(self):
         df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
                                 'foo', 'bar', 'bar', 'foo'],
@@ -1618,36 +1749,6 @@ def test_drop_duplicates(self):
         for keep in ['first', 'last', False]:
             assert df.duplicated(keep=keep).sum() == 0
 
-    @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
-    def test_duplicated_with_misspelled_column_name(self, subset):
-        # GH 19730
-        df = pd.DataFrame({'A': [0, 0, 1],
-                           'B': [0, 0, 1],
-                           'C': [0, 0, 1]})
-
-        with pytest.raises(KeyError):
-            df.duplicated(subset)
-
-        with pytest.raises(KeyError):
-            df.drop_duplicates(subset)
-
-    @pytest.mark.slow
-    def test_duplicated_do_not_fail_on_wide_dataframes(self):
-        # gh-21524
-        # Given the wide dataframe with a lot of columns
-        # with different (important!) values
-        data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
-                for i in range(100)}
-        df = pd.DataFrame(data).T
-        result = df.duplicated()
-
-        # Then duplicates produce the bool pd.Series as a result
-        # and don't fail during calculation.
-        # Actual values doesn't matter here, though usually
-        # it's all False in this case
-        assert isinstance(result, pd.Series)
-        assert result.dtype == np.bool
-
     def test_drop_duplicates_with_duplicate_column_names(self):
         # GH17836
         df = DataFrame([

diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py