pandas-dev · h-vetinari · Jul 16, 2018 · Aug 30, 2018 · Aug 30, 2018 · Aug 30, 2018
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -412,21 +412,35 @@ def time_frame_nunique(self):
 class Duplicated(object):
 
     goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
+
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
 
-    def setup(self):
         n = (1 << 20)
         t = date_range('2015-01-01', freq='S', periods=(n // 64))
         xs = np.random.randn(n // 64).round(2)
         self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
                              'b': np.random.choice(t, n),
                              'c': np.random.choice(xs, n)})
-        self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
+        # df2 will not have any duplicates
+        self.df2 = DataFrame(np.random.randn(100, 1000).astype(str))
+
+        df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)),
+                        columns=list('ABCDE'))
+        df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str))
+        self.df3 = df3
+
+    def time_frame_duplicated(self, keep, return_inverse):
+        self.df.duplicated(keep=keep, return_inverse=return_inverse)
 
-    def time_frame_duplicated(self):
-        self.df.duplicated()
+    def time_frame_duplicated_wide(self, keep, return_inverse):
+        self.df2.duplicated(keep=keep, return_inverse=return_inverse)
 
-    def time_frame_duplicated_wide(self):
-        self.df2.duplicated()
+    def time_frame_duplicated_mixed(self, keep, return_inverse):
+        self.df3.duplicated(keep=keep, return_inverse=return_inverse)
 
 
 class XS(object):

diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -84,6 +84,24 @@ def time_modulo(self, dtype):
         self.index % 2
 
 
+class Duplicated(object):
+
+    goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
+
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
+        n, k = 200, 1000
+        base = tm.makeStringIndex(n)
+        self.idx = Index(base[np.random.choice(n, k * n)])
+
+    def time_duplicated(self, keep, return_inverse):
+        self.idx.duplicated(keep=keep, return_inverse=return_inverse)
+
+
 class Range(object):
 
     goal_time = 0.2

diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
@@ -83,17 +83,22 @@ def time_is_monotonic(self):
 class Duplicated(object):
 
     goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
 
-    def setup(self):
-        n, k = 200, 5000
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
+        n, k = 200, 1000
         levels = [np.arange(n),
                   tm.makeStringIndex(n).values,
                   1000 + np.arange(n)]
         labels = [np.random.choice(n, (k * n)) for lev in levels]
         self.mi = MultiIndex(levels=levels, labels=labels)
 
-    def time_duplicated(self):
-        self.mi.duplicated()
+    def time_duplicated(self, keep, return_inverse):
+        self.mi.duplicated(keep=keep, return_inverse=return_inverse)
 
 
 class Sortlevel(object):

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -192,3 +192,21 @@ def setup(self):
 
     def time_series_datetimeindex_repr(self):
         getattr(self.s, 'a', None)
+
+
+class Duplicated(object):
+
+    goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
+
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
+        n, k = 200, 1000
+        base = tm.makeStringIndex(n)
+        self.s = Series(base[np.random.choice(n, k * n)])
+
+    def time_series_duplicated(self, keep, return_inverse):
+        self.s.duplicated(keep=keep, return_inverse=return_inverse)
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -159,6 +159,52 @@ This is the same behavior as ``Series.values`` for categorical data. See
 :ref:`whatsnew_0240.api_breaking.interval_values` for more.
 
 
+.. _whatsnew_0240.enhancements.duplicated_inverse:
+
+The `duplicated`-method has gained the `return_inverse` kwarg
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The `duplicated`-method for `Series`, `DataFrame` and all flavors of `Index` has gained a `return_inverse` keyword,
+which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
+that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).
+
+For ``Index`` objects, the inverse is an ``np.ndarray``:
+
+.. ipython:: python
+
+    idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
+    isduplicate, inverse = idx.duplicated(return_inverse=True)  # default: keep='first'
+    isduplicate
+    inverse
+
+This allows to reconstruct the original ``Index`` as follows:
+
+.. ipython:: python
+
+    unique = idx[~isduplicate]  # same as idx.drop_duplicates()
+    unique
+
+    reconstruct = unique[inverse]
+    reconstruct.equals(idx)
+
+For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``,
+which contains the mapping from the index of the deduplicated, unique subset back to the original index.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
+                      index=[1, 4, 9, 16, 25])
+    df
+    isduplicate, inverse = df.duplicated(keep='last', return_inverse=True)
+    isduplicate
+    inverse
+
+    unique = df.loc[~isduplicate]  # same as df.drop_duplicates(keep='last')
+    unique
+    reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
+    reconstruct.equals(df)
+
+
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -770,7 +770,7 @@ def _value_counts_arraylike(values, dropna):
     return keys, counts
 
 
-def duplicated(values, keep='first'):
+def duplicated(values, keep='first', return_inverse=False):
     """
     Return boolean ndarray denoting duplicate values.
 
@@ -785,16 +785,67 @@ def duplicated(values, keep='first'):
           occurrence.
         - ``last`` : Mark duplicates as ``True`` except for the last
           occurrence.
-        - False : Mark all duplicates as ``True``.
+        - False : Mark all duplicates as ``True``. This option is not
+          compatible with ``return_inverse``.
+    return_inverse : boolean, default False
+        If True, also return the selection of (integer) indices from the array
+        of unique values (created e.g. by selecting the boolean complement of
+        the first output, or by using `.drop_duplicates` with the same
+        `keep`-parameter) that can be used to reconstruct "values".
+
+        .. versionadded:: 0.24.0
 
     Returns
     -------
-    duplicated : ndarray
+    duplicated : ndarray or tuple of ndarray if ``return_inverse`` is True
     """
 
+    if return_inverse and keep is False:
+        raise ValueError("The parameters return_inverse=True and "
+                         "keep=False cannot be used together (impossible "
+                         "to calculate an inverse when discarding all "
+                         "instances of a duplicate).")
+
     values, dtype, ndtype = _ensure_data(values)
     f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
-    return f(values, keep=keep)
+    isdup = f(values, keep=keep)
+    if not return_inverse:
+        return isdup
+    elif not isdup.any():
+        # no need to calculate inverse if no duplicates
+        inv = np.arange(len(values))
+        return isdup, inv
+
+    if keep == 'first':
+        # o2u: original indices to indices of ARRAY of unique values
+        # u2o: reduplication from array of unique values to original array
+        # this fits together in the way that values[o2u] are the unique values
+        # and values[o2u][u2o] == values
+        _, o2u, u2o = np.unique(values, return_index=True,
+                                return_inverse=True)
+    elif keep == 'last':
+        # np.unique takes first occurrence as unique value,
+        # so we flip values that first becomes last
+        values = values[::-1]
+        _, o2u, u2o = np.unique(values, return_index=True,
+                                return_inverse=True)
+        # the values in "values" correspond(ed) to the index of "values",
+        # which is simply np.arange(len(values)).
+        # By flipping "values" around, we need to do the same for the index,
+        # ___because o2u and u2o are relative to that order___.
+        # Finally, to fit with the original order again, we need to flip the
+        # result around one last time.
+        o2u, u2o = np.arange(len(values))[::-1][o2u], u2o[::-1]
+
+    # np.unique yields a ___sorted___ list of uniques, and o2u/u2o are relative
+    # to this order. To restore the original order, we argsort o2u, because o2u
+    # would be ordered if np.unique had not sorted implicitly. The first
+    # argsort gives the permutation from o2u to its sorted form, but we need
+    # the inverse permutation (the map from the unsorted uniques to o2u, from
+    # which we can continue with u2o). This inversion (as a permutation) is
+    # achieved by the second argsort.
+    inv = np.argsort(np.argsort(o2u))[u2o]
+    return isdup, inv
 
 
 def mode(values, dropna=True):

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1246,16 +1246,39 @@ def drop_duplicates(self, keep='first', inplace=False):
         else:
             return result
 
-    def duplicated(self, keep='first'):
+    def duplicated(self, keep='first', return_inverse=False):
         from pandas.core.algorithms import duplicated
+
+        if return_inverse and keep is False:
+            raise ValueError("The parameters return_inverse=True and "
+                             "keep=False cannot be used together (impossible "
+                             "to calculate an inverse when discarding all "
+                             "instances of a duplicate).")
+
         if isinstance(self, ABCIndexClass):
             if self.is_unique:
-                return np.zeros(len(self), dtype=np.bool)
-            return duplicated(self, keep=keep)
-        else:
+                isdup = np.zeros(len(self), dtype=np.bool)
+                if not return_inverse:
+                    return isdup
+                return isdup, np.arange(len(self))
+            # core.algorithms.duplicated has the same output signature as
+            # Index.duplicated -> no need to distinguish cases here
+            return duplicated(self, keep=keep, return_inverse=return_inverse)
+
+        # Series case
+        if not return_inverse:
             return self._constructor(duplicated(self, keep=keep),
                                      index=self.index).__finalize__(self)
 
+        # return_inverse = True
+        isdup_array, inv_array = duplicated(self, keep=keep,
+                                            return_inverse=True)
+        isdup = self._constructor(isdup_array,
+                                  index=self.index).__finalize__(self)
+        inv = self._constructor(self.loc[~isdup_array].index[inv_array],
+                                index=self.index)
+        return isdup, inv
+
     # ----------------------------------------------------------------------
     # abstracts