Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add return_inverse to duplicated for DataFrame/Series/Index/MultiIndex #21645

Closed
wants to merge 11 commits into from
26 changes: 20 additions & 6 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,21 +412,35 @@ def time_frame_nunique(self):
class Duplicated(object):

goal_time = 0.2
params = (['first', 'last', False], [True, False])
param_names = ['keep', 'return_inverse']

def setup(self, keep, return_inverse):
if keep is False and return_inverse:
raise NotImplementedError

def setup(self):
n = (1 << 20)
t = date_range('2015-01-01', freq='S', periods=(n // 64))
xs = np.random.randn(n // 64).round(2)
self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
'b': np.random.choice(t, n),
'c': np.random.choice(xs, n)})
self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
# df2 will not have any duplicates
self.df2 = DataFrame(np.random.randn(100, 1000).astype(str))

df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)),
columns=list('ABCDE'))
df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str))
self.df3 = df3

def time_frame_duplicated(self, keep, return_inverse):
self.df.duplicated(keep=keep, return_inverse=return_inverse)

def time_frame_duplicated(self):
self.df.duplicated()
def time_frame_duplicated_wide(self, keep, return_inverse):
self.df2.duplicated(keep=keep, return_inverse=return_inverse)

def time_frame_duplicated_wide(self):
self.df2.duplicated()
def time_frame_duplicated_mixed(self, keep, return_inverse):
self.df3.duplicated(keep=keep, return_inverse=return_inverse)


class XS(object):
Expand Down
18 changes: 18 additions & 0 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,24 @@ def time_modulo(self, dtype):
self.index % 2


class Duplicated(object):

goal_time = 0.2
params = (['first', 'last', False], [True, False])
param_names = ['keep', 'return_inverse']

def setup(self, keep, return_inverse):
if keep is False and return_inverse:
raise NotImplementedError

n, k = 200, 1000
base = tm.makeStringIndex(n)
self.idx = Index(base[np.random.choice(n, k * n)])

def time_duplicated(self, keep, return_inverse):
self.idx.duplicated(keep=keep, return_inverse=return_inverse)


class Range(object):

goal_time = 0.2
Expand Down
13 changes: 9 additions & 4 deletions asv_bench/benchmarks/multiindex_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,17 +83,22 @@ def time_is_monotonic(self):
class Duplicated(object):

goal_time = 0.2
params = (['first', 'last', False], [True, False])
param_names = ['keep', 'return_inverse']

def setup(self):
n, k = 200, 5000
def setup(self, keep, return_inverse):
if keep is False and return_inverse:
raise NotImplementedError

n, k = 200, 1000
levels = [np.arange(n),
tm.makeStringIndex(n).values,
1000 + np.arange(n)]
labels = [np.random.choice(n, (k * n)) for lev in levels]
self.mi = MultiIndex(levels=levels, labels=labels)

def time_duplicated(self):
self.mi.duplicated()
def time_duplicated(self, keep, return_inverse):
self.mi.duplicated(keep=keep, return_inverse=return_inverse)


class Sortlevel(object):
Expand Down
18 changes: 18 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,21 @@ def setup(self):

def time_series_datetimeindex_repr(self):
getattr(self.s, 'a', None)


class Duplicated(object):

goal_time = 0.2
params = (['first', 'last', False], [True, False])
param_names = ['keep', 'return_inverse']

def setup(self, keep, return_inverse):
if keep is False and return_inverse:
raise NotImplementedError

n, k = 200, 1000
base = tm.makeStringIndex(n)
self.s = Series(base[np.random.choice(n, k * n)])

def time_series_duplicated(self, keep, return_inverse):
self.s.duplicated(keep=keep, return_inverse=return_inverse)
46 changes: 46 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,52 @@ This is the same behavior as ``Series.values`` for categorical data. See
:ref:`whatsnew_0240.api_breaking.interval_values` for more.


.. _whatsnew_0240.enhancements.duplicated_inverse:

The `duplicated`-method has gained the `return_inverse` kwarg
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double backticks were you now use single backticks (also on the lines below)

(in rst, double backticks give code-styled text)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The :meth:`~DataFrame.duplicated`-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword,
which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).

For ``Index`` objects, the inverse is an ``np.ndarray``:

.. ipython:: python

idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
isduplicate, inverse = idx.duplicated(return_inverse=True) # default: keep='first'
isduplicate
inverse

This allows to reconstruct the original ``Index`` as follows:

.. ipython:: python

unique = idx[~isduplicate] # same as idx.drop_duplicates()
unique

reconstruct = unique[inverse]
reconstruct.equals(idx)

For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``,
which contains the mapping from the index of the deduplicated, unique subset back to the original index.

.. ipython:: python

df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
index=[1, 4, 9, 16, 25])
df
isduplicate, inverse = df.duplicated(keep='last', return_inverse=True)
isduplicate
inverse

unique = df.loc[~isduplicate] # same as df.drop_duplicates(keep='last')
unique
reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
reconstruct.equals(df)


.. _whatsnew_0240.enhancements.other:

Other Enhancements
Expand Down
61 changes: 57 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ def _value_counts_arraylike(values, dropna):
return keys, counts


def duplicated(values, keep='first'):
def duplicated(values, keep='first', return_inverse=False):
"""
Return boolean ndarray denoting duplicate values.

Expand All @@ -786,16 +786,69 @@ def duplicated(values, keep='first'):
occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last
occurrence.
- False : Mark all duplicates as ``True``.
- False : Mark all duplicates as ``True``. This option is not
compatible with ``return_inverse``.
return_inverse : boolean, default False
If True, also return the selection of (integer) indices from the array
of unique values (created e.g. by selecting the boolean complement of
the first output, or by using `.drop_duplicates` with the same
`keep`-parameter) that can be used to reconstruct "values".

.. versionadded:: 0.24.0

Returns
-------
duplicated : ndarray
duplicated : ndarray or tuple of ndarray if ``return_inverse`` is True
"""

if return_inverse and keep is False:
raise ValueError("The parameters return_inverse=True and "
"keep=False cannot be used together (impossible "
"to calculate an inverse when discarding all "
"instances of a duplicate).")

values, dtype, ndtype = _ensure_data(values)
f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
return f(values, keep=keep)
isduplicate = f(values, keep=keep)
if not return_inverse:
return isduplicate
elif not isduplicate.any():
# no need to calculate inverse if no duplicates
inverse = np.arange(len(values))
return isduplicate, inverse

if keep == 'first':
# values2unique: original indices to indices of ARRAY of unique values
# unique2values: reduplication from array of uniques to original array
# this fits together in the way that values[values2unique] are the
# unique values and values[values2unique][unique2values] == values
_, values2unique, unique2values = np.unique(values, return_index=True,
return_inverse=True)
elif keep == 'last':
# np.unique takes first occurrence per unique value,
# so we flip values that first becomes last
values = values[::-1]
_, values2unique, unique2values = np.unique(values, return_index=True,
return_inverse=True)
# the values in "values" correspond(ed) to the index of "values",
# which is simply np.arange(len(values)).
# By flipping "values" around, we need to do the same for the index,
# _because values2unique and unique2values are relative to that order_.
# Finally, to fit with the original order again, we need to flip the
# result around one last time.
values2unique = np.arange(len(values))[::-1][values2unique]
unique2values = unique2values[::-1]

# np.unique yields a ___sorted___ list of uniques, and values2unique resp.
# unique2values are relative to this order. To restore the original order,
# we argsort values2unique, because values2unique would be ordered if
# np.unique had not sorted implicitly.
# The first argsort gives the permutation from values2unique to its sorted
# form, but we need the inverse permutation (the map from the unsorted
# uniques to values2unique, from which we can continue with unique2values).
# This inversion (as a permutation) is achieved by the second argsort.
inverse = np.argsort(np.argsort(values2unique))[unique2values]
return isduplicate, inverse


def mode(values, dropna=True):
Expand Down
32 changes: 28 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,16 +1259,40 @@ def drop_duplicates(self, keep='first', inplace=False):
else:
return result

def duplicated(self, keep='first'):
def duplicated(self, keep='first', return_inverse=False):
from pandas.core.algorithms import duplicated

if return_inverse and keep is False:
raise ValueError("The parameters return_inverse=True and "
"keep=False cannot be used together (impossible "
"to calculate an inverse when discarding all "
"instances of a duplicate).")

if isinstance(self, ABCIndexClass):
if self.is_unique:
return np.zeros(len(self), dtype=np.bool)
return duplicated(self, keep=keep)
else:
isduplicate = np.zeros(len(self), dtype=np.bool)
if not return_inverse:
return isduplicate
return isduplicate, np.arange(len(self))
# core.algorithms.duplicated has the same output signature as
# Index.duplicated -> no need to distinguish cases here
return duplicated(self, keep=keep, return_inverse=return_inverse)

# Series case
if not return_inverse:
return self._constructor(duplicated(self, keep=keep),
index=self.index).__finalize__(self)

# return_inverse = True
isduplicate_array, inverse_array = duplicated(self, keep=keep,
return_inverse=True)
isduplicate = self._constructor(isduplicate_array,
index=self.index).__finalize__(self)
inverse = self._constructor(
self.loc[~isduplicate_array].index[inverse_array],
index=self.index)
return isduplicate, inverse

# ----------------------------------------------------------------------
# abstracts

Expand Down
Loading