Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add return_inverse to duplicated for DataFrame/Series/Index/MultiIndex #21645

Closed
wants to merge 11 commits into from
26 changes: 20 additions & 6 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,21 +412,35 @@ def time_frame_nunique(self):
class Duplicated(object):

goal_time = 0.2
params = (['first', 'last', False], [True, False])
param_names = ['keep', 'return_inverse']

def setup(self, keep, return_inverse):
if keep is False and return_inverse:
raise NotImplementedError

def setup(self):
n = (1 << 20)
t = date_range('2015-01-01', freq='S', periods=(n // 64))
xs = np.random.randn(n // 64).round(2)
self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
'b': np.random.choice(t, n),
'c': np.random.choice(xs, n)})
self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
# df2 will not have any duplicates
self.df2 = DataFrame(np.random.randn(100, 1000).astype(str))

df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)),
columns=list('ABCDE'))
df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str))
self.df3 = df3

def time_frame_duplicated(self, keep, return_inverse):
self.df.duplicated(keep=keep, return_inverse=return_inverse)

def time_frame_duplicated(self):
self.df.duplicated()
def time_frame_duplicated_wide(self, keep, return_inverse):
self.df2.duplicated(keep=keep, return_inverse=return_inverse)

def time_frame_duplicated_wide(self):
self.df2.duplicated()
def time_frame_duplicated_mixed(self, keep, return_inverse):
self.df3.duplicated(keep=keep, return_inverse=return_inverse)


class XS(object):
Expand Down
18 changes: 18 additions & 0 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,24 @@ def time_modulo(self, dtype):
self.index % 2


class Duplicated(object):

goal_time = 0.2
params = (['first', 'last', False], [True, False])
param_names = ['keep', 'return_inverse']

def setup(self, keep, return_inverse):
if keep is False and return_inverse:
raise NotImplementedError

n, k = 200, 1000
base = tm.makeStringIndex(n)
self.idx = Index(base[np.random.choice(n, k * n)])

def time_duplicated(self, keep, return_inverse):
self.idx.duplicated(keep=keep, return_inverse=return_inverse)


class Range(object):

goal_time = 0.2
Expand Down
13 changes: 9 additions & 4 deletions asv_bench/benchmarks/multiindex_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,17 +83,22 @@ def time_is_monotonic(self):
class Duplicated(object):

goal_time = 0.2
params = (['first', 'last', False], [True, False])
param_names = ['keep', 'return_inverse']

def setup(self):
n, k = 200, 5000
def setup(self, keep, return_inverse):
if keep is False and return_inverse:
raise NotImplementedError

n, k = 200, 1000
levels = [np.arange(n),
tm.makeStringIndex(n).values,
1000 + np.arange(n)]
labels = [np.random.choice(n, (k * n)) for lev in levels]
self.mi = MultiIndex(levels=levels, labels=labels)

def time_duplicated(self):
self.mi.duplicated()
def time_duplicated(self, keep, return_inverse):
self.mi.duplicated(keep=keep, return_inverse=return_inverse)


class Sortlevel(object):
Expand Down
18 changes: 18 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,21 @@ def setup(self):

def time_series_datetimeindex_repr(self):
getattr(self.s, 'a', None)


class Duplicated(object):

goal_time = 0.2
params = (['first', 'last', False], [True, False])
param_names = ['keep', 'return_inverse']

def setup(self, keep, return_inverse):
if keep is False and return_inverse:
raise NotImplementedError

n, k = 200, 1000
base = tm.makeStringIndex(n)
self.s = Series(base[np.random.choice(n, k * n)])

def time_series_duplicated(self, keep, return_inverse):
self.s.duplicated(keep=keep, return_inverse=return_inverse)
46 changes: 46 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,52 @@ This is the same behavior as ``Series.values`` for categorical data. See
:ref:`whatsnew_0240.api_breaking.interval_values` for more.


.. _whatsnew_0240.enhancements.duplicated_inverse:

The `duplicated`-method has gained the `return_inverse` kwarg
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double backticks were you now use single backticks (also on the lines below)

(in rst, double backticks give code-styled text)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The `duplicated`-method for `Series`, `DataFrame` and all flavors of `Index` has gained a `return_inverse` keyword,
which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).

For ``Index`` objects, the inverse is an ``np.ndarray``:

.. ipython:: python

idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
isduplicate, inverse = idx.duplicated(return_inverse=True) # default: keep='first'
isduplicate
inverse

This allows to reconstruct the original ``Index`` as follows:

.. ipython:: python

unique = idx[~isduplicate] # same as idx.drop_duplicates()
unique

reconstruct = unique[inverse]
reconstruct.equals(idx)

For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``,
which contains the mapping from the index of the deduplicated, unique subset back to the original index.

.. ipython:: python

df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
index=[1, 4, 9, 16, 25])
df
isduplicate, inverse = df.duplicated(keep='last', return_inverse=True)
isduplicate
inverse

unique = df.loc[~isduplicate] # same as df.drop_duplicates(keep='last')
unique
reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
reconstruct.equals(df)


.. _whatsnew_0240.enhancements.other:

Other Enhancements
Expand Down
59 changes: 55 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,7 @@ def _value_counts_arraylike(values, dropna):
return keys, counts


def duplicated(values, keep='first'):
def duplicated(values, keep='first', return_inverse=False):
"""
Return boolean ndarray denoting duplicate values.

Expand All @@ -785,16 +785,67 @@ def duplicated(values, keep='first'):
occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last
occurrence.
- False : Mark all duplicates as ``True``.
- False : Mark all duplicates as ``True``. This option is not
compatible with ``return_inverse``.
return_inverse : boolean, default False
If True, also return the selection of (integer) indices from the array
of unique values (created e.g. by selecting the boolean complement of
the first output, or by using `.drop_duplicates` with the same
`keep`-parameter) that can be used to reconstruct "values".

.. versionadded:: 0.24.0

Returns
-------
duplicated : ndarray
duplicated : ndarray or tuple of ndarray if ``return_inverse`` is True
"""

if return_inverse and keep is False:
raise ValueError("The parameters return_inverse=True and "
"keep=False cannot be used together (impossible "
"to calculate an inverse when discarding all "
"instances of a duplicate).")

values, dtype, ndtype = _ensure_data(values)
f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
return f(values, keep=keep)
isdup = f(values, keep=keep)
if not return_inverse:
return isdup
elif not isdup.any():
# no need to calculate inverse if no duplicates
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this always going to hold true? For example, if we work with a Series that is not sequentially indexed starting at 0 but doesn't contain duplicates is this going to return the appropriate result?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just the base-version of duplicated that always returns an nd.array. There's a wrapper in IndexOpsMixin.duplicated (i.e. in core/base.py that takes care of adapting this for Series.

inv = np.arange(len(values))
return isdup, inv

if keep == 'first':
# o2u: original indices to indices of ARRAY of unique values
# u2o: reduplication from array of unique values to original array
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather you not use np.unique at all, its not as performant as pd.unique, doesn't handle all dtypes, and sorts.

Further pls, pls use actual names here, and really avoid using abbreviations in any library code.

Copy link
Contributor Author

@h-vetinari h-vetinari Sep 23, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback

I'd rather you not use np.unique at all, its not as performant as pd.unique, doesn't handle all dtypes, and sorts.

Yes, it would be nicer to have this implemented in the cython hashtable functions, but that performance improvement is for a follow-up. np.unique is an easy solution and is invoked only for return_inverse=True (and we're only calling it on a series of ints, not objects, because the factorization for that hasn't changed!).

Further pls, pls use actual names here, and really avoid using abbreviations in any library code.

There's a fully commented, thoroughly explained and very localized part where these appear. Not sure how this is unclear, but will adapt...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Finally, the core changes are minimal (but I understand that it looks like a lot). TLDR: Implementation moves to core.algorithms, everything else wraps around that (+ doc improvements). Tests are expanded, and ASVs added.

# this fits together in the way that values[o2u] are the unique values
# and values[o2u][u2o] == values
_, o2u, u2o = np.unique(values, return_index=True,
return_inverse=True)
elif keep == 'last':
# np.unique takes first occurrence as unique value,
# so we flip values that first becomes last
values = values[::-1]
_, o2u, u2o = np.unique(values, return_index=True,
return_inverse=True)
# the values in "values" correspond(ed) to the index of "values",
# which is simply np.arange(len(values)).
# By flipping "values" around, we need to do the same for the index,
# ___because o2u and u2o are relative to that order___.
# Finally, to fit with the original order again, we need to flip the
# result around one last time.
o2u, u2o = np.arange(len(values))[::-1][o2u], u2o[::-1]

# np.unique yields a ___sorted___ list of uniques, and o2u/u2o are relative
# to this order. To restore the original order, we argsort o2u, because o2u
# would be ordered if np.unique had not sorted implicitly. The first
# argsort gives the permutation from o2u to its sorted form, but we need
# the inverse permutation (the map from the unsorted uniques to o2u, from
# which we can continue with u2o). This inversion (as a permutation) is
# achieved by the second argsort.
inv = np.argsort(np.argsort(o2u))[u2o]
return isdup, inv


def mode(values, dropna=True):
Expand Down
31 changes: 27 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1246,16 +1246,39 @@ def drop_duplicates(self, keep='first', inplace=False):
else:
return result

def duplicated(self, keep='first'):
def duplicated(self, keep='first', return_inverse=False):
from pandas.core.algorithms import duplicated

if return_inverse and keep is False:
raise ValueError("The parameters return_inverse=True and "
"keep=False cannot be used together (impossible "
"to calculate an inverse when discarding all "
"instances of a duplicate).")

if isinstance(self, ABCIndexClass):
if self.is_unique:
return np.zeros(len(self), dtype=np.bool)
return duplicated(self, keep=keep)
else:
isdup = np.zeros(len(self), dtype=np.bool)
if not return_inverse:
return isdup
return isdup, np.arange(len(self))
# core.algorithms.duplicated has the same output signature as
# Index.duplicated -> no need to distinguish cases here
return duplicated(self, keep=keep, return_inverse=return_inverse)

# Series case
if not return_inverse:
return self._constructor(duplicated(self, keep=keep),
index=self.index).__finalize__(self)

# return_inverse = True
isdup_array, inv_array = duplicated(self, keep=keep,
return_inverse=True)
isdup = self._constructor(isdup_array,
index=self.index).__finalize__(self)
inv = self._constructor(self.loc[~isdup_array].index[inv_array],
index=self.index)
return isdup, inv

# ----------------------------------------------------------------------
# abstracts

Expand Down
Loading