Skip to content

Commit

Permalink
WIP: add return_inverse to Series/Index as well
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari committed Jun 29, 2018
1 parent b08dc3d commit 819bc9a
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 54 deletions.
47 changes: 43 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ def _value_counts_arraylike(values, dropna):
return keys, counts


def duplicated(values, keep='first'):
def duplicated(values, keep='first', return_inverse=False):
"""
Return boolean ndarray denoting duplicate values.
Expand All @@ -786,16 +786,55 @@ def duplicated(values, keep='first'):
occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last
occurrence.
- False : Mark all duplicates as ``True``.
- False : Mark all duplicates as ``True``. This option is not
compatible with ``return_inverse``.
return_inverse : boolean, default False
Determines whether the mapping from unique elements to the original
index should be returned. If True, the output is a tuple.
.. versionadded:: 0.24.0
Returns
-------
duplicated : ndarray
duplicated : ndarray or or tuple of ndarray if return_inverse is True
"""

if return_inverse and not keep:
raise ValueError("The parameters return_inverse=True and "
"keep=False cannot be used together (impossible "
"to calculate an inverse when discarding all "
"instances of a duplicate).")

values, dtype, ndtype = _ensure_data(values)
f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
return f(values, keep=keep)
isdup = f(values, keep=keep)
if not return_inverse:
return isdup
elif not isdup.any():
# no need to calculate inverse if no duplicates
inv = np.array(range(len(values)))
return isdup, inv

if keep == 'first':
# o2u: original indices to indices of ARRAY of unique values
# u2o: reduplication from array of unique values to original array
_, o2u, u2o = np.unique(values, return_inverse=True,
return_index=True)
inv = o2u[u2o]
elif keep == 'last':
# np.unique takes first occurrence as unique value,
# so we flip ids that first becomes last
values = values[::-1]
_, o2u, u2o = np.unique(values, return_inverse=True,
return_index=True)
# the values in the ids-array correspond(ed) to the index of value,
# which is simply np.array(range(len(values))).
# By flipping ids around, we need to do the same for the index,
# ___because o2u and u2o are relative to that order___.
# Finally, to fit with the original order again, we need to flip the
# values around one last time.
inv = np.array(range(len(values)))[::-1][o2u][u2o][::-1]
return isdup, inv


def mode(values, dropna=True):
Expand Down
29 changes: 25 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1242,16 +1242,37 @@ def drop_duplicates(self, keep='first', inplace=False):
else:
return result

def duplicated(self, keep='first'):
def duplicated(self, keep='first', return_inverse=False):
from pandas.core.algorithms import duplicated

if return_inverse and not keep:
raise ValueError("The parameters return_inverse=True and "
"keep=False cannot be used together (impossible "
"to calculate an inverse when discarding all "
"instances of a duplicate).")

if isinstance(self, ABCIndexClass):
if self.is_unique:
return np.zeros(len(self), dtype=np.bool)
return duplicated(self, keep=keep)
else:
isdup = np.zeros(len(self), dtype=np.bool)
if not return_inverse:
return isdup
return isdup, np.array(range(len(self)))
# algorithms.duplicated has the same output signature as
# Index.duplicated -> no need to distinguish cases here
return duplicated(self, keep=keep, return_inverse=return_inverse)

# Series case
if not return_inverse:
return self._constructor(duplicated(self, keep=keep),
index=self.index).__finalize__(self)

isdup_array, inv_array = duplicated(self, keep=keep,
return_inverse=return_inverse)
isdup = self._constructor(isdup_array,
index=self.index).__finalize__(self)
inv = self._constructor(self.index[inv_array], index=self.index)
return isdup, inv

# ----------------------------------------------------------------------
# abstracts

Expand Down
39 changes: 11 additions & 28 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4364,7 +4364,7 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
compatible with ``return_inverse``.
return_inverse : boolean, default False
Determines whether the mapping from unique elements to the original
index should be returned. If true, the output is a tuple.
index should be returned. If True, the output is a tuple.
.. versionadded:: 0.24.0
Expand All @@ -4373,12 +4373,14 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
duplicated : Series or tuple of Series if return_inverse is True
"""
from pandas.core.sorting import get_group_index
from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
from pandas._libs.hashtable import _SIZE_HINT_LIMIT
from pandas.core.algorithms import duplicated

if return_inverse and not keep:
raise ValueError("The parameters return_inverse=True and "
"keep=False cannot be used together (impossible "
"to calculate an inverse when discarding values)")
"to calculate an inverse when discarding all "
"instances of a duplicate).")

def f(vals):
labels, shape = algorithms.factorize(
Expand All @@ -4404,32 +4406,13 @@ def f(vals):
labels, shape = map(list, zip(*map(f, vals)))

ids = get_group_index(labels, shape, sort=False, xnull=False)
isdup = Series(duplicated_int64(ids, keep), index=self.index)
if not return_inverse:
return isdup
elif not isdup.any():
# no need to calculate inverse if no duplicates
inv = Series(self.index, index=self.index)
return isdup, inv

if keep == 'first':
# o2u: original indices to indices of ARRAY of unique values
# u2o: reduplication from array of unique values to original array
_, o2u, u2o = np.unique(ids, return_inverse=True,
return_index=True)
inv = Series(self.index[o2u][u2o], index=self.index)
elif keep == 'last':
# np.unique takes first occurrence as unique value,
# so we flip ids that first becomes last
ids = ids[::-1]
_, o2u, u2o = np.unique(ids, return_inverse=True,
return_index=True)
# the values in the ids-array correspond(ed) to self.index -
# by flipping ids around, we need to do the same for self.index,
# ___because o2u and u2o are relative to that order___.
# Finally, to fit with 'index=self.index' in the constructor,
# we need to flip the values around one last time
inv = Series(self.index[::-1][o2u][u2o][::-1], index=self.index)
return Series(duplicated(ids, keep=keep), index=self.index)

isdup_array, inv_array = duplicated(ids, keep=keep,
return_inverse=return_inverse)
isdup = Series(isdup_array, index=self.index)
inv = Series(self.index[inv_array], index=self.index)
return isdup, inv

# ----------------------------------------------------------------------
Expand Down
21 changes: 14 additions & 7 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4432,7 +4432,7 @@ def drop_duplicates(self, keep='first'):
"""
return super(Index, self).drop_duplicates(keep=keep)

def duplicated(self, keep='first'):
def duplicated(self, keep='first', return_inverse=False):
"""
Indicate duplicate index values.
Expand All @@ -4449,7 +4449,17 @@ def duplicated(self, keep='first'):
occurrence.
- 'last' : Mark duplicates as ``True`` except for the last
occurrence.
- ``False`` : Mark all duplicates as ``True``.
- ``False`` : Mark all duplicates as ``True``. This option is not
compatible with ``return_inverse``.
return_inverse : boolean, default False
Determines whether the mapping from unique elements to the original
index should be returned. If True, the output is a tuple.
.. versionadded:: 0.24.0
Returns
-------
duplicated : ndarray or or tuple of ndarray if return_inverse is True
Examples
--------
Expand All @@ -4476,17 +4486,14 @@ def duplicated(self, keep='first'):
>>> idx.duplicated(keep=False)
array([ True, False, True, False, True])
Returns
-------
numpy.ndarray
See Also
--------
pandas.Series.duplicated : Equivalent method on pandas.Series
pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame
pandas.Index.drop_duplicates : Remove duplicate values from Index
"""
return super(Index, self).duplicated(keep=keep)
return super(Index, self).duplicated(keep=keep,
return_inverse=return_inverse)

_index_shared_docs['fillna'] = """
Fill NA/NaN values with the specified value
Expand Down
13 changes: 9 additions & 4 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -930,14 +930,19 @@ def f(k, stringify):
return hash_tuple(key)

@Appender(Index.duplicated.__doc__)
def duplicated(self, keep='first'):
def duplicated(self, keep='first', return_inverse=False):
from pandas.core.sorting import get_group_index
from pandas._libs.hashtable import duplicated_int64
from pandas.core.algorithms import duplicated

if return_inverse and not keep:
raise ValueError("The parameters return_inverse=True and "
"keep=False cannot be used together (impossible "
"to calculate an inverse when discarding all "
"instances of a duplicate).")

shape = map(len, self.levels)
ids = get_group_index(self.labels, shape, sort=False, xnull=False)

return duplicated_int64(ids, keep)
return duplicated(ids, keep=keep, return_inverse=return_inverse)

def fillna(self, value=None, downcast=None):
"""
Expand Down
21 changes: 14 additions & 7 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1581,7 +1581,7 @@ def drop_duplicates(self, keep='first', inplace=False):
"""
return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)

def duplicated(self, keep='first'):
def duplicated(self, keep='first', return_inverse=False):
"""
Indicate duplicate Series values.
Expand All @@ -1596,7 +1596,17 @@ def duplicated(self, keep='first'):
occurrence.
- 'last' : Mark duplicates as ``True`` except for the last
occurrence.
- ``False`` : Mark all duplicates as ``True``.
- ``False`` : Mark all duplicates as ``True``. This option is not
compatible with ``return_inverse``.
return_inverse : boolean, default False
Determines whether the mapping from unique elements to the original
index should be returned. If True, the output is a tuple.
.. versionadded:: 0.24.0
Returns
-------
duplicated : Series or or tuple of Series if return_inverse is True
Examples
--------
Expand Down Expand Up @@ -1643,17 +1653,14 @@ def duplicated(self, keep='first'):
4 True
dtype: bool
Returns
-------
pandas.core.series.Series
See Also
--------
pandas.Index.duplicated : Equivalent method on pandas.Index
pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame
pandas.Series.drop_duplicates : Remove duplicate values from Series
"""
return super(Series, self).duplicated(keep=keep)
return super(Series, self).duplicated(keep=keep,
return_inverse=return_inverse)

def idxmin(self, axis=None, skipna=True, *args, **kwargs):
"""
Expand Down

0 comments on commit 819bc9a

Please sign in to comment.