From 819bc9a31c12e252a66ee4cbc66fed2801da8acf Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 29 Jun 2018 19:24:22 +0200 Subject: [PATCH] WIP: add return_inverse to Series/Index as well --- pandas/core/algorithms.py | 47 +++++++++++++++++++++++++++++++++--- pandas/core/base.py | 29 +++++++++++++++++++--- pandas/core/frame.py | 39 +++++++++--------------------- pandas/core/indexes/base.py | 21 ++++++++++------ pandas/core/indexes/multi.py | 13 +++++++--- pandas/core/series.py | 21 ++++++++++------ 6 files changed, 116 insertions(+), 54 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e34b8eb55ccb7..566d0ac3e91a5d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -771,7 +771,7 @@ def _value_counts_arraylike(values, dropna): return keys, counts -def duplicated(values, keep='first'): +def duplicated(values, keep='first', return_inverse=False): """ Return boolean ndarray denoting duplicate values. @@ -786,16 +786,55 @@ def duplicated(values, keep='first'): occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - - False : Mark all duplicates as ``True``. + - False : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + Determines whether the mapping from unique elements to the original + index should be returned. If True, the output is a tuple. + + .. versionadded:: 0.24.0 Returns ------- - duplicated : ndarray + duplicated : ndarray or or tuple of ndarray if return_inverse is True """ + if return_inverse and not keep: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") + values, dtype, ndtype = _ensure_data(values) f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) - return f(values, keep=keep) + isdup = f(values, keep=keep) + if not return_inverse: + return isdup + elif not isdup.any(): + # no need to calculate inverse if no duplicates + inv = np.array(range(len(values))) + return isdup, inv + + if keep == 'first': + # o2u: original indices to indices of ARRAY of unique values + # u2o: reduplication from array of unique values to original array + _, o2u, u2o = np.unique(values, return_inverse=True, + return_index=True) + inv = o2u[u2o] + elif keep == 'last': + # np.unique takes first occurrence as unique value, + # so we flip ids that first becomes last + values = values[::-1] + _, o2u, u2o = np.unique(values, return_inverse=True, + return_index=True) + # the values in the ids-array correspond(ed) to the index of value, + # which is simply np.array(range(len(values))). + # By flipping ids around, we need to do the same for the index, + # ___because o2u and u2o are relative to that order___. + # Finally, to fit with the original order again, we need to flip the + # values around one last time. + inv = np.array(range(len(values)))[::-1][o2u][u2o][::-1] + return isdup, inv def mode(values, dropna=True): diff --git a/pandas/core/base.py b/pandas/core/base.py index 6625a3bbe97d70..7defe6b7d438e2 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1242,16 +1242,37 @@ def drop_duplicates(self, keep='first', inplace=False): else: return result - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): from pandas.core.algorithms import duplicated + + if return_inverse and not keep: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") + if isinstance(self, ABCIndexClass): if self.is_unique: - return np.zeros(len(self), dtype=np.bool) - return duplicated(self, keep=keep) - else: + isdup = np.zeros(len(self), dtype=np.bool) + if not return_inverse: + return isdup + return isdup, np.array(range(len(self))) + # algorithms.duplicated has the same output signature as + # Index.duplicated -> no need to distinguish cases here + return duplicated(self, keep=keep, return_inverse=return_inverse) + + # Series case + if not return_inverse: return self._constructor(duplicated(self, keep=keep), index=self.index).__finalize__(self) + isdup_array, inv_array = duplicated(self, keep=keep, + return_inverse=return_inverse) + isdup = self._constructor(isdup_array, + index=self.index).__finalize__(self) + inv = self._constructor(self.index[inv_array], index=self.index) + return isdup, inv + # ---------------------------------------------------------------------- # abstracts diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e46cc12028c79e..f2000b6886e2ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4364,7 +4364,7 @@ def duplicated(self, subset=None, keep='first', return_inverse=False): compatible with ``return_inverse``. return_inverse : boolean, default False Determines whether the mapping from unique elements to the original - index should be returned. If true, the output is a tuple. + index should be returned. If True, the output is a tuple. .. versionadded:: 0.24.0 @@ -4373,12 +4373,14 @@ def duplicated(self, subset=None, keep='first', return_inverse=False): duplicated : Series or tuple of Series if return_inverse is True """ from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT + from pandas._libs.hashtable import _SIZE_HINT_LIMIT + from pandas.core.algorithms import duplicated if return_inverse and not keep: raise ValueError("The parameters return_inverse=True and " "keep=False cannot be used together (impossible " - "to calculate an inverse when discarding values)") + "to calculate an inverse when discarding all " + "instances of a duplicate).") def f(vals): labels, shape = algorithms.factorize( @@ -4404,32 +4406,13 @@ def f(vals): labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) - isdup = Series(duplicated_int64(ids, keep), index=self.index) if not return_inverse: - return isdup - elif not isdup.any(): - # no need to calculate inverse if no duplicates - inv = Series(self.index, index=self.index) - return isdup, inv - - if keep == 'first': - # o2u: original indices to indices of ARRAY of unique values - # u2o: reduplication from array of unique values to original array - _, o2u, u2o = np.unique(ids, return_inverse=True, - return_index=True) - inv = Series(self.index[o2u][u2o], index=self.index) - elif keep == 'last': - # np.unique takes first occurrence as unique value, - # so we flip ids that first becomes last - ids = ids[::-1] - _, o2u, u2o = np.unique(ids, return_inverse=True, - return_index=True) - # the values in the ids-array correspond(ed) to self.index - - # by flipping ids around, we need to do the same for self.index, - # ___because o2u and u2o are relative to that order___. - # Finally, to fit with 'index=self.index' in the constructor, - # we need to flip the values around one last time - inv = Series(self.index[::-1][o2u][u2o][::-1], index=self.index) + return Series(duplicated(ids, keep=keep), index=self.index) + + isdup_array, inv_array = duplicated(ids, keep=keep, + return_inverse=return_inverse) + isdup = Series(isdup_array, index=self.index) + inv = Series(self.index[inv_array], index=self.index) return isdup, inv # ---------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 122f8662abb614..43274c3b8a97e0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4432,7 +4432,7 @@ def drop_duplicates(self, keep='first'): """ return super(Index, self).drop_duplicates(keep=keep) - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): """ Indicate duplicate index values. @@ -4449,7 +4449,17 @@ def duplicated(self, keep='first'): occurrence. - 'last' : Mark duplicates as ``True`` except for the last occurrence. - - ``False`` : Mark all duplicates as ``True``. + - ``False`` : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + Determines whether the mapping from unique elements to the original + index should be returned. If True, the output is a tuple. + + .. versionadded:: 0.24.0 + + Returns + ------- + duplicated : ndarray or or tuple of ndarray if return_inverse is True Examples -------- @@ -4476,17 +4486,14 @@ def duplicated(self, keep='first'): >>> idx.duplicated(keep=False) array([ True, False, True, False, True]) - Returns - ------- - numpy.ndarray - See Also -------- pandas.Series.duplicated : Equivalent method on pandas.Series pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame pandas.Index.drop_duplicates : Remove duplicate values from Index """ - return super(Index, self).duplicated(keep=keep) + return super(Index, self).duplicated(keep=keep, + return_inverse=return_inverse) _index_shared_docs['fillna'] = """ Fill NA/NaN values with the specified value diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f9f3041bef0739..3514de7492b137 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -930,14 +930,19 @@ def f(k, stringify): return hash_tuple(key) @Appender(Index.duplicated.__doc__) - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64 + from pandas.core.algorithms import duplicated + + if return_inverse and not keep: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") shape = map(len, self.levels) ids = get_group_index(self.labels, shape, sort=False, xnull=False) - - return duplicated_int64(ids, keep) + return duplicated(ids, keep=keep, return_inverse=return_inverse) def fillna(self, value=None, downcast=None): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index cdb901d18767ce..c42481775b7404 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1581,7 +1581,7 @@ def drop_duplicates(self, keep='first', inplace=False): """ return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): """ Indicate duplicate Series values. @@ -1596,7 +1596,17 @@ def duplicated(self, keep='first'): occurrence. - 'last' : Mark duplicates as ``True`` except for the last occurrence. - - ``False`` : Mark all duplicates as ``True``. + - ``False`` : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + Determines whether the mapping from unique elements to the original + index should be returned. If True, the output is a tuple. + + .. versionadded:: 0.24.0 + + Returns + ------- + duplicated : Series or or tuple of Series if return_inverse is True Examples -------- @@ -1643,17 +1653,14 @@ def duplicated(self, keep='first'): 4 True dtype: bool - Returns - ------- - pandas.core.series.Series - See Also -------- pandas.Index.duplicated : Equivalent method on pandas.Index pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame pandas.Series.drop_duplicates : Remove duplicate values from Series """ - return super(Series, self).duplicated(keep=keep) + return super(Series, self).duplicated(keep=keep, + return_inverse=return_inverse) def idxmin(self, axis=None, skipna=True, *args, **kwargs): """