From b61ac0e5dd516ae8ef3198783cee5b118dc928c4 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 2 Dec 2018 21:54:21 +0100 Subject: [PATCH 1/5] API/ENH/DEPR: Series.unique returns Series; .unique gets return_inverse --- pandas/core/algorithms.py | 10 +- pandas/core/arrays/categorical.py | 28 ++++- pandas/core/base.py | 19 ++- pandas/core/series.py | 192 ++++++++++++++++++++++++++---- 4 files changed, 217 insertions(+), 32 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1a4368ee8ea98..6fb568d20a426 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -271,7 +271,7 @@ def match(to_match, values, na_sentinel=-1): return result -def unique(values): +def unique(values, return_inverse=False): """ Hash table-based unique. Uniques are returned in order of appearance. This does NOT sort. @@ -355,7 +355,11 @@ def unique(values): htable, _, values, dtype, ndtype = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values) + if return_inverse: + uniques, inverse = table.unique(values, return_inverse=True) + else: + uniques = table.unique(values) + uniques = _reconstruct_data(uniques, dtype, original) if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype): @@ -365,6 +369,8 @@ def unique(values): # TODO: it must return DatetimeArray with tz in pandas 2.0 uniques = uniques.astype(object).values + if return_inverse: + return uniques, inverse return uniques diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ac1c34edba914..b2f37f350c95b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2249,7 +2249,7 @@ def mode(self, dropna=True): codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) return self._constructor(values=codes, dtype=self.dtype, fastpath=True) - def unique(self): + def unique(self, return_inverse=False): """ Return the ``Categorical`` which ``categories`` and ``codes`` are unique. Unused categories are NOT returned. @@ -2259,9 +2259,22 @@ def unique(self): - ordered category: values are sorted by appearance order, categories keeps existing order. + Parameters + ---------- + return_inverse : boolean, default False + Whether to return the inverse of the unique values. If True, the + output will be a tuple where the second component is again an + np.ndarray that contains the mapping between the indices of the + elements in the calling Categorical and their locations in the + unique values. See examples for how to reconstruct. + + .. versionadded:: 0.24.0 + Returns ------- - unique values : ``Categorical`` + uniques : ``Categorical`` + inverse : np.ndarray (if `return_inverse=True`) + The inverse from the `uniques` back to the calling ``Categorical``. Examples -------- @@ -2293,7 +2306,10 @@ def unique(self): """ # unlike np.unique, unique1d does not sort - unique_codes = unique1d(self.codes) + if return_inverse: + unique_codes, inverse = unique1d(self.codes, return_inverse=True) + else: + unique_codes = unique1d(self.codes, return_inverse=False) cat = self.copy() # keep nan in codes @@ -2303,7 +2319,11 @@ def unique(self): take_codes = unique_codes[unique_codes != -1] if self.ordered: take_codes = np.sort(take_codes) - return cat.set_categories(cat.categories.take(take_codes)) + result = cat.set_categories(cat.categories.take(take_codes)) + + if return_inverse: + return result, inverse + return result def _values_for_factorize(self): codes = self.codes.astype('int64') diff --git a/pandas/core/base.py b/pandas/core/base.py index e7c3a45a710e0..7d1cf71c82e3b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1208,15 +1208,24 @@ def value_counts(self, normalize=False, sort=True, ascending=False, normalize=normalize, bins=bins, dropna=dropna) return result - def unique(self): + def unique(self, return_inverse=False): values = self._values - if hasattr(values, 'unique'): - - result = values.unique() + if is_extension_array_dtype(values): + if return_inverse: + # as long as return_inverse is not part of the EA.unique + # contract, test if this works + try: + result = values.unique(return_inverse=return_inverse) + except TypeError: + raise ValueError('extension array of dtype {dtype} does ' + 'not yet support unique with ' + 'return_inverse.') + else: + result = values.unique() else: from pandas.core.algorithms import unique1d - result = unique1d(values) + result = unique1d(values, return_inverse=return_inverse) return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 8d4d7677cca44..8b162d86c3a31 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1502,18 +1502,51 @@ def mode(self, dropna=True): # TODO: Add option for bins like value_counts() return algorithms.mode(self, dropna=dropna) - def unique(self): + def unique(self, return_inverse=False, raw=None): """ Return unique values of Series object. Uniques are returned in order of appearance. Hash table-based unique, therefore does NOT sort. + Parameters + ---------- + return_inverse : boolean, default False + Whether to return the inverse of the unique values. If True, the + output will be a tuple where the second component is again a Series + that contains the mapping between the indices of the elements in + the calling Series and their locations in the unique values. See + examples for how to reconstruct. + + Using `return_inverse=True` is not compatible with `raw=True`. + + .. versionadded:: 0.24.0 + + raw : boolean or None, default None + This parameter switches between different return types. If it is + True, the result will be `ndarray` (resp. a :class:`pd.Categorical` + in case of categorical data), which corresponds to the behavior + before v.0.24. + + If False (the future default behavior, starting with v.1.0), it + will always return a Series of the same type as the caller. + + .. versionadded:: 0.24.0 + Returns ------- - ndarray or Categorical - The unique values returned as a NumPy array. In case of categorical - data type, returned as a Categorical. + uniques : Series (if `raw=False`), else ndarray or Categorical + If `raw=False`, this is a Series which contains the uniques in + order of their appearance (and with their respective indices). + If `raw=True`, the unique values are returned as a numpy array, + or as a Categorical (in case of categorical data). + inverse : Series (if `return_inverse=True`) + The inverse from the `uniques` back to the calling Series. + + Raises + ------ + ValueError + If `raw=True` and `return_inverse=True`. See Also -------- @@ -1522,40 +1555,157 @@ def unique(self): Examples -------- - >>> pd.Series([2, 1, 3, 3], name='A').unique() - array([2, 1, 3]) + If `raw=False`, the output is a Series: + + >>> pd.Series([1, 1, 3, 2], name='A').unique(raw=False) + 0 1 + 2 3 + 3 2 + Name: A, dtype: int64 + + If `raw=True`, the output is an ndarray (if the data is not + categorical): + + >>> pd.Series([1, 1, 3, 2], name='A').unique(raw=True) + array([1, 3, 2]) + + This method also deals well with timestamps, - >>> pd.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique() - array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + >>> pd.Series([pd.Timestamp('2016-01-01') + ... for _ in range(3)]).unique(raw=False) + 0 2016-01-01 + dtype: datetime64[ns] + + as well as timezones: >>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern') - ... for _ in range(3)]).unique() - array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], - dtype=object) + ... for _ in range(3)]).unique(raw=False) + 0 2016-01-01 00:00:00-05:00 + dtype: datetime64[ns, US/Eastern] An unordered Categorical will return categories in the order of appearance. - >>> pd.Series(pd.Categorical(list('baabc'))).unique() + >>> pd.Series(pd.Categorical(list('baabc'))).unique(raw=False) + 0 b + 1 a + 4 c + dtype: category + Categories (3, object): [b, a, c] + + >>> pd.Series(pd.Categorical(list('baabc'))).unique(raw=True) [b, a, c] Categories (3, object): [b, a, c] An ordered Categorical preserves the category ordering. >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'), - ... ordered=True)).unique() - [b, a, c] + ... ordered=True)).unique(raw=False) + 0 b + 1 a + 4 c + dtype: category Categories (3, object): [a < b < c] - """ - result = super(Series, self).unique() - if is_datetime64tz_dtype(self.dtype): - # we are special casing datetime64tz_dtype - # to return an object array of tz-aware Timestamps + As an example for dealing with `return_inverse`, we consider the + following example (the reason we use a non-default index is only for + demonstration purposes, because this is also something the inverse + needs to reconstruct): + + >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'], + ... index=[1, 4, 9, 16, 25]) + + >>> animals_unique, inverse = animals.unique(raw=False, + ... return_inverse=True) + >>> animals_unique + 1 lama + 4 cow + 16 beetle + dtype: object + + >>> inverse + 1 1 + 4 4 + 9 1 + 16 16 + 25 1 + dtype: int64 + + This can be used to reconstruct the original object from its unique + values as follows + + >>> reconstruct = animals_unique.reindex(inverse) + >>> reconstruct + 1 lama + 4 cow + 1 lama + 16 beetle + 1 lama + dtype: object + + We see that the values of `animals` get reconstructed correctly, but + the index does not match yet -- consequently, the last step is to + correctly set the index. + + >>> reconstruct.index = inverse.index + >>> reconstruct + 1 lama + 4 cow + 9 lama + 16 beetle + 25 lama + dtype: object + + >>> reconstruct.equals(animals) + True + """ + if raw is None: + msg = ('A future version of pandas will return a Series here. ' + 'To keep returning an ndarray / Categorical (the behavior ' + 'before v.0.24) and silence this warning, pass the keyword ' + '`raw=True`. To return a Series and silence this warning, ' + 'pass `raw=False`. In the future, the default will switch ' + 'to `raw=False`, and therefore, if an array is required as ' + 'output, the recommended way is to pass `raw=False` and ' + 'use `.array` on the result.') + warnings.warn(msg, FutureWarning, stacklevel=2) + raw = True + + if raw not in [True, False]: + raise ValueError('The keyword "raw" must be either True or False') + if return_inverse not in [True, False]: + raise ValueError('The keyword "return_inverse" must be either ' + 'True or False') + + if raw and return_inverse: + raise ValueError('The keyword "return_inverse=True" is not ' + 'supported if "raw=True"') + elif raw: + result = super(Series, self).unique() + + if is_datetime64tz_dtype(self.dtype): + # we are special casing datetime64tz_dtype + # to return an object array of tz-aware Timestamps + + # TODO: it must return DatetimeArray with tz in pandas 2.0 + result = result.astype(object).values + return result + + # for raw=False, we need the inverse in any case + result_array, inverse_array = super(Series, + self).unique(return_inverse=True) + + # construct indices of first occurrences. In principle, this could be + # returned from the cython methods, but this is not compatible with the + # (shared) signature for Index.unique + idx = ~Series(inverse_array).duplicated(keep='first') - # TODO: it must return DatetimeArray with tz in pandas 2.0 - result = result.astype(object).values + result = self._constructor(result_array, + index=self.index[idx]).__finalize__(self) + if return_inverse: + inverse = Series(result.index[inverse_array], index=self.index) + return result, inverse return result def drop_duplicates(self, keep='first', inplace=False): From 72929218ebfea769b3ead6db03baa24fd3c180e1 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 5 Dec 2018 02:04:25 +0100 Subject: [PATCH 2/5] Fixes for tests --- pandas/core/algorithms.py | 5 ++++- pandas/core/base.py | 5 ++++- pandas/core/series.py | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6fb568d20a426..05e8df64d10cb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -347,7 +347,10 @@ def unique(values, return_inverse=False): values = _ensure_arraylike(values) - if is_extension_array_dtype(values): + if isinstance(values, ABCSeries): + # this calls through Series, need raw=True to not raise warning + return values.unique(raw=True) + elif is_extension_array_dtype(values): # Dispatch to extension dtype's unique. return values.unique() diff --git a/pandas/core/base.py b/pandas/core/base.py index 7d1cf71c82e3b..d42027a491abc 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1244,7 +1244,10 @@ def nunique(self, dropna=True): ------- nunique : int """ - uniqs = self.unique() + if isinstance(self, ABCSeries): + uniqs = self.unique(raw=True) + else: + uniqs = self.unique() n = len(uniqs) if dropna and isna(uniqs).any(): n -= 1 diff --git a/pandas/core/series.py b/pandas/core/series.py index 8b162d86c3a31..1ce256f21f097 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1646,7 +1646,7 @@ def unique(self, return_inverse=False, raw=None): We see that the values of `animals` get reconstructed correctly, but the index does not match yet -- consequently, the last step is to correctly set the index. - + >>> reconstruct.index = inverse.index >>> reconstruct 1 lama @@ -1682,11 +1682,11 @@ def unique(self, return_inverse=False, raw=None): 'supported if "raw=True"') elif raw: result = super(Series, self).unique() - + if is_datetime64tz_dtype(self.dtype): # we are special casing datetime64tz_dtype # to return an object array of tz-aware Timestamps - + # TODO: it must return DatetimeArray with tz in pandas 2.0 result = result.astype(object).values return result From 10432d4c081d6208dfd39d84b7d559fb373388b2 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 5 Dec 2018 02:04:41 +0100 Subject: [PATCH 3/5] TST: first pass at tests --- .../arrays/categorical/test_analytics.py | 6 ++--- pandas/tests/arrays/sparse/test_array.py | 2 +- pandas/tests/extension/base/methods.py | 10 ++++++-- pandas/tests/frame/test_indexing.py | 2 +- pandas/tests/plotting/common.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 4 +-- pandas/tests/series/test_duplicates.py | 15 +++++------ pandas/tests/test_algos.py | 6 ++--- pandas/tests/test_base.py | 25 ++++++++++++------- 9 files changed, 43 insertions(+), 29 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 4251273e424dd..bd36cd1e0782f 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -183,13 +183,13 @@ def test_unique_index_series(self): tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) - tm.assert_categorical_equal(Series(c).unique(), exp) + tm.assert_categorical_equal(Series(c).unique(raw=True), exp) c = Categorical([1, 1, 2, 2], categories=[3, 2, 1]) exp = Categorical([1, 2], categories=[1, 2]) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) - tm.assert_categorical_equal(Series(c).unique(), exp) + tm.assert_categorical_equal(Series(c).unique(raw=True), exp) c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True) # Categorical.unique keeps categories order if ordered=True @@ -197,7 +197,7 @@ def test_unique_index_series(self): tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) - tm.assert_categorical_equal(Series(c).unique(), exp) + tm.assert_categorical_equal(Series(c).unique(raw=True), exp) def test_shift(self): # GH 9416 diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index b8cef92f6a6d4..f4f20428cad1d 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1150,7 +1150,7 @@ def test_first_fill_value_loc(arr, loc): ]) def test_unique_na_fill(arr, fill_value): a = pd.SparseArray(arr, fill_value=fill_value).unique() - b = pd.Series(arr).unique() + b = pd.Series(arr).unique(raw=True) assert isinstance(a, SparseArray) a = np.asarray(a) tm.assert_numpy_array_equal(a, b) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index e9a89c1af2f22..fea68725912ce 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.core.dtypes.generic import ABCSeries + import pandas as pd import pandas.util.testing as tm @@ -75,11 +77,15 @@ def test_sort_values_frame(self, data_for_sorting, ascending): self.assert_frame_equal(result, expected) @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) - @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) + @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs), + pd.unique]) def test_unique(self, data, box, method): duplicated = box(data._from_sequence([data[0], data[0]])) - result = method(duplicated) + if isinstance(duplicated, ABCSeries) and method != pd.unique: + result = method(duplicated, raw=True) + else: + result = method(duplicated) assert len(result) == 1 assert isinstance(result, type(data)) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 0a61c844f1af8..d94b27f16397a 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2226,7 +2226,7 @@ def verify(df, level, idx, indexer, check_index_type=True): for x in [2, 3, 3, 2, 3, 2, 3, 2]]), 'joline': np.random.randn(20).round(3) * 10}) - for idx in permutations(df['jim'].unique()): + for idx in permutations(df['jim'].unique(raw=True)): for i in range(3): verify_first_level(df, 'jim', idx[:i + 1]) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index f41a3a10604af..b5ce068f7bf2c 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -156,7 +156,7 @@ def _check_visible(self, collections, visible=True): assert patch.get_visible() == visible def _get_colors_mapped(self, series, colors): - unique = series.unique() + unique = series.unique(raw=True) # unique and colors length can be differed # depending on slice value mapped = dict(zip(unique, colors)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 94e180f9328d6..fb5cbfbefad0e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -433,7 +433,7 @@ def test_merge_nosort(self): datetime(2010, 2, 3), datetime(2012, 2, 3)]} df = DataFrame.from_dict(d) - var3 = df.var3.unique() + var3 = df.var3.unique(raw=True) var3.sort() new = DataFrame.from_dict({"var3": var3, "var8": np.random.random(7)}) @@ -442,7 +442,7 @@ def test_merge_nosort(self): exp = merge(df, new, on='var3', sort=False) assert_frame_equal(result, exp) - assert (df.var3.unique() == result.var3.unique()).all() + assert (df.var3.unique(raw=True) == result.var3.unique(raw=True)).all() def test_merge_nan_right(self): df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]}) diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 26222637e3509..1d65e52aee0db 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -26,37 +26,38 @@ def test_unique(): # GH714 also, dtype=float s = Series([1.2345] * 100) s[::2] = np.nan - result = s.unique() + result = s.unique(raw=True) assert len(result) == 2 s = Series([1.2345] * 100, dtype='f4') s[::2] = np.nan - result = s.unique() + result = s.unique(raw=True) assert len(result) == 2 # NAs in object arrays #714 s = Series(['foo'] * 100, dtype='O') s[::2] = np.nan - result = s.unique() + result = s.unique(raw=True) assert len(result) == 2 # decision about None s = Series([1, 2, 3, None, None, None], dtype=object) - result = s.unique() + result = s.unique(raw=True) expected = np.array([1, 2, 3, None], dtype=object) tm.assert_numpy_array_equal(result, expected) # GH 18051 s = Series(Categorical([])) - tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False) + tm.assert_categorical_equal(s.unique(raw=True), Categorical([]), + check_dtype=False) s = Series(Categorical([np.nan])) - tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), + tm.assert_categorical_equal(s.unique(raw=True), Categorical([np.nan]), check_dtype=False) def test_unique_data_ownership(): # it works! #1807 - Series(Series(["a", "c", "b"]).unique()).sort_values() + Series(Series(["a", "c", "b"]).unique(raw=True)).sort_values() def test_is_unique(): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c9d403f6696af..2ad67a93b7345 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -434,7 +434,7 @@ def test_categorical(self): # Series of categorical dtype s = Series(Categorical(list('baabc')), name='foo') - result = s.unique() + result = s.unique(raw=True) tm.assert_categorical_equal(result, expected) result = pd.unique(s) @@ -455,7 +455,7 @@ def test_datetime64tz_aware(self): result = Series( Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])).unique() + Timestamp('20160101', tz='US/Eastern')])).unique(raw=True) expected = np.array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -1293,7 +1293,7 @@ def test_lookup_overflow(self, writable): def test_get_unique(self): s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) exp = np.array([1, 2, 2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(s.unique(), exp) + tm.assert_numpy_array_equal(s.unique(raw=True), exp) @pytest.mark.parametrize('nvals', [0, 10]) # resizing to 0 is special case @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [ diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 47fafe2a900b4..55c72462b0e74 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -442,11 +442,12 @@ def test_value_counts_unique_nunique(self): assert result.index.name is None assert result.name == 'a' - result = o.unique() if isinstance(o, Index): + result = o.unique() assert isinstance(result, o.__class__) tm.assert_index_equal(result, orig) elif is_datetime64tz_dtype(o): + result = o.unique(raw=True) # datetimetz Series returns array of Timestamp assert result[0] == orig[0] for r in result: @@ -454,6 +455,7 @@ def test_value_counts_unique_nunique(self): tm.assert_numpy_array_equal(result, orig._values.astype(object).values) else: + result = o.unique(raw=True) tm.assert_numpy_array_equal(result, orig.values) assert o.nunique() == len(np.unique(o.values)) @@ -534,16 +536,18 @@ def test_value_counts_unique_nunique_null(self): assert result_s.index.name is None assert result_s.name == 'a' - result = o.unique() if isinstance(o, Index): + result = o.unique() tm.assert_index_equal(result, Index(values[1:], name='a')) elif is_datetime64tz_dtype(o): + result = o.unique(raw=True) # unable to compare NaT / nan vals = values[2:].astype(object).values tm.assert_numpy_array_equal(result[1:], vals) assert result[0] is pd.NaT else: + result = o.unique(raw=True) tm.assert_numpy_array_equal(result[1:], values[2:]) assert pd.isna(result[0]) @@ -565,7 +569,7 @@ def test_value_counts_inferred(self): tm.assert_index_equal(s.unique(), exp) else: exp = np.unique(np.array(s_values, dtype=np.object_)) - tm.assert_numpy_array_equal(s.unique(), exp) + tm.assert_numpy_array_equal(s.unique(raw=True), exp) assert s.nunique() == 4 # don't sort, have to sort after the fact as not sorting is @@ -605,7 +609,7 @@ def test_value_counts_bins(self): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: exp = np.array([1, 2, 3], dtype=np.int64) - tm.assert_numpy_array_equal(s1.unique(), exp) + tm.assert_numpy_array_equal(s1.unique(raw=True), exp) assert s1.nunique() == 3 @@ -637,7 +641,7 @@ def test_value_counts_bins(self): tm.assert_index_equal(s.unique(), exp) else: exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) - tm.assert_numpy_array_equal(s.unique(), exp) + tm.assert_numpy_array_equal(s.unique(raw=True), exp) assert s.nunique() == 3 s = klass({}) @@ -648,7 +652,7 @@ def test_value_counts_bins(self): if isinstance(s, Index): tm.assert_index_equal(s.unique(), Index([]), exact=False) else: - tm.assert_numpy_array_equal(s.unique(), np.array([]), + tm.assert_numpy_array_equal(s.unique(raw=True), np.array([]), check_dtype=False) assert s.nunique() == 0 @@ -681,7 +685,7 @@ def test_value_counts_datetime64(self, klass): if isinstance(s, Index): tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: - tm.assert_numpy_array_equal(s.unique(), expected) + tm.assert_numpy_array_equal(s.unique(raw=True), expected) assert s.nunique() == 3 @@ -697,7 +701,10 @@ def test_value_counts_datetime64(self, klass): expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) - unique = s.unique() + if isinstance(s, Index): + unique = s.unique() + else: + unique = s.unique(raw=True) assert unique.dtype == 'datetime64[ns]' # numpy_array_equal cannot compare pd.NaT @@ -723,7 +730,7 @@ def test_value_counts_datetime64(self, klass): if isinstance(td, Index): tm.assert_index_equal(td.unique(), expected) else: - tm.assert_numpy_array_equal(td.unique(), expected.values) + tm.assert_numpy_array_equal(td.unique(raw=True), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') From 9601d6be795a29ef40f4cb00e687c75c5e207dd0 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 5 Dec 2018 08:28:56 +0100 Subject: [PATCH 4/5] Add kwarg to Index --- pandas/core/indexes/base.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a5b8e22070923..b10a239e74339 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1985,9 +1985,21 @@ def dropna(self, how='any'): .. versionadded:: 0.23.0 + return_inverse : boolean, default False + Whether to return the inverse of the unique values. If True, the + output will be a tuple where the second component is again an + np.ndarray that contains the mapping between the indices of the + elements in the calling Categorical and their locations in the + unique values. See examples for how to reconstruct. + + .. versionadded:: 0.24.0 + Returns ------- - Index without duplicates + uniques : Index + The ``Index`` without duplicates + inverse : np.ndarray (if `return_inverse=True`) + The inverse from the `uniques` back to the calling ``Index``. See Also -------- @@ -1996,9 +2008,14 @@ def dropna(self, how='any'): """) @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) - def unique(self, level=None): + def unique(self, level=None, return_inverse=False): if level is not None: self._validate_index_level(level) + + if return_inverse: + result, inverse = super(Index, self).unique(return_inverse=True) + return self._shallow_copy(result), inverse + result = super(Index, self).unique() return self._shallow_copy(result) From 6fd279a6b8d562bca9f4ddb6a44bd02c15320b27 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 5 Dec 2018 08:31:40 +0100 Subject: [PATCH 5/5] Whatsnew --- doc/source/whatsnew/v0.24.0.rst | 60 +++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 15476c3bc2e13..2d08dc194befa 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -320,6 +320,64 @@ Example: See the :ref:`advanced docs on renaming` for more details. +.. _whatsnew_0240.enhancements.unique: + +Changes to the ``unique``-method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The three related methods :meth:`pandas.unique`, :meth:`Series.unique` and +:meth:`Index.unique` now support the keyword ``return_inverse``, which, if passed, +makes the output a tuple where the second component is an object that contains the +mapping from the indices of the values to their location in the return unique values. + +.. ipython:: python + + idx = pd.Index([1, 0, 0, 1]) + uniques, inverse = idx.unique(return_inverse=True) + uniques + inverse + reconstruct = uniques[inverse] + reconstruct.equals(idx) + +For :class:`Series`, the ``unique`` method has also gained the ``raw``-keyword, which +allows to toggle between the behavior before v.0.24 (returning an ``np.ndarray`` +or ``Categorical``), and the future behavior of returning a ``Series``. + +.. ipython:: python + + pd.Series([1, 1, 3, 2], name='A').unique(raw=False) + pd.Series([1, 1, 3, 2], name='A').unique(raw=True) + +The ``return_inverse``-keyword is only available if ``raw=False``, since it is necessary +to reconstruct both the values and the index of a ``Series`` for an inverse (to illustrate +that the index is maintained, we pass a non-default index in the example below). + +.. ipython:: python + + animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'], + index=[1, 4, 9, 16, 25]) + animals_unique, inverse = animals.unique(raw=False, return_inverse=True) + animals_unique + inverse + +This can be used to reconstruct the original object from its unique values as follows: + +.. ipython:: python + + reconstruct = animals_unique.reindex(inverse) + reconstruct + +We see that the values of `animals` get reconstructed correctly, but the index does +not match yet -- consequently, the last step is to correctly set the index. + + +.. ipython:: python + + reconstruct.index = inverse.index + reconstruct + reconstruct.equals(animals) + + .. _whatsnew_0240.enhancements.other: Other Enhancements @@ -1103,6 +1161,8 @@ Deprecations - :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) - :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) - :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) +- :meth:`Series.unique` has deprecated returning an array and will return a Series in the future. The behavior can be controlled by the ``raw``-keyword. + The recommended method to get an array is to pass `raw=False` and use `.array` on the result. - :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) - The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) - :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`)