From b61ac0e5dd516ae8ef3198783cee5b118dc928c4 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 2 Dec 2018 21:54:21 +0100
Subject: [PATCH 1/5] API/ENH/DEPR: Series.unique returns Series; .unique gets
 return_inverse

---
 pandas/core/algorithms.py         |  10 +-
 pandas/core/arrays/categorical.py |  28 ++++-
 pandas/core/base.py               |  19 ++-
 pandas/core/series.py             | 192 ++++++++++++++++++++++++++----
 4 files changed, 217 insertions(+), 32 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 1a4368ee8ea98..6fb568d20a426 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -271,7 +271,7 @@ def match(to_match, values, na_sentinel=-1):
     return result
 
 
-def unique(values):
+def unique(values, return_inverse=False):
     """
     Hash table-based unique. Uniques are returned in order
     of appearance. This does NOT sort.
@@ -355,7 +355,11 @@ def unique(values):
     htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    uniques = table.unique(values)
+    if return_inverse:
+        uniques, inverse = table.unique(values, return_inverse=True)
+    else:
+        uniques = table.unique(values)
+
     uniques = _reconstruct_data(uniques, dtype, original)
 
     if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
@@ -365,6 +369,8 @@ def unique(values):
         # TODO: it must return DatetimeArray with tz in pandas 2.0
         uniques = uniques.astype(object).values
 
+    if return_inverse:
+        return uniques, inverse
     return uniques
 
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index ac1c34edba914..b2f37f350c95b 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2249,7 +2249,7 @@ def mode(self, dropna=True):
         codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
         return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
 
-    def unique(self):
+    def unique(self, return_inverse=False):
         """
         Return the ``Categorical`` which ``categories`` and ``codes`` are
         unique. Unused categories are NOT returned.
@@ -2259,9 +2259,22 @@ def unique(self):
         - ordered category: values are sorted by appearance order, categories
           keeps existing order.
 
+        Parameters
+        ----------
+        return_inverse : boolean, default False
+            Whether to return the inverse of the unique values. If True, the
+            output will be a tuple where the second component is again an
+            np.ndarray that contains the mapping between the indices of the
+            elements in the calling Categorical and their locations in the
+            unique values. See examples for how to reconstruct.
+
+            .. versionadded:: 0.24.0
+
         Returns
         -------
-        unique values : ``Categorical``
+        uniques : ``Categorical``
+        inverse : np.ndarray (if `return_inverse=True`)
+            The inverse from the `uniques` back to the calling ``Categorical``.
 
         Examples
         --------
@@ -2293,7 +2306,10 @@ def unique(self):
         """
 
         # unlike np.unique, unique1d does not sort
-        unique_codes = unique1d(self.codes)
+        if return_inverse:
+            unique_codes, inverse = unique1d(self.codes, return_inverse=True)
+        else:
+            unique_codes = unique1d(self.codes, return_inverse=False)
         cat = self.copy()
 
         # keep nan in codes
@@ -2303,7 +2319,11 @@ def unique(self):
         take_codes = unique_codes[unique_codes != -1]
         if self.ordered:
             take_codes = np.sort(take_codes)
-        return cat.set_categories(cat.categories.take(take_codes))
+        result = cat.set_categories(cat.categories.take(take_codes))
+
+        if return_inverse:
+            return result, inverse
+        return result
 
     def _values_for_factorize(self):
         codes = self.codes.astype('int64')
diff --git a/pandas/core/base.py b/pandas/core/base.py
index e7c3a45a710e0..7d1cf71c82e3b 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -1208,15 +1208,24 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
                               normalize=normalize, bins=bins, dropna=dropna)
         return result
 
-    def unique(self):
+    def unique(self, return_inverse=False):
         values = self._values
 
-        if hasattr(values, 'unique'):
-
-            result = values.unique()
+        if is_extension_array_dtype(values):
+            if return_inverse:
+                # as long as return_inverse is not part of the EA.unique
+                # contract, test if this works
+                try:
+                    result = values.unique(return_inverse=return_inverse)
+                except TypeError:
+                    raise ValueError('extension array of dtype {dtype} does '
+                                     'not yet support unique with '
+                                     'return_inverse.')
+            else:
+                result = values.unique()
         else:
             from pandas.core.algorithms import unique1d
-            result = unique1d(values)
+            result = unique1d(values, return_inverse=return_inverse)
 
         return result
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 8d4d7677cca44..8b162d86c3a31 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -1502,18 +1502,51 @@ def mode(self, dropna=True):
         # TODO: Add option for bins like value_counts()
         return algorithms.mode(self, dropna=dropna)
 
-    def unique(self):
+    def unique(self, return_inverse=False, raw=None):
         """
         Return unique values of Series object.
 
         Uniques are returned in order of appearance. Hash table-based unique,
         therefore does NOT sort.
 
+        Parameters
+        ----------
+        return_inverse : boolean, default False
+            Whether to return the inverse of the unique values. If True, the
+            output will be a tuple where the second component is again a Series
+            that contains the mapping between the indices of the elements in
+            the calling Series and their locations in the unique values. See
+            examples for how to reconstruct.
+
+            Using `return_inverse=True` is not compatible with `raw=True`.
+
+            .. versionadded:: 0.24.0
+
+        raw : boolean or None, default None
+            This parameter switches between different return types. If it is
+            True, the result will be `ndarray` (resp. a :class:`pd.Categorical`
+            in case of categorical data), which corresponds to the behavior
+            before v.0.24.
+
+            If False (the future default behavior, starting with v.1.0), it
+            will always return a Series of the same type as the caller.
+
+            .. versionadded:: 0.24.0
+
         Returns
         -------
-        ndarray or Categorical
-            The unique values returned as a NumPy array. In case of categorical
-            data type, returned as a Categorical.
+        uniques : Series (if `raw=False`), else ndarray or Categorical
+            If `raw=False`, this is a Series which contains the uniques in
+            order of their appearance (and with their respective indices).
+            If `raw=True`, the unique values are returned as a numpy array,
+            or as a Categorical (in case of categorical data).
+        inverse : Series (if `return_inverse=True`)
+            The inverse from the `uniques` back to the calling Series.
+
+        Raises
+        ------
+        ValueError
+            If `raw=True` and `return_inverse=True`.
 
         See Also
         --------
@@ -1522,40 +1555,157 @@ def unique(self):
 
         Examples
         --------
-        >>> pd.Series([2, 1, 3, 3], name='A').unique()
-        array([2, 1, 3])
+        If `raw=False`, the output is a Series:
+
+        >>> pd.Series([1, 1, 3, 2], name='A').unique(raw=False)
+        0    1
+        2    3
+        3    2
+        Name: A, dtype: int64
+
+        If `raw=True`, the output is an ndarray (if the data is not
+        categorical):
+
+        >>> pd.Series([1, 1, 3, 2], name='A').unique(raw=True)
+        array([1, 3, 2])
+
+        This method also deals well with timestamps,
 
-        >>> pd.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique()
-        array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
+        >>> pd.Series([pd.Timestamp('2016-01-01')
+        ...            for _ in range(3)]).unique(raw=False)
+        0   2016-01-01
+        dtype: datetime64[ns]
+
+        as well as timezones:
 
         >>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern')
-        ...            for _ in range(3)]).unique()
-        array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
-              dtype=object)
+        ...            for _ in range(3)]).unique(raw=False)
+        0   2016-01-01 00:00:00-05:00
+        dtype: datetime64[ns, US/Eastern]
 
         An unordered Categorical will return categories in the order of
         appearance.
 
-        >>> pd.Series(pd.Categorical(list('baabc'))).unique()
+        >>> pd.Series(pd.Categorical(list('baabc'))).unique(raw=False)
+        0    b
+        1    a
+        4    c
+        dtype: category
+        Categories (3, object): [b, a, c]
+
+        >>> pd.Series(pd.Categorical(list('baabc'))).unique(raw=True)
         [b, a, c]
         Categories (3, object): [b, a, c]
 
         An ordered Categorical preserves the category ordering.
 
         >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'),
-        ...                          ordered=True)).unique()
-        [b, a, c]
+        ...                          ordered=True)).unique(raw=False)
+        0    b
+        1    a
+        4    c
+        dtype: category
         Categories (3, object): [a < b < c]
-        """
-        result = super(Series, self).unique()
 
-        if is_datetime64tz_dtype(self.dtype):
-            # we are special casing datetime64tz_dtype
-            # to return an object array of tz-aware Timestamps
+        As an example for dealing with `return_inverse`, we consider the
+        following example (the reason we use a non-default index is only for
+        demonstration purposes, because this is also something the inverse
+        needs to reconstruct):
+
+        >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'],
+        ...                     index=[1, 4, 9, 16, 25])
+
+        >>> animals_unique, inverse = animals.unique(raw=False,
+        ...                                          return_inverse=True)
+        >>> animals_unique
+        1       lama
+        4        cow
+        16    beetle
+        dtype: object
+
+        >>> inverse
+        1      1
+        4      4
+        9      1
+        16    16
+        25     1
+        dtype: int64
+
+        This can be used to reconstruct the original object from its unique
+        values as follows
+
+        >>> reconstruct = animals_unique.reindex(inverse)
+        >>> reconstruct
+        1       lama
+        4        cow
+        1       lama
+        16    beetle
+        1       lama
+        dtype: object
+
+        We see that the values of `animals` get reconstructed correctly, but
+        the index does not match yet  -- consequently, the last step is to
+        correctly set the index.
+    
+        >>> reconstruct.index = inverse.index
+        >>> reconstruct
+        1       lama
+        4        cow
+        9       lama
+        16    beetle
+        25      lama
+        dtype: object
+
+        >>> reconstruct.equals(animals)
+        True
+        """
+        if raw is None:
+            msg = ('A future version of pandas will return a Series here. '
+                   'To keep returning an ndarray / Categorical (the behavior '
+                   'before v.0.24) and silence this warning, pass the keyword '
+                   '`raw=True`. To return a Series and silence this warning, '
+                   'pass `raw=False`. In the future, the default will switch '
+                   'to `raw=False`, and therefore, if an array is required as '
+                   'output, the recommended way is to pass `raw=False` and '
+                   'use `.array` on the result.')
+            warnings.warn(msg, FutureWarning, stacklevel=2)
+            raw = True
+
+        if raw not in [True, False]:
+            raise ValueError('The keyword "raw" must be either True or False')
+        if return_inverse not in [True, False]:
+            raise ValueError('The keyword "return_inverse" must be either '
+                             'True or False')
+
+        if raw and return_inverse:
+            raise ValueError('The keyword "return_inverse=True" is not '
+                             'supported if "raw=True"')
+        elif raw:
+            result = super(Series, self).unique()
+    
+            if is_datetime64tz_dtype(self.dtype):
+                # we are special casing datetime64tz_dtype
+                # to return an object array of tz-aware Timestamps
+    
+                # TODO: it must return DatetimeArray with tz in pandas 2.0
+                result = result.astype(object).values
+            return result
+
+        # for raw=False, we need the inverse in any case
+        result_array, inverse_array = super(Series,
+                                            self).unique(return_inverse=True)
+
+        # construct indices of first occurrences. In principle, this could be
+        # returned from the cython methods, but this is not compatible with the
+        # (shared) signature for Index.unique
+        idx = ~Series(inverse_array).duplicated(keep='first')
 
-            # TODO: it must return DatetimeArray with tz in pandas 2.0
-            result = result.astype(object).values
+        result = self._constructor(result_array,
+                                   index=self.index[idx]).__finalize__(self)
 
+        if return_inverse:
+            inverse = Series(result.index[inverse_array], index=self.index)
+            return result, inverse
         return result
 
     def drop_duplicates(self, keep='first', inplace=False):

From 72929218ebfea769b3ead6db03baa24fd3c180e1 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 5 Dec 2018 02:04:25 +0100
Subject: [PATCH 2/5] Fixes for tests

---
 pandas/core/algorithms.py | 5 ++++-
 pandas/core/base.py       | 5 ++++-
 pandas/core/series.py     | 6 +++---
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 6fb568d20a426..05e8df64d10cb 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -347,7 +347,10 @@ def unique(values, return_inverse=False):
 
     values = _ensure_arraylike(values)
 
-    if is_extension_array_dtype(values):
+    if isinstance(values, ABCSeries):
+        # this calls through Series, need raw=True to not raise warning
+        return values.unique(raw=True)
+    elif is_extension_array_dtype(values):
         # Dispatch to extension dtype's unique.
         return values.unique()
 
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 7d1cf71c82e3b..d42027a491abc 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -1244,7 +1244,10 @@ def nunique(self, dropna=True):
         -------
         nunique : int
         """
-        uniqs = self.unique()
+        if isinstance(self, ABCSeries):
+            uniqs = self.unique(raw=True)
+        else:
+            uniqs = self.unique()
         n = len(uniqs)
         if dropna and isna(uniqs).any():
             n -= 1
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 8b162d86c3a31..1ce256f21f097 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -1646,7 +1646,7 @@ def unique(self, return_inverse=False, raw=None):
         We see that the values of `animals` get reconstructed correctly, but
         the index does not match yet  -- consequently, the last step is to
         correctly set the index.
-    
+
         >>> reconstruct.index = inverse.index
         >>> reconstruct
         1       lama
@@ -1682,11 +1682,11 @@ def unique(self, return_inverse=False, raw=None):
                              'supported if "raw=True"')
         elif raw:
             result = super(Series, self).unique()
-    
+
             if is_datetime64tz_dtype(self.dtype):
                 # we are special casing datetime64tz_dtype
                 # to return an object array of tz-aware Timestamps
-    
+
                 # TODO: it must return DatetimeArray with tz in pandas 2.0
                 result = result.astype(object).values
             return result

From 10432d4c081d6208dfd39d84b7d559fb373388b2 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 5 Dec 2018 02:04:41 +0100
Subject: [PATCH 3/5] TST: first pass at tests

---
 .../arrays/categorical/test_analytics.py      |  6 ++---
 pandas/tests/arrays/sparse/test_array.py      |  2 +-
 pandas/tests/extension/base/methods.py        | 10 ++++++--
 pandas/tests/frame/test_indexing.py           |  2 +-
 pandas/tests/plotting/common.py               |  2 +-
 pandas/tests/reshape/merge/test_merge.py      |  4 +--
 pandas/tests/series/test_duplicates.py        | 15 +++++------
 pandas/tests/test_algos.py                    |  6 ++---
 pandas/tests/test_base.py                     | 25 ++++++++++++-------
 9 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
index 4251273e424dd..bd36cd1e0782f 100644
--- a/pandas/tests/arrays/categorical/test_analytics.py
+++ b/pandas/tests/arrays/categorical/test_analytics.py
@@ -183,13 +183,13 @@ def test_unique_index_series(self):
         tm.assert_categorical_equal(c.unique(), exp)
 
         tm.assert_index_equal(Index(c).unique(), Index(exp))
-        tm.assert_categorical_equal(Series(c).unique(), exp)
+        tm.assert_categorical_equal(Series(c).unique(raw=True), exp)
 
         c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
         exp = Categorical([1, 2], categories=[1, 2])
         tm.assert_categorical_equal(c.unique(), exp)
         tm.assert_index_equal(Index(c).unique(), Index(exp))
-        tm.assert_categorical_equal(Series(c).unique(), exp)
+        tm.assert_categorical_equal(Series(c).unique(raw=True), exp)
 
         c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
         # Categorical.unique keeps categories order if ordered=True
@@ -197,7 +197,7 @@ def test_unique_index_series(self):
         tm.assert_categorical_equal(c.unique(), exp)
 
         tm.assert_index_equal(Index(c).unique(), Index(exp))
-        tm.assert_categorical_equal(Series(c).unique(), exp)
+        tm.assert_categorical_equal(Series(c).unique(raw=True), exp)
 
     def test_shift(self):
         # GH 9416
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
index b8cef92f6a6d4..f4f20428cad1d 100644
--- a/pandas/tests/arrays/sparse/test_array.py
+++ b/pandas/tests/arrays/sparse/test_array.py
@@ -1150,7 +1150,7 @@ def test_first_fill_value_loc(arr, loc):
 ])
 def test_unique_na_fill(arr, fill_value):
     a = pd.SparseArray(arr, fill_value=fill_value).unique()
-    b = pd.Series(arr).unique()
+    b = pd.Series(arr).unique(raw=True)
     assert isinstance(a, SparseArray)
     a = np.asarray(a)
     tm.assert_numpy_array_equal(a, b)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index e9a89c1af2f22..fea68725912ce 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.core.dtypes.generic import ABCSeries
+
 import pandas as pd
 import pandas.util.testing as tm
 
@@ -75,11 +77,15 @@ def test_sort_values_frame(self, data_for_sorting, ascending):
         self.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize('box', [pd.Series, lambda x: x])
-    @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique])
+    @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs),
+                                        pd.unique])
     def test_unique(self, data, box, method):
         duplicated = box(data._from_sequence([data[0], data[0]]))
 
-        result = method(duplicated)
+        if isinstance(duplicated, ABCSeries) and method != pd.unique:
+            result = method(duplicated, raw=True)
+        else:
+            result = method(duplicated)
 
         assert len(result) == 1
         assert isinstance(result, type(data))
diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py
index 0a61c844f1af8..d94b27f16397a 100644
--- a/pandas/tests/frame/test_indexing.py
+++ b/pandas/tests/frame/test_indexing.py
@@ -2226,7 +2226,7 @@ def verify(df, level, idx, indexer, check_index_type=True):
                             for x in [2, 3, 3, 2, 3, 2, 3, 2]]),
                         'joline': np.random.randn(20).round(3) * 10})
 
-        for idx in permutations(df['jim'].unique()):
+        for idx in permutations(df['jim'].unique(raw=True)):
             for i in range(3):
                 verify_first_level(df, 'jim', idx[:i + 1])
 
diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py
index f41a3a10604af..b5ce068f7bf2c 100644
--- a/pandas/tests/plotting/common.py
+++ b/pandas/tests/plotting/common.py
@@ -156,7 +156,7 @@ def _check_visible(self, collections, visible=True):
             assert patch.get_visible() == visible
 
     def _get_colors_mapped(self, series, colors):
-        unique = series.unique()
+        unique = series.unique(raw=True)
         # unique and colors length can be differed
         # depending on slice value
         mapped = dict(zip(unique, colors))
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index 94e180f9328d6..fb5cbfbefad0e 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -433,7 +433,7 @@ def test_merge_nosort(self):
                       datetime(2010, 2, 3),
                       datetime(2012, 2, 3)]}
         df = DataFrame.from_dict(d)
-        var3 = df.var3.unique()
+        var3 = df.var3.unique(raw=True)
         var3.sort()
         new = DataFrame.from_dict({"var3": var3,
                                    "var8": np.random.random(7)})
@@ -442,7 +442,7 @@ def test_merge_nosort(self):
         exp = merge(df, new, on='var3', sort=False)
         assert_frame_equal(result, exp)
 
-        assert (df.var3.unique() == result.var3.unique()).all()
+        assert (df.var3.unique(raw=True) == result.var3.unique(raw=True)).all()
 
     def test_merge_nan_right(self):
         df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]})
diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py
index 26222637e3509..1d65e52aee0db 100644
--- a/pandas/tests/series/test_duplicates.py
+++ b/pandas/tests/series/test_duplicates.py
@@ -26,37 +26,38 @@ def test_unique():
     # GH714 also, dtype=float
     s = Series([1.2345] * 100)
     s[::2] = np.nan
-    result = s.unique()
+    result = s.unique(raw=True)
     assert len(result) == 2
 
     s = Series([1.2345] * 100, dtype='f4')
     s[::2] = np.nan
-    result = s.unique()
+    result = s.unique(raw=True)
     assert len(result) == 2
 
     # NAs in object arrays #714
     s = Series(['foo'] * 100, dtype='O')
     s[::2] = np.nan
-    result = s.unique()
+    result = s.unique(raw=True)
     assert len(result) == 2
 
     # decision about None
     s = Series([1, 2, 3, None, None, None], dtype=object)
-    result = s.unique()
+    result = s.unique(raw=True)
     expected = np.array([1, 2, 3, None], dtype=object)
     tm.assert_numpy_array_equal(result, expected)
 
     # GH 18051
     s = Series(Categorical([]))
-    tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False)
+    tm.assert_categorical_equal(s.unique(raw=True), Categorical([]),
+                                check_dtype=False)
     s = Series(Categorical([np.nan]))
-    tm.assert_categorical_equal(s.unique(), Categorical([np.nan]),
+    tm.assert_categorical_equal(s.unique(raw=True), Categorical([np.nan]),
                                 check_dtype=False)
 
 
 def test_unique_data_ownership():
     # it works! #1807
-    Series(Series(["a", "c", "b"]).unique()).sort_values()
+    Series(Series(["a", "c", "b"]).unique(raw=True)).sort_values()
 
 
 def test_is_unique():
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index c9d403f6696af..2ad67a93b7345 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -434,7 +434,7 @@ def test_categorical(self):
 
         # Series of categorical dtype
         s = Series(Categorical(list('baabc')), name='foo')
-        result = s.unique()
+        result = s.unique(raw=True)
         tm.assert_categorical_equal(result, expected)
 
         result = pd.unique(s)
@@ -455,7 +455,7 @@ def test_datetime64tz_aware(self):
 
         result = Series(
             Index([Timestamp('20160101', tz='US/Eastern'),
-                   Timestamp('20160101', tz='US/Eastern')])).unique()
+                   Timestamp('20160101', tz='US/Eastern')])).unique(raw=True)
         expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
                                        tz='US/Eastern')], dtype=object)
         tm.assert_numpy_array_equal(result, expected)
@@ -1293,7 +1293,7 @@ def test_lookup_overflow(self, writable):
     def test_get_unique(self):
         s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
         exp = np.array([1, 2, 2**63], dtype=np.uint64)
-        tm.assert_numpy_array_equal(s.unique(), exp)
+        tm.assert_numpy_array_equal(s.unique(raw=True), exp)
 
     @pytest.mark.parametrize('nvals', [0, 10])  # resizing to 0 is special case
     @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
index 47fafe2a900b4..55c72462b0e74 100644
--- a/pandas/tests/test_base.py
+++ b/pandas/tests/test_base.py
@@ -442,11 +442,12 @@ def test_value_counts_unique_nunique(self):
             assert result.index.name is None
             assert result.name == 'a'
 
-            result = o.unique()
             if isinstance(o, Index):
+                result = o.unique()
                 assert isinstance(result, o.__class__)
                 tm.assert_index_equal(result, orig)
             elif is_datetime64tz_dtype(o):
+                result = o.unique(raw=True)
                 # datetimetz Series returns array of Timestamp
                 assert result[0] == orig[0]
                 for r in result:
@@ -454,6 +455,7 @@ def test_value_counts_unique_nunique(self):
                 tm.assert_numpy_array_equal(result,
                                             orig._values.astype(object).values)
             else:
+                result = o.unique(raw=True)
                 tm.assert_numpy_array_equal(result, orig.values)
 
             assert o.nunique() == len(np.unique(o.values))
@@ -534,16 +536,18 @@ def test_value_counts_unique_nunique_null(self):
                 assert result_s.index.name is None
                 assert result_s.name == 'a'
 
-                result = o.unique()
                 if isinstance(o, Index):
+                    result = o.unique()
                     tm.assert_index_equal(result,
                                           Index(values[1:], name='a'))
                 elif is_datetime64tz_dtype(o):
+                    result = o.unique(raw=True)
                     # unable to compare NaT / nan
                     vals = values[2:].astype(object).values
                     tm.assert_numpy_array_equal(result[1:], vals)
                     assert result[0] is pd.NaT
                 else:
+                    result = o.unique(raw=True)
                     tm.assert_numpy_array_equal(result[1:], values[2:])
 
                     assert pd.isna(result[0])
@@ -565,7 +569,7 @@ def test_value_counts_inferred(self):
                 tm.assert_index_equal(s.unique(), exp)
             else:
                 exp = np.unique(np.array(s_values, dtype=np.object_))
-                tm.assert_numpy_array_equal(s.unique(), exp)
+                tm.assert_numpy_array_equal(s.unique(raw=True), exp)
 
             assert s.nunique() == 4
             # don't sort, have to sort after the fact as not sorting is
@@ -605,7 +609,7 @@ def test_value_counts_bins(self):
                 tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
             else:
                 exp = np.array([1, 2, 3], dtype=np.int64)
-                tm.assert_numpy_array_equal(s1.unique(), exp)
+                tm.assert_numpy_array_equal(s1.unique(raw=True), exp)
 
             assert s1.nunique() == 3
 
@@ -637,7 +641,7 @@ def test_value_counts_bins(self):
                 tm.assert_index_equal(s.unique(), exp)
             else:
                 exp = np.array(['a', 'b', np.nan, 'd'], dtype=object)
-                tm.assert_numpy_array_equal(s.unique(), exp)
+                tm.assert_numpy_array_equal(s.unique(raw=True), exp)
             assert s.nunique() == 3
 
             s = klass({})
@@ -648,7 +652,7 @@ def test_value_counts_bins(self):
             if isinstance(s, Index):
                 tm.assert_index_equal(s.unique(), Index([]), exact=False)
             else:
-                tm.assert_numpy_array_equal(s.unique(), np.array([]),
+                tm.assert_numpy_array_equal(s.unique(raw=True), np.array([]),
                                             check_dtype=False)
 
             assert s.nunique() == 0
@@ -681,7 +685,7 @@ def test_value_counts_datetime64(self, klass):
         if isinstance(s, Index):
             tm.assert_index_equal(s.unique(), DatetimeIndex(expected))
         else:
-            tm.assert_numpy_array_equal(s.unique(), expected)
+            tm.assert_numpy_array_equal(s.unique(raw=True), expected)
 
         assert s.nunique() == 3
 
@@ -697,7 +701,10 @@ def test_value_counts_datetime64(self, klass):
         expected_s[pd.NaT] = 1
         tm.assert_series_equal(result, expected_s)
 
-        unique = s.unique()
+        if isinstance(s, Index):
+            unique = s.unique()
+        else:
+            unique = s.unique(raw=True)
         assert unique.dtype == 'datetime64[ns]'
 
         # numpy_array_equal cannot compare pd.NaT
@@ -723,7 +730,7 @@ def test_value_counts_datetime64(self, klass):
         if isinstance(td, Index):
             tm.assert_index_equal(td.unique(), expected)
         else:
-            tm.assert_numpy_array_equal(td.unique(), expected.values)
+            tm.assert_numpy_array_equal(td.unique(raw=True), expected.values)
 
         td2 = timedelta(1) + (df.dt - df.dt)
         td2 = klass(td2, name='dt')

From 9601d6be795a29ef40f4cb00e687c75c5e207dd0 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 5 Dec 2018 08:28:56 +0100
Subject: [PATCH 4/5] Add kwarg to Index

---
 pandas/core/indexes/base.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index a5b8e22070923..b10a239e74339 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -1985,9 +1985,21 @@ def dropna(self, how='any'):
 
             .. versionadded:: 0.23.0
 
+        return_inverse : boolean, default False
+            Whether to return the inverse of the unique values. If True, the
+            output will be a tuple where the second component is again an
+            np.ndarray that contains the mapping between the indices of the
+            elements in the calling Categorical and their locations in the
+            unique values. See examples for how to reconstruct.
+
+            .. versionadded:: 0.24.0
+
         Returns
         -------
-        Index without duplicates
+        uniques : Index
+            The ``Index`` without duplicates
+        inverse : np.ndarray (if `return_inverse=True`)
+            The inverse from the `uniques` back to the calling ``Index``.
 
         See Also
         --------
@@ -1996,9 +2008,14 @@ def dropna(self, how='any'):
         """)
 
     @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
-    def unique(self, level=None):
+    def unique(self, level=None, return_inverse=False):
         if level is not None:
             self._validate_index_level(level)
+
+        if return_inverse:
+            result, inverse = super(Index, self).unique(return_inverse=True)
+            return self._shallow_copy(result), inverse
+
         result = super(Index, self).unique()
         return self._shallow_copy(result)
 

From 6fd279a6b8d562bca9f4ddb6a44bd02c15320b27 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 5 Dec 2018 08:31:40 +0100
Subject: [PATCH 5/5] Whatsnew

---
 doc/source/whatsnew/v0.24.0.rst | 60 +++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 15476c3bc2e13..2d08dc194befa 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -320,6 +320,64 @@ Example:
 See the :ref:`advanced docs on renaming<advanced.index_names>` for more details.
 
 
+.. _whatsnew_0240.enhancements.unique:
+
+Changes to the ``unique``-method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The three related methods :meth:`pandas.unique`, :meth:`Series.unique` and
+:meth:`Index.unique` now support the keyword ``return_inverse``, which, if passed,
+makes the output a tuple where the second component is an object that contains the
+mapping from the indices of the values to their location in the return unique values.
+
+.. ipython:: python
+
+    idx = pd.Index([1, 0, 0, 1])
+    uniques, inverse = idx.unique(return_inverse=True)
+    uniques
+    inverse
+    reconstruct = uniques[inverse]
+    reconstruct.equals(idx)
+
+For :class:`Series`, the ``unique`` method has also gained the ``raw``-keyword, which
+allows to toggle between the behavior before v.0.24 (returning an ``np.ndarray``
+or ``Categorical``), and the future behavior of returning a ``Series``.
+
+.. ipython:: python
+
+    pd.Series([1, 1, 3, 2], name='A').unique(raw=False)
+    pd.Series([1, 1, 3, 2], name='A').unique(raw=True)
+
+The ``return_inverse``-keyword is only available if ``raw=False``, since it is necessary
+to reconstruct both the values and the index of a ``Series`` for an inverse (to illustrate
+that the index is maintained, we pass a non-default index in the example below).
+
+.. ipython:: python
+
+    animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'],
+                        index=[1, 4, 9, 16, 25])
+    animals_unique, inverse = animals.unique(raw=False, return_inverse=True)
+    animals_unique
+    inverse
+
+This can be used to reconstruct the original object from its unique values as follows:
+
+.. ipython:: python
+
+    reconstruct = animals_unique.reindex(inverse)
+    reconstruct
+
+We see that the values of `animals` get reconstructed correctly, but the index does
+not match yet  -- consequently, the last step is to correctly set the index.
+
+
+.. ipython:: python
+
+    reconstruct.index = inverse.index
+    reconstruct
+    reconstruct.equals(animals)
+
+
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements
@@ -1103,6 +1161,8 @@ Deprecations
 - :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`)
 - :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`)
 - :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`)
+- :meth:`Series.unique` has deprecated returning an array and will return a Series in the future. The behavior can be controlled by the ``raw``-keyword.
+  The recommended method to get an array is to pass `raw=False` and use `.array` on the result.
 - :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`)
 - The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`)
 - :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`)