From aa546c4465d80d1362ce02dee5a0a62d808db060 Mon Sep 17 00:00:00 2001
From: h-vetinari <33685575+h-vetinari@users.noreply.github.com>
Date: Thu, 29 Nov 2018 18:21:24 +0100
Subject: [PATCH] ENH: Add return_inverse to cython-unique; unify
 unique/factorize-code (#23400)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 478 ++++++++++++---------
 pandas/core/algorithms.py                  |   2 +-
 pandas/tests/test_algos.py                 |  10 +-
 3 files changed, 293 insertions(+), 197 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index a71023ed34f44d..7f4c2a6410870d 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -356,11 +356,12 @@ cdef class {{name}}HashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
+    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                object na_value=None, bint ignore_na=False,
+                bint return_inverse=False):
         """
-        Calculate unique values and labels (no sorting); ignores all NA-values
+        Calculate unique values and labels (no sorting!)
 
         Parameters
         ----------
@@ -374,13 +375,22 @@ cdef class {{name}}HashTable(HashTable):
             Sentinel value used for all NA-values in inverse
         na_value : object, default None
             Value to identify as missing. If na_value is None, then
-            any value satisfying val!=val are considered missing.
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
+        ignore_na : boolean, default False
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
 
         Returns
         -------
         uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
@@ -392,7 +402,8 @@ cdef class {{name}}HashTable(HashTable):
             {{name}}VectorData *ud
             bint use_na_value
 
-        labels = np.empty(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
 
@@ -410,20 +421,19 @@ cdef class {{name}}HashTable(HashTable):
             for i in range(n):
                 val = values[i]
 
-                if val != val or (use_na_value and val == na_value2):
+                if ignore_na and (val != val
+                                  or (use_na_value and val == na_value2)):
+                    # if missing values do not count as unique values (i.e. if
+                    # ignore_na is True), skip the hashtable entry for them,
+                    # and replace the corresponding label with na_sentinel
                     labels[i] = na_sentinel
                     continue
 
                 k = kh_get_{{dtype}}(self.table, val)
 
-                if k != self.table.n_buckets:
-                    # k falls into a previous bucket
-                    idx = self.table.vals[k]
-                    labels[i] = idx
-                else:
+                if k == self.table.n_buckets:
                     # k hasn't been seen yet
                     k = kh_put_{{dtype}}(self.table, val, &ret)
-                    self.table.vals[k] = count
 
                     if needs_resize(ud):
                         with gil:
@@ -433,23 +443,82 @@ cdef class {{name}}HashTable(HashTable):
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                    labels[i] = count
-                    count += 1
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = count
+                        count += 1
+                elif return_inverse:
+                    # k falls into a previous bucket
+                    # only relevant in case we need to construct the inverse
+                    idx = self.table.vals[k]
+                    labels[i] = idx
+
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
 
-        return np.asarray(labels)
+    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Parameters
+        ----------
+        values : ndarray[{{dtype}}]
+            Array of values of which unique will be calculated
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+
+        Returns
+        -------
+        uniques : ndarray[{{dtype}}]
+            Unique values of input, not sorted
+        labels : ndarray[int64] (if return_inverse)
+            The labels from values to uniques
+        """
+        uniques = {{name}}Vector()
+        return self._unique(values, uniques, ignore_na=False,
+                            return_inverse=return_inverse)
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
-        uniques = {{name}}Vector()
-        labels = self._factorize(values, uniques=uniques,
-                                 na_sentinel=na_sentinel, na_value=na_value)
-        return labels, uniques.to_array()
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+
+        Parameters
+        ----------
+        values : ndarray[{{dtype}}]
+            Array of values of which unique will be calculated
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
+
+        Returns
+        -------
+        uniques : ndarray[{{dtype}}]
+            Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
+        """
+        uniques_vector = {{name}}Vector()
+        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+                            na_value=na_value, ignore_na=True,
+                            return_inverse=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        return self._factorize(values, uniques, count_prior=count_prior,
-                               na_sentinel=na_sentinel, na_value=na_value)
+        _, labels = self._unique(values, uniques, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value,
+                                 ignore_na=True, return_inverse=True)
+        return labels
 
     @cython.boundscheck(False)
     def get_labels_groupby(self, const {{dtype}}_t[:] values):
@@ -496,44 +565,6 @@ cdef class {{name}}HashTable(HashTable):
 
         return np.asarray(labels), arr_uniques
 
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    def unique(self, const {{dtype}}_t[:] values):
-        """
-        Calculate unique values without sorting
-
-        Parameters
-        ----------
-        values : ndarray[{{dtype}}]
-            Array of values of which unique will be calculated
-
-        Returns
-        -------
-        uniques : ndarray[{{dtype}}]
-            Unique values of input, not sorted
-        """
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            {{dtype}}_t val
-            khiter_t k
-            {{name}}Vector uniques = {{name}}Vector()
-            {{name}}VectorData *ud
-
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_get_{{dtype}}(self.table, val)
-                if k == self.table.n_buckets:
-                    kh_put_{{dtype}}(self.table, val, &ret)
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, val)
-        return uniques.to_array()
-
 {{endfor}}
 
 
@@ -613,56 +644,6 @@ cdef class StringHashTable(HashTable):
         free(vecs)
         return labels
 
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    def unique(self, ndarray[object] values):
-        """
-        Calculate unique values without sorting
-
-        Parameters
-        ----------
-        values : ndarray[object]
-            Array of values of which unique will be calculated
-
-        Returns
-        -------
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        """
-        cdef:
-            Py_ssize_t i, count, n = len(values)
-            int64_t[:] uindexer
-            int ret = 0
-            object val
-            ObjectVector uniques
-            khiter_t k
-            const char *v
-            const char **vecs
-
-        vecs = <const char **>malloc(n * sizeof(char *))
-        uindexer = np.empty(n, dtype=np.int64)
-        for i in range(n):
-            val = values[i]
-            v = util.get_c_string(val)
-            vecs[i] = v
-
-        count = 0
-        with nogil:
-            for i in range(n):
-                v = vecs[i]
-                k = kh_get_str(self.table, v)
-                if k == self.table.n_buckets:
-                    kh_put_str(self.table, v, &ret)
-                    uindexer[count] = i
-                    count += 1
-        free(vecs)
-
-        # uniques
-        uniques = ObjectVector()
-        for i in range(count):
-            uniques.append(values[uindexer[i]])
-        return uniques.to_array()
-
     @cython.boundscheck(False)
     def lookup(self, ndarray[object] values):
         cdef:
@@ -726,11 +707,12 @@ cdef class StringHashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _factorize(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
+    def _unique(self, ndarray[object] values, ObjectVector uniques,
+                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                object na_value=None, bint ignore_na=False,
+                bint return_inverse=False):
         """
-        Calculate unique values and labels (no sorting); ignores all NA-values
+        Calculate unique values and labels (no sorting!)
 
         Parameters
         ----------
@@ -743,13 +725,23 @@ cdef class StringHashTable(HashTable):
         na_sentinel : Py_ssize_t, default -1
             Sentinel value used for all NA-values in inverse
         na_value : object, default None
-            Value to identify as missing
+            Value to identify as missing. If na_value is None, then any value
+            that is not a string is considered missing. If na_value is
+            not None, then _additionally_ any value "val" satisfying
+            val == na_value is considered missing.
+        ignore_na : boolean, default False
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
 
         Returns
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
@@ -763,41 +755,50 @@ cdef class StringHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        labels = np.zeros(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
-        # assign pointers and pre-filter out missing
+        # assign pointers and pre-filter out missing (if ignore_na)
         vecs = <const char **>malloc(n * sizeof(char *))
         for i in range(n):
             val = values[i]
 
-            if (isinstance(val, (str, unicode))
-                    and not (use_na_value and val == na_value)):
+            if (ignore_na
+                and (not isinstance(val, (str, unicode))
+                     or (use_na_value and val == na_value))):
+                # if missing values do not count as unique values (i.e. if
+                # ignore_na is True), we can skip the actual value, and
+                # replace the label with na_sentinel directly
+                labels[i] = na_sentinel
+            else:
+                # if ignore_na is False, we also stringify NaN/None/etc.
                 v = util.get_c_string(val)
                 vecs[i] = v
-            else:
-                labels[i] = na_sentinel
 
         # compute
         with nogil:
             for i in range(n):
-                if labels[i] == na_sentinel:
+                if ignore_na and labels[i] == na_sentinel:
+                    # skip entries for ignored missing values (see above)
                     continue
 
                 v = vecs[i]
                 k = kh_get_str(self.table, v)
-                if k != self.table.n_buckets:
-                    # k falls into a previous bucket
-                    idx = self.table.vals[k]
-                    labels[i] = <int64_t>idx
-                else:
+                if k == self.table.n_buckets:
                     # k hasn't been seen yet
                     k = kh_put_str(self.table, v, &ret)
-                    self.table.vals[k] = count
                     uindexer[count] = i
-                    labels[i] = <int64_t>count
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = <int64_t>count
                     count += 1
+                elif return_inverse:
+                    # k falls into a previous bucket
+                    # only relevant in case we need to construct the inverse
+                    idx = self.table.vals[k]
+                    labels[i] = <int64_t>idx
 
         free(vecs)
 
@@ -805,20 +806,72 @@ cdef class StringHashTable(HashTable):
         for i in range(count):
             uniques.append(values[uindexer[i]])
 
-        return np.asarray(labels)
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
+
+    def unique(self, ndarray[object] values, bint return_inverse=False):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64] (if return_inverse)
+            The labels from values to uniques
+        """
+        uniques = ObjectVector()
+        return self._unique(values, uniques, ignore_na=False,
+                            return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
-        uniques = ObjectVector()
-        labels = self._factorize(values, uniques=uniques,
-                                 na_sentinel=na_sentinel, na_value=na_value)
-        return labels, uniques.to_array()
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then any value
+            that is not a string is considered missing. If na_value is
+            not None, then _additionally_ any value "val" satisfying
+            val == na_value is considered missing.
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
+        """
+        uniques_vector = ObjectVector()
+        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+                            na_value=na_value, ignore_na=True,
+                            return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        return self._factorize(values, uniques, count_prior=count_prior,
-                               na_sentinel=na_sentinel, na_value=na_value)
+        _, labels = self._unique(values, uniques, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value,
+                                 ignore_na=True, return_inverse=True)
+        return labels
 
 
 cdef class PyObjectHashTable(HashTable):
@@ -908,44 +961,12 @@ cdef class PyObjectHashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def unique(self, ndarray[object] values):
+    def _unique(self, ndarray[object] values, ObjectVector uniques,
+                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                object na_value=None, bint ignore_na=False,
+                bint return_inverse=False):
         """
-        Calculate unique values without sorting
-
-        Parameters
-        ----------
-        values : ndarray[object]
-            Array of values of which unique will be calculated
-
-        Returns
-        -------
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        """
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            object val
-            khiter_t k
-            ObjectVector uniques = ObjectVector()
-
-        for i in range(n):
-            val = values[i]
-            hash(val)
-            k = kh_get_pymap(self.table, <PyObject*>val)
-            if k == self.table.n_buckets:
-                kh_put_pymap(self.table, <PyObject*>val, &ret)
-                uniques.append(val)
-
-        return uniques.to_array()
-
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    def _factorize(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
-        """
-        Calculate unique values and labels (no sorting); ignores all NA-values
+        Calculate unique values and labels (no sorting!)
 
         Parameters
         ----------
@@ -959,13 +980,22 @@ cdef class PyObjectHashTable(HashTable):
             Sentinel value used for all NA-values in inverse
         na_value : object, default None
             Value to identify as missing. If na_value is None, then None _plus_
-            any value satisfying val!=val are considered missing.
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
+        ignore_na : boolean, default False
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
 
         Returns
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
@@ -976,42 +1006,100 @@ cdef class PyObjectHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        labels = np.empty(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
         for i in range(n):
             val = values[i]
             hash(val)
 
-            if ((val != val or val is None)
-                    or (use_na_value and val == na_value)):
+            if ignore_na and ((val != val or val is None)
+                              or (use_na_value and val == na_value)):
+                # if missing values do not count as unique values (i.e. if
+                # ignore_na is True), skip the hashtable entry for them, and
+                # replace the corresponding label with na_sentinel
                 labels[i] = na_sentinel
                 continue
 
             k = kh_get_pymap(self.table, <PyObject*>val)
-            if k != self.table.n_buckets:
-                # k falls into a previous bucket
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
+            if k == self.table.n_buckets:
                 # k hasn't been seen yet
                 k = kh_put_pymap(self.table, <PyObject*>val, &ret)
-                self.table.vals[k] = count
                 uniques.append(val)
-                labels[i] = count
-                count += 1
+                if return_inverse:
+                    self.table.vals[k] = count
+                    labels[i] = count
+                    count += 1
+            elif return_inverse:
+                # k falls into a previous bucket
+                # only relevant in case we need to construct the inverse
+                idx = self.table.vals[k]
+                labels[i] = idx
+
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
+
+    def unique(self, ndarray[object] values, bint return_inverse=False):
+        """
+        Calculate unique values and labels (no sorting!)
 
-        return np.asarray(labels)
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64] (if return_inverse)
+            The labels from values to uniques
+        """
+        uniques = ObjectVector()
+        return self._unique(values, uniques, ignore_na=False,
+                            return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
-        uniques = ObjectVector()
-        labels = self._factorize(values, uniques=uniques,
-                                 na_sentinel=na_sentinel, na_value=na_value)
-        return labels, uniques.to_array()
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then None _plus_
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
+        """
+        uniques_vector = ObjectVector()
+        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+                            na_value=na_value, ignore_na=True,
+                            return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        return self._factorize(values, uniques, count_prior=count_prior,
-                               na_sentinel=na_sentinel, na_value=na_value)
+        _, labels = self._unique(values, uniques, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value,
+                                 ignore_na=True, return_inverse=True)
+        return labels
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 7aceef8634e206..1a4368ee8ea98a 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -460,7 +460,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
     (hash_klass, _), values = _get_data_algo(values, _hashtables)
 
     table = hash_klass(size_hint or len(values))
-    labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
+    uniques, labels = table.factorize(values, na_sentinel=na_sentinel,
                                       na_value=na_value)
 
     labels = ensure_platform_int(labels)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index fa33a1ceae0b94..c9d403f6696af1 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1361,6 +1361,14 @@ def test_hashtable_unique(self, htable, tm_dtype, writable):
         result_unique = htable().unique(s_duplicated.values)
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
+        # test return_inverse=True
+        # reconstruction can only succeed if the inverse is correct
+        result_unique, result_inverse = htable().unique(s_duplicated.values,
+                                                        return_inverse=True)
+        tm.assert_numpy_array_equal(result_unique, expected_unique)
+        reconstr = result_unique[result_inverse]
+        tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
+
     @pytest.mark.parametrize('htable, tm_dtype', [
         (ht.PyObjectHashTable, 'String'),
         (ht.StringHashTable, 'String'),
@@ -1383,7 +1391,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
         s_duplicated.values.setflags(write=writable)
         na_mask = s_duplicated.isna().values
 
-        result_inverse, result_unique = htable().factorize(s_duplicated.values)
+        result_unique, result_inverse = htable().factorize(s_duplicated.values)
 
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.factorize()