From 640162fab9498f56ab0c93748ef4655cd0fc449f Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 3 Oct 2018 08:15:28 +0200
Subject: [PATCH 01/26] Fix ASV import error

---
 asv_bench/benchmarks/indexing.py      | 8 ++++----
 asv_bench/benchmarks/join_merge.py    | 7 ++++---
 asv_bench/benchmarks/panel_ctor.py    | 4 ++--
 asv_bench/benchmarks/panel_methods.py | 3 ++-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index c5b147b152aa6..2850fa249725c 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -2,10 +2,10 @@
 
 import numpy as np
 import pandas.util.testing as tm
-from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index,
-                    IntervalIndex, CategoricalIndex,
-                    IndexSlice, concat, date_range)
-from .pandas_vb_common import setup, Panel  # noqa
+from pandas import (Series, DataFrame, MultiIndex, Panel,
+                    Int64Index, Float64Index, IntervalIndex,
+                    CategoricalIndex, IndexSlice, concat, date_range)
+from .pandas_vb_common import setup  # noqa
 
 
 class NumericSeriesIndexing(object):
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
index 7487a0d8489b7..6624c3d0aaf49 100644
--- a/asv_bench/benchmarks/join_merge.py
+++ b/asv_bench/benchmarks/join_merge.py
@@ -3,14 +3,15 @@
 
 import numpy as np
 import pandas.util.testing as tm
-from pandas import (DataFrame, Series, MultiIndex, date_range, concat, merge,
-                    merge_asof)
+from pandas import (DataFrame, Series, Panel, MultiIndex,
+                    date_range, concat, merge, merge_asof)
+
 try:
     from pandas import merge_ordered
 except ImportError:
     from pandas import ordered_merge as merge_ordered
 
-from .pandas_vb_common import Panel, setup  # noqa
+from .pandas_vb_common import setup  # noqa
 
 
 class Append(object):
diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py
index ce946c76ed199..4614bbd198afa 100644
--- a/asv_bench/benchmarks/panel_ctor.py
+++ b/asv_bench/benchmarks/panel_ctor.py
@@ -1,9 +1,9 @@
 import warnings
 from datetime import datetime, timedelta
 
-from pandas import DataFrame, DatetimeIndex, date_range
+from pandas import DataFrame, Panel, DatetimeIndex, date_range
 
-from .pandas_vb_common import Panel, setup  # noqa
+from .pandas_vb_common import setup  # noqa
 
 
 class DifferentIndexes(object):
diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py
index a5b1a92e9cf67..4d19e9a87c507 100644
--- a/asv_bench/benchmarks/panel_methods.py
+++ b/asv_bench/benchmarks/panel_methods.py
@@ -1,8 +1,9 @@
 import warnings
 
 import numpy as np
+from pandas import Panel
 
-from .pandas_vb_common import Panel, setup  # noqa
+from .pandas_vb_common import setup  # noqa
 
 
 class PanelMethods(object):

From 31d0dc59f0dcabd9570f2a849af38f564a006a94 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 27 Sep 2018 23:52:42 +0200
Subject: [PATCH 02/26] Add return_inverse to hashtable.unique

---
 pandas/_libs/hashtable_class_helper.pxi.in | 110 +++++++++++++++------
 1 file changed, 79 insertions(+), 31 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index f294fd141a9f1..f6fccadf78904 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -355,14 +355,14 @@ cdef class {{name}}HashTable(HashTable):
 
         return np.asarray(locs)
 
-    def factorize(self, {{dtype}}_t values):
+    def factorize(self, {{dtype}}_t[:] values):
         uniques = {{name}}Vector()
-        labels = self.get_labels(values, uniques, 0, 0)
+        labels = self.get_labels(values, uniques, 0)
         return uniques.to_array(), labels
 
     @cython.boundscheck(False)
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
-                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
         cdef:
             Py_ssize_t i, n = len(values)
@@ -399,9 +399,11 @@ cdef class {{name}}HashTable(HashTable):
                 k = kh_get_{{dtype}}(self.table, val)
 
                 if k != self.table.n_buckets:
+                    # k falls into a previous bucket
                     idx = self.table.vals[k]
                     labels[i] = idx
                 else:
+                    # k hasn't been seen yet
                     k = kh_put_{{dtype}}(self.table, val, &ret)
                     self.table.vals[k] = count
 
@@ -464,27 +466,42 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(labels), arr_uniques
 
     @cython.boundscheck(False)
-    def unique(self, const {{dtype}}_t[:] values):
+    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
         cdef:
-           Py_ssize_t i, n = len(values)
-           int ret = 0
-           {{dtype}}_t val
-           khiter_t k
-           {{name}}Vector uniques = {{name}}Vector()
-           {{name}}VectorData *ud
+            Py_ssize_t i, idx, count = 0, n = len(values)
+            int64_t[:] labels
+            int ret = 0
+            {{dtype}}_t val
+            khiter_t k
+            {{name}}Vector uniques = {{name}}Vector()
+            {{name}}VectorData *ud
 
         ud = uniques.data
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
 
         with nogil:
             for i in range(n):
                 val = values[i]
                 k = kh_get_{{dtype}}(self.table, val)
-                if k == self.table.n_buckets:
-                    kh_put_{{dtype}}(self.table, val, &ret)
+                if return_inverse and k != self.table.n_buckets:
+                    # k falls into a previous bucket
+                    idx = self.table.vals[k]
+                    labels[i] = idx
+                elif k == self.table.n_buckets:
+                    # k hasn't been seen yet
+                    k = kh_put_{{dtype}}(self.table, val, &ret)
                     if needs_resize(ud):
                         with gil:
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = count
+                    count += 1
+
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
 
 {{endfor}}
@@ -567,45 +584,57 @@ cdef class StringHashTable(HashTable):
         return labels
 
     @cython.boundscheck(False)
-    def unique(self, ndarray[object] values):
+    def unique(self, ndarray[object] values, bint return_inverse=False):
         cdef:
-            Py_ssize_t i, count, n = len(values)
+            Py_ssize_t i, idx, count = 0, n = len(values)
+            int64_t[:] labels
             int64_t[:] uindexer
             int ret = 0
             object val
-            ObjectVector uniques
+            ObjectVector uniques = ObjectVector()
             khiter_t k
             const char *v
             const char **vecs
 
-        vecs = <const char **> malloc(n * sizeof(char *))
+        if return_inverse:
+            labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
+
+        # assign pointers
+        vecs = <const char **> malloc(n * sizeof(char *))
         for i in range(n):
             val = values[i]
             v = util.get_c_string(val)
             vecs[i] = v
 
-        count = 0
+
+        # compute
         with nogil:
             for i in range(n):
                 v = vecs[i]
                 k = kh_get_str(self.table, v)
-                if k == self.table.n_buckets:
-                    kh_put_str(self.table, v, &ret)
+                if return_inverse and k != self.table.n_buckets:
+                    # k falls into a previous bucket
+                    idx = self.table.vals[k]
+                    labels[i] = <int64_t>idx
+                elif k == self.table.n_buckets:
+                    # k hasn't been seen yet
+                    k = kh_put_str(self.table, v, &ret)
                     uindexer[count] = i
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = <int64_t>count
                     count += 1
+
         free(vecs)
 
         # uniques
-        uniques = ObjectVector()
         for i in range(count):
             uniques.append(values[uindexer[i]])
-        return uniques.to_array()
 
-    def factorize(self, ndarray[object] values):
-        uniques = ObjectVector()
-        labels = self.get_labels(values, uniques, 0, 0)
-        return uniques.to_array(), labels
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
 
     @cython.boundscheck(False)
     def lookup(self, ndarray[object] values):
@@ -670,7 +699,7 @@ cdef class StringHashTable(HashTable):
 
     @cython.boundscheck(False)
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior, int64_t na_sentinel,
+                   Py_ssize_t count_prior=0, int64_t na_sentinel=-1,
                    object na_value=None):
         cdef:
             Py_ssize_t i, n = len(values)
@@ -814,26 +843,43 @@ cdef class PyObjectHashTable(HashTable):
 
         return np.asarray(locs)
 
-    def unique(self, ndarray[object] values):
+    @cython.boundscheck(False)
+    def unique(self, ndarray[object] values, bint return_inverse=False):
         cdef:
-            Py_ssize_t i, n = len(values)
+            Py_ssize_t i, idx, count = 0, n = len(values)
+            int64_t[:] labels
             int ret = 0
             object val
             khiter_t k
             ObjectVector uniques = ObjectVector()
 
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
+
         for i in range(n):
             val = values[i]
             hash(val)
             k = kh_get_pymap(self.table, <PyObject*>val)
-            if k == self.table.n_buckets:
-                kh_put_pymap(self.table, <PyObject*>val, &ret)
+            if return_inverse and k != self.table.n_buckets:
+                # k falls into a previous bucket
+                idx = self.table.vals[k]
+                labels[i] = <int64_t>idx
+            elif k == self.table.n_buckets:
+                # k hasn't been seen yet
+                k = kh_put_pymap(self.table, <PyObject*>val, &ret)
                 uniques.append(val)
+                if return_inverse:
+                    self.table.vals[k] = count
+                    labels[i] = <int64_t>count
+                count += 1
 
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
 
+    @cython.boundscheck(False)
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior, int64_t na_sentinel,
+                   Py_ssize_t count_prior=0, int64_t na_sentinel=-1,
                    object na_value=None):
         cdef:
             Py_ssize_t i, n = len(values)
@@ -858,9 +904,11 @@ cdef class PyObjectHashTable(HashTable):
 
             k = kh_get_pymap(self.table, <PyObject*>val)
             if k != self.table.n_buckets:
+                # k falls into a previous bucket
                 idx = self.table.vals[k]
                 labels[i] = idx
             else:
+                # k hasn't been seen yet
                 k = kh_put_pymap(self.table, <PyObject*>val, &ret)
                 self.table.vals[k] = count
                 uniques.append(val)

From c5e51478e6a19c8d6a4673020edea609666b6058 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 30 Sep 2018 17:17:50 +0200
Subject: [PATCH 03/26] Pure copy/paste: Group unique/factorize functions next
 to each other

---
 pandas/_libs/hashtable_class_helper.pxi.in | 184 ++++++++++-----------
 1 file changed, 92 insertions(+), 92 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index f6fccadf78904..0418939956b1c 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -355,6 +355,45 @@ cdef class {{name}}HashTable(HashTable):
 
         return np.asarray(locs)
 
+    @cython.boundscheck(False)
+    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
+        cdef:
+            Py_ssize_t i, idx, count = 0, n = len(values)
+            int64_t[:] labels
+            int ret = 0
+            {{dtype}}_t val
+            khiter_t k
+            {{name}}Vector uniques = {{name}}Vector()
+            {{name}}VectorData *ud
+
+        ud = uniques.data
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
+
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                k = kh_get_{{dtype}}(self.table, val)
+                if return_inverse and k != self.table.n_buckets:
+                    # k falls into a previous bucket
+                    idx = self.table.vals[k]
+                    labels[i] = idx
+                elif k == self.table.n_buckets:
+                    # k hasn't been seen yet
+                    k = kh_put_{{dtype}}(self.table, val, &ret)
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data_{{dtype}}(ud, val)
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = count
+                    count += 1
+
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
+
     def factorize(self, {{dtype}}_t[:] values):
         uniques = {{name}}Vector()
         labels = self.get_labels(values, uniques, 0)
@@ -465,45 +504,6 @@ cdef class {{name}}HashTable(HashTable):
 
         return np.asarray(labels), arr_uniques
 
-    @cython.boundscheck(False)
-    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
-        cdef:
-            Py_ssize_t i, idx, count = 0, n = len(values)
-            int64_t[:] labels
-            int ret = 0
-            {{dtype}}_t val
-            khiter_t k
-            {{name}}Vector uniques = {{name}}Vector()
-            {{name}}VectorData *ud
-
-        ud = uniques.data
-        if return_inverse:
-            labels = np.empty(n, dtype=np.int64)
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_get_{{dtype}}(self.table, val)
-                if return_inverse and k != self.table.n_buckets:
-                    # k falls into a previous bucket
-                    idx = self.table.vals[k]
-                    labels[i] = idx
-                elif k == self.table.n_buckets:
-                    # k hasn't been seen yet
-                    k = kh_put_{{dtype}}(self.table, val, &ret)
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, val)
-                    if return_inverse:
-                        self.table.vals[k] = count
-                        labels[i] = count
-                    count += 1
-
-        if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        return uniques.to_array()
-
 {{endfor}}
 
 
@@ -583,59 +583,6 @@ cdef class StringHashTable(HashTable):
         free(vecs)
         return labels
 
-    @cython.boundscheck(False)
-    def unique(self, ndarray[object] values, bint return_inverse=False):
-        cdef:
-            Py_ssize_t i, idx, count = 0, n = len(values)
-            int64_t[:] labels
-            int64_t[:] uindexer
-            int ret = 0
-            object val
-            ObjectVector uniques = ObjectVector()
-            khiter_t k
-            const char *v
-            const char **vecs
-
-        if return_inverse:
-            labels = np.zeros(n, dtype=np.int64)
-        uindexer = np.empty(n, dtype=np.int64)
-
-        # assign pointers
-        vecs = <const char **> malloc(n * sizeof(char *))
-        for i in range(n):
-            val = values[i]
-            v = util.get_c_string(val)
-            vecs[i] = v
-
-
-        # compute
-        with nogil:
-            for i in range(n):
-                v = vecs[i]
-                k = kh_get_str(self.table, v)
-                if return_inverse and k != self.table.n_buckets:
-                    # k falls into a previous bucket
-                    idx = self.table.vals[k]
-                    labels[i] = <int64_t>idx
-                elif k == self.table.n_buckets:
-                    # k hasn't been seen yet
-                    k = kh_put_str(self.table, v, &ret)
-                    uindexer[count] = i
-                    if return_inverse:
-                        self.table.vals[k] = count
-                        labels[i] = <int64_t>count
-                    count += 1
-
-        free(vecs)
-
-        # uniques
-        for i in range(count):
-            uniques.append(values[uindexer[i]])
-
-        if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        return uniques.to_array()
-
     @cython.boundscheck(False)
     def lookup(self, ndarray[object] values):
         cdef:
@@ -697,6 +644,59 @@ cdef class StringHashTable(HashTable):
                 self.table.vals[k] = i
         free(vecs)
 
+    @cython.boundscheck(False)
+    def unique(self, ndarray[object] values, bint return_inverse=False):
+        cdef:
+            Py_ssize_t i, idx, count = 0, n = len(values)
+            int64_t[:] labels
+            int64_t[:] uindexer
+            int ret = 0
+            object val
+            ObjectVector uniques = ObjectVector()
+            khiter_t k
+            const char *v
+            const char **vecs
+
+        if return_inverse:
+            labels = np.zeros(n, dtype=np.int64)
+        uindexer = np.empty(n, dtype=np.int64)
+
+        # assign pointers
+        vecs = <const char **> malloc(n * sizeof(char *))
+        for i in range(n):
+            val = values[i]
+            v = util.get_c_string(val)
+            vecs[i] = v
+
+
+        # compute
+        with nogil:
+            for i in range(n):
+                v = vecs[i]
+                k = kh_get_str(self.table, v)
+                if return_inverse and k != self.table.n_buckets:
+                    # k falls into a previous bucket
+                    idx = self.table.vals[k]
+                    labels[i] = <int64_t>idx
+                elif k == self.table.n_buckets:
+                    # k hasn't been seen yet
+                    k = kh_put_str(self.table, v, &ret)
+                    uindexer[count] = i
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = <int64_t>count
+                    count += 1
+
+        free(vecs)
+
+        # uniques
+        for i in range(count):
+            uniques.append(values[uindexer[i]])
+
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
+
     @cython.boundscheck(False)
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, int64_t na_sentinel=-1,

From 9918d52b96f722495fb1a72135e991d725ba3cda Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 3 Oct 2018 23:07:07 +0200
Subject: [PATCH 04/26] Unify hashtable.factorize and .unique

---
 pandas/_libs/hashtable_class_helper.pxi.in | 253 ++++++++-------------
 1 file changed, 91 insertions(+), 162 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 0418939956b1c..3d3d0ad66734b 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -356,64 +356,21 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(locs)
 
     @cython.boundscheck(False)
-    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
+    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+                bint ignore_na=False, bint return_inverse=False,
+                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                object na_value=None):
         cdef:
-            Py_ssize_t i, idx, count = 0, n = len(values)
+            Py_ssize_t i, idx, count = count_prior, n = len(values)
             int64_t[:] labels
             int ret = 0
-            {{dtype}}_t val
-            khiter_t k
-            {{name}}Vector uniques = {{name}}Vector()
-            {{name}}VectorData *ud
-
-        ud = uniques.data
-        if return_inverse:
-            labels = np.empty(n, dtype=np.int64)
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_get_{{dtype}}(self.table, val)
-                if return_inverse and k != self.table.n_buckets:
-                    # k falls into a previous bucket
-                    idx = self.table.vals[k]
-                    labels[i] = idx
-                elif k == self.table.n_buckets:
-                    # k hasn't been seen yet
-                    k = kh_put_{{dtype}}(self.table, val, &ret)
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, val)
-                    if return_inverse:
-                        self.table.vals[k] = count
-                        labels[i] = count
-                    count += 1
-
-        if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        return uniques.to_array()
-
-    def factorize(self, {{dtype}}_t[:] values):
-        uniques = {{name}}Vector()
-        labels = self.get_labels(values, uniques, 0)
-        return uniques.to_array(), labels
-
-    @cython.boundscheck(False)
-    def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int64_t[:] labels
-            Py_ssize_t idx, count = count_prior
-            int ret = 0
             {{dtype}}_t val, na_value2
             khiter_t k
             {{name}}VectorData *ud
             bint use_na_value
 
-        labels = np.empty(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
 
@@ -431,21 +388,19 @@ cdef class {{name}}HashTable(HashTable):
             for i in range(n):
                 val = values[i]
 
-                if val != val or (use_na_value and val == na_value2):
+                if ignore_na and (val != val
+                                  or (use_na_value and val == na_value2)):
                     labels[i] = na_sentinel
                     continue
 
                 k = kh_get_{{dtype}}(self.table, val)
-
-                if k != self.table.n_buckets:
+                if return_inverse and k != self.table.n_buckets:
                     # k falls into a previous bucket
                     idx = self.table.vals[k]
                     labels[i] = idx
-                else:
+                elif k == self.table.n_buckets:
                     # k hasn't been seen yet
                     k = kh_put_{{dtype}}(self.table, val, &ret)
-                    self.table.vals[k] = count
-
                     if needs_resize(ud):
                         with gil:
                             if uniques.external_view_exists:
@@ -454,10 +409,30 @@ cdef class {{name}}HashTable(HashTable):
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                    labels[i] = count
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = count
                     count += 1
 
-        return np.asarray(labels)
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
+
+    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
+        return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
+                            return_inverse=return_inverse)
+
+    def factorize(self, {{dtype}}_t[:] values):
+        return self._unique(values, uniques={{name}}Vector(), ignore_na=True,
+                            return_inverse=True)
+
+    def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                   object na_value=None):
+        _, labels = self._unique(values, uniques, ignore_na=True,
+                                 return_inverse=True, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value)
+        return labels
 
     @cython.boundscheck(False)
     def get_labels_groupby(self, const {{dtype}}_t[:] values):
@@ -645,33 +620,45 @@ cdef class StringHashTable(HashTable):
         free(vecs)
 
     @cython.boundscheck(False)
-    def unique(self, ndarray[object] values, bint return_inverse=False):
+    def _unique(self, ndarray[object] values, ObjectVector uniques,
+                bint ignore_na=False, bint return_inverse=False,
+                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                object na_value=None):
         cdef:
-            Py_ssize_t i, idx, count = 0, n = len(values)
+            Py_ssize_t i, idx, count = count_prior, n = len(values)
             int64_t[:] labels
             int64_t[:] uindexer
             int ret = 0
             object val
-            ObjectVector uniques = ObjectVector()
-            khiter_t k
             const char *v
             const char **vecs
+            khiter_t k
+            bint use_na_value
 
         if return_inverse:
             labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
+        use_na_value = na_value is not None
 
-        # assign pointers
+        # assign pointers and pre-filter out missing (if ignore_na)
         vecs = <const char **> malloc(n * sizeof(char *))
         for i in range(n):
             val = values[i]
-            v = util.get_c_string(val)
-            vecs[i] = v
 
+            if not ignore_na or ((PyUnicode_Check(val) or PyString_Check(val))
+                                 and not (use_na_value and val == na_value)):
+                # if ignore_na is False, we also stringify NaN/None/etc.
+                v = util.get_c_string(val)
+                vecs[i] = v
+            else:
+                labels[i] = na_sentinel
 
         # compute
         with nogil:
             for i in range(n):
+                if ignore_na and labels[i] == na_sentinel:
+                    continue
+
                 v = vecs[i]
                 k = kh_get_str(self.table, v)
                 if return_inverse and k != self.table.n_buckets:
@@ -697,65 +684,21 @@ cdef class StringHashTable(HashTable):
             return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
 
-    @cython.boundscheck(False)
-    def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior=0, int64_t na_sentinel=-1,
-                   object na_value=None):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int64_t[:] labels
-            int64_t[:] uindexer
-            Py_ssize_t idx, count = count_prior
-            int ret = 0
-            object val
-            const char *v
-            const char **vecs
-            khiter_t k
-            bint use_na_value
-
-        # these by-definition *must* be strings
-        labels = np.zeros(n, dtype=np.int64)
-        uindexer = np.empty(n, dtype=np.int64)
-        use_na_value = na_value is not None
-
-        # pre-filter out missing
-        # and assign pointers
-        vecs = <const char **> malloc(n * sizeof(char *))
-        for i in range(n):
-            val = values[i]
-
-            if ((PyUnicode_Check(val) or PyString_Check(val)) and
-                    not (use_na_value and val == na_value)):
-                v = util.get_c_string(val)
-                vecs[i] = v
-            else:
-                labels[i] = na_sentinel
-
-        # compute
-        with nogil:
-            for i in range(n):
-                if labels[i] == na_sentinel:
-                    continue
-
-                v = vecs[i]
-                k = kh_get_str(self.table, v)
-                if k != self.table.n_buckets:
-                    idx = self.table.vals[k]
-                    labels[i] = <int64_t>idx
-                else:
-                    k = kh_put_str(self.table, v, &ret)
-                    self.table.vals[k] = count
-                    uindexer[count] = i
-                    labels[i] = <int64_t>count
-                    count += 1
-
-        free(vecs)
+    def unique(self, ndarray[object] values, bint return_inverse=False):
+        return self._unique(values, uniques=ObjectVector(), ignore_na=False,
+                            return_inverse=return_inverse)
 
-        # uniques
-        for i in range(count):
-            uniques.append(values[uindexer[i]])
+    def factorize(self, ndarray[object] values):
+        return self._unique(values, uniques=ObjectVector(), ignore_na=True,
+                            return_inverse=True)
 
-        return np.asarray(labels)
+    def get_labels(self, ndarray[object] values, ObjectVector uniques,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                   object na_value=None):
+        _, labels = self._unique(values, uniques, ignore_na=True,
+                                 return_inverse=True, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value)
+        return labels
 
 
 cdef class PyObjectHashTable(HashTable):
@@ -844,21 +787,31 @@ cdef class PyObjectHashTable(HashTable):
         return np.asarray(locs)
 
     @cython.boundscheck(False)
-    def unique(self, ndarray[object] values, bint return_inverse=False):
+    def _unique(self, ndarray[object] values, ObjectVector uniques,
+                bint ignore_na=False, bint return_inverse=False,
+                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                object na_value=None):
         cdef:
-            Py_ssize_t i, idx, count = 0, n = len(values)
+            Py_ssize_t i, idx, count = count_prior, n = len(values)
             int64_t[:] labels
             int ret = 0
             object val
             khiter_t k
-            ObjectVector uniques = ObjectVector()
+            bint use_na_value
 
         if return_inverse:
             labels = np.empty(n, dtype=np.int64)
+        use_na_value = na_value is not None
 
         for i in range(n):
             val = values[i]
             hash(val)
+
+            if ignore_na and ((val != val or val is None)
+                              or (use_na_value and val == na_value)):
+                labels[i] = na_sentinel
+                continue
+
             k = kh_get_pymap(self.table, <PyObject*>val)
             if return_inverse and k != self.table.n_buckets:
                 # k falls into a previous bucket
@@ -877,42 +830,18 @@ cdef class PyObjectHashTable(HashTable):
             return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
 
-    @cython.boundscheck(False)
-    def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior=0, int64_t na_sentinel=-1,
-                   object na_value=None):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int64_t[:] labels
-            Py_ssize_t idx, count = count_prior
-            int ret = 0
-            object val
-            khiter_t k
-            bint use_na_value
-
-        labels = np.empty(n, dtype=np.int64)
-        use_na_value = na_value is not None
-
-        for i in range(n):
-            val = values[i]
-            hash(val)
-
-            if ((val != val or val is None) or
-                    (use_na_value and val == na_value)):
-                labels[i] = na_sentinel
-                continue
+    def unique(self, ndarray[object] values, bint return_inverse=False):
+        return self._unique(values, uniques=ObjectVector(), ignore_na=False,
+                            return_inverse=return_inverse)
 
-            k = kh_get_pymap(self.table, <PyObject*>val)
-            if k != self.table.n_buckets:
-                # k falls into a previous bucket
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
-                # k hasn't been seen yet
-                k = kh_put_pymap(self.table, <PyObject*>val, &ret)
-                self.table.vals[k] = count
-                uniques.append(val)
-                labels[i] = count
-                count += 1
+    def factorize(self, ndarray[object] values):
+        return self._unique(values, uniques=ObjectVector(), ignore_na=True,
+                            return_inverse=True)
 
-        return np.asarray(labels)
+    def get_labels(self, ndarray[object] values, ObjectVector uniques,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                   object na_value=None):
+        _, labels = self._unique(values, uniques, ignore_na=True,
+                                 return_inverse=True, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value)
+        return labels

From 52ae84e7f05d86ca228448b412fff76b99a0a1b0 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 4 Oct 2018 16:48:07 +0200
Subject: [PATCH 05/26] Force compilation of different code paths

---
 pandas/_libs/hashtable_class_helper.pxi.in | 39 ++++++++++++++++++++--
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 3d3d0ad66734b..4d02b24734c29 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -419,8 +419,19 @@ cdef class {{name}}HashTable(HashTable):
         return uniques.to_array()
 
     def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
+        # define separate functions with/without inverse to force compilation
+        # of the different code paths for boolean "return_inverse"
+        if return_inverse:
+            return self._unique_with_inverse(values)
+        return self._unique_no_inverse(values)
+
+    def _unique_no_inverse(self, const {{dtype}}_t[:] values):
+        return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
+                            return_inverse=False)
+
+    def _unique_with_inverse(self, const {{dtype}}_t[:] values):
         return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
-                            return_inverse=return_inverse)
+                            return_inverse=True)
 
     def factorize(self, {{dtype}}_t[:] values):
         return self._unique(values, uniques={{name}}Vector(), ignore_na=True,
@@ -685,8 +696,19 @@ cdef class StringHashTable(HashTable):
         return uniques.to_array()
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
+        # define separate functions with/without inverse to force compilation
+        # of the different code paths for boolean "return_inverse"
+        if return_inverse:
+            return self._unique_with_inverse(values)
+        return self._unique_no_inverse(values)
+
+    def _unique_no_inverse(self, ndarray[object] values):
         return self._unique(values, uniques=ObjectVector(), ignore_na=False,
-                            return_inverse=return_inverse)
+                            return_inverse=False)
+
+    def _unique_with_inverse(self, ndarray[object] values):
+        return self._unique(values, uniques=ObjectVector(), ignore_na=False,
+                            return_inverse=True)
 
     def factorize(self, ndarray[object] values):
         return self._unique(values, uniques=ObjectVector(), ignore_na=True,
@@ -831,8 +853,19 @@ cdef class PyObjectHashTable(HashTable):
         return uniques.to_array()
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
+        # define separate functions with/without inverse to force compilation
+        # of the different code paths for boolean "return_inverse"
+        if return_inverse:
+            return self._unique_with_inverse(values)
+        return self._unique_no_inverse(values)
+
+    def _unique_no_inverse(self, ndarray[object] values):
         return self._unique(values, uniques=ObjectVector(), ignore_na=False,
-                            return_inverse=return_inverse)
+                            return_inverse=False)
+
+    def _unique_with_inverse(self, ndarray[object] values):
+        return self._unique(values, uniques=ObjectVector(), ignore_na=False,
+                            return_inverse=True)
 
     def factorize(self, ndarray[object] values):
         return self._unique(values, uniques=ObjectVector(), ignore_na=True,

From dbe4e0ed81e826ca631c7b70c0117b5026a5f079 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 4 Oct 2018 22:17:36 +0200
Subject: [PATCH 06/26] Add separate functions for return_inverse=False

---
 pandas/_libs/hashtable_class_helper.pxi.in | 113 +++++++++++++++------
 1 file changed, 80 insertions(+), 33 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 4d02b24734c29..e179445bbeac7 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -418,21 +418,35 @@ cdef class {{name}}HashTable(HashTable):
             return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
 
+    @cython.boundscheck(False)
+    def _unique_no_inverse(self, const {{dtype}}_t[:] values):
+        # define separate functions without inverse for performance
+        cdef:
+           Py_ssize_t i, n = len(values)
+           int ret = 0
+           {{dtype}}_t val
+           khiter_t k
+           {{name}}Vector uniques = {{name}}Vector()
+           {{name}}VectorData *ud
+        ud = uniques.data
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                k = kh_get_{{dtype}}(self.table, val)
+                if k == self.table.n_buckets:
+                    kh_put_{{dtype}}(self.table, val, &ret)
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data_{{dtype}}(ud, val)
+        return uniques.to_array()
+
     def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
-        # define separate functions with/without inverse to force compilation
-        # of the different code paths for boolean "return_inverse"
         if return_inverse:
-            return self._unique_with_inverse(values)
+            return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
+                                return_inverse=True)
         return self._unique_no_inverse(values)
 
-    def _unique_no_inverse(self, const {{dtype}}_t[:] values):
-        return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
-                            return_inverse=False)
-
-    def _unique_with_inverse(self, const {{dtype}}_t[:] values):
-        return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
-                            return_inverse=True)
-
     def factorize(self, {{dtype}}_t[:] values):
         return self._unique(values, uniques={{name}}Vector(), ignore_na=True,
                             return_inverse=True)
@@ -695,21 +709,46 @@ cdef class StringHashTable(HashTable):
             return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
 
+    @cython.boundscheck(False)
+    def _unique_no_inverse(self, ndarray[object] values):
+        # define separate functions without inverse for performance
+        cdef:
+            Py_ssize_t i, count, n = len(values)
+            int64_t[:] uindexer
+            int ret = 0
+            object val
+            ObjectVector uniques
+            khiter_t k
+            const char *v
+            const char **vecs
+        vecs = <const char **> malloc(n * sizeof(char *))
+        uindexer = np.empty(n, dtype=np.int64)
+        for i in range(n):
+            val = values[i]
+            v = util.get_c_string(val)
+            vecs[i] = v
+        count = 0
+        with nogil:
+            for i in range(n):
+                v = vecs[i]
+                k = kh_get_str(self.table, v)
+                if k == self.table.n_buckets:
+                    kh_put_str(self.table, v, &ret)
+                    uindexer[count] = i
+                    count += 1
+        free(vecs)
+        # uniques
+        uniques = ObjectVector()
+        for i in range(count):
+            uniques.append(values[uindexer[i]])
+        return uniques.to_array()
+
     def unique(self, ndarray[object] values, bint return_inverse=False):
-        # define separate functions with/without inverse to force compilation
-        # of the different code paths for boolean "return_inverse"
         if return_inverse:
-            return self._unique_with_inverse(values)
+            return self._unique(values, uniques=ObjectVector(), ignore_na=False,
+                            	return_inverse=True)
         return self._unique_no_inverse(values)
 
-    def _unique_no_inverse(self, ndarray[object] values):
-        return self._unique(values, uniques=ObjectVector(), ignore_na=False,
-                            return_inverse=False)
-
-    def _unique_with_inverse(self, ndarray[object] values):
-        return self._unique(values, uniques=ObjectVector(), ignore_na=False,
-                            return_inverse=True)
-
     def factorize(self, ndarray[object] values):
         return self._unique(values, uniques=ObjectVector(), ignore_na=True,
                             return_inverse=True)
@@ -852,21 +891,29 @@ cdef class PyObjectHashTable(HashTable):
             return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
 
+    def _unique_no_inverse(self, ndarray[object] values):
+        # define separate functions without inverse for performance
+        cdef:
+            Py_ssize_t i, n = len(values)
+            int ret = 0
+            object val
+            khiter_t k
+            ObjectVector uniques = ObjectVector()
+        for i in range(n):
+            val = values[i]
+            hash(val)
+            k = kh_get_pymap(self.table, <PyObject*>val)
+            if k == self.table.n_buckets:
+                kh_put_pymap(self.table, <PyObject*>val, &ret)
+                uniques.append(val)
+        return uniques.to_array()
+
     def unique(self, ndarray[object] values, bint return_inverse=False):
-        # define separate functions with/without inverse to force compilation
-        # of the different code paths for boolean "return_inverse"
         if return_inverse:
-            return self._unique_with_inverse(values)
+            return self._unique(values, uniques=ObjectVector(), ignore_na=False,
+                            	return_inverse=True)
         return self._unique_no_inverse(values)
 
-    def _unique_no_inverse(self, ndarray[object] values):
-        return self._unique(values, uniques=ObjectVector(), ignore_na=False,
-                            return_inverse=False)
-
-    def _unique_with_inverse(self, ndarray[object] values):
-        return self._unique(values, uniques=ObjectVector(), ignore_na=False,
-                            return_inverse=True)
-
     def factorize(self, ndarray[object] values):
         return self._unique(values, uniques=ObjectVector(), ignore_na=True,
                             return_inverse=True)

From 8481e19619c8847b0bdb7dcb61726497f552b230 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 4 Oct 2018 23:12:30 +0200
Subject: [PATCH 07/26] Finish split in _unique_with_inverse and
 _unique_no_inverse

---
 pandas/_libs/hashtable_class_helper.pxi.in | 116 ++++++++++-----------
 1 file changed, 53 insertions(+), 63 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index e179445bbeac7..6e9afb8faa42d 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -356,10 +356,10 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(locs)
 
     @cython.boundscheck(False)
-    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
-                bint ignore_na=False, bint return_inverse=False,
-                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None):
+    def _unique_with_inverse(self, const {{dtype}}_t[:] values,
+                             {{name}}Vector uniques, bint ignore_na=False,
+                             Py_ssize_t count_prior=0,
+                             Py_ssize_t na_sentinel=-1, object na_value=None):
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
             int64_t[:] labels
@@ -369,8 +369,7 @@ cdef class {{name}}HashTable(HashTable):
             {{name}}VectorData *ud
             bint use_na_value
 
-        if return_inverse:
-            labels = np.empty(n, dtype=np.int64)
+        labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
 
@@ -394,11 +393,11 @@ cdef class {{name}}HashTable(HashTable):
                     continue
 
                 k = kh_get_{{dtype}}(self.table, val)
-                if return_inverse and k != self.table.n_buckets:
+                if k != self.table.n_buckets:
                     # k falls into a previous bucket
                     idx = self.table.vals[k]
                     labels[i] = idx
-                elif k == self.table.n_buckets:
+                else:
                     # k hasn't been seen yet
                     k = kh_put_{{dtype}}(self.table, val, &ret)
                     if needs_resize(ud):
@@ -409,14 +408,11 @@ cdef class {{name}}HashTable(HashTable):
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                    if return_inverse:
-                        self.table.vals[k] = count
-                        labels[i] = count
+                    self.table.vals[k] = count
+                    labels[i] = count
                     count += 1
 
-        if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        return uniques.to_array()
+        return uniques.to_array(), np.asarray(labels)
 
     @cython.boundscheck(False)
     def _unique_no_inverse(self, const {{dtype}}_t[:] values):
@@ -443,20 +439,21 @@ cdef class {{name}}HashTable(HashTable):
 
     def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
         if return_inverse:
-            return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
-                                return_inverse=True)
+            return self._unique_with_inverse(values, uniques={{name}}Vector(),
+                                             ignore_na=False)
         return self._unique_no_inverse(values)
 
     def factorize(self, {{dtype}}_t[:] values):
-        return self._unique(values, uniques={{name}}Vector(), ignore_na=True,
-                            return_inverse=True)
+        return self._unique_with_inverse(values, uniques={{name}}Vector(),
+                                         ignore_na=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique(values, uniques, ignore_na=True,
-                                 return_inverse=True, count_prior=count_prior,
-                                 na_sentinel=na_sentinel, na_value=na_value)
+        _, labels = self._unique_with_inverse(values, uniques, ignore_na=True,
+                                              count_prior=count_prior,
+                                              na_sentinel=na_sentinel,
+                                              na_value=na_value)
         return labels
 
     @cython.boundscheck(False)
@@ -645,10 +642,10 @@ cdef class StringHashTable(HashTable):
         free(vecs)
 
     @cython.boundscheck(False)
-    def _unique(self, ndarray[object] values, ObjectVector uniques,
-                bint ignore_na=False, bint return_inverse=False,
-                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None):
+    def _unique_with_inverse(self, ndarray[object] values,
+                             ObjectVector uniques, bint ignore_na=False,
+                             Py_ssize_t count_prior=0,
+                             Py_ssize_t na_sentinel=-1, object na_value=None):
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
             int64_t[:] labels
@@ -660,8 +657,7 @@ cdef class StringHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        if return_inverse:
-            labels = np.zeros(n, dtype=np.int64)
+        labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
@@ -686,17 +682,16 @@ cdef class StringHashTable(HashTable):
 
                 v = vecs[i]
                 k = kh_get_str(self.table, v)
-                if return_inverse and k != self.table.n_buckets:
+                if k != self.table.n_buckets:
                     # k falls into a previous bucket
                     idx = self.table.vals[k]
                     labels[i] = <int64_t>idx
-                elif k == self.table.n_buckets:
+                else:
                     # k hasn't been seen yet
                     k = kh_put_str(self.table, v, &ret)
                     uindexer[count] = i
-                    if return_inverse:
-                        self.table.vals[k] = count
-                        labels[i] = <int64_t>count
+                    self.table.vals[k] = count
+                    labels[i] = <int64_t>count
                     count += 1
 
         free(vecs)
@@ -705,9 +700,7 @@ cdef class StringHashTable(HashTable):
         for i in range(count):
             uniques.append(values[uindexer[i]])
 
-        if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        return uniques.to_array()
+        return uniques.to_array(), np.asarray(labels)
 
     @cython.boundscheck(False)
     def _unique_no_inverse(self, ndarray[object] values):
@@ -745,20 +738,21 @@ cdef class StringHashTable(HashTable):
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
         if return_inverse:
-            return self._unique(values, uniques=ObjectVector(), ignore_na=False,
-                            	return_inverse=True)
+            return self._unique_with_inverse(values, uniques=ObjectVector(),
+                                             ignore_na=False)
         return self._unique_no_inverse(values)
 
     def factorize(self, ndarray[object] values):
-        return self._unique(values, uniques=ObjectVector(), ignore_na=True,
-                            return_inverse=True)
+        return self._unique_with_inverse(values, uniques=ObjectVector(),
+                                         ignore_na=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique(values, uniques, ignore_na=True,
-                                 return_inverse=True, count_prior=count_prior,
-                                 na_sentinel=na_sentinel, na_value=na_value)
+        _, labels = self._unique_with_inverse(values, uniques, ignore_na=True,
+                                              count_prior=count_prior,
+                                              na_sentinel=na_sentinel,
+                                              na_value=na_value)
         return labels
 
 
@@ -848,10 +842,10 @@ cdef class PyObjectHashTable(HashTable):
         return np.asarray(locs)
 
     @cython.boundscheck(False)
-    def _unique(self, ndarray[object] values, ObjectVector uniques,
-                bint ignore_na=False, bint return_inverse=False,
-                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None):
+    def _unique_with_inverse(self, ndarray[object] values,
+                             ObjectVector uniques, bint ignore_na=False,
+                             Py_ssize_t count_prior=0,
+                             Py_ssize_t na_sentinel=-1, object na_value=None):
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
             int64_t[:] labels
@@ -860,8 +854,7 @@ cdef class PyObjectHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        if return_inverse:
-            labels = np.empty(n, dtype=np.int64)
+        labels = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
         for i in range(n):
@@ -874,22 +867,19 @@ cdef class PyObjectHashTable(HashTable):
                 continue
 
             k = kh_get_pymap(self.table, <PyObject*>val)
-            if return_inverse and k != self.table.n_buckets:
+            if k != self.table.n_buckets:
                 # k falls into a previous bucket
                 idx = self.table.vals[k]
                 labels[i] = <int64_t>idx
-            elif k == self.table.n_buckets:
+            else:
                 # k hasn't been seen yet
                 k = kh_put_pymap(self.table, <PyObject*>val, &ret)
                 uniques.append(val)
-                if return_inverse:
-                    self.table.vals[k] = count
-                    labels[i] = <int64_t>count
+                self.table.vals[k] = count
+                labels[i] = <int64_t>count
                 count += 1
 
-        if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        return uniques.to_array()
+        return uniques.to_array(), np.asarray(labels)
 
     def _unique_no_inverse(self, ndarray[object] values):
         # define separate functions without inverse for performance
@@ -910,18 +900,18 @@ cdef class PyObjectHashTable(HashTable):
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
         if return_inverse:
-            return self._unique(values, uniques=ObjectVector(), ignore_na=False,
-                            	return_inverse=True)
+            return self._unique_with_inverse(values, uniques=ObjectVector(),
+                                             ignore_na=False)
         return self._unique_no_inverse(values)
 
     def factorize(self, ndarray[object] values):
-        return self._unique(values, uniques=ObjectVector(), ignore_na=True,
-                            return_inverse=True)
+        return self._unique_with_inverse(values, uniques=ObjectVector(), ignore_na=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique(values, uniques, ignore_na=True,
-                                 return_inverse=True, count_prior=count_prior,
-                                 na_sentinel=na_sentinel, na_value=na_value)
+        _, labels = self._unique_with_inverse(values, uniques, ignore_na=True,
+                                              count_prior=count_prior,
+                                              na_sentinel=na_sentinel,
+                                              na_value=na_value)
         return labels

From 27ceb4d649e3dad3e87545bf141ea200e5650a6f Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 4 Oct 2018 23:38:18 +0200
Subject: [PATCH 08/26] Add cython.wraparound(False)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 6e9afb8faa42d..285bdaac2ce12 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -356,6 +356,7 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(locs)
 
     @cython.boundscheck(False)
+    @cython.wraparound(False)
     def _unique_with_inverse(self, const {{dtype}}_t[:] values,
                              {{name}}Vector uniques, bint ignore_na=False,
                              Py_ssize_t count_prior=0,
@@ -415,6 +416,7 @@ cdef class {{name}}HashTable(HashTable):
         return uniques.to_array(), np.asarray(labels)
 
     @cython.boundscheck(False)
+    @cython.wraparound(False)
     def _unique_no_inverse(self, const {{dtype}}_t[:] values):
         # define separate functions without inverse for performance
         cdef:
@@ -642,6 +644,7 @@ cdef class StringHashTable(HashTable):
         free(vecs)
 
     @cython.boundscheck(False)
+    @cython.wraparound(False)
     def _unique_with_inverse(self, ndarray[object] values,
                              ObjectVector uniques, bint ignore_na=False,
                              Py_ssize_t count_prior=0,
@@ -703,6 +706,7 @@ cdef class StringHashTable(HashTable):
         return uniques.to_array(), np.asarray(labels)
 
     @cython.boundscheck(False)
+    @cython.wraparound(False)
     def _unique_no_inverse(self, ndarray[object] values):
         # define separate functions without inverse for performance
         cdef:
@@ -842,6 +846,7 @@ cdef class PyObjectHashTable(HashTable):
         return np.asarray(locs)
 
     @cython.boundscheck(False)
+    @cython.wraparound(False)
     def _unique_with_inverse(self, ndarray[object] values,
                              ObjectVector uniques, bint ignore_na=False,
                              Py_ssize_t count_prior=0,
@@ -881,6 +886,8 @@ cdef class PyObjectHashTable(HashTable):
 
         return uniques.to_array(), np.asarray(labels)
 
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
     def _unique_no_inverse(self, ndarray[object] values):
         # define separate functions without inverse for performance
         cdef:

From b1705a995b561b9f36f0edab10f3f1bb3984b606 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sat, 6 Oct 2018 17:34:32 +0200
Subject: [PATCH 09/26] Unmove unique-implementation (review jreback)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 156 ++++++++++-----------
 1 file changed, 78 insertions(+), 78 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 285bdaac2ce12..f0c675596688b 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -415,30 +415,6 @@ cdef class {{name}}HashTable(HashTable):
 
         return uniques.to_array(), np.asarray(labels)
 
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    def _unique_no_inverse(self, const {{dtype}}_t[:] values):
-        # define separate functions without inverse for performance
-        cdef:
-           Py_ssize_t i, n = len(values)
-           int ret = 0
-           {{dtype}}_t val
-           khiter_t k
-           {{name}}Vector uniques = {{name}}Vector()
-           {{name}}VectorData *ud
-        ud = uniques.data
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_get_{{dtype}}(self.table, val)
-                if k == self.table.n_buckets:
-                    kh_put_{{dtype}}(self.table, val, &ret)
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, val)
-        return uniques.to_array()
-
     def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
         if return_inverse:
             return self._unique_with_inverse(values, uniques={{name}}Vector(),
@@ -503,6 +479,30 @@ cdef class {{name}}HashTable(HashTable):
 
         return np.asarray(labels), arr_uniques
 
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    def _unique_no_inverse(self, const {{dtype}}_t[:] values):
+        # define separate functions without inverse for performance
+        cdef:
+           Py_ssize_t i, n = len(values)
+           int ret = 0
+           {{dtype}}_t val
+           khiter_t k
+           {{name}}Vector uniques = {{name}}Vector()
+           {{name}}VectorData *ud
+        ud = uniques.data
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                k = kh_get_{{dtype}}(self.table, val)
+                if k == self.table.n_buckets:
+                    kh_put_{{dtype}}(self.table, val, &ret)
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data_{{dtype}}(ud, val)
+        return uniques.to_array()
+
 {{endfor}}
 
 
@@ -582,6 +582,41 @@ cdef class StringHashTable(HashTable):
         free(vecs)
         return labels
 
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    def _unique_no_inverse(self, ndarray[object] values):
+        # define separate functions without inverse for performance
+        cdef:
+            Py_ssize_t i, count, n = len(values)
+            int64_t[:] uindexer
+            int ret = 0
+            object val
+            ObjectVector uniques
+            khiter_t k
+            const char *v
+            const char **vecs
+        vecs = <const char **> malloc(n * sizeof(char *))
+        uindexer = np.empty(n, dtype=np.int64)
+        for i in range(n):
+            val = values[i]
+            v = util.get_c_string(val)
+            vecs[i] = v
+        count = 0
+        with nogil:
+            for i in range(n):
+                v = vecs[i]
+                k = kh_get_str(self.table, v)
+                if k == self.table.n_buckets:
+                    kh_put_str(self.table, v, &ret)
+                    uindexer[count] = i
+                    count += 1
+        free(vecs)
+        # uniques
+        uniques = ObjectVector()
+        for i in range(count):
+            uniques.append(values[uindexer[i]])
+        return uniques.to_array()
+
     @cython.boundscheck(False)
     def lookup(self, ndarray[object] values):
         cdef:
@@ -705,41 +740,6 @@ cdef class StringHashTable(HashTable):
 
         return uniques.to_array(), np.asarray(labels)
 
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    def _unique_no_inverse(self, ndarray[object] values):
-        # define separate functions without inverse for performance
-        cdef:
-            Py_ssize_t i, count, n = len(values)
-            int64_t[:] uindexer
-            int ret = 0
-            object val
-            ObjectVector uniques
-            khiter_t k
-            const char *v
-            const char **vecs
-        vecs = <const char **> malloc(n * sizeof(char *))
-        uindexer = np.empty(n, dtype=np.int64)
-        for i in range(n):
-            val = values[i]
-            v = util.get_c_string(val)
-            vecs[i] = v
-        count = 0
-        with nogil:
-            for i in range(n):
-                v = vecs[i]
-                k = kh_get_str(self.table, v)
-                if k == self.table.n_buckets:
-                    kh_put_str(self.table, v, &ret)
-                    uindexer[count] = i
-                    count += 1
-        free(vecs)
-        # uniques
-        uniques = ObjectVector()
-        for i in range(count):
-            uniques.append(values[uindexer[i]])
-        return uniques.to_array()
-
     def unique(self, ndarray[object] values, bint return_inverse=False):
         if return_inverse:
             return self._unique_with_inverse(values, uniques=ObjectVector(),
@@ -845,6 +845,25 @@ cdef class PyObjectHashTable(HashTable):
 
         return np.asarray(locs)
 
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    def _unique_no_inverse(self, ndarray[object] values):
+        # define separate functions without inverse for performance
+        cdef:
+            Py_ssize_t i, n = len(values)
+            int ret = 0
+            object val
+            khiter_t k
+            ObjectVector uniques = ObjectVector()
+        for i in range(n):
+            val = values[i]
+            hash(val)
+            k = kh_get_pymap(self.table, <PyObject*>val)
+            if k == self.table.n_buckets:
+                kh_put_pymap(self.table, <PyObject*>val, &ret)
+                uniques.append(val)
+        return uniques.to_array()
+
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def _unique_with_inverse(self, ndarray[object] values,
@@ -886,25 +905,6 @@ cdef class PyObjectHashTable(HashTable):
 
         return uniques.to_array(), np.asarray(labels)
 
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    def _unique_no_inverse(self, ndarray[object] values):
-        # define separate functions without inverse for performance
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            object val
-            khiter_t k
-            ObjectVector uniques = ObjectVector()
-        for i in range(n):
-            val = values[i]
-            hash(val)
-            k = kh_get_pymap(self.table, <PyObject*>val)
-            if k == self.table.n_buckets:
-                kh_put_pymap(self.table, <PyObject*>val, &ret)
-                uniques.append(val)
-        return uniques.to_array()
-
     def unique(self, ndarray[object] values, bint return_inverse=False):
         if return_inverse:
             return self._unique_with_inverse(values, uniques=ObjectVector(),

From a6ed5ddc65ab2ed49ee63dda2ccb40179e8166ab Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sat, 6 Oct 2018 17:42:31 +0200
Subject: [PATCH 10/26] Undo line artefacts

---
 pandas/_libs/hashtable_class_helper.pxi.in | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index f0c675596688b..ec6b1c575191a 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -394,6 +394,7 @@ cdef class {{name}}HashTable(HashTable):
                     continue
 
                 k = kh_get_{{dtype}}(self.table, val)
+
                 if k != self.table.n_buckets:
                     # k falls into a previous bucket
                     idx = self.table.vals[k]
@@ -401,6 +402,8 @@ cdef class {{name}}HashTable(HashTable):
                 else:
                     # k hasn't been seen yet
                     k = kh_put_{{dtype}}(self.table, val, &ret)
+                    self.table.vals[k] = count
+
                     if needs_resize(ud):
                         with gil:
                             if uniques.external_view_exists:
@@ -409,7 +412,6 @@ cdef class {{name}}HashTable(HashTable):
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                    self.table.vals[k] = count
                     labels[i] = count
                     count += 1
 
@@ -490,7 +492,9 @@ cdef class {{name}}HashTable(HashTable):
            khiter_t k
            {{name}}Vector uniques = {{name}}Vector()
            {{name}}VectorData *ud
+
         ud = uniques.data
+
         with nogil:
             for i in range(n):
                 val = values[i]
@@ -595,12 +599,14 @@ cdef class StringHashTable(HashTable):
             khiter_t k
             const char *v
             const char **vecs
+
         vecs = <const char **> malloc(n * sizeof(char *))
         uindexer = np.empty(n, dtype=np.int64)
         for i in range(n):
             val = values[i]
             v = util.get_c_string(val)
             vecs[i] = v
+
         count = 0
         with nogil:
             for i in range(n):
@@ -611,6 +617,7 @@ cdef class StringHashTable(HashTable):
                     uindexer[count] = i
                     count += 1
         free(vecs)
+
         # uniques
         uniques = ObjectVector()
         for i in range(count):
@@ -727,8 +734,8 @@ cdef class StringHashTable(HashTable):
                 else:
                     # k hasn't been seen yet
                     k = kh_put_str(self.table, v, &ret)
-                    uindexer[count] = i
                     self.table.vals[k] = count
+                    uindexer[count] = i
                     labels[i] = <int64_t>count
                     count += 1
 
@@ -855,6 +862,7 @@ cdef class PyObjectHashTable(HashTable):
             object val
             khiter_t k
             ObjectVector uniques = ObjectVector()
+
         for i in range(n):
             val = values[i]
             hash(val)
@@ -862,6 +870,7 @@ cdef class PyObjectHashTable(HashTable):
             if k == self.table.n_buckets:
                 kh_put_pymap(self.table, <PyObject*>val, &ret)
                 uniques.append(val)
+
         return uniques.to_array()
 
     @cython.boundscheck(False)
@@ -894,13 +903,13 @@ cdef class PyObjectHashTable(HashTable):
             if k != self.table.n_buckets:
                 # k falls into a previous bucket
                 idx = self.table.vals[k]
-                labels[i] = <int64_t>idx
+                labels[i] = idx
             else:
                 # k hasn't been seen yet
                 k = kh_put_pymap(self.table, <PyObject*>val, &ret)
-                uniques.append(val)
                 self.table.vals[k] = count
-                labels[i] = <int64_t>count
+                uniques.append(val)
+                labels[i] = count
                 count += 1
 
         return uniques.to_array(), np.asarray(labels)

From 19eaf32e03d58d92ab84bccef62639384dfb97ae Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 7 Oct 2018 23:06:46 +0200
Subject: [PATCH 11/26] Clean up test_algos.test_vector_resize

---
 pandas/tests/test_algos.py | 72 +++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index b2ddbf715b480..8303cacba0960 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -15,7 +15,6 @@
 from pandas import compat
 from pandas._libs import (groupby as libgroupby, algos as libalgos,
                           hashtable as ht)
-from pandas._libs.hashtable import unique_label_indices
 from pandas.compat import lrange, range
 import pandas.core.algorithms as algos
 import pandas.core.common as com
@@ -1266,41 +1265,42 @@ def test_get_unique(self):
         exp = np.array([1, 2, 2**63], dtype=np.uint64)
         tm.assert_numpy_array_equal(s.unique(), exp)
 
-    def test_vector_resize(self, writable):
+    @pytest.mark.parametrize('nvals', [0, 10])  # resizing to 0 is special case
+    @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [
+        (ht.PyObjectHashTable, ht.ObjectVector, 'object', False),
+        (ht.StringHashTable, ht.ObjectVector, 'object', True),
+        (ht.Float64HashTable, ht.Float64Vector, 'float64', False),
+        (ht.Int64HashTable, ht.Int64Vector, 'int64', False),
+        (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)])
+    def test_vector_resize(self, writable, htable, uniques, dtype,
+                           safely_resizes, nvals):
         # Test for memory errors after internal vector
-        # reallocations (pull request #7157)
-
-        def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes):
-            vals = np.array(np.random.randn(1000), dtype=dtype)
-            # GH 21688 ensure we can deal with readonly memory views
-            vals.setflags(write=writable)
-            # get_labels may append to uniques
-            htable.get_labels(vals[:nvals], uniques, 0, -1)
-            # to_array() set an external_view_exists flag on uniques.
-            tmp = uniques.to_array()
-            oldshape = tmp.shape
-            # subsequent get_labels() calls can no longer append to it
-            # (for all but StringHashTables + ObjectVector)
-            if safely_resizes:
+        # reallocations (GH 7157)
+        vals = np.array(np.random.randn(1000), dtype=dtype)
+
+        # GH 21688 ensure we can deal with readonly memory views
+        vals.setflags(write=writable)
+
+        # initialise instances
+        htable = htable()
+        uniques = uniques()
+
+        # get_labels may append to uniques
+        htable.get_labels(vals[:nvals], uniques, 0, -1)
+        # to_array() sets an external_view_exists flag on uniques.
+        tmp = uniques.to_array()
+        oldshape = tmp.shape
+
+        # subsequent get_labels() calls can no longer append to it
+        # (except for StringHashTables + ObjectVector)
+        if safely_resizes:
+            htable.get_labels(vals, uniques, 0, -1)
+        else:
+            with tm.assert_raises_regex(ValueError, 'external reference.*'):
                 htable.get_labels(vals, uniques, 0, -1)
-            else:
-                with pytest.raises(ValueError) as excinfo:
-                    htable.get_labels(vals, uniques, 0, -1)
-                assert str(excinfo.value).startswith('external reference')
-            uniques.to_array()   # should not raise here
-            assert tmp.shape == oldshape
-
-        test_cases = [
-            (ht.PyObjectHashTable, ht.ObjectVector, 'object', False),
-            (ht.StringHashTable, ht.ObjectVector, 'object', True),
-            (ht.Float64HashTable, ht.Float64Vector, 'float64', False),
-            (ht.Int64HashTable, ht.Int64Vector, 'int64', False),
-            (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]
-
-        for (tbl, vect, dtype, safely_resizes) in test_cases:
-            # resizing to empty is a special case
-            _test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes)
-            _test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes)
+
+        uniques.to_array()   # should not raise here
+        assert tmp.shape == oldshape
 
 
 def test_quantile():
@@ -1315,14 +1315,14 @@ def test_unique_label_indices():
 
     a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')
 
-    left = unique_label_indices(a)
+    left = ht.unique_label_indices(a)
     right = np.unique(a, return_index=True)[1]
 
     tm.assert_numpy_array_equal(left, right,
                                 check_dtype=False)
 
     a[np.random.choice(len(a), 10)] = -1
-    left = unique_label_indices(a)
+    left = ht.unique_label_indices(a)
     right = np.unique(a, return_index=True)[1][1:]
     tm.assert_numpy_array_equal(left, right,
                                 check_dtype=False)

From ce7626f1b0f29771e7f287ae4188d05dd3e94d25 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 8 Oct 2018 00:00:17 +0200
Subject: [PATCH 12/26] Add test for hashtable.unique (esp. for
 return_inverse=True)

---
 pandas/tests/test_algos.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 8303cacba0960..6ebc275e0c9d0 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1302,6 +1302,38 @@ def test_vector_resize(self, writable, htable, uniques, dtype,
         uniques.to_array()   # should not raise here
         assert tmp.shape == oldshape
 
+    @pytest.mark.parametrize('htable, tm_dtype', [
+        (ht.PyObjectHashTable, 'String'),
+        (ht.StringHashTable, 'String'),
+        (ht.Float64HashTable, 'Float'),
+        (ht.Int64HashTable, 'Int'),
+        (ht.UInt64HashTable, 'UInt')])
+    def test_hashtable_unique(self, htable, tm_dtype):
+        # output of maker has guaranteed unique elements
+        maker = getattr(tm, 'make' + tm_dtype + 'Index')
+        s = Series(maker(1000))
+        if htable == ht.Float64HashTable:
+            # add NaN for float column
+            s.loc[500] = np.nan
+        elif htable == ht.PyObjectHashTable:
+            # use different NaN types for object column
+            s.loc[500:502] = [np.nan, None, pd.NaT]
+
+        # create duplicated selection
+        s_duplicated = s.sample(frac=3, replace=True)
+
+        # drop_duplicates has own cython code (khash) and is tested separately
+        # keeps first occurrence like ht.unique
+        expected_unique = s_duplicated.drop_duplicates(keep='first').values
+        result_unique = htable().unique(s_duplicated.values)
+        tm.assert_numpy_array_equal(result_unique, expected_unique)
+
+        result_unique, result_inverse = htable().unique(s_duplicated.values,
+                                                        return_inverse=True)
+        tm.assert_numpy_array_equal(result_unique, expected_unique)
+        reconstr = result_unique[result_inverse]
+        tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
+
 
 def test_quantile():
     s = Series(np.random.randn(100))

From 7b9014fdf1ac227fb0efb6a82e406064526a046e Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 8 Oct 2018 00:43:59 +0200
Subject: [PATCH 13/26] Review (jreback)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 176 +++++++++++++++------
 pandas/tests/test_algos.py                 |   6 -
 2 files changed, 129 insertions(+), 53 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index ec6b1c575191a..17af5b6fb2d90 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -358,9 +358,32 @@ cdef class {{name}}HashTable(HashTable):
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def _unique_with_inverse(self, const {{dtype}}_t[:] values,
-                             {{name}}Vector uniques, bint ignore_na=False,
-                             Py_ssize_t count_prior=0,
+                             {{name}}Vector uniques, Py_ssize_t count_prior=0,
                              Py_ssize_t na_sentinel=-1, object na_value=None):
+        """
+        Calculate unique values without sorting; ignores all NA-values
+
+        Parameters
+        ----------
+        values : ndarray[{{dtype}}]
+            Array of values of which unique will be calculated
+        uniques : {{name}}Vector
+            Vector into which uniques will be written
+        count_prior : Py_ssize_t, default 0
+            Number of existing entries in uniques
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then
+            any value satisfying val!=val are considered missing.
+
+        Returns
+        -------
+        uniques : ndarray[{{dtype}}]
+            Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
+        """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
             int64_t[:] labels
@@ -388,8 +411,7 @@ cdef class {{name}}HashTable(HashTable):
             for i in range(n):
                 val = values[i]
 
-                if ignore_na and (val != val
-                                  or (use_na_value and val == na_value2)):
+                if val != val or (use_na_value and val == na_value2):
                     labels[i] = na_sentinel
                     continue
 
@@ -417,20 +439,13 @@ cdef class {{name}}HashTable(HashTable):
 
         return uniques.to_array(), np.asarray(labels)
 
-    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
-        if return_inverse:
-            return self._unique_with_inverse(values, uniques={{name}}Vector(),
-                                             ignore_na=False)
-        return self._unique_no_inverse(values)
-
     def factorize(self, {{dtype}}_t[:] values):
-        return self._unique_with_inverse(values, uniques={{name}}Vector(),
-                                         ignore_na=True)
+        return self._unique_with_inverse(values, uniques={{name}}Vector())
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique_with_inverse(values, uniques, ignore_na=True,
+        _, labels = self._unique_with_inverse(values, uniques,
                                               count_prior=count_prior,
                                               na_sentinel=na_sentinel,
                                               na_value=na_value)
@@ -483,8 +498,20 @@ cdef class {{name}}HashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique_no_inverse(self, const {{dtype}}_t[:] values):
-        # define separate functions without inverse for performance
+    def unique(self, const {{dtype}}_t[:] values):
+        """
+        Calculate unique values without sorting
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        """
         cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
@@ -588,8 +615,20 @@ cdef class StringHashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique_no_inverse(self, ndarray[object] values):
-        # define separate functions without inverse for performance
+    def unique(self, ndarray[object] values):
+        """
+        Calculate unique values without sorting
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        """
         cdef:
             Py_ssize_t i, count, n = len(values)
             int64_t[:] uindexer
@@ -688,9 +727,31 @@ cdef class StringHashTable(HashTable):
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def _unique_with_inverse(self, ndarray[object] values,
-                             ObjectVector uniques, bint ignore_na=False,
-                             Py_ssize_t count_prior=0,
+                             ObjectVector uniques, Py_ssize_t count_prior=0,
                              Py_ssize_t na_sentinel=-1, object na_value=None):
+        """
+        Calculate unique values without sorting; ignores all NA-values
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        uniques : ObjectVector
+            Vector into which uniques will be written
+        count_prior : Py_ssize_t, default 0
+            Number of existing entries in uniques
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
+        """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
             int64_t[:] labels
@@ -706,14 +767,13 @@ cdef class StringHashTable(HashTable):
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
-        # assign pointers and pre-filter out missing (if ignore_na)
+        # assign pointers and pre-filter out missing
         vecs = <const char **> malloc(n * sizeof(char *))
         for i in range(n):
             val = values[i]
 
-            if not ignore_na or ((PyUnicode_Check(val) or PyString_Check(val))
-                                 and not (use_na_value and val == na_value)):
-                # if ignore_na is False, we also stringify NaN/None/etc.
+            if (PyUnicode_Check(val) or PyString_Check(val))
+                and not (use_na_value and val == na_value)):
                 v = util.get_c_string(val)
                 vecs[i] = v
             else:
@@ -722,7 +782,7 @@ cdef class StringHashTable(HashTable):
         # compute
         with nogil:
             for i in range(n):
-                if ignore_na and labels[i] == na_sentinel:
+                if labels[i] == na_sentinel:
                     continue
 
                 v = vecs[i]
@@ -747,20 +807,13 @@ cdef class StringHashTable(HashTable):
 
         return uniques.to_array(), np.asarray(labels)
 
-    def unique(self, ndarray[object] values, bint return_inverse=False):
-        if return_inverse:
-            return self._unique_with_inverse(values, uniques=ObjectVector(),
-                                             ignore_na=False)
-        return self._unique_no_inverse(values)
-
     def factorize(self, ndarray[object] values):
-        return self._unique_with_inverse(values, uniques=ObjectVector(),
-                                         ignore_na=True)
+        return self._unique_with_inverse(values, uniques=ObjectVector())
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique_with_inverse(values, uniques, ignore_na=True,
+        _, labels = self._unique_with_inverse(values, uniques,
                                               count_prior=count_prior,
                                               na_sentinel=na_sentinel,
                                               na_value=na_value)
@@ -854,8 +907,20 @@ cdef class PyObjectHashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique_no_inverse(self, ndarray[object] values):
-        # define separate functions without inverse for performance
+    def unique(self, ndarray[object] values):
+        """
+        Calculate unique values without sorting
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        """
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
@@ -876,9 +941,32 @@ cdef class PyObjectHashTable(HashTable):
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def _unique_with_inverse(self, ndarray[object] values,
-                             ObjectVector uniques, bint ignore_na=False,
-                             Py_ssize_t count_prior=0,
+                             ObjectVector uniques, Py_ssize_t count_prior=0,
                              Py_ssize_t na_sentinel=-1, object na_value=None):
+        """
+        Calculate unique values without sorting; ignores all NA-values
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        uniques : ObjectVector
+            Vector into which uniques will be written
+        count_prior : Py_ssize_t, default 0
+            Number of existing entries in uniques
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then None _plus_
+            any value satisfying val!=val are considered missing.
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
+        """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
             int64_t[:] labels
@@ -894,8 +982,8 @@ cdef class PyObjectHashTable(HashTable):
             val = values[i]
             hash(val)
 
-            if ignore_na and ((val != val or val is None)
-                              or (use_na_value and val == na_value)):
+            if ((val != val or val is None)
+                or (use_na_value and val == na_value)):
                 labels[i] = na_sentinel
                 continue
 
@@ -914,19 +1002,13 @@ cdef class PyObjectHashTable(HashTable):
 
         return uniques.to_array(), np.asarray(labels)
 
-    def unique(self, ndarray[object] values, bint return_inverse=False):
-        if return_inverse:
-            return self._unique_with_inverse(values, uniques=ObjectVector(),
-                                             ignore_na=False)
-        return self._unique_no_inverse(values)
-
     def factorize(self, ndarray[object] values):
-        return self._unique_with_inverse(values, uniques=ObjectVector(), ignore_na=True)
+        return self._unique_with_inverse(values, uniques=ObjectVector())
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique_with_inverse(values, uniques, ignore_na=True,
+        _, labels = self._unique_with_inverse(values, uniques,
                                               count_prior=count_prior,
                                               na_sentinel=na_sentinel,
                                               na_value=na_value)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 6ebc275e0c9d0..afed3aece807d 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1328,12 +1328,6 @@ def test_hashtable_unique(self, htable, tm_dtype):
         result_unique = htable().unique(s_duplicated.values)
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
-        result_unique, result_inverse = htable().unique(s_duplicated.values,
-                                                        return_inverse=True)
-        tm.assert_numpy_array_equal(result_unique, expected_unique)
-        reconstr = result_unique[result_inverse]
-        tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
-
 
 def test_quantile():
     s = Series(np.random.randn(100))

From 471c4da6479a6b14fcddc8f1220743414b364c5a Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 8 Oct 2018 07:43:04 +0200
Subject: [PATCH 14/26] Fix typo

---
 pandas/_libs/hashtable_class_helper.pxi.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 17af5b6fb2d90..397c8b9a2219d 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -772,8 +772,8 @@ cdef class StringHashTable(HashTable):
         for i in range(n):
             val = values[i]
 
-            if (PyUnicode_Check(val) or PyString_Check(val))
-                and not (use_na_value and val == na_value)):
+            if ((PyUnicode_Check(val) or PyString_Check(val))
+                    and not (use_na_value and val == na_value)):
                 v = util.get_c_string(val)
                 vecs[i] = v
             else:

From 9d453786f976969b1d1c5fef07db00004ecef7b5 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 8 Oct 2018 18:45:02 +0200
Subject: [PATCH 15/26] Small fixes

---
 pandas/_libs/hashtable_class_helper.pxi.in |  8 ++++----
 pandas/tests/test_algos.py                 | 10 ++++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 397c8b9a2219d..1cd0e0d1e8982 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -361,7 +361,7 @@ cdef class {{name}}HashTable(HashTable):
                              {{name}}Vector uniques, Py_ssize_t count_prior=0,
                              Py_ssize_t na_sentinel=-1, object na_value=None):
         """
-        Calculate unique values without sorting; ignores all NA-values
+        Calculate unique values and labels (no sorting); ignores all NA-values
 
         Parameters
         ----------
@@ -730,7 +730,7 @@ cdef class StringHashTable(HashTable):
                              ObjectVector uniques, Py_ssize_t count_prior=0,
                              Py_ssize_t na_sentinel=-1, object na_value=None):
         """
-        Calculate unique values without sorting; ignores all NA-values
+        Calculate unique values and labels (no sorting); ignores all NA-values
 
         Parameters
         ----------
@@ -944,7 +944,7 @@ cdef class PyObjectHashTable(HashTable):
                              ObjectVector uniques, Py_ssize_t count_prior=0,
                              Py_ssize_t na_sentinel=-1, object na_value=None):
         """
-        Calculate unique values without sorting; ignores all NA-values
+        Calculate unique values and labels (no sorting); ignores all NA-values
 
         Parameters
         ----------
@@ -983,7 +983,7 @@ cdef class PyObjectHashTable(HashTable):
             hash(val)
 
             if ((val != val or val is None)
-                or (use_na_value and val == na_value)):
+                    or (use_na_value and val == na_value)):
                 labels[i] = na_sentinel
                 continue
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index afed3aece807d..9aa77665995de 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1278,10 +1278,12 @@ def test_vector_resize(self, writable, htable, uniques, dtype,
         # reallocations (GH 7157)
         vals = np.array(np.random.randn(1000), dtype=dtype)
 
-        # GH 21688 ensure we can deal with readonly memory views
+        # GH 21688 ensures we can deal with read-only memory views
         vals.setflags(write=writable)
 
-        # initialise instances
+        # initialise instances; cannot initialise in parametrization,
+        # as otherwise external views would be held on the array (which is
+        # one of the things this test is checking)
         htable = htable()
         uniques = uniques()
 
@@ -1322,8 +1324,8 @@ def test_hashtable_unique(self, htable, tm_dtype):
         # create duplicated selection
         s_duplicated = s.sample(frac=3, replace=True)
 
-        # drop_duplicates has own cython code (khash) and is tested separately
-        # keeps first occurrence like ht.unique
+        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
+        #  and is tested separately; keeps first occurrence like ht.unique()
         expected_unique = s_duplicated.drop_duplicates(keep='first').values
         result_unique = htable().unique(s_duplicated.values)
         tm.assert_numpy_array_equal(result_unique, expected_unique)

From 00b2ccb031099ea1f33fe380dcbdb93402e93779 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 8 Oct 2018 18:50:50 +0200
Subject: [PATCH 16/26] Review (jorisvandenbossche)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 1cd0e0d1e8982..620e564df922b 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -440,7 +440,8 @@ cdef class {{name}}HashTable(HashTable):
         return uniques.to_array(), np.asarray(labels)
 
     def factorize(self, {{dtype}}_t[:] values):
-        return self._unique_with_inverse(values, uniques={{name}}Vector())
+        uniques = {{name}}Vector()
+        return self._unique_with_inverse(values, uniques=uniques)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
@@ -808,7 +809,8 @@ cdef class StringHashTable(HashTable):
         return uniques.to_array(), np.asarray(labels)
 
     def factorize(self, ndarray[object] values):
-        return self._unique_with_inverse(values, uniques=ObjectVector())
+        uniques = ObjectVector()
+        return self._unique_with_inverse(values, uniques=uniques)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
@@ -1003,7 +1005,8 @@ cdef class PyObjectHashTable(HashTable):
         return uniques.to_array(), np.asarray(labels)
 
     def factorize(self, ndarray[object] values):
-        return self._unique_with_inverse(values, uniques=ObjectVector())
+        uniques = ObjectVector()
+        return self._unique_with_inverse(values, uniques=uniques)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,

From a267d4a2e872a48986b35c771814fdf8617b0792 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 11 Oct 2018 21:58:33 +0200
Subject: [PATCH 17/26] Review (jorisvandenbossche)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 66 +++++++++++-----------
 pandas/core/algorithms.py                  |  8 +--
 pandas/tests/test_algos.py                 | 54 +++++++++++++++++-
 3 files changed, 88 insertions(+), 40 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 627285771c76e..dcac9962f05c6 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -357,9 +357,9 @@ cdef class {{name}}HashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique_with_inverse(self, const {{dtype}}_t[:] values,
-                             {{name}}Vector uniques, Py_ssize_t count_prior=0,
-                             Py_ssize_t na_sentinel=-1, object na_value=None):
+    def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                   object na_value=None):
         """
         Calculate unique values and labels (no sorting); ignores all NA-values
 
@@ -437,20 +437,20 @@ cdef class {{name}}HashTable(HashTable):
                     labels[i] = count
                     count += 1
 
-        return uniques.to_array(), np.asarray(labels)
+        return np.asarray(labels)
 
-    def factorize(self, {{dtype}}_t[:] values):
+    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
+                  object na_value=None):
         uniques = {{name}}Vector()
-        return self._unique_with_inverse(values, uniques=uniques)
+        labels = self._factorize(values, uniques=uniques,
+                                 na_sentinel=na_sentinel, na_value=na_value)
+        return labels, uniques.to_array()
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique_with_inverse(values, uniques,
-                                              count_prior=count_prior,
-                                              na_sentinel=na_sentinel,
-                                              na_value=na_value)
-        return labels
+        return self._factorize(values, uniques, count_prior=count_prior,
+                               na_sentinel=na_sentinel, na_value=na_value)
 
     @cython.boundscheck(False)
     def get_labels_groupby(self, const {{dtype}}_t[:] values):
@@ -727,9 +727,9 @@ cdef class StringHashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique_with_inverse(self, ndarray[object] values,
-                             ObjectVector uniques, Py_ssize_t count_prior=0,
-                             Py_ssize_t na_sentinel=-1, object na_value=None):
+    def _factorize(self, ndarray[object] values, ObjectVector uniques,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                   object na_value=None):
         """
         Calculate unique values and labels (no sorting); ignores all NA-values
 
@@ -806,20 +806,20 @@ cdef class StringHashTable(HashTable):
         for i in range(count):
             uniques.append(values[uindexer[i]])
 
-        return uniques.to_array(), np.asarray(labels)
+        return np.asarray(labels)
 
-    def factorize(self, ndarray[object] values):
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+                  object na_value=None):
         uniques = ObjectVector()
-        return self._unique_with_inverse(values, uniques=uniques)
+        labels = self._factorize(values, uniques=uniques,
+                                 na_sentinel=na_sentinel, na_value=na_value)
+        return labels, uniques.to_array()
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique_with_inverse(values, uniques,
-                                              count_prior=count_prior,
-                                              na_sentinel=na_sentinel,
-                                              na_value=na_value)
-        return labels
+        return self._factorize(values, uniques, count_prior=count_prior,
+                               na_sentinel=na_sentinel, na_value=na_value)
 
 
 cdef class PyObjectHashTable(HashTable):
@@ -942,9 +942,9 @@ cdef class PyObjectHashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique_with_inverse(self, ndarray[object] values,
-                             ObjectVector uniques, Py_ssize_t count_prior=0,
-                             Py_ssize_t na_sentinel=-1, object na_value=None):
+    def _factorize(self, ndarray[object] values, ObjectVector uniques,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                   object na_value=None):
         """
         Calculate unique values and labels (no sorting); ignores all NA-values
 
@@ -1002,17 +1002,17 @@ cdef class PyObjectHashTable(HashTable):
                 labels[i] = count
                 count += 1
 
-        return uniques.to_array(), np.asarray(labels)
+        return np.asarray(labels)
 
-    def factorize(self, ndarray[object] values):
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+                  object na_value=None):
         uniques = ObjectVector()
-        return self._unique_with_inverse(values, uniques=uniques)
+        labels = self._factorize(values, uniques=uniques,
+                                 na_sentinel=na_sentinel, na_value=na_value)
+        return labels, uniques.to_array()
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique_with_inverse(values, uniques,
-                                              count_prior=count_prior,
-                                              na_sentinel=na_sentinel,
-                                              na_value=na_value)
-        return labels
+        return self._factorize(values, uniques, count_prior=count_prior,
+                               na_sentinel=na_sentinel, na_value=na_value)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index e91cc8ec1e996..073ca2bf248c9 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -468,15 +468,13 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
     -------
     labels, uniques : ndarray
     """
-    (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
+    (hash_klass, _), values = _get_data_algo(values, _hashtables)
 
     table = hash_klass(size_hint or len(values))
-    uniques = vec_klass()
-    labels = table.get_labels(values, uniques, 0, na_sentinel,
-                              na_value=na_value)
+    labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
+                                      na_value=na_value)
 
     labels = ensure_platform_int(labels)
-    uniques = uniques.to_array()
     return labels, uniques
 
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 9aa77665995de..ac49515406aa2 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1322,14 +1322,64 @@ def test_hashtable_unique(self, htable, tm_dtype):
             s.loc[500:502] = [np.nan, None, pd.NaT]
 
         # create duplicated selection
-        s_duplicated = s.sample(frac=3, replace=True)
+        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
 
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
-        #  and is tested separately; keeps first occurrence like ht.unique()
+        # and is tested separately; keeps first occurrence like ht.unique()
         expected_unique = s_duplicated.drop_duplicates(keep='first').values
         result_unique = htable().unique(s_duplicated.values)
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
+    @pytest.mark.parametrize('na_sentinel', [-1])
+    @pytest.mark.parametrize('htable, tm_dtype', [
+        (ht.PyObjectHashTable, 'String'),
+        (ht.StringHashTable, 'String'),
+        (ht.Float64HashTable, 'Float'),
+        (ht.Int64HashTable, 'Int'),
+        (ht.UInt64HashTable, 'UInt')])
+    def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel):
+        # output of maker has guaranteed unique elements
+        maker = getattr(tm, 'make' + tm_dtype + 'Index')
+        s = Series(maker(1000))
+        if htable == ht.Float64HashTable:
+            # add NaN for float column
+            s.loc[500] = np.nan
+        elif htable == ht.PyObjectHashTable:
+            # use different NaN types for object column
+            s.loc[500:502] = [np.nan, None, pd.NaT]
+
+        # create duplicated selection
+        idx_duplicated = pd.Series(s.index).sample(frac=3, replace=True)
+        s_duplicated = s[idx_duplicated.values].reset_index(drop=True)
+        na_mask = s_duplicated.isna().values
+
+        result_inverse, result_unique = htable().factorize(s_duplicated.values)
+
+        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
+        # and is tested separately; keeps first occurrence like ht.unique()
+        expected_unique = s_duplicated.dropna().drop_duplicates(keep='first')
+        expected_unique = expected_unique.values
+        tm.assert_numpy_array_equal(result_unique, expected_unique)
+
+        # ignore NaNs for calculating inverse
+        _, values2unique, unique2values = np.unique(idx_duplicated[~na_mask],
+                                                    return_inverse=True,
+                                                    return_index=True)
+        expected_inverse = np.ones(s_duplicated.shape,
+                                   dtype=np.intp) * na_sentinel
+
+        # np.unique yields a __SORTED__ list of uniques, and values2unique
+        # resp. unique2values are relative to this order. To restore the
+        # original order, we argsort values2unique, because values2unique would
+        # be ordered if np.unique had not sorted implicitly. The first argsort
+        # gives the permutation from values2unique to its sorted form, but we
+        # need the inverse permutation (the map from the unsorted uniques to
+        # values2unique, from which we can continue with unique2values).
+        # This inversion (as a permutation) is achieved by the second argsort.
+        inverse_no_na = np.argsort(np.argsort(values2unique))[unique2values]
+        expected_inverse[~na_mask] = inverse_no_na
+        tm.assert_numpy_array_equal(result_inverse, expected_inverse)
+
 
 def test_quantile():
     s = Series(np.random.randn(100))

From 7f1bb4020322fc78dd6fe43eba169695395a3780 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Fri, 12 Oct 2018 00:04:34 +0200
Subject: [PATCH 18/26] Improve comment

---
 pandas/tests/test_algos.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index ac49515406aa2..61e5f60142a5d 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1356,7 +1356,8 @@ def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel):
         result_inverse, result_unique = htable().factorize(s_duplicated.values)
 
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
-        # and is tested separately; keeps first occurrence like ht.unique()
+        # and is tested separately; keeps first occurrence like ht.factorize()
+        # since factorize removes all NaNs, we do the same here
         expected_unique = s_duplicated.dropna().drop_duplicates(keep='first')
         expected_unique = expected_unique.values
         tm.assert_numpy_array_equal(result_unique, expected_unique)

From 95939922141a1220dddcd53d5c81435ad8f2edc5 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Fri, 12 Oct 2018 08:25:31 +0200
Subject: [PATCH 19/26] Test for writable; expand comments

---
 pandas/tests/test_algos.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 61e5f60142a5d..fdeb8f67c2ff0 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1310,7 +1310,7 @@ def test_vector_resize(self, writable, htable, uniques, dtype,
         (ht.Float64HashTable, 'Float'),
         (ht.Int64HashTable, 'Int'),
         (ht.UInt64HashTable, 'UInt')])
-    def test_hashtable_unique(self, htable, tm_dtype):
+    def test_hashtable_unique(self, htable, tm_dtype, writable):
         # output of maker has guaranteed unique elements
         maker = getattr(tm, 'make' + tm_dtype + 'Index')
         s = Series(maker(1000))
@@ -1323,6 +1323,7 @@ def test_hashtable_unique(self, htable, tm_dtype):
 
         # create duplicated selection
         s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
+        s_duplicated.values.setflags(write=writable)
 
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.unique()
@@ -1330,14 +1331,15 @@ def test_hashtable_unique(self, htable, tm_dtype):
         result_unique = htable().unique(s_duplicated.values)
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
-    @pytest.mark.parametrize('na_sentinel', [-1])
+    @pytest.mark.parametrize('na_sentinel', [-1, 1001])
     @pytest.mark.parametrize('htable, tm_dtype', [
         (ht.PyObjectHashTable, 'String'),
         (ht.StringHashTable, 'String'),
         (ht.Float64HashTable, 'Float'),
         (ht.Int64HashTable, 'Int'),
         (ht.UInt64HashTable, 'UInt')])
-    def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel):
+    def test_hashtable_factorize(self, htable, tm_dtype,
+                                 na_sentinel, writable):
         # output of maker has guaranteed unique elements
         maker = getattr(tm, 'make' + tm_dtype + 'Index')
         s = Series(maker(1000))
@@ -1348,12 +1350,15 @@ def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel):
             # use different NaN types for object column
             s.loc[500:502] = [np.nan, None, pd.NaT]
 
-        # create duplicated selection
+        # create duplicated selection (with known indices per duplicate!)
         idx_duplicated = pd.Series(s.index).sample(frac=3, replace=True)
         s_duplicated = s[idx_duplicated.values].reset_index(drop=True)
+        s_duplicated.values.setflags(write=writable)
         na_mask = s_duplicated.isna().values
 
-        result_inverse, result_unique = htable().factorize(s_duplicated.values)
+        result_tuple = htable().factorize(s_duplicated.values,
+                                          na_sentinel=na_sentinel)
+        result_inverse, result_unique = result_tuple
 
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.factorize()
@@ -1362,7 +1367,11 @@ def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel):
         expected_unique = expected_unique.values
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
-        # ignore NaNs for calculating inverse
+        # ignore NaNs for calculating inverse because factorize drops all NaNs!
+        # values2unique: mapping indices of original to indices of uniques
+        # unique2values: reduplication from array of uniques to original array
+        # this fits together in the way that values[values2unique] are the
+        # uniques (from np.unique!) and uniques[unique2values] == original
         _, values2unique, unique2values = np.unique(idx_duplicated[~na_mask],
                                                     return_inverse=True,
                                                     return_index=True)

From 08d7f507081e8e55b39b23e810c9d9b4c7f0ac98 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Fri, 12 Oct 2018 08:33:50 +0200
Subject: [PATCH 20/26] Simplify factorize test

---
 pandas/tests/test_algos.py | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index fdeb8f67c2ff0..6477e9fd79340 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1331,15 +1331,13 @@ def test_hashtable_unique(self, htable, tm_dtype, writable):
         result_unique = htable().unique(s_duplicated.values)
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
-    @pytest.mark.parametrize('na_sentinel', [-1, 1001])
     @pytest.mark.parametrize('htable, tm_dtype', [
         (ht.PyObjectHashTable, 'String'),
         (ht.StringHashTable, 'String'),
         (ht.Float64HashTable, 'Float'),
         (ht.Int64HashTable, 'Int'),
         (ht.UInt64HashTable, 'UInt')])
-    def test_hashtable_factorize(self, htable, tm_dtype,
-                                 na_sentinel, writable):
+    def test_hashtable_factorize(self, htable, tm_dtype, writable):
         # output of maker has guaranteed unique elements
         maker = getattr(tm, 'make' + tm_dtype + 'Index')
         s = Series(maker(1000))
@@ -1356,9 +1354,7 @@ def test_hashtable_factorize(self, htable, tm_dtype,
         s_duplicated.values.setflags(write=writable)
         na_mask = s_duplicated.isna().values
 
-        result_tuple = htable().factorize(s_duplicated.values,
-                                          na_sentinel=na_sentinel)
-        result_inverse, result_unique = result_tuple
+        result_inverse, result_unique = htable().factorize(s_duplicated.values)
 
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.factorize()
@@ -1367,28 +1363,11 @@ def test_hashtable_factorize(self, htable, tm_dtype,
         expected_unique = expected_unique.values
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
-        # ignore NaNs for calculating inverse because factorize drops all NaNs!
-        # values2unique: mapping indices of original to indices of uniques
-        # unique2values: reduplication from array of uniques to original array
-        # this fits together in the way that values[values2unique] are the
-        # uniques (from np.unique!) and uniques[unique2values] == original
-        _, values2unique, unique2values = np.unique(idx_duplicated[~na_mask],
-                                                    return_inverse=True,
-                                                    return_index=True)
-        expected_inverse = np.ones(s_duplicated.shape,
-                                   dtype=np.intp) * na_sentinel
-
-        # np.unique yields a __SORTED__ list of uniques, and values2unique
-        # resp. unique2values are relative to this order. To restore the
-        # original order, we argsort values2unique, because values2unique would
-        # be ordered if np.unique had not sorted implicitly. The first argsort
-        # gives the permutation from values2unique to its sorted form, but we
-        # need the inverse permutation (the map from the unsorted uniques to
-        # values2unique, from which we can continue with unique2values).
-        # This inversion (as a permutation) is achieved by the second argsort.
-        inverse_no_na = np.argsort(np.argsort(values2unique))[unique2values]
-        expected_inverse[~na_mask] = inverse_no_na
-        tm.assert_numpy_array_equal(result_inverse, expected_inverse)
+        # reconstruction can only succeed if the inverse is correct.
+        # Since factorize removes the NaNs, those have to be excluded
+        result_reconstruct = result_unique[result_inverse[~na_mask]]
+        expected_reconstruct = s_duplicated.dropna().values
+        tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
 
 
 def test_quantile():

From d91be98ec686ad5ee7af1790011638586f0d2914 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Fri, 12 Oct 2018 08:40:19 +0200
Subject: [PATCH 21/26] Add simple test

---
 pandas/tests/test_algos.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 6477e9fd79340..c776cd844e6ff 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -249,6 +249,17 @@ def test_uint64_factorize(self, writable):
         tm.assert_numpy_array_equal(labels, exp_labels)
         tm.assert_numpy_array_equal(uniques, exp_uniques)
 
+    def test_object_factorize(self, writable):
+        data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'],
+                        dtype=object)
+        data.setflags(write=writable)
+        exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
+        exp_uniques = np.array(['a', 'c', 'b'], dtype=object)
+
+        labels, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(labels, exp_labels)
+        tm.assert_numpy_array_equal(uniques, exp_uniques)
+
     def test_deprecate_order(self):
         # gh 19727 - check warning is raised for deprecated keyword, order.
         # Test not valid once order keyword is removed.

From e27ec9a85d6d62fbadf2c6f3fcf435327262f60e Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Fri, 12 Oct 2018 08:42:22 +0200
Subject: [PATCH 22/26] Tiny fixes

---
 pandas/tests/test_algos.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index c776cd844e6ff..f6438e70e692e 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1370,12 +1370,11 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.factorize()
         # since factorize removes all NaNs, we do the same here
-        expected_unique = s_duplicated.dropna().drop_duplicates(keep='first')
-        expected_unique = expected_unique.values
+        expected_unique = s_duplicated.dropna().drop_duplicates().values
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
-        # reconstruction can only succeed if the inverse is correct.
-        # Since factorize removes the NaNs, those have to be excluded
+        # reconstruction can only succeed if the inverse is correct. Since
+        # factorize removes the NaNs, those have to be excluded here as well
         result_reconstruct = result_unique[result_inverse[~na_mask]]
         expected_reconstruct = s_duplicated.dropna().values
         tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)

From d825be0ac1805ea4f0c35427efc9237d98a41200 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sat, 13 Oct 2018 00:52:19 +0200
Subject: [PATCH 23/26] Remove idx_duplicated from test (now unnecessary)

---
 pandas/tests/test_algos.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index f6438e70e692e..1fb9bf6c3caf9 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1359,9 +1359,8 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
             # use different NaN types for object column
             s.loc[500:502] = [np.nan, None, pd.NaT]
 
-        # create duplicated selection (with known indices per duplicate!)
-        idx_duplicated = pd.Series(s.index).sample(frac=3, replace=True)
-        s_duplicated = s[idx_duplicated.values].reset_index(drop=True)
+        # create duplicated selection
+        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
         s_duplicated.values.setflags(write=writable)
         na_mask = s_duplicated.isna().values
 

From 1a342d09d686f7e9545d89fa8c77a42613486953 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 14 Oct 2018 21:19:44 +0200
Subject: [PATCH 24/26] Review (jreback)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index dcac9962f05c6..c061102fbaddc 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -505,12 +505,12 @@ cdef class {{name}}HashTable(HashTable):
 
         Parameters
         ----------
-        values : ndarray[object]
+        values : ndarray[{{dtype}}]
             Array of values of which unique will be calculated
 
         Returns
         -------
-        uniques : ndarray[object]
+        uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
         """
         cdef:

From 3438727cf21b059be2928a3193f0a1ab4cc84bf5 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 17 Oct 2018 23:36:58 +0200
Subject: [PATCH 25/26] Review (jreback)

---
 pandas/tests/test_algos.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 57d085314dc29..557669260604a 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -227,19 +227,42 @@ def test_complex_sorting(self):
 
         pytest.raises(TypeError, algos.factorize, x17[::-1], sort=True)
 
+    def test_float64_factorize(self, writable):
+        data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
+        data.setflags(write=writable)
+        exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
+        exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
+
+        labels, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(labels, exp_labels)
+        tm.assert_numpy_array_equal(uniques, exp_uniques)
+
     def test_uint64_factorize(self, writable):
-        data = np.array([2**63, 1, 2**63], dtype=np.uint64)
+        data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64)
         data.setflags(write=writable)
         exp_labels = np.array([0, 1, 0], dtype=np.intp)
-        exp_uniques = np.array([2**63, 1], dtype=np.uint64)
+        exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64)
 
         labels, uniques = algos.factorize(data)
         tm.assert_numpy_array_equal(labels, exp_labels)
         tm.assert_numpy_array_equal(uniques, exp_uniques)
 
-        data = np.array([2**63, -1, 2**63], dtype=object)
+    def test_int64_factorize(self, writable):
+        data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64)
+        data.setflags(write=writable)
         exp_labels = np.array([0, 1, 0], dtype=np.intp)
-        exp_uniques = np.array([2**63, -1], dtype=object)
+        exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64)
+
+        labels, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(labels, exp_labels)
+        tm.assert_numpy_array_equal(uniques, exp_uniques)
+
+    def test_string_factorize(self, writable):
+        data = np.array(['a', 'c', 'a', 'b', 'c'],
+                        dtype=object)
+        data.setflags(write=writable)
+        exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp)
+        exp_uniques = np.array(['a', 'c', 'b'], dtype=object)
 
         labels, uniques = algos.factorize(data)
         tm.assert_numpy_array_equal(labels, exp_labels)

From 6d0e86b0f62a63902ff35bc6575d6f3c3e82b6a2 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 18 Oct 2018 08:23:59 +0200
Subject: [PATCH 26/26] Retrigger Circle