From 640162fab9498f56ab0c93748ef4655cd0fc449f Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 3 Oct 2018 08:15:28 +0200 Subject: [PATCH 01/26] Fix ASV import error --- asv_bench/benchmarks/indexing.py | 8 ++++---- asv_bench/benchmarks/join_merge.py | 7 ++++--- asv_bench/benchmarks/panel_ctor.py | 4 ++-- asv_bench/benchmarks/panel_methods.py | 3 ++- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index c5b147b152aa6..2850fa249725c 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -2,10 +2,10 @@ import numpy as np import pandas.util.testing as tm -from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, - IntervalIndex, CategoricalIndex, - IndexSlice, concat, date_range) -from .pandas_vb_common import setup, Panel # noqa +from pandas import (Series, DataFrame, MultiIndex, Panel, + Int64Index, Float64Index, IntervalIndex, + CategoricalIndex, IndexSlice, concat, date_range) +from .pandas_vb_common import setup # noqa class NumericSeriesIndexing(object): diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 7487a0d8489b7..6624c3d0aaf49 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -3,14 +3,15 @@ import numpy as np import pandas.util.testing as tm -from pandas import (DataFrame, Series, MultiIndex, date_range, concat, merge, - merge_asof) +from pandas import (DataFrame, Series, Panel, MultiIndex, + date_range, concat, merge, merge_asof) + try: from pandas import merge_ordered except ImportError: from pandas import ordered_merge as merge_ordered -from .pandas_vb_common import Panel, setup # noqa +from .pandas_vb_common import setup # noqa class Append(object): diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py index ce946c76ed199..4614bbd198afa 100644 --- a/asv_bench/benchmarks/panel_ctor.py +++ b/asv_bench/benchmarks/panel_ctor.py @@ -1,9 +1,9 @@ import warnings from datetime import datetime, timedelta -from pandas import DataFrame, DatetimeIndex, date_range +from pandas import DataFrame, Panel, DatetimeIndex, date_range -from .pandas_vb_common import Panel, setup # noqa +from .pandas_vb_common import setup # noqa class DifferentIndexes(object): diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py index a5b1a92e9cf67..4d19e9a87c507 100644 --- a/asv_bench/benchmarks/panel_methods.py +++ b/asv_bench/benchmarks/panel_methods.py @@ -1,8 +1,9 @@ import warnings import numpy as np +from pandas import Panel -from .pandas_vb_common import Panel, setup # noqa +from .pandas_vb_common import setup # noqa class PanelMethods(object): From 31d0dc59f0dcabd9570f2a849af38f564a006a94 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 27 Sep 2018 23:52:42 +0200 Subject: [PATCH 02/26] Add return_inverse to hashtable.unique --- pandas/_libs/hashtable_class_helper.pxi.in | 110 +++++++++++++++------ 1 file changed, 79 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f294fd141a9f1..f6fccadf78904 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -355,14 +355,14 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) - def factorize(self, {{dtype}}_t values): + def factorize(self, {{dtype}}_t[:] values): uniques = {{name}}Vector() - labels = self.get_labels(values, uniques, 0, 0) + labels = self.get_labels(values, uniques, 0) return uniques.to_array(), labels @cython.boundscheck(False) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): cdef: Py_ssize_t i, n = len(values) @@ -399,9 +399,11 @@ cdef class {{name}}HashTable(HashTable): k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: + # k falls into a previous bucket idx = self.table.vals[k] labels[i] = idx else: + # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count @@ -464,27 +466,42 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques @cython.boundscheck(False) - def unique(self, const {{dtype}}_t[:] values): + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - {{dtype}}_t val - khiter_t k - {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud + Py_ssize_t i, idx, count = 0, n = len(values) + int64_t[:] labels + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud ud = uniques.data + if return_inverse: + labels = np.empty(n, dtype=np.int64) with nogil: for i in range(n): val = values[i] k = kh_get_{{dtype}}(self.table, val) - if k == self.table.n_buckets: - kh_put_{{dtype}}(self.table, val, &ret) + if return_inverse and k != self.table.n_buckets: + # k falls into a previous bucket + idx = self.table.vals[k] + labels[i] = idx + elif k == self.table.n_buckets: + # k hasn't been seen yet + k = kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): with gil: uniques.resize() append_data_{{dtype}}(ud, val) + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + + if return_inverse: + return uniques.to_array(), np.asarray(labels) return uniques.to_array() {{endfor}} @@ -567,45 +584,57 @@ cdef class StringHashTable(HashTable): return labels @cython.boundscheck(False) - def unique(self, ndarray[object] values): + def unique(self, ndarray[object] values, bint return_inverse=False): cdef: - Py_ssize_t i, count, n = len(values) + Py_ssize_t i, idx, count = 0, n = len(values) + int64_t[:] labels int64_t[:] uindexer int ret = 0 object val - ObjectVector uniques + ObjectVector uniques = ObjectVector() khiter_t k const char *v const char **vecs - vecs = malloc(n * sizeof(char *)) + if return_inverse: + labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) + + # assign pointers + vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] v = util.get_c_string(val) vecs[i] = v - count = 0 + + # compute with nogil: for i in range(n): v = vecs[i] k = kh_get_str(self.table, v) - if k == self.table.n_buckets: - kh_put_str(self.table, v, &ret) + if return_inverse and k != self.table.n_buckets: + # k falls into a previous bucket + idx = self.table.vals[k] + labels[i] = idx + elif k == self.table.n_buckets: + # k hasn't been seen yet + k = kh_put_str(self.table, v, &ret) uindexer[count] = i + if return_inverse: + self.table.vals[k] = count + labels[i] = count count += 1 + free(vecs) # uniques - uniques = ObjectVector() for i in range(count): uniques.append(values[uindexer[i]]) - return uniques.to_array() - def factorize(self, ndarray[object] values): - uniques = ObjectVector() - labels = self.get_labels(values, uniques, 0, 0) - return uniques.to_array(), labels + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() @cython.boundscheck(False) def lookup(self, ndarray[object] values): @@ -670,7 +699,7 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, + Py_ssize_t count_prior=0, int64_t na_sentinel=-1, object na_value=None): cdef: Py_ssize_t i, n = len(values) @@ -814,26 +843,43 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) - def unique(self, ndarray[object] values): + @cython.boundscheck(False) + def unique(self, ndarray[object] values, bint return_inverse=False): cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, idx, count = 0, n = len(values) + int64_t[:] labels int ret = 0 object val khiter_t k ObjectVector uniques = ObjectVector() + if return_inverse: + labels = np.empty(n, dtype=np.int64) + for i in range(n): val = values[i] hash(val) k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) + if return_inverse and k != self.table.n_buckets: + # k falls into a previous bucket + idx = self.table.vals[k] + labels[i] = idx + elif k == self.table.n_buckets: + # k hasn't been seen yet + k = kh_put_pymap(self.table, val, &ret) uniques.append(val) + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + if return_inverse: + return uniques.to_array(), np.asarray(labels) return uniques.to_array() + @cython.boundscheck(False) def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, + Py_ssize_t count_prior=0, int64_t na_sentinel=-1, object na_value=None): cdef: Py_ssize_t i, n = len(values) @@ -858,9 +904,11 @@ cdef class PyObjectHashTable(HashTable): k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: + # k falls into a previous bucket idx = self.table.vals[k] labels[i] = idx else: + # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = count uniques.append(val) From c5e51478e6a19c8d6a4673020edea609666b6058 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 30 Sep 2018 17:17:50 +0200 Subject: [PATCH 03/26] Pure copy/paste: Group unique/factorize functions next to each other --- pandas/_libs/hashtable_class_helper.pxi.in | 184 ++++++++++----------- 1 file changed, 92 insertions(+), 92 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f6fccadf78904..0418939956b1c 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -355,6 +355,45 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) + @cython.boundscheck(False) + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + cdef: + Py_ssize_t i, idx, count = 0, n = len(values) + int64_t[:] labels + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + + ud = uniques.data + if return_inverse: + labels = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_{{dtype}}(self.table, val) + if return_inverse and k != self.table.n_buckets: + # k falls into a previous bucket + idx = self.table.vals[k] + labels[i] = idx + elif k == self.table.n_buckets: + # k hasn't been seen yet + k = kh_put_{{dtype}}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + def factorize(self, {{dtype}}_t[:] values): uniques = {{name}}Vector() labels = self.get_labels(values, uniques, 0) @@ -465,45 +504,6 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques - @cython.boundscheck(False) - def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): - cdef: - Py_ssize_t i, idx, count = 0, n = len(values) - int64_t[:] labels - int ret = 0 - {{dtype}}_t val - khiter_t k - {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud - - ud = uniques.data - if return_inverse: - labels = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_{{dtype}}(self.table, val) - if return_inverse and k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - elif k == self.table.n_buckets: - # k hasn't been seen yet - k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - if return_inverse: - self.table.vals[k] = count - labels[i] = count - count += 1 - - if return_inverse: - return uniques.to_array(), np.asarray(labels) - return uniques.to_array() - {{endfor}} @@ -583,59 +583,6 @@ cdef class StringHashTable(HashTable): free(vecs) return labels - @cython.boundscheck(False) - def unique(self, ndarray[object] values, bint return_inverse=False): - cdef: - Py_ssize_t i, idx, count = 0, n = len(values) - int64_t[:] labels - int64_t[:] uindexer - int ret = 0 - object val - ObjectVector uniques = ObjectVector() - khiter_t k - const char *v - const char **vecs - - if return_inverse: - labels = np.zeros(n, dtype=np.int64) - uindexer = np.empty(n, dtype=np.int64) - - # assign pointers - vecs = malloc(n * sizeof(char *)) - for i in range(n): - val = values[i] - v = util.get_c_string(val) - vecs[i] = v - - - # compute - with nogil: - for i in range(n): - v = vecs[i] - k = kh_get_str(self.table, v) - if return_inverse and k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - elif k == self.table.n_buckets: - # k hasn't been seen yet - k = kh_put_str(self.table, v, &ret) - uindexer[count] = i - if return_inverse: - self.table.vals[k] = count - labels[i] = count - count += 1 - - free(vecs) - - # uniques - for i in range(count): - uniques.append(values[uindexer[i]]) - - if return_inverse: - return uniques.to_array(), np.asarray(labels) - return uniques.to_array() - @cython.boundscheck(False) def lookup(self, ndarray[object] values): cdef: @@ -697,6 +644,59 @@ cdef class StringHashTable(HashTable): self.table.vals[k] = i free(vecs) + @cython.boundscheck(False) + def unique(self, ndarray[object] values, bint return_inverse=False): + cdef: + Py_ssize_t i, idx, count = 0, n = len(values) + int64_t[:] labels + int64_t[:] uindexer + int ret = 0 + object val + ObjectVector uniques = ObjectVector() + khiter_t k + const char *v + const char **vecs + + if return_inverse: + labels = np.zeros(n, dtype=np.int64) + uindexer = np.empty(n, dtype=np.int64) + + # assign pointers + vecs = malloc(n * sizeof(char *)) + for i in range(n): + val = values[i] + v = util.get_c_string(val) + vecs[i] = v + + + # compute + with nogil: + for i in range(n): + v = vecs[i] + k = kh_get_str(self.table, v) + if return_inverse and k != self.table.n_buckets: + # k falls into a previous bucket + idx = self.table.vals[k] + labels[i] = idx + elif k == self.table.n_buckets: + # k hasn't been seen yet + k = kh_put_str(self.table, v, &ret) + uindexer[count] = i + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + + free(vecs) + + # uniques + for i in range(count): + uniques.append(values[uindexer[i]]) + + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + @cython.boundscheck(False) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, int64_t na_sentinel=-1, From 9918d52b96f722495fb1a72135e991d725ba3cda Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 3 Oct 2018 23:07:07 +0200 Subject: [PATCH 04/26] Unify hashtable.factorize and .unique --- pandas/_libs/hashtable_class_helper.pxi.in | 253 ++++++++------------- 1 file changed, 91 insertions(+), 162 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 0418939956b1c..3d3d0ad66734b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -356,64 +356,21 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) - def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + bint ignore_na=False, bint return_inverse=False, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): cdef: - Py_ssize_t i, idx, count = 0, n = len(values) + Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int ret = 0 - {{dtype}}_t val - khiter_t k - {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud - - ud = uniques.data - if return_inverse: - labels = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_{{dtype}}(self.table, val) - if return_inverse and k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - elif k == self.table.n_buckets: - # k hasn't been seen yet - k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - if return_inverse: - self.table.vals[k] = count - labels[i] = count - count += 1 - - if return_inverse: - return uniques.to_array(), np.asarray(labels) - return uniques.to_array() - - def factorize(self, {{dtype}}_t[:] values): - uniques = {{name}}Vector() - labels = self.get_labels(values, uniques, 0) - return uniques.to_array(), labels - - @cython.boundscheck(False) - def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 {{dtype}}_t val, na_value2 khiter_t k {{name}}VectorData *ud bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -431,21 +388,19 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if val != val or (use_na_value and val == na_value2): + if ignore_na and (val != val + or (use_na_value and val == na_value2)): labels[i] = na_sentinel continue k = kh_get_{{dtype}}(self.table, val) - - if k != self.table.n_buckets: + if return_inverse and k != self.table.n_buckets: # k falls into a previous bucket idx = self.table.vals[k] labels[i] = idx - else: + elif k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - self.table.vals[k] = count - if needs_resize(ud): with gil: if uniques.external_view_exists: @@ -454,10 +409,30 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - labels[i] = count + if return_inverse: + self.table.vals[k] = count + labels[i] = count count += 1 - return np.asarray(labels) + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + return self._unique(values, uniques={{name}}Vector(), ignore_na=False, + return_inverse=return_inverse) + + def factorize(self, {{dtype}}_t[:] values): + return self._unique(values, uniques={{name}}Vector(), ignore_na=True, + return_inverse=True) + + def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + _, labels = self._unique(values, uniques, ignore_na=True, + return_inverse=True, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) + return labels @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): @@ -645,33 +620,45 @@ cdef class StringHashTable(HashTable): free(vecs) @cython.boundscheck(False) - def unique(self, ndarray[object] values, bint return_inverse=False): + def _unique(self, ndarray[object] values, ObjectVector uniques, + bint ignore_na=False, bint return_inverse=False, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): cdef: - Py_ssize_t i, idx, count = 0, n = len(values) + Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int64_t[:] uindexer int ret = 0 object val - ObjectVector uniques = ObjectVector() - khiter_t k const char *v const char **vecs + khiter_t k + bint use_na_value if return_inverse: labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) + use_na_value = na_value is not None - # assign pointers + # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - v = util.get_c_string(val) - vecs[i] = v + if not ignore_na or ((PyUnicode_Check(val) or PyString_Check(val)) + and not (use_na_value and val == na_value)): + # if ignore_na is False, we also stringify NaN/None/etc. + v = util.get_c_string(val) + vecs[i] = v + else: + labels[i] = na_sentinel # compute with nogil: for i in range(n): + if ignore_na and labels[i] == na_sentinel: + continue + v = vecs[i] k = kh_get_str(self.table, v) if return_inverse and k != self.table.n_buckets: @@ -697,65 +684,21 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), np.asarray(labels) return uniques.to_array() - @cython.boundscheck(False) - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, int64_t na_sentinel=-1, - object na_value=None): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - int64_t[:] uindexer - Py_ssize_t idx, count = count_prior - int ret = 0 - object val - const char *v - const char **vecs - khiter_t k - bint use_na_value - - # these by-definition *must* be strings - labels = np.zeros(n, dtype=np.int64) - uindexer = np.empty(n, dtype=np.int64) - use_na_value = na_value is not None - - # pre-filter out missing - # and assign pointers - vecs = malloc(n * sizeof(char *)) - for i in range(n): - val = values[i] - - if ((PyUnicode_Check(val) or PyString_Check(val)) and - not (use_na_value and val == na_value)): - v = util.get_c_string(val) - vecs[i] = v - else: - labels[i] = na_sentinel - - # compute - with nogil: - for i in range(n): - if labels[i] == na_sentinel: - continue - - v = vecs[i] - k = kh_get_str(self.table, v) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_str(self.table, v, &ret) - self.table.vals[k] = count - uindexer[count] = i - labels[i] = count - count += 1 - - free(vecs) + def unique(self, ndarray[object] values, bint return_inverse=False): + return self._unique(values, uniques=ObjectVector(), ignore_na=False, + return_inverse=return_inverse) - # uniques - for i in range(count): - uniques.append(values[uindexer[i]]) + def factorize(self, ndarray[object] values): + return self._unique(values, uniques=ObjectVector(), ignore_na=True, + return_inverse=True) - return np.asarray(labels) + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + _, labels = self._unique(values, uniques, ignore_na=True, + return_inverse=True, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) + return labels cdef class PyObjectHashTable(HashTable): @@ -844,21 +787,31 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) - def unique(self, ndarray[object] values, bint return_inverse=False): + def _unique(self, ndarray[object] values, ObjectVector uniques, + bint ignore_na=False, bint return_inverse=False, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): cdef: - Py_ssize_t i, idx, count = 0, n = len(values) + Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int ret = 0 object val khiter_t k - ObjectVector uniques = ObjectVector() + bint use_na_value if return_inverse: labels = np.empty(n, dtype=np.int64) + use_na_value = na_value is not None for i in range(n): val = values[i] hash(val) + + if ignore_na and ((val != val or val is None) + or (use_na_value and val == na_value)): + labels[i] = na_sentinel + continue + k = kh_get_pymap(self.table, val) if return_inverse and k != self.table.n_buckets: # k falls into a previous bucket @@ -877,42 +830,18 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array(), np.asarray(labels) return uniques.to_array() - @cython.boundscheck(False) - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, int64_t na_sentinel=-1, - object na_value=None): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - object val - khiter_t k - bint use_na_value - - labels = np.empty(n, dtype=np.int64) - use_na_value = na_value is not None - - for i in range(n): - val = values[i] - hash(val) - - if ((val != val or val is None) or - (use_na_value and val == na_value)): - labels[i] = na_sentinel - continue + def unique(self, ndarray[object] values, bint return_inverse=False): + return self._unique(values, uniques=ObjectVector(), ignore_na=False, + return_inverse=return_inverse) - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: - # k hasn't been seen yet - k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = count - uniques.append(val) - labels[i] = count - count += 1 + def factorize(self, ndarray[object] values): + return self._unique(values, uniques=ObjectVector(), ignore_na=True, + return_inverse=True) - return np.asarray(labels) + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + _, labels = self._unique(values, uniques, ignore_na=True, + return_inverse=True, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) + return labels From 52ae84e7f05d86ca228448b412fff76b99a0a1b0 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 4 Oct 2018 16:48:07 +0200 Subject: [PATCH 05/26] Force compilation of different code paths --- pandas/_libs/hashtable_class_helper.pxi.in | 39 ++++++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3d3d0ad66734b..4d02b24734c29 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -419,8 +419,19 @@ cdef class {{name}}HashTable(HashTable): return uniques.to_array() def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + # define separate functions with/without inverse to force compilation + # of the different code paths for boolean "return_inverse" + if return_inverse: + return self._unique_with_inverse(values) + return self._unique_no_inverse(values) + + def _unique_no_inverse(self, const {{dtype}}_t[:] values): + return self._unique(values, uniques={{name}}Vector(), ignore_na=False, + return_inverse=False) + + def _unique_with_inverse(self, const {{dtype}}_t[:] values): return self._unique(values, uniques={{name}}Vector(), ignore_na=False, - return_inverse=return_inverse) + return_inverse=True) def factorize(self, {{dtype}}_t[:] values): return self._unique(values, uniques={{name}}Vector(), ignore_na=True, @@ -685,8 +696,19 @@ cdef class StringHashTable(HashTable): return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): + # define separate functions with/without inverse to force compilation + # of the different code paths for boolean "return_inverse" + if return_inverse: + return self._unique_with_inverse(values) + return self._unique_no_inverse(values) + + def _unique_no_inverse(self, ndarray[object] values): return self._unique(values, uniques=ObjectVector(), ignore_na=False, - return_inverse=return_inverse) + return_inverse=False) + + def _unique_with_inverse(self, ndarray[object] values): + return self._unique(values, uniques=ObjectVector(), ignore_na=False, + return_inverse=True) def factorize(self, ndarray[object] values): return self._unique(values, uniques=ObjectVector(), ignore_na=True, @@ -831,8 +853,19 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): + # define separate functions with/without inverse to force compilation + # of the different code paths for boolean "return_inverse" + if return_inverse: + return self._unique_with_inverse(values) + return self._unique_no_inverse(values) + + def _unique_no_inverse(self, ndarray[object] values): return self._unique(values, uniques=ObjectVector(), ignore_na=False, - return_inverse=return_inverse) + return_inverse=False) + + def _unique_with_inverse(self, ndarray[object] values): + return self._unique(values, uniques=ObjectVector(), ignore_na=False, + return_inverse=True) def factorize(self, ndarray[object] values): return self._unique(values, uniques=ObjectVector(), ignore_na=True, From dbe4e0ed81e826ca631c7b70c0117b5026a5f079 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 4 Oct 2018 22:17:36 +0200 Subject: [PATCH 06/26] Add separate functions for return_inverse=False --- pandas/_libs/hashtable_class_helper.pxi.in | 113 +++++++++++++++------ 1 file changed, 80 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 4d02b24734c29..e179445bbeac7 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -418,21 +418,35 @@ cdef class {{name}}HashTable(HashTable): return uniques.to_array(), np.asarray(labels) return uniques.to_array() + @cython.boundscheck(False) + def _unique_no_inverse(self, const {{dtype}}_t[:] values): + # define separate functions without inverse for performance + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + ud = uniques.data + with nogil: + for i in range(n): + val = values[i] + k = kh_get_{{dtype}}(self.table, val) + if k == self.table.n_buckets: + kh_put_{{dtype}}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + return uniques.to_array() + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): - # define separate functions with/without inverse to force compilation - # of the different code paths for boolean "return_inverse" if return_inverse: - return self._unique_with_inverse(values) + return self._unique(values, uniques={{name}}Vector(), ignore_na=False, + return_inverse=True) return self._unique_no_inverse(values) - def _unique_no_inverse(self, const {{dtype}}_t[:] values): - return self._unique(values, uniques={{name}}Vector(), ignore_na=False, - return_inverse=False) - - def _unique_with_inverse(self, const {{dtype}}_t[:] values): - return self._unique(values, uniques={{name}}Vector(), ignore_na=False, - return_inverse=True) - def factorize(self, {{dtype}}_t[:] values): return self._unique(values, uniques={{name}}Vector(), ignore_na=True, return_inverse=True) @@ -695,21 +709,46 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), np.asarray(labels) return uniques.to_array() + @cython.boundscheck(False) + def _unique_no_inverse(self, ndarray[object] values): + # define separate functions without inverse for performance + cdef: + Py_ssize_t i, count, n = len(values) + int64_t[:] uindexer + int ret = 0 + object val + ObjectVector uniques + khiter_t k + const char *v + const char **vecs + vecs = malloc(n * sizeof(char *)) + uindexer = np.empty(n, dtype=np.int64) + for i in range(n): + val = values[i] + v = util.get_c_string(val) + vecs[i] = v + count = 0 + with nogil: + for i in range(n): + v = vecs[i] + k = kh_get_str(self.table, v) + if k == self.table.n_buckets: + kh_put_str(self.table, v, &ret) + uindexer[count] = i + count += 1 + free(vecs) + # uniques + uniques = ObjectVector() + for i in range(count): + uniques.append(values[uindexer[i]]) + return uniques.to_array() + def unique(self, ndarray[object] values, bint return_inverse=False): - # define separate functions with/without inverse to force compilation - # of the different code paths for boolean "return_inverse" if return_inverse: - return self._unique_with_inverse(values) + return self._unique(values, uniques=ObjectVector(), ignore_na=False, + return_inverse=True) return self._unique_no_inverse(values) - def _unique_no_inverse(self, ndarray[object] values): - return self._unique(values, uniques=ObjectVector(), ignore_na=False, - return_inverse=False) - - def _unique_with_inverse(self, ndarray[object] values): - return self._unique(values, uniques=ObjectVector(), ignore_na=False, - return_inverse=True) - def factorize(self, ndarray[object] values): return self._unique(values, uniques=ObjectVector(), ignore_na=True, return_inverse=True) @@ -852,21 +891,29 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array(), np.asarray(labels) return uniques.to_array() + def _unique_no_inverse(self, ndarray[object] values): + # define separate functions without inverse for performance + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ObjectVector uniques = ObjectVector() + for i in range(n): + val = values[i] + hash(val) + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + kh_put_pymap(self.table, val, &ret) + uniques.append(val) + return uniques.to_array() + def unique(self, ndarray[object] values, bint return_inverse=False): - # define separate functions with/without inverse to force compilation - # of the different code paths for boolean "return_inverse" if return_inverse: - return self._unique_with_inverse(values) + return self._unique(values, uniques=ObjectVector(), ignore_na=False, + return_inverse=True) return self._unique_no_inverse(values) - def _unique_no_inverse(self, ndarray[object] values): - return self._unique(values, uniques=ObjectVector(), ignore_na=False, - return_inverse=False) - - def _unique_with_inverse(self, ndarray[object] values): - return self._unique(values, uniques=ObjectVector(), ignore_na=False, - return_inverse=True) - def factorize(self, ndarray[object] values): return self._unique(values, uniques=ObjectVector(), ignore_na=True, return_inverse=True) From 8481e19619c8847b0bdb7dcb61726497f552b230 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 4 Oct 2018 23:12:30 +0200 Subject: [PATCH 07/26] Finish split in _unique_with_inverse and _unique_no_inverse --- pandas/_libs/hashtable_class_helper.pxi.in | 116 ++++++++++----------- 1 file changed, 53 insertions(+), 63 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e179445bbeac7..6e9afb8faa42d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -356,10 +356,10 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) - def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - bint ignore_na=False, bint return_inverse=False, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique_with_inverse(self, const {{dtype}}_t[:] values, + {{name}}Vector uniques, bint ignore_na=False, + Py_ssize_t count_prior=0, + Py_ssize_t na_sentinel=-1, object na_value=None): cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels @@ -369,8 +369,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud bint use_na_value - if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -394,11 +393,11 @@ cdef class {{name}}HashTable(HashTable): continue k = kh_get_{{dtype}}(self.table, val) - if return_inverse and k != self.table.n_buckets: + if k != self.table.n_buckets: # k falls into a previous bucket idx = self.table.vals[k] labels[i] = idx - elif k == self.table.n_buckets: + else: # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): @@ -409,14 +408,11 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - if return_inverse: - self.table.vals[k] = count - labels[i] = count + self.table.vals[k] = count + labels[i] = count count += 1 - if return_inverse: - return uniques.to_array(), np.asarray(labels) - return uniques.to_array() + return uniques.to_array(), np.asarray(labels) @cython.boundscheck(False) def _unique_no_inverse(self, const {{dtype}}_t[:] values): @@ -443,20 +439,21 @@ cdef class {{name}}HashTable(HashTable): def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): if return_inverse: - return self._unique(values, uniques={{name}}Vector(), ignore_na=False, - return_inverse=True) + return self._unique_with_inverse(values, uniques={{name}}Vector(), + ignore_na=False) return self._unique_no_inverse(values) def factorize(self, {{dtype}}_t[:] values): - return self._unique(values, uniques={{name}}Vector(), ignore_na=True, - return_inverse=True) + return self._unique_with_inverse(values, uniques={{name}}Vector(), + ignore_na=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique(values, uniques, ignore_na=True, - return_inverse=True, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique_with_inverse(values, uniques, ignore_na=True, + count_prior=count_prior, + na_sentinel=na_sentinel, + na_value=na_value) return labels @cython.boundscheck(False) @@ -645,10 +642,10 @@ cdef class StringHashTable(HashTable): free(vecs) @cython.boundscheck(False) - def _unique(self, ndarray[object] values, ObjectVector uniques, - bint ignore_na=False, bint return_inverse=False, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique_with_inverse(self, ndarray[object] values, + ObjectVector uniques, bint ignore_na=False, + Py_ssize_t count_prior=0, + Py_ssize_t na_sentinel=-1, object na_value=None): cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels @@ -660,8 +657,7 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - if return_inverse: - labels = np.zeros(n, dtype=np.int64) + labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -686,17 +682,16 @@ cdef class StringHashTable(HashTable): v = vecs[i] k = kh_get_str(self.table, v) - if return_inverse and k != self.table.n_buckets: + if k != self.table.n_buckets: # k falls into a previous bucket idx = self.table.vals[k] labels[i] = idx - elif k == self.table.n_buckets: + else: # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) uindexer[count] = i - if return_inverse: - self.table.vals[k] = count - labels[i] = count + self.table.vals[k] = count + labels[i] = count count += 1 free(vecs) @@ -705,9 +700,7 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - if return_inverse: - return uniques.to_array(), np.asarray(labels) - return uniques.to_array() + return uniques.to_array(), np.asarray(labels) @cython.boundscheck(False) def _unique_no_inverse(self, ndarray[object] values): @@ -745,20 +738,21 @@ cdef class StringHashTable(HashTable): def unique(self, ndarray[object] values, bint return_inverse=False): if return_inverse: - return self._unique(values, uniques=ObjectVector(), ignore_na=False, - return_inverse=True) + return self._unique_with_inverse(values, uniques=ObjectVector(), + ignore_na=False) return self._unique_no_inverse(values) def factorize(self, ndarray[object] values): - return self._unique(values, uniques=ObjectVector(), ignore_na=True, - return_inverse=True) + return self._unique_with_inverse(values, uniques=ObjectVector(), + ignore_na=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique(values, uniques, ignore_na=True, - return_inverse=True, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique_with_inverse(values, uniques, ignore_na=True, + count_prior=count_prior, + na_sentinel=na_sentinel, + na_value=na_value) return labels @@ -848,10 +842,10 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) - def _unique(self, ndarray[object] values, ObjectVector uniques, - bint ignore_na=False, bint return_inverse=False, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique_with_inverse(self, ndarray[object] values, + ObjectVector uniques, bint ignore_na=False, + Py_ssize_t count_prior=0, + Py_ssize_t na_sentinel=-1, object na_value=None): cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels @@ -860,8 +854,7 @@ cdef class PyObjectHashTable(HashTable): khiter_t k bint use_na_value - if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None for i in range(n): @@ -874,22 +867,19 @@ cdef class PyObjectHashTable(HashTable): continue k = kh_get_pymap(self.table, val) - if return_inverse and k != self.table.n_buckets: + if k != self.table.n_buckets: # k falls into a previous bucket idx = self.table.vals[k] labels[i] = idx - elif k == self.table.n_buckets: + else: # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) uniques.append(val) - if return_inverse: - self.table.vals[k] = count - labels[i] = count + self.table.vals[k] = count + labels[i] = count count += 1 - if return_inverse: - return uniques.to_array(), np.asarray(labels) - return uniques.to_array() + return uniques.to_array(), np.asarray(labels) def _unique_no_inverse(self, ndarray[object] values): # define separate functions without inverse for performance @@ -910,18 +900,18 @@ cdef class PyObjectHashTable(HashTable): def unique(self, ndarray[object] values, bint return_inverse=False): if return_inverse: - return self._unique(values, uniques=ObjectVector(), ignore_na=False, - return_inverse=True) + return self._unique_with_inverse(values, uniques=ObjectVector(), + ignore_na=False) return self._unique_no_inverse(values) def factorize(self, ndarray[object] values): - return self._unique(values, uniques=ObjectVector(), ignore_na=True, - return_inverse=True) + return self._unique_with_inverse(values, uniques=ObjectVector(), ignore_na=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique(values, uniques, ignore_na=True, - return_inverse=True, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique_with_inverse(values, uniques, ignore_na=True, + count_prior=count_prior, + na_sentinel=na_sentinel, + na_value=na_value) return labels From 27ceb4d649e3dad3e87545bf141ea200e5650a6f Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 4 Oct 2018 23:38:18 +0200 Subject: [PATCH 08/26] Add cython.wraparound(False) --- pandas/_libs/hashtable_class_helper.pxi.in | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6e9afb8faa42d..285bdaac2ce12 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -356,6 +356,7 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) + @cython.wraparound(False) def _unique_with_inverse(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, bint ignore_na=False, Py_ssize_t count_prior=0, @@ -415,6 +416,7 @@ cdef class {{name}}HashTable(HashTable): return uniques.to_array(), np.asarray(labels) @cython.boundscheck(False) + @cython.wraparound(False) def _unique_no_inverse(self, const {{dtype}}_t[:] values): # define separate functions without inverse for performance cdef: @@ -642,6 +644,7 @@ cdef class StringHashTable(HashTable): free(vecs) @cython.boundscheck(False) + @cython.wraparound(False) def _unique_with_inverse(self, ndarray[object] values, ObjectVector uniques, bint ignore_na=False, Py_ssize_t count_prior=0, @@ -703,6 +706,7 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), np.asarray(labels) @cython.boundscheck(False) + @cython.wraparound(False) def _unique_no_inverse(self, ndarray[object] values): # define separate functions without inverse for performance cdef: @@ -842,6 +846,7 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) + @cython.wraparound(False) def _unique_with_inverse(self, ndarray[object] values, ObjectVector uniques, bint ignore_na=False, Py_ssize_t count_prior=0, @@ -881,6 +886,8 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array(), np.asarray(labels) + @cython.boundscheck(False) + @cython.wraparound(False) def _unique_no_inverse(self, ndarray[object] values): # define separate functions without inverse for performance cdef: From b1705a995b561b9f36f0edab10f3f1bb3984b606 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 6 Oct 2018 17:34:32 +0200 Subject: [PATCH 09/26] Unmove unique-implementation (review jreback) --- pandas/_libs/hashtable_class_helper.pxi.in | 156 ++++++++++----------- 1 file changed, 78 insertions(+), 78 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 285bdaac2ce12..f0c675596688b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -415,30 +415,6 @@ cdef class {{name}}HashTable(HashTable): return uniques.to_array(), np.asarray(labels) - @cython.boundscheck(False) - @cython.wraparound(False) - def _unique_no_inverse(self, const {{dtype}}_t[:] values): - # define separate functions without inverse for performance - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - {{dtype}}_t val - khiter_t k - {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud - ud = uniques.data - with nogil: - for i in range(n): - val = values[i] - k = kh_get_{{dtype}}(self.table, val) - if k == self.table.n_buckets: - kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - return uniques.to_array() - def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): if return_inverse: return self._unique_with_inverse(values, uniques={{name}}Vector(), @@ -503,6 +479,30 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques + @cython.boundscheck(False) + @cython.wraparound(False) + def _unique_no_inverse(self, const {{dtype}}_t[:] values): + # define separate functions without inverse for performance + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + ud = uniques.data + with nogil: + for i in range(n): + val = values[i] + k = kh_get_{{dtype}}(self.table, val) + if k == self.table.n_buckets: + kh_put_{{dtype}}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + return uniques.to_array() + {{endfor}} @@ -582,6 +582,41 @@ cdef class StringHashTable(HashTable): free(vecs) return labels + @cython.boundscheck(False) + @cython.wraparound(False) + def _unique_no_inverse(self, ndarray[object] values): + # define separate functions without inverse for performance + cdef: + Py_ssize_t i, count, n = len(values) + int64_t[:] uindexer + int ret = 0 + object val + ObjectVector uniques + khiter_t k + const char *v + const char **vecs + vecs = malloc(n * sizeof(char *)) + uindexer = np.empty(n, dtype=np.int64) + for i in range(n): + val = values[i] + v = util.get_c_string(val) + vecs[i] = v + count = 0 + with nogil: + for i in range(n): + v = vecs[i] + k = kh_get_str(self.table, v) + if k == self.table.n_buckets: + kh_put_str(self.table, v, &ret) + uindexer[count] = i + count += 1 + free(vecs) + # uniques + uniques = ObjectVector() + for i in range(count): + uniques.append(values[uindexer[i]]) + return uniques.to_array() + @cython.boundscheck(False) def lookup(self, ndarray[object] values): cdef: @@ -705,41 +740,6 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), np.asarray(labels) - @cython.boundscheck(False) - @cython.wraparound(False) - def _unique_no_inverse(self, ndarray[object] values): - # define separate functions without inverse for performance - cdef: - Py_ssize_t i, count, n = len(values) - int64_t[:] uindexer - int ret = 0 - object val - ObjectVector uniques - khiter_t k - const char *v - const char **vecs - vecs = malloc(n * sizeof(char *)) - uindexer = np.empty(n, dtype=np.int64) - for i in range(n): - val = values[i] - v = util.get_c_string(val) - vecs[i] = v - count = 0 - with nogil: - for i in range(n): - v = vecs[i] - k = kh_get_str(self.table, v) - if k == self.table.n_buckets: - kh_put_str(self.table, v, &ret) - uindexer[count] = i - count += 1 - free(vecs) - # uniques - uniques = ObjectVector() - for i in range(count): - uniques.append(values[uindexer[i]]) - return uniques.to_array() - def unique(self, ndarray[object] values, bint return_inverse=False): if return_inverse: return self._unique_with_inverse(values, uniques=ObjectVector(), @@ -845,6 +845,25 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) + @cython.boundscheck(False) + @cython.wraparound(False) + def _unique_no_inverse(self, ndarray[object] values): + # define separate functions without inverse for performance + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ObjectVector uniques = ObjectVector() + for i in range(n): + val = values[i] + hash(val) + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + kh_put_pymap(self.table, val, &ret) + uniques.append(val) + return uniques.to_array() + @cython.boundscheck(False) @cython.wraparound(False) def _unique_with_inverse(self, ndarray[object] values, @@ -886,25 +905,6 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array(), np.asarray(labels) - @cython.boundscheck(False) - @cython.wraparound(False) - def _unique_no_inverse(self, ndarray[object] values): - # define separate functions without inverse for performance - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - ObjectVector uniques = ObjectVector() - for i in range(n): - val = values[i] - hash(val) - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - return uniques.to_array() - def unique(self, ndarray[object] values, bint return_inverse=False): if return_inverse: return self._unique_with_inverse(values, uniques=ObjectVector(), From a6ed5ddc65ab2ed49ee63dda2ccb40179e8166ab Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 6 Oct 2018 17:42:31 +0200 Subject: [PATCH 10/26] Undo line artefacts --- pandas/_libs/hashtable_class_helper.pxi.in | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f0c675596688b..ec6b1c575191a 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -394,6 +394,7 @@ cdef class {{name}}HashTable(HashTable): continue k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: # k falls into a previous bucket idx = self.table.vals[k] @@ -401,6 +402,8 @@ cdef class {{name}}HashTable(HashTable): else: # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = count + if needs_resize(ud): with gil: if uniques.external_view_exists: @@ -409,7 +412,6 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - self.table.vals[k] = count labels[i] = count count += 1 @@ -490,7 +492,9 @@ cdef class {{name}}HashTable(HashTable): khiter_t k {{name}}Vector uniques = {{name}}Vector() {{name}}VectorData *ud + ud = uniques.data + with nogil: for i in range(n): val = values[i] @@ -595,12 +599,14 @@ cdef class StringHashTable(HashTable): khiter_t k const char *v const char **vecs + vecs = malloc(n * sizeof(char *)) uindexer = np.empty(n, dtype=np.int64) for i in range(n): val = values[i] v = util.get_c_string(val) vecs[i] = v + count = 0 with nogil: for i in range(n): @@ -611,6 +617,7 @@ cdef class StringHashTable(HashTable): uindexer[count] = i count += 1 free(vecs) + # uniques uniques = ObjectVector() for i in range(count): @@ -727,8 +734,8 @@ cdef class StringHashTable(HashTable): else: # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) - uindexer[count] = i self.table.vals[k] = count + uindexer[count] = i labels[i] = count count += 1 @@ -855,6 +862,7 @@ cdef class PyObjectHashTable(HashTable): object val khiter_t k ObjectVector uniques = ObjectVector() + for i in range(n): val = values[i] hash(val) @@ -862,6 +870,7 @@ cdef class PyObjectHashTable(HashTable): if k == self.table.n_buckets: kh_put_pymap(self.table, val, &ret) uniques.append(val) + return uniques.to_array() @cython.boundscheck(False) @@ -894,13 +903,13 @@ cdef class PyObjectHashTable(HashTable): if k != self.table.n_buckets: # k falls into a previous bucket idx = self.table.vals[k] - labels[i] = idx + labels[i] = idx else: # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) - uniques.append(val) self.table.vals[k] = count - labels[i] = count + uniques.append(val) + labels[i] = count count += 1 return uniques.to_array(), np.asarray(labels) From 19eaf32e03d58d92ab84bccef62639384dfb97ae Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 7 Oct 2018 23:06:46 +0200 Subject: [PATCH 11/26] Clean up test_algos.test_vector_resize --- pandas/tests/test_algos.py | 72 +++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b2ddbf715b480..8303cacba0960 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -15,7 +15,6 @@ from pandas import compat from pandas._libs import (groupby as libgroupby, algos as libalgos, hashtable as ht) -from pandas._libs.hashtable import unique_label_indices from pandas.compat import lrange, range import pandas.core.algorithms as algos import pandas.core.common as com @@ -1266,41 +1265,42 @@ def test_get_unique(self): exp = np.array([1, 2, 2**63], dtype=np.uint64) tm.assert_numpy_array_equal(s.unique(), exp) - def test_vector_resize(self, writable): + @pytest.mark.parametrize('nvals', [0, 10]) # resizing to 0 is special case + @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [ + (ht.PyObjectHashTable, ht.ObjectVector, 'object', False), + (ht.StringHashTable, ht.ObjectVector, 'object', True), + (ht.Float64HashTable, ht.Float64Vector, 'float64', False), + (ht.Int64HashTable, ht.Int64Vector, 'int64', False), + (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]) + def test_vector_resize(self, writable, htable, uniques, dtype, + safely_resizes, nvals): # Test for memory errors after internal vector - # reallocations (pull request #7157) - - def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes): - vals = np.array(np.random.randn(1000), dtype=dtype) - # GH 21688 ensure we can deal with readonly memory views - vals.setflags(write=writable) - # get_labels may append to uniques - htable.get_labels(vals[:nvals], uniques, 0, -1) - # to_array() set an external_view_exists flag on uniques. - tmp = uniques.to_array() - oldshape = tmp.shape - # subsequent get_labels() calls can no longer append to it - # (for all but StringHashTables + ObjectVector) - if safely_resizes: + # reallocations (GH 7157) + vals = np.array(np.random.randn(1000), dtype=dtype) + + # GH 21688 ensure we can deal with readonly memory views + vals.setflags(write=writable) + + # initialise instances + htable = htable() + uniques = uniques() + + # get_labels may append to uniques + htable.get_labels(vals[:nvals], uniques, 0, -1) + # to_array() sets an external_view_exists flag on uniques. + tmp = uniques.to_array() + oldshape = tmp.shape + + # subsequent get_labels() calls can no longer append to it + # (except for StringHashTables + ObjectVector) + if safely_resizes: + htable.get_labels(vals, uniques, 0, -1) + else: + with tm.assert_raises_regex(ValueError, 'external reference.*'): htable.get_labels(vals, uniques, 0, -1) - else: - with pytest.raises(ValueError) as excinfo: - htable.get_labels(vals, uniques, 0, -1) - assert str(excinfo.value).startswith('external reference') - uniques.to_array() # should not raise here - assert tmp.shape == oldshape - - test_cases = [ - (ht.PyObjectHashTable, ht.ObjectVector, 'object', False), - (ht.StringHashTable, ht.ObjectVector, 'object', True), - (ht.Float64HashTable, ht.Float64Vector, 'float64', False), - (ht.Int64HashTable, ht.Int64Vector, 'int64', False), - (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)] - - for (tbl, vect, dtype, safely_resizes) in test_cases: - # resizing to empty is a special case - _test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes) - _test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes) + + uniques.to_array() # should not raise here + assert tmp.shape == oldshape def test_quantile(): @@ -1315,14 +1315,14 @@ def test_unique_label_indices(): a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') - left = unique_label_indices(a) + left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1] tm.assert_numpy_array_equal(left, right, check_dtype=False) a[np.random.choice(len(a), 10)] = -1 - left = unique_label_indices(a) + left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] tm.assert_numpy_array_equal(left, right, check_dtype=False) From ce7626f1b0f29771e7f287ae4188d05dd3e94d25 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 8 Oct 2018 00:00:17 +0200 Subject: [PATCH 12/26] Add test for hashtable.unique (esp. for return_inverse=True) --- pandas/tests/test_algos.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 8303cacba0960..6ebc275e0c9d0 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1302,6 +1302,38 @@ def test_vector_resize(self, writable, htable, uniques, dtype, uniques.to_array() # should not raise here assert tmp.shape == oldshape + @pytest.mark.parametrize('htable, tm_dtype', [ + (ht.PyObjectHashTable, 'String'), + (ht.StringHashTable, 'String'), + (ht.Float64HashTable, 'Float'), + (ht.Int64HashTable, 'Int'), + (ht.UInt64HashTable, 'UInt')]) + def test_hashtable_unique(self, htable, tm_dtype): + # output of maker has guaranteed unique elements + maker = getattr(tm, 'make' + tm_dtype + 'Index') + s = Series(maker(1000)) + if htable == ht.Float64HashTable: + # add NaN for float column + s.loc[500] = np.nan + elif htable == ht.PyObjectHashTable: + # use different NaN types for object column + s.loc[500:502] = [np.nan, None, pd.NaT] + + # create duplicated selection + s_duplicated = s.sample(frac=3, replace=True) + + # drop_duplicates has own cython code (khash) and is tested separately + # keeps first occurrence like ht.unique + expected_unique = s_duplicated.drop_duplicates(keep='first').values + result_unique = htable().unique(s_duplicated.values) + tm.assert_numpy_array_equal(result_unique, expected_unique) + + result_unique, result_inverse = htable().unique(s_duplicated.values, + return_inverse=True) + tm.assert_numpy_array_equal(result_unique, expected_unique) + reconstr = result_unique[result_inverse] + tm.assert_numpy_array_equal(reconstr, s_duplicated.values) + def test_quantile(): s = Series(np.random.randn(100)) From 7b9014fdf1ac227fb0efb6a82e406064526a046e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 8 Oct 2018 00:43:59 +0200 Subject: [PATCH 13/26] Review (jreback) --- pandas/_libs/hashtable_class_helper.pxi.in | 176 +++++++++++++++------ pandas/tests/test_algos.py | 6 - 2 files changed, 129 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index ec6b1c575191a..17af5b6fb2d90 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -358,9 +358,32 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) def _unique_with_inverse(self, const {{dtype}}_t[:] values, - {{name}}Vector uniques, bint ignore_na=False, - Py_ssize_t count_prior=0, + {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + """ + Calculate unique values without sorting; ignores all NA-values + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + uniques : {{name}}Vector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value satisfying val!=val are considered missing. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels @@ -388,8 +411,7 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if ignore_na and (val != val - or (use_na_value and val == na_value2)): + if val != val or (use_na_value and val == na_value2): labels[i] = na_sentinel continue @@ -417,20 +439,13 @@ cdef class {{name}}HashTable(HashTable): return uniques.to_array(), np.asarray(labels) - def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): - if return_inverse: - return self._unique_with_inverse(values, uniques={{name}}Vector(), - ignore_na=False) - return self._unique_no_inverse(values) - def factorize(self, {{dtype}}_t[:] values): - return self._unique_with_inverse(values, uniques={{name}}Vector(), - ignore_na=True) + return self._unique_with_inverse(values, uniques={{name}}Vector()) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique_with_inverse(values, uniques, ignore_na=True, + _, labels = self._unique_with_inverse(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value) @@ -483,8 +498,20 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _unique_no_inverse(self, const {{dtype}}_t[:] values): - # define separate functions without inverse for performance + def unique(self, const {{dtype}}_t[:] values): + """ + Calculate unique values without sorting + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + """ cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -588,8 +615,20 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _unique_no_inverse(self, ndarray[object] values): - # define separate functions without inverse for performance + def unique(self, ndarray[object] values): + """ + Calculate unique values without sorting + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + """ cdef: Py_ssize_t i, count, n = len(values) int64_t[:] uindexer @@ -688,9 +727,31 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) def _unique_with_inverse(self, ndarray[object] values, - ObjectVector uniques, bint ignore_na=False, - Py_ssize_t count_prior=0, + ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + """ + Calculate unique values without sorting; ignores all NA-values + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + uniques : ObjectVector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels @@ -706,14 +767,13 @@ cdef class StringHashTable(HashTable): uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None - # assign pointers and pre-filter out missing (if ignore_na) + # assign pointers and pre-filter out missing vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if not ignore_na or ((PyUnicode_Check(val) or PyString_Check(val)) - and not (use_na_value and val == na_value)): - # if ignore_na is False, we also stringify NaN/None/etc. + if (PyUnicode_Check(val) or PyString_Check(val)) + and not (use_na_value and val == na_value)): v = util.get_c_string(val) vecs[i] = v else: @@ -722,7 +782,7 @@ cdef class StringHashTable(HashTable): # compute with nogil: for i in range(n): - if ignore_na and labels[i] == na_sentinel: + if labels[i] == na_sentinel: continue v = vecs[i] @@ -747,20 +807,13 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), np.asarray(labels) - def unique(self, ndarray[object] values, bint return_inverse=False): - if return_inverse: - return self._unique_with_inverse(values, uniques=ObjectVector(), - ignore_na=False) - return self._unique_no_inverse(values) - def factorize(self, ndarray[object] values): - return self._unique_with_inverse(values, uniques=ObjectVector(), - ignore_na=True) + return self._unique_with_inverse(values, uniques=ObjectVector()) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique_with_inverse(values, uniques, ignore_na=True, + _, labels = self._unique_with_inverse(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value) @@ -854,8 +907,20 @@ cdef class PyObjectHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _unique_no_inverse(self, ndarray[object] values): - # define separate functions without inverse for performance + def unique(self, ndarray[object] values): + """ + Calculate unique values without sorting + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + """ cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -876,9 +941,32 @@ cdef class PyObjectHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) def _unique_with_inverse(self, ndarray[object] values, - ObjectVector uniques, bint ignore_na=False, - Py_ssize_t count_prior=0, + ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + """ + Calculate unique values without sorting; ignores all NA-values + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + uniques : ObjectVector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value satisfying val!=val are considered missing. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels @@ -894,8 +982,8 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - if ignore_na and ((val != val or val is None) - or (use_na_value and val == na_value)): + if ((val != val or val is None) + or (use_na_value and val == na_value)): labels[i] = na_sentinel continue @@ -914,19 +1002,13 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array(), np.asarray(labels) - def unique(self, ndarray[object] values, bint return_inverse=False): - if return_inverse: - return self._unique_with_inverse(values, uniques=ObjectVector(), - ignore_na=False) - return self._unique_no_inverse(values) - def factorize(self, ndarray[object] values): - return self._unique_with_inverse(values, uniques=ObjectVector(), ignore_na=True) + return self._unique_with_inverse(values, uniques=ObjectVector()) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique_with_inverse(values, uniques, ignore_na=True, + _, labels = self._unique_with_inverse(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6ebc275e0c9d0..afed3aece807d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1328,12 +1328,6 @@ def test_hashtable_unique(self, htable, tm_dtype): result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) - result_unique, result_inverse = htable().unique(s_duplicated.values, - return_inverse=True) - tm.assert_numpy_array_equal(result_unique, expected_unique) - reconstr = result_unique[result_inverse] - tm.assert_numpy_array_equal(reconstr, s_duplicated.values) - def test_quantile(): s = Series(np.random.randn(100)) From 471c4da6479a6b14fcddc8f1220743414b364c5a Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 8 Oct 2018 07:43:04 +0200 Subject: [PATCH 14/26] Fix typo --- pandas/_libs/hashtable_class_helper.pxi.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 17af5b6fb2d90..397c8b9a2219d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -772,8 +772,8 @@ cdef class StringHashTable(HashTable): for i in range(n): val = values[i] - if (PyUnicode_Check(val) or PyString_Check(val)) - and not (use_na_value and val == na_value)): + if ((PyUnicode_Check(val) or PyString_Check(val)) + and not (use_na_value and val == na_value)): v = util.get_c_string(val) vecs[i] = v else: From 9d453786f976969b1d1c5fef07db00004ecef7b5 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 8 Oct 2018 18:45:02 +0200 Subject: [PATCH 15/26] Small fixes --- pandas/_libs/hashtable_class_helper.pxi.in | 8 ++++---- pandas/tests/test_algos.py | 10 ++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 397c8b9a2219d..1cd0e0d1e8982 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -361,7 +361,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): """ - Calculate unique values without sorting; ignores all NA-values + Calculate unique values and labels (no sorting); ignores all NA-values Parameters ---------- @@ -730,7 +730,7 @@ cdef class StringHashTable(HashTable): ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): """ - Calculate unique values without sorting; ignores all NA-values + Calculate unique values and labels (no sorting); ignores all NA-values Parameters ---------- @@ -944,7 +944,7 @@ cdef class PyObjectHashTable(HashTable): ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): """ - Calculate unique values without sorting; ignores all NA-values + Calculate unique values and labels (no sorting); ignores all NA-values Parameters ---------- @@ -983,7 +983,7 @@ cdef class PyObjectHashTable(HashTable): hash(val) if ((val != val or val is None) - or (use_na_value and val == na_value)): + or (use_na_value and val == na_value)): labels[i] = na_sentinel continue diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index afed3aece807d..9aa77665995de 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1278,10 +1278,12 @@ def test_vector_resize(self, writable, htable, uniques, dtype, # reallocations (GH 7157) vals = np.array(np.random.randn(1000), dtype=dtype) - # GH 21688 ensure we can deal with readonly memory views + # GH 21688 ensures we can deal with read-only memory views vals.setflags(write=writable) - # initialise instances + # initialise instances; cannot initialise in parametrization, + # as otherwise external views would be held on the array (which is + # one of the things this test is checking) htable = htable() uniques = uniques() @@ -1322,8 +1324,8 @@ def test_hashtable_unique(self, htable, tm_dtype): # create duplicated selection s_duplicated = s.sample(frac=3, replace=True) - # drop_duplicates has own cython code (khash) and is tested separately - # keeps first occurrence like ht.unique + # drop_duplicates has own cython code (hash_table_func_helper.pxi) + # and is tested separately; keeps first occurrence like ht.unique() expected_unique = s_duplicated.drop_duplicates(keep='first').values result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) From 00b2ccb031099ea1f33fe380dcbdb93402e93779 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 8 Oct 2018 18:50:50 +0200 Subject: [PATCH 16/26] Review (jorisvandenbossche) --- pandas/_libs/hashtable_class_helper.pxi.in | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 1cd0e0d1e8982..620e564df922b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -440,7 +440,8 @@ cdef class {{name}}HashTable(HashTable): return uniques.to_array(), np.asarray(labels) def factorize(self, {{dtype}}_t[:] values): - return self._unique_with_inverse(values, uniques={{name}}Vector()) + uniques = {{name}}Vector() + return self._unique_with_inverse(values, uniques=uniques) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, @@ -808,7 +809,8 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), np.asarray(labels) def factorize(self, ndarray[object] values): - return self._unique_with_inverse(values, uniques=ObjectVector()) + uniques = ObjectVector() + return self._unique_with_inverse(values, uniques=uniques) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, @@ -1003,7 +1005,8 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array(), np.asarray(labels) def factorize(self, ndarray[object] values): - return self._unique_with_inverse(values, uniques=ObjectVector()) + uniques = ObjectVector() + return self._unique_with_inverse(values, uniques=uniques) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, From a267d4a2e872a48986b35c771814fdf8617b0792 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 11 Oct 2018 21:58:33 +0200 Subject: [PATCH 17/26] Review (jorisvandenbossche) --- pandas/_libs/hashtable_class_helper.pxi.in | 66 +++++++++++----------- pandas/core/algorithms.py | 8 +-- pandas/tests/test_algos.py | 54 +++++++++++++++++- 3 files changed, 88 insertions(+), 40 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 627285771c76e..dcac9962f05c6 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -357,9 +357,9 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _unique_with_inverse(self, const {{dtype}}_t[:] values, - {{name}}Vector uniques, Py_ssize_t count_prior=0, - Py_ssize_t na_sentinel=-1, object na_value=None): + def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting); ignores all NA-values @@ -437,20 +437,20 @@ cdef class {{name}}HashTable(HashTable): labels[i] = count count += 1 - return uniques.to_array(), np.asarray(labels) + return np.asarray(labels) - def factorize(self, {{dtype}}_t[:] values): + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, + object na_value=None): uniques = {{name}}Vector() - return self._unique_with_inverse(values, uniques=uniques) + labels = self._factorize(values, uniques=uniques, + na_sentinel=na_sentinel, na_value=na_value) + return labels, uniques.to_array() def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique_with_inverse(values, uniques, - count_prior=count_prior, - na_sentinel=na_sentinel, - na_value=na_value) - return labels + return self._factorize(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): @@ -727,9 +727,9 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _unique_with_inverse(self, ndarray[object] values, - ObjectVector uniques, Py_ssize_t count_prior=0, - Py_ssize_t na_sentinel=-1, object na_value=None): + def _factorize(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting); ignores all NA-values @@ -806,20 +806,20 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - return uniques.to_array(), np.asarray(labels) + return np.asarray(labels) - def factorize(self, ndarray[object] values): + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): uniques = ObjectVector() - return self._unique_with_inverse(values, uniques=uniques) + labels = self._factorize(values, uniques=uniques, + na_sentinel=na_sentinel, na_value=na_value) + return labels, uniques.to_array() def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique_with_inverse(values, uniques, - count_prior=count_prior, - na_sentinel=na_sentinel, - na_value=na_value) - return labels + return self._factorize(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) cdef class PyObjectHashTable(HashTable): @@ -942,9 +942,9 @@ cdef class PyObjectHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _unique_with_inverse(self, ndarray[object] values, - ObjectVector uniques, Py_ssize_t count_prior=0, - Py_ssize_t na_sentinel=-1, object na_value=None): + def _factorize(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting); ignores all NA-values @@ -1002,17 +1002,17 @@ cdef class PyObjectHashTable(HashTable): labels[i] = count count += 1 - return uniques.to_array(), np.asarray(labels) + return np.asarray(labels) - def factorize(self, ndarray[object] values): + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): uniques = ObjectVector() - return self._unique_with_inverse(values, uniques=uniques) + labels = self._factorize(values, uniques=uniques, + na_sentinel=na_sentinel, na_value=na_value) + return labels, uniques.to_array() def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique_with_inverse(values, uniques, - count_prior=count_prior, - na_sentinel=na_sentinel, - na_value=na_value) - return labels + return self._factorize(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e91cc8ec1e996..073ca2bf248c9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -468,15 +468,13 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, ------- labels, uniques : ndarray """ - (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) + (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - uniques = vec_klass() - labels = table.get_labels(values, uniques, 0, na_sentinel, - na_value=na_value) + labels, uniques = table.factorize(values, na_sentinel=na_sentinel, + na_value=na_value) labels = ensure_platform_int(labels) - uniques = uniques.to_array() return labels, uniques diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 9aa77665995de..ac49515406aa2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1322,14 +1322,64 @@ def test_hashtable_unique(self, htable, tm_dtype): s.loc[500:502] = [np.nan, None, pd.NaT] # create duplicated selection - s_duplicated = s.sample(frac=3, replace=True) + s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) # drop_duplicates has own cython code (hash_table_func_helper.pxi) - # and is tested separately; keeps first occurrence like ht.unique() + # and is tested separately; keeps first occurrence like ht.unique() expected_unique = s_duplicated.drop_duplicates(keep='first').values result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) + @pytest.mark.parametrize('na_sentinel', [-1]) + @pytest.mark.parametrize('htable, tm_dtype', [ + (ht.PyObjectHashTable, 'String'), + (ht.StringHashTable, 'String'), + (ht.Float64HashTable, 'Float'), + (ht.Int64HashTable, 'Int'), + (ht.UInt64HashTable, 'UInt')]) + def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel): + # output of maker has guaranteed unique elements + maker = getattr(tm, 'make' + tm_dtype + 'Index') + s = Series(maker(1000)) + if htable == ht.Float64HashTable: + # add NaN for float column + s.loc[500] = np.nan + elif htable == ht.PyObjectHashTable: + # use different NaN types for object column + s.loc[500:502] = [np.nan, None, pd.NaT] + + # create duplicated selection + idx_duplicated = pd.Series(s.index).sample(frac=3, replace=True) + s_duplicated = s[idx_duplicated.values].reset_index(drop=True) + na_mask = s_duplicated.isna().values + + result_inverse, result_unique = htable().factorize(s_duplicated.values) + + # drop_duplicates has own cython code (hash_table_func_helper.pxi) + # and is tested separately; keeps first occurrence like ht.unique() + expected_unique = s_duplicated.dropna().drop_duplicates(keep='first') + expected_unique = expected_unique.values + tm.assert_numpy_array_equal(result_unique, expected_unique) + + # ignore NaNs for calculating inverse + _, values2unique, unique2values = np.unique(idx_duplicated[~na_mask], + return_inverse=True, + return_index=True) + expected_inverse = np.ones(s_duplicated.shape, + dtype=np.intp) * na_sentinel + + # np.unique yields a __SORTED__ list of uniques, and values2unique + # resp. unique2values are relative to this order. To restore the + # original order, we argsort values2unique, because values2unique would + # be ordered if np.unique had not sorted implicitly. The first argsort + # gives the permutation from values2unique to its sorted form, but we + # need the inverse permutation (the map from the unsorted uniques to + # values2unique, from which we can continue with unique2values). + # This inversion (as a permutation) is achieved by the second argsort. + inverse_no_na = np.argsort(np.argsort(values2unique))[unique2values] + expected_inverse[~na_mask] = inverse_no_na + tm.assert_numpy_array_equal(result_inverse, expected_inverse) + def test_quantile(): s = Series(np.random.randn(100)) From 7f1bb4020322fc78dd6fe43eba169695395a3780 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 12 Oct 2018 00:04:34 +0200 Subject: [PATCH 18/26] Improve comment --- pandas/tests/test_algos.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ac49515406aa2..61e5f60142a5d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1356,7 +1356,8 @@ def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel): result_inverse, result_unique = htable().factorize(s_duplicated.values) # drop_duplicates has own cython code (hash_table_func_helper.pxi) - # and is tested separately; keeps first occurrence like ht.unique() + # and is tested separately; keeps first occurrence like ht.factorize() + # since factorize removes all NaNs, we do the same here expected_unique = s_duplicated.dropna().drop_duplicates(keep='first') expected_unique = expected_unique.values tm.assert_numpy_array_equal(result_unique, expected_unique) From 95939922141a1220dddcd53d5c81435ad8f2edc5 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 12 Oct 2018 08:25:31 +0200 Subject: [PATCH 19/26] Test for writable; expand comments --- pandas/tests/test_algos.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 61e5f60142a5d..fdeb8f67c2ff0 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1310,7 +1310,7 @@ def test_vector_resize(self, writable, htable, uniques, dtype, (ht.Float64HashTable, 'Float'), (ht.Int64HashTable, 'Int'), (ht.UInt64HashTable, 'UInt')]) - def test_hashtable_unique(self, htable, tm_dtype): + def test_hashtable_unique(self, htable, tm_dtype, writable): # output of maker has guaranteed unique elements maker = getattr(tm, 'make' + tm_dtype + 'Index') s = Series(maker(1000)) @@ -1323,6 +1323,7 @@ def test_hashtable_unique(self, htable, tm_dtype): # create duplicated selection s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) + s_duplicated.values.setflags(write=writable) # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.unique() @@ -1330,14 +1331,15 @@ def test_hashtable_unique(self, htable, tm_dtype): result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) - @pytest.mark.parametrize('na_sentinel', [-1]) + @pytest.mark.parametrize('na_sentinel', [-1, 1001]) @pytest.mark.parametrize('htable, tm_dtype', [ (ht.PyObjectHashTable, 'String'), (ht.StringHashTable, 'String'), (ht.Float64HashTable, 'Float'), (ht.Int64HashTable, 'Int'), (ht.UInt64HashTable, 'UInt')]) - def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel): + def test_hashtable_factorize(self, htable, tm_dtype, + na_sentinel, writable): # output of maker has guaranteed unique elements maker = getattr(tm, 'make' + tm_dtype + 'Index') s = Series(maker(1000)) @@ -1348,12 +1350,15 @@ def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel): # use different NaN types for object column s.loc[500:502] = [np.nan, None, pd.NaT] - # create duplicated selection + # create duplicated selection (with known indices per duplicate!) idx_duplicated = pd.Series(s.index).sample(frac=3, replace=True) s_duplicated = s[idx_duplicated.values].reset_index(drop=True) + s_duplicated.values.setflags(write=writable) na_mask = s_duplicated.isna().values - result_inverse, result_unique = htable().factorize(s_duplicated.values) + result_tuple = htable().factorize(s_duplicated.values, + na_sentinel=na_sentinel) + result_inverse, result_unique = result_tuple # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.factorize() @@ -1362,7 +1367,11 @@ def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel): expected_unique = expected_unique.values tm.assert_numpy_array_equal(result_unique, expected_unique) - # ignore NaNs for calculating inverse + # ignore NaNs for calculating inverse because factorize drops all NaNs! + # values2unique: mapping indices of original to indices of uniques + # unique2values: reduplication from array of uniques to original array + # this fits together in the way that values[values2unique] are the + # uniques (from np.unique!) and uniques[unique2values] == original _, values2unique, unique2values = np.unique(idx_duplicated[~na_mask], return_inverse=True, return_index=True) From 08d7f507081e8e55b39b23e810c9d9b4c7f0ac98 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 12 Oct 2018 08:33:50 +0200 Subject: [PATCH 20/26] Simplify factorize test --- pandas/tests/test_algos.py | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fdeb8f67c2ff0..6477e9fd79340 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1331,15 +1331,13 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) - @pytest.mark.parametrize('na_sentinel', [-1, 1001]) @pytest.mark.parametrize('htable, tm_dtype', [ (ht.PyObjectHashTable, 'String'), (ht.StringHashTable, 'String'), (ht.Float64HashTable, 'Float'), (ht.Int64HashTable, 'Int'), (ht.UInt64HashTable, 'UInt')]) - def test_hashtable_factorize(self, htable, tm_dtype, - na_sentinel, writable): + def test_hashtable_factorize(self, htable, tm_dtype, writable): # output of maker has guaranteed unique elements maker = getattr(tm, 'make' + tm_dtype + 'Index') s = Series(maker(1000)) @@ -1356,9 +1354,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, s_duplicated.values.setflags(write=writable) na_mask = s_duplicated.isna().values - result_tuple = htable().factorize(s_duplicated.values, - na_sentinel=na_sentinel) - result_inverse, result_unique = result_tuple + result_inverse, result_unique = htable().factorize(s_duplicated.values) # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.factorize() @@ -1367,28 +1363,11 @@ def test_hashtable_factorize(self, htable, tm_dtype, expected_unique = expected_unique.values tm.assert_numpy_array_equal(result_unique, expected_unique) - # ignore NaNs for calculating inverse because factorize drops all NaNs! - # values2unique: mapping indices of original to indices of uniques - # unique2values: reduplication from array of uniques to original array - # this fits together in the way that values[values2unique] are the - # uniques (from np.unique!) and uniques[unique2values] == original - _, values2unique, unique2values = np.unique(idx_duplicated[~na_mask], - return_inverse=True, - return_index=True) - expected_inverse = np.ones(s_duplicated.shape, - dtype=np.intp) * na_sentinel - - # np.unique yields a __SORTED__ list of uniques, and values2unique - # resp. unique2values are relative to this order. To restore the - # original order, we argsort values2unique, because values2unique would - # be ordered if np.unique had not sorted implicitly. The first argsort - # gives the permutation from values2unique to its sorted form, but we - # need the inverse permutation (the map from the unsorted uniques to - # values2unique, from which we can continue with unique2values). - # This inversion (as a permutation) is achieved by the second argsort. - inverse_no_na = np.argsort(np.argsort(values2unique))[unique2values] - expected_inverse[~na_mask] = inverse_no_na - tm.assert_numpy_array_equal(result_inverse, expected_inverse) + # reconstruction can only succeed if the inverse is correct. + # Since factorize removes the NaNs, those have to be excluded + result_reconstruct = result_unique[result_inverse[~na_mask]] + expected_reconstruct = s_duplicated.dropna().values + tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) def test_quantile(): From d91be98ec686ad5ee7af1790011638586f0d2914 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 12 Oct 2018 08:40:19 +0200 Subject: [PATCH 21/26] Add simple test --- pandas/tests/test_algos.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6477e9fd79340..c776cd844e6ff 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -249,6 +249,17 @@ def test_uint64_factorize(self, writable): tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) + def test_object_factorize(self, writable): + data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'], + dtype=object) + data.setflags(write=writable) + exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) + exp_uniques = np.array(['a', 'c', 'b'], dtype=object) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. From e27ec9a85d6d62fbadf2c6f3fcf435327262f60e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 12 Oct 2018 08:42:22 +0200 Subject: [PATCH 22/26] Tiny fixes --- pandas/tests/test_algos.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c776cd844e6ff..f6438e70e692e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1370,12 +1370,11 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.factorize() # since factorize removes all NaNs, we do the same here - expected_unique = s_duplicated.dropna().drop_duplicates(keep='first') - expected_unique = expected_unique.values + expected_unique = s_duplicated.dropna().drop_duplicates().values tm.assert_numpy_array_equal(result_unique, expected_unique) - # reconstruction can only succeed if the inverse is correct. - # Since factorize removes the NaNs, those have to be excluded + # reconstruction can only succeed if the inverse is correct. Since + # factorize removes the NaNs, those have to be excluded here as well result_reconstruct = result_unique[result_inverse[~na_mask]] expected_reconstruct = s_duplicated.dropna().values tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) From d825be0ac1805ea4f0c35427efc9237d98a41200 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 13 Oct 2018 00:52:19 +0200 Subject: [PATCH 23/26] Remove idx_duplicated from test (now unnecessary) --- pandas/tests/test_algos.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f6438e70e692e..1fb9bf6c3caf9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1359,9 +1359,8 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): # use different NaN types for object column s.loc[500:502] = [np.nan, None, pd.NaT] - # create duplicated selection (with known indices per duplicate!) - idx_duplicated = pd.Series(s.index).sample(frac=3, replace=True) - s_duplicated = s[idx_duplicated.values].reset_index(drop=True) + # create duplicated selection + s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) s_duplicated.values.setflags(write=writable) na_mask = s_duplicated.isna().values From 1a342d09d686f7e9545d89fa8c77a42613486953 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 14 Oct 2018 21:19:44 +0200 Subject: [PATCH 24/26] Review (jreback) --- pandas/_libs/hashtable_class_helper.pxi.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index dcac9962f05c6..c061102fbaddc 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -505,12 +505,12 @@ cdef class {{name}}HashTable(HashTable): Parameters ---------- - values : ndarray[object] + values : ndarray[{{dtype}}] Array of values of which unique will be calculated Returns ------- - uniques : ndarray[object] + uniques : ndarray[{{dtype}}] Unique values of input, not sorted """ cdef: From 3438727cf21b059be2928a3193f0a1ab4cc84bf5 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 17 Oct 2018 23:36:58 +0200 Subject: [PATCH 25/26] Review (jreback) --- pandas/tests/test_algos.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 57d085314dc29..557669260604a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -227,19 +227,42 @@ def test_complex_sorting(self): pytest.raises(TypeError, algos.factorize, x17[::-1], sort=True) + def test_float64_factorize(self, writable): + data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) + data.setflags(write=writable) + exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp) + exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + def test_uint64_factorize(self, writable): - data = np.array([2**63, 1, 2**63], dtype=np.uint64) + data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**63, 1], dtype=np.uint64) + exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) - data = np.array([2**63, -1, 2**63], dtype=object) + def test_int64_factorize(self, writable): + data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64) + data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**63, -1], dtype=object) + exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + + def test_string_factorize(self, writable): + data = np.array(['a', 'c', 'a', 'b', 'c'], + dtype=object) + data.setflags(write=writable) + exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp) + exp_uniques = np.array(['a', 'c', 'b'], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) From 6d0e86b0f62a63902ff35bc6575d6f3c3e82b6a2 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 18 Oct 2018 08:23:59 +0200 Subject: [PATCH 26/26] Retrigger Circle