From aa546c4465d80d1362ce02dee5a0a62d808db060 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Thu, 29 Nov 2018 18:21:24 +0100 Subject: [PATCH] ENH: Add return_inverse to cython-unique; unify unique/factorize-code (#23400) --- pandas/_libs/hashtable_class_helper.pxi.in | 478 ++++++++++++--------- pandas/core/algorithms.py | 2 +- pandas/tests/test_algos.py | 10 +- 3 files changed, 293 insertions(+), 197 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a71023ed34f44d..7f4c2a6410870d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -356,11 +356,12 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ - Calculate unique values and labels (no sorting); ignores all NA-values + Calculate unique values and labels (no sorting!) Parameters ---------- @@ -374,13 +375,22 @@ cdef class {{name}}HashTable(HashTable): Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then - any value satisfying val!=val are considered missing. + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -392,7 +402,8 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -410,20 +421,19 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if val != val or (use_na_value and val == na_value2): + if ignore_na and (val != val + or (use_na_value and val == na_value2)): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, + # and replace the corresponding label with na_sentinel labels[i] = na_sentinel continue k = kh_get_{{dtype}}(self.table, val) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - self.table.vals[k] = count if needs_resize(ud): with gil: @@ -433,23 +443,82 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - labels[i] = count - count += 1 + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx + + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() - return np.asarray(labels) + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = {{name}}Vector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, object na_value=None): - uniques = {{name}}Vector() - labels = self._factorize(values, uniques=uniques, - na_sentinel=na_sentinel, na_value=na_value) - return labels, uniques.to_array() + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ + uniques_vector = {{name}}Vector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - return self._factorize(values, uniques, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): @@ -496,44 +565,6 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques - @cython.boundscheck(False) - @cython.wraparound(False) - def unique(self, const {{dtype}}_t[:] values): - """ - Calculate unique values without sorting - - Parameters - ---------- - values : ndarray[{{dtype}}] - Array of values of which unique will be calculated - - Returns - ------- - uniques : ndarray[{{dtype}}] - Unique values of input, not sorted - """ - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - {{dtype}}_t val - khiter_t k - {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_{{dtype}}(self.table, val) - if k == self.table.n_buckets: - kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - return uniques.to_array() - {{endfor}} @@ -613,56 +644,6 @@ cdef class StringHashTable(HashTable): free(vecs) return labels - @cython.boundscheck(False) - @cython.wraparound(False) - def unique(self, ndarray[object] values): - """ - Calculate unique values without sorting - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - - Returns - ------- - uniques : ndarray[object] - Unique values of input, not sorted - """ - cdef: - Py_ssize_t i, count, n = len(values) - int64_t[:] uindexer - int ret = 0 - object val - ObjectVector uniques - khiter_t k - const char *v - const char **vecs - - vecs = malloc(n * sizeof(char *)) - uindexer = np.empty(n, dtype=np.int64) - for i in range(n): - val = values[i] - v = util.get_c_string(val) - vecs[i] = v - - count = 0 - with nogil: - for i in range(n): - v = vecs[i] - k = kh_get_str(self.table, v) - if k == self.table.n_buckets: - kh_put_str(self.table, v, &ret) - uindexer[count] = i - count += 1 - free(vecs) - - # uniques - uniques = ObjectVector() - for i in range(count): - uniques.append(values[uindexer[i]]) - return uniques.to_array() - @cython.boundscheck(False) def lookup(self, ndarray[object] values): cdef: @@ -726,11 +707,12 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _factorize(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ - Calculate unique values and labels (no sorting); ignores all NA-values + Calculate unique values and labels (no sorting!) Parameters ---------- @@ -743,13 +725,23 @@ cdef class StringHashTable(HashTable): na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None - Value to identify as missing + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -763,41 +755,50 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - labels = np.zeros(n, dtype=np.int64) + if return_inverse: + labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None - # assign pointers and pre-filter out missing + # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if (isinstance(val, (str, unicode)) - and not (use_na_value and val == na_value)): + if (ignore_na + and (not isinstance(val, (str, unicode)) + or (use_na_value and val == na_value))): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), we can skip the actual value, and + # replace the label with na_sentinel directly + labels[i] = na_sentinel + else: + # if ignore_na is False, we also stringify NaN/None/etc. v = util.get_c_string(val) vecs[i] = v - else: - labels[i] = na_sentinel # compute with nogil: for i in range(n): - if labels[i] == na_sentinel: + if ignore_na and labels[i] == na_sentinel: + # skip entries for ignored missing values (see above) continue v = vecs[i] k = kh_get_str(self.table, v) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) - self.table.vals[k] = count uindexer[count] = i - labels[i] = count + if return_inverse: + self.table.vals[k] = count + labels[i] = count count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx free(vecs) @@ -805,20 +806,72 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - return np.asarray(labels) + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + + def unique(self, ndarray[object] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None): - uniques = ObjectVector() - labels = self._factorize(values, uniques=uniques, - na_sentinel=na_sentinel, na_value=na_value) - return labels, uniques.to_array() + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ + uniques_vector = ObjectVector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - return self._factorize(values, uniques, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels cdef class PyObjectHashTable(HashTable): @@ -908,44 +961,12 @@ cdef class PyObjectHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def unique(self, ndarray[object] values): + def _unique(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ - Calculate unique values without sorting - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - - Returns - ------- - uniques : ndarray[object] - Unique values of input, not sorted - """ - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - ObjectVector uniques = ObjectVector() - - for i in range(n): - val = values[i] - hash(val) - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - - return uniques.to_array() - - @cython.boundscheck(False) - @cython.wraparound(False) - def _factorize(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): - """ - Calculate unique values and labels (no sorting); ignores all NA-values + Calculate unique values and labels (no sorting!) Parameters ---------- @@ -959,13 +980,22 @@ cdef class PyObjectHashTable(HashTable): Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then None _plus_ - any value satisfying val!=val are considered missing. + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -976,42 +1006,100 @@ cdef class PyObjectHashTable(HashTable): khiter_t k bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None for i in range(n): val = values[i] hash(val) - if ((val != val or val is None) - or (use_na_value and val == na_value)): + if ignore_na and ((val != val or val is None) + or (use_na_value and val == na_value)): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, and + # replace the corresponding label with na_sentinel labels[i] = na_sentinel continue k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = count uniques.append(val) - labels[i] = count - count += 1 + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx + + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + + def unique(self, ndarray[object] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) - return np.asarray(labels) + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None): - uniques = ObjectVector() - labels = self._factorize(values, uniques=uniques, - na_sentinel=na_sentinel, na_value=na_value) - return labels, uniques.to_array() + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ + uniques_vector = ObjectVector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - return self._factorize(values, uniques, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7aceef8634e206..1a4368ee8ea98a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -460,7 +460,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - labels, uniques = table.factorize(values, na_sentinel=na_sentinel, + uniques, labels = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) labels = ensure_platform_int(labels) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fa33a1ceae0b94..c9d403f6696af1 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1361,6 +1361,14 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) + # test return_inverse=True + # reconstruction can only succeed if the inverse is correct + result_unique, result_inverse = htable().unique(s_duplicated.values, + return_inverse=True) + tm.assert_numpy_array_equal(result_unique, expected_unique) + reconstr = result_unique[result_inverse] + tm.assert_numpy_array_equal(reconstr, s_duplicated.values) + @pytest.mark.parametrize('htable, tm_dtype', [ (ht.PyObjectHashTable, 'String'), (ht.StringHashTable, 'String'), @@ -1383,7 +1391,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): s_duplicated.values.setflags(write=writable) na_mask = s_duplicated.isna().values - result_inverse, result_unique = htable().factorize(s_duplicated.values) + result_unique, result_inverse = htable().factorize(s_duplicated.values) # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.factorize()