diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6052768f69e84..6b50c65e29a4b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -355,14 +355,87 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) + def unique(self, const {{dtype}}_t[:] values, bint return_inverse): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + return_inverse : boolean + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = {{name}}Vector() + # explicitly compile path without inverse for performance + # the last three arguments are not relevant for this method, but we + # don't use kwargs to avoid cython perf hit (just using default values) + if return_inverse: + return self._unique_with_inverse(values, uniques, 0, -1, None) + return self._unique_no_inverse(values, uniques, 0, -1, None) + + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel, + object na_value): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + """ + # reduced signature compared to _factorize + # not necessary to have uniques-vector, count_prior + uniques = {{name}}Vector() + return self._factorize(values, uniques, 0, na_sentinel, na_value) + +{{py: +# tuples of "func_name, return_inverse, ignore_na" +unique_funcs = [('_unique_no_inverse', False, False), + ('_unique_with_inverse', True, False), + ('_factorize', True, True), + ('get_labels', True, True)] +}} + +{{for func_name, return_inverse, ignore_na in unique_funcs}} + @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - bint ignore_na, bint return_uniques, - bint return_inverse, Py_ssize_t count_prior, - Py_ssize_t na_sentinel, object na_value): + def {{func_name}}(self, const {{dtype}}_t[:] values, + {{name}}Vector uniques, Py_ssize_t count_prior, + Py_ssize_t na_sentinel, object na_value): """ Calculate unique values and labels (no sorting!) +{{if func_name == '_factorize' or func_name == 'get_labels'}} + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" +{{endif}} Parameters ---------- @@ -370,17 +443,6 @@ cdef class {{name}}HashTable(HashTable): Array of values of which unique will be calculated uniques : {{name}}Vector Vector into which uniques will be written - ignore_na : boolean - Whether NA-values should be ignored for calculating the uniques. If - True, the labels corresponding to missing values will be set to - na_sentinel. - return_uniques : boolean - Whether to return the content of the passed "uniques" vector as an - np.ndarray at the end. If False, the vector passed to "uniques" - must be explicitly read and transformed by the user. - return_inverse : boolean - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. count_prior : Py_ssize_t Number of existing entries in uniques na_sentinel : Py_ssize_t @@ -393,10 +455,23 @@ cdef class {{name}}HashTable(HashTable): Returns ------- - uniques : ndarray[{{dtype}}] (if return_uniques) +{{if func_name == '_unique_no_inverse'}} + uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) +{{elif func_name == '_unique_with_inverse'}} + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques +{{elif func_name == '_factorize' # switched output order for factorize}} + labels : ndarray[int64] The labels from values to uniques + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted +{{elif func_name == 'get_labels'}} + labels : ndarray[int64] + The labels from values to uniques +{{endif}} """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -407,7 +482,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud bint use_na_value - if return_inverse: + if {{return_inverse}}: labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -426,8 +501,8 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if ignore_na and (val != val - or (use_na_value and val == na_value2)): + if {{ignore_na}} and (val != val + or (use_na_value and val == na_value2)): labels[i] = na_sentinel continue @@ -445,54 +520,27 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - if return_inverse: + if {{return_inverse}}: self.table.vals[k] = count labels[i] = count count += 1 - elif return_inverse: + elif {{return_inverse}}: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx - if return_uniques and return_inverse: - return uniques.to_array(), np.asarray(labels) - elif return_uniques: - return uniques.to_array() - elif return_inverse: - return np.asarray(labels) - - def unique(self, const {{dtype}}_t[:] values, bint return_inverse): - uniques = {{name}}Vector() - return self._unique(values, uniques, - False, # ignore_na - True, # return_uniques - return_inverse, - # the rest are of the parameters are not relevant, - # but we don't use kwargs to avoid cython perf hit - 0, # count_prior - -1, # na_sentinel - None) # na_value +{{if func_name == '_unique_no_inverse'}} + return uniques.to_array() +{{elif func_name == '_unique_with_inverse'}} + return uniques.to_array(), np.asarray(labels) +{{elif func_name == '_factorize'}} + return np.asarray(labels), uniques.to_array() +{{elif func_name == 'get_labels'}} + return np.asarray(labels) +{{endif}} - def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel, - object na_value): - uniques = {{name}}Vector() - # factorize has reversed outputs compared to _unique (see "[::-1]") - return self._unique(values, uniques, - True, # ignore_na - True, # return_uniques - True, # return_inverse - 0, # count_prior - na_sentinel, na_value)[::-1] - - def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): - return self._unique(values, uniques, - True, # ignore_na - False, # return_uniques - True, # return_inverse - count_prior, na_sentinel, na_value) +{{endfor}} @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): @@ -679,14 +727,87 @@ cdef class StringHashTable(HashTable): self.table.vals[k] = i free(vecs) + def unique(self, ndarray[object] values, bint return_inverse): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + # explicitly compile path without inverse for performance + # the last three arguments are not relevant for this method, but we + # don't use kwargs to avoid cython perf hit (just using default values) + if return_inverse: + return self._unique_with_inverse(values, uniques, 0, -1, None) + return self._unique_no_inverse(values, uniques, 0, -1, None) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, + object na_value): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + + Returns + ------- + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted + """ + # reduced signature compared to _factorize + # not necessary to have uniques-vector, count_prior + uniques = ObjectVector() + return self._factorize(values, uniques, 0, na_sentinel, na_value) + +{{py: +# tuples of "func_name, return_inverse, ignore_na" +unique_funcs = [('_unique_no_inverse', False, False), + ('_unique_with_inverse', True, False), + ('_factorize', True, True), + ('get_labels', True, True)] +}} + +{{for func_name, return_inverse, ignore_na in unique_funcs}} + @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, ndarray[object] values, ObjectVector uniques, - bint ignore_na, bint return_uniques, bint return_inverse, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): + def {{func_name}}(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + object na_value): """ - Calculate unique values and labels (no sorting) + Calculate unique values and labels (no sorting!) +{{if func_name == '_factorize' or func_name == 'get_labels'}} + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" +{{endif}} Parameters ---------- @@ -694,17 +815,6 @@ cdef class StringHashTable(HashTable): Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - ignore_na : boolean - Whether NA-values should be ignored for calculating the uniques. If - True, the labels corresponding to missing values will be set to - na_sentinel. - return_uniques : boolean - Whether to return the content of the passed "uniques" vector as an - np.ndarray at the end. If False, the vector passed to "uniques" - must be explicitly read and transformed by the user. - return_inverse : boolean - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. count_prior : Py_ssize_t Number of existing entries in uniques na_sentinel : Py_ssize_t @@ -717,10 +827,23 @@ cdef class StringHashTable(HashTable): Returns ------- - uniques : ndarray[object] (if return_uniques) +{{if func_name == '_unique_no_inverse'}} + uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) +{{elif func_name == '_unique_with_inverse'}} + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques +{{elif func_name == '_factorize' # switched output order for factorize}} + labels : ndarray[int64] The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted +{{elif func_name == 'get_labels'}} + labels : ndarray[int64] + The labels from values to uniques +{{endif}} """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -733,7 +856,7 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - if return_inverse: + if {{return_inverse}}: labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -743,8 +866,9 @@ cdef class StringHashTable(HashTable): for i in range(n): val = values[i] - if ignore_na and (not (PyUnicode_Check(val) or PyString_Check(val)) - or (use_na_value and val == na_value)): + if ({{ignore_na}} + and (not (PyUnicode_Check(val) or PyString_Check(val)) + or (use_na_value and val == na_value))): # missing value labels[i] = na_sentinel else: @@ -755,7 +879,7 @@ cdef class StringHashTable(HashTable): # compute with nogil: for i in range(n): - if ignore_na and labels[i] == na_sentinel: + if {{ignore_na}} and labels[i] == na_sentinel: continue v = vecs[i] @@ -764,11 +888,11 @@ cdef class StringHashTable(HashTable): # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) uindexer[count] = i - if return_inverse: + if {{return_inverse}}: self.table.vals[k] = count labels[i] = count count += 1 - elif return_inverse: + elif {{return_inverse}}: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] @@ -780,44 +904,17 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - if return_uniques and return_inverse: - return uniques.to_array(), np.asarray(labels) - elif return_uniques: - return uniques.to_array() - elif return_inverse: - return np.asarray(labels) - - def unique(self, ndarray[object] values, bint return_inverse=False): - uniques = ObjectVector() - return self._unique(values, uniques, - False, # ignore_na - True, # return_uniques - return_inverse, - # the rest are of the parameters are not relevant, - # but we don't use kwargs to avoid cython perf hit - 0, # count_prior - -1, # na_sentinel - None) # na_value +{{if func_name == '_unique_no_inverse'}} + return uniques.to_array() +{{elif func_name == '_unique_with_inverse'}} + return uniques.to_array(), np.asarray(labels) +{{elif func_name == '_factorize'}} + return np.asarray(labels), uniques.to_array() +{{elif func_name == 'get_labels'}} + return np.asarray(labels) +{{endif}} - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, - object na_value): - uniques = ObjectVector() - # factorize has reversed outputs compared to _unique (see "[::-1]") - return self._unique(values, uniques, - True, # ignore_na - True, # return_uniques - True, # return_inverse - 0, # count_prior - na_sentinel, na_value)[::-1] - - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): - return self._unique(values, uniques, - True, # ignore_na - False, # return_uniques - True, # return_inverse - count_prior, na_sentinel, na_value) +{{endfor}} cdef class PyObjectHashTable(HashTable): @@ -905,14 +1002,87 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) + def unique(self, ndarray[object] values, bint return_inverse): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + # explicitly compile path without inverse for performance + # the last three arguments are not relevant for this method, but we + # don't use kwargs to avoid cython perf hit (just using default values) + if return_inverse: + return self._unique_with_inverse(values, uniques, 0, -1, None) + return self._unique_no_inverse(values, uniques, 0, -1, None) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, + object na_value): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted + """ + # reduced signature compared to _factorize + # not necessary to have uniques-vector, count_prior + uniques = ObjectVector() + return self._factorize(values, uniques, 0, na_sentinel, na_value) + +{{py: +# tuples of "func_name, return_inverse, ignore_na" +unique_funcs = [('_unique_no_inverse', False, False), + ('_unique_with_inverse', True, False), + ('_factorize', True, True), + ('get_labels', True, True)] +}} + +{{for func_name, return_inverse, ignore_na in unique_funcs}} + @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, ndarray[object] values, ObjectVector uniques, - bint ignore_na, bint return_uniques, bint return_inverse, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): + def {{func_name}}(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + object na_value): """ - Calculate unique values and labels (no sorting) + Calculate unique values and labels (no sorting!) +{{if func_name == '_factorize' or func_name == 'get_labels'}} + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" +{{endif}} Parameters ---------- @@ -920,17 +1090,6 @@ cdef class PyObjectHashTable(HashTable): Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - ignore_na : boolean - Whether NA-values should be ignored for calculating the uniques. If - True, the labels corresponding to missing values will be set to - na_sentinel. - return_uniques : boolean - Whether to return the content of the passed "uniques" vector as an - np.ndarray at the end. If False, the vector passed to "uniques" - must be explicitly read and transformed by the user. - return_inverse : boolean - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. count_prior : Py_ssize_t Number of existing entries in uniques na_sentinel : Py_ssize_t @@ -943,10 +1102,23 @@ cdef class PyObjectHashTable(HashTable): Returns ------- - uniques : ndarray[object] (if return_uniques) +{{if func_name == '_unique_no_inverse'}} + uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) +{{elif func_name == '_unique_with_inverse'}} + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] The labels from values to uniques +{{elif func_name == '_factorize' # switched output order for factorize}} + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted +{{elif func_name == 'get_labels'}} + labels : ndarray[int64] + The labels from values to uniques +{{endif}} """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -956,7 +1128,7 @@ cdef class PyObjectHashTable(HashTable): khiter_t k bint use_na_value - if return_inverse: + if {{return_inverse}}: labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -964,8 +1136,8 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - if ignore_na and ((val != val or val is None) - or (use_na_value and val == na_value)): + if {{ignore_na}} and ((val != val or val is None) + or (use_na_value and val == na_value)): labels[i] = na_sentinel continue @@ -974,51 +1146,24 @@ cdef class PyObjectHashTable(HashTable): # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) uniques.append(val) - if return_inverse: + if {{return_inverse}}: self.table.vals[k] = count labels[i] = count count += 1 - elif return_inverse: + elif {{return_inverse}}: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx - if return_uniques and return_inverse: - return uniques.to_array(), np.asarray(labels) - elif return_uniques: - return uniques.to_array() - elif return_inverse: - return np.asarray(labels) - - def unique(self, ndarray[object] values, bint return_inverse=False): - uniques = ObjectVector() - return self._unique(values, uniques, - False, # ignore_na - True, # return_uniques - return_inverse, - # the rest are of the parameters are not relevant, - # but we don't use kwargs to avoid cython perf hit - 0, # count_prior - -1, # na_sentinel - None) # na_value +{{if func_name == '_unique_no_inverse'}} + return uniques.to_array() +{{elif func_name == '_unique_with_inverse'}} + return uniques.to_array(), np.asarray(labels) +{{elif func_name == '_factorize'}} + return np.asarray(labels), uniques.to_array() +{{elif func_name == 'get_labels'}} + return np.asarray(labels) +{{endif}} - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, - object na_value): - uniques = ObjectVector() - # factorize has reversed outputs compared to _unique (see "[::-1]") - return self._unique(values, uniques, - True, # ignore_na - True, # return_uniques - True, # return_inverse - 0, # count_prior - na_sentinel, na_value)[::-1] - - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): - return self._unique(values, uniques, - True, # ignore_na - False, # return_uniques - True, # return_inverse - count_prior, na_sentinel, na_value) +{{endfor}}