Skip to content

Commit

Permalink
CLN: prepare unifying hashtable.factorize and .unique; add doc-strings (
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari authored and jreback committed Oct 18, 2018
1 parent e5196aa commit 99e6401
Show file tree
Hide file tree
Showing 3 changed files with 319 additions and 74 deletions.
206 changes: 177 additions & 29 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -355,19 +355,38 @@ cdef class {{name}}HashTable(HashTable):

return np.asarray(locs)

def factorize(self, {{dtype}}_t values):
uniques = {{name}}Vector()
labels = self.get_labels(values, uniques, 0, 0)
return uniques.to_array(), labels

@cython.boundscheck(False)
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
@cython.wraparound(False)
def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
"""
Calculate unique values and labels (no sorting); ignores all NA-values

Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated
uniques : {{name}}Vector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then
any value satisfying val!=val are considered missing.

Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[int64]
The labels from values to uniques
"""
cdef:
Py_ssize_t i, n = len(values)
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
Py_ssize_t idx, count = count_prior
int ret = 0
{{dtype}}_t val, na_value2
khiter_t k
Expand Down Expand Up @@ -399,9 +418,11 @@ cdef class {{name}}HashTable(HashTable):
k = kh_get_{{dtype}}(self.table, val)

if k != self.table.n_buckets:
# k falls into a previous bucket
idx = self.table.vals[k]
labels[i] = idx
else:
# k hasn't been seen yet
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = count

Expand All @@ -418,6 +439,19 @@ cdef class {{name}}HashTable(HashTable):

return np.asarray(labels)

def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
uniques = {{name}}Vector()
labels = self._factorize(values, uniques=uniques,
na_sentinel=na_sentinel, na_value=na_value)
return labels, uniques.to_array()

def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
return self._factorize(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value)

@cython.boundscheck(False)
def get_labels_groupby(self, const {{dtype}}_t[:] values):
cdef:
Expand Down Expand Up @@ -464,7 +498,21 @@ cdef class {{name}}HashTable(HashTable):
return np.asarray(labels), arr_uniques

@cython.boundscheck(False)
@cython.wraparound(False)
def unique(self, const {{dtype}}_t[:] values):
"""
Calculate unique values without sorting

Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated

Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
"""
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand Down Expand Up @@ -567,7 +615,21 @@ cdef class StringHashTable(HashTable):
return labels

@cython.boundscheck(False)
@cython.wraparound(False)
def unique(self, ndarray[object] values):
"""
Calculate unique values without sorting

Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
"""
cdef:
Py_ssize_t i, count, n = len(values)
int64_t[:] uindexer
Expand Down Expand Up @@ -602,11 +664,6 @@ cdef class StringHashTable(HashTable):
uniques.append(values[uindexer[i]])
return uniques.to_array()

def factorize(self, ndarray[object] values):
uniques = ObjectVector()
labels = self.get_labels(values, uniques, 0, 0)
return uniques.to_array(), labels

@cython.boundscheck(False)
def lookup(self, ndarray[object] values):
cdef:
Expand Down Expand Up @@ -669,34 +726,55 @@ cdef class StringHashTable(HashTable):
free(vecs)

@cython.boundscheck(False)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
@cython.wraparound(False)
def _factorize(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
"""
Calculate unique values and labels (no sorting); ignores all NA-values

Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
uniques : ObjectVector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64]
The labels from values to uniques
"""
cdef:
Py_ssize_t i, n = len(values)
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
int64_t[:] uindexer
Py_ssize_t idx, count = count_prior
int ret = 0
object val
const char *v
const char **vecs
khiter_t k
bint use_na_value

# these by-definition *must* be strings
labels = np.zeros(n, dtype=np.int64)
uindexer = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None

# pre-filter out missing
# and assign pointers
# assign pointers and pre-filter out missing
vecs = <const char **> malloc(n * sizeof(char *))
for i in range(n):
val = values[i]

if ((PyUnicode_Check(val) or PyString_Check(val)) and
not (use_na_value and val == na_value)):
if ((PyUnicode_Check(val) or PyString_Check(val))
and not (use_na_value and val == na_value)):
v = util.get_c_string(val)
vecs[i] = v
else:
Expand All @@ -711,9 +789,11 @@ cdef class StringHashTable(HashTable):
v = vecs[i]
k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
# k falls into a previous bucket
idx = self.table.vals[k]
labels[i] = <int64_t>idx
else:
# k hasn't been seen yet
k = kh_put_str(self.table, v, &ret)
self.table.vals[k] = count
uindexer[count] = i
Expand All @@ -728,6 +808,19 @@ cdef class StringHashTable(HashTable):

return np.asarray(labels)

def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
uniques = ObjectVector()
labels = self._factorize(values, uniques=uniques,
na_sentinel=na_sentinel, na_value=na_value)
return labels, uniques.to_array()

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
return self._factorize(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value)


cdef class PyObjectHashTable(HashTable):

Expand Down Expand Up @@ -814,7 +907,22 @@ cdef class PyObjectHashTable(HashTable):

return np.asarray(locs)

@cython.boundscheck(False)
@cython.wraparound(False)
def unique(self, ndarray[object] values):
"""
Calculate unique values without sorting

Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
"""
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand All @@ -832,13 +940,38 @@ cdef class PyObjectHashTable(HashTable):

return uniques.to_array()

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
@cython.boundscheck(False)
@cython.wraparound(False)
def _factorize(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
"""
Calculate unique values and labels (no sorting); ignores all NA-values

Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
uniques : ObjectVector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then None _plus_
any value satisfying val!=val are considered missing.

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64]
The labels from values to uniques
"""
cdef:
Py_ssize_t i, n = len(values)
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
Py_ssize_t idx, count = count_prior
int ret = 0
object val
khiter_t k
Expand All @@ -851,20 +984,35 @@ cdef class PyObjectHashTable(HashTable):
val = values[i]
hash(val)

if ((val != val or val is None) or
(use_na_value and val == na_value)):
if ((val != val or val is None)
or (use_na_value and val == na_value)):
labels[i] = na_sentinel
continue

k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
# k falls into a previous bucket
idx = self.table.vals[k]
labels[i] = idx
else:
# k hasn't been seen yet
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = count
uniques.append(val)
labels[i] = count
count += 1

return np.asarray(labels)

def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
uniques = ObjectVector()
labels = self._factorize(values, uniques=uniques,
na_sentinel=na_sentinel, na_value=na_value)
return labels, uniques.to_array()

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
return self._factorize(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value)
8 changes: 3 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,15 +467,13 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
-------
labels, uniques : ndarray
"""
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
(hash_klass, _), values = _get_data_algo(values, _hashtables)

table = hash_klass(size_hint or len(values))
uniques = vec_klass()
labels = table.get_labels(values, uniques, 0, na_sentinel,
na_value=na_value)
labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
na_value=na_value)

labels = ensure_platform_int(labels)
uniques = uniques.to_array()
return labels, uniques


Expand Down
Loading

0 comments on commit 99e6401

Please sign in to comment.