Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: prepare unifying hashtable.factorize and .unique; add doc-strings #22986

Merged
merged 33 commits into from
Oct 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
640162f
Fix ASV import error
h-vetinari Oct 3, 2018
31d0dc5
Add return_inverse to hashtable.unique
h-vetinari Sep 27, 2018
c5e5147
Pure copy/paste: Group unique/factorize functions next to each other
h-vetinari Sep 30, 2018
9918d52
Unify hashtable.factorize and .unique
h-vetinari Oct 3, 2018
52ae84e
Force compilation of different code paths
h-vetinari Oct 4, 2018
dbe4e0e
Add separate functions for return_inverse=False
h-vetinari Oct 4, 2018
8481e19
Finish split in _unique_with_inverse and _unique_no_inverse
h-vetinari Oct 4, 2018
27ceb4d
Add cython.wraparound(False)
h-vetinari Oct 4, 2018
f5cd5e9
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 6, 2018
b1705a9
Unmove unique-implementation (review jreback)
h-vetinari Oct 6, 2018
a6ed5dd
Undo line artefacts
h-vetinari Oct 6, 2018
17752ce
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 7, 2018
19eaf32
Clean up test_algos.test_vector_resize
h-vetinari Oct 7, 2018
ce7626f
Add test for hashtable.unique (esp. for return_inverse=True)
h-vetinari Oct 7, 2018
7b9014f
Review (jreback)
h-vetinari Oct 7, 2018
471c4da
Fix typo
h-vetinari Oct 8, 2018
9d45378
Small fixes
h-vetinari Oct 8, 2018
00b2ccb
Review (jorisvandenbossche)
h-vetinari Oct 8, 2018
8687315
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 11, 2018
a267d4a
Review (jorisvandenbossche)
h-vetinari Oct 11, 2018
7f1bb40
Improve comment
h-vetinari Oct 11, 2018
9593992
Test for writable; expand comments
h-vetinari Oct 12, 2018
08d7f50
Simplify factorize test
h-vetinari Oct 12, 2018
d91be98
Add simple test
h-vetinari Oct 12, 2018
e27ec9a
Tiny fixes
h-vetinari Oct 12, 2018
d825be0
Remove idx_duplicated from test (now unnecessary)
h-vetinari Oct 12, 2018
1a342d0
Review (jreback)
h-vetinari Oct 14, 2018
28e0441
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 15, 2018
d65e4fd
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 15, 2018
bca615c
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 17, 2018
3438727
Review (jreback)
h-vetinari Oct 17, 2018
facc111
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 18, 2018
6d0e86b
Retrigger Circle
h-vetinari Oct 18, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 177 additions & 29 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -355,19 +355,38 @@ cdef class {{name}}HashTable(HashTable):

return np.asarray(locs)

def factorize(self, {{dtype}}_t values):
uniques = {{name}}Vector()
labels = self.get_labels(values, uniques, 0, 0)
return uniques.to_array(), labels

@cython.boundscheck(False)
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
@cython.wraparound(False)
def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
"""
Calculate unique values and labels (no sorting); ignores all NA-values

Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated
uniques : {{name}}Vector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then
any value satisfying val!=val are considered missing.

Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[int64]
The labels from values to uniques
"""
cdef:
Py_ssize_t i, n = len(values)
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
Py_ssize_t idx, count = count_prior
int ret = 0
{{dtype}}_t val, na_value2
khiter_t k
Expand Down Expand Up @@ -399,9 +418,11 @@ cdef class {{name}}HashTable(HashTable):
k = kh_get_{{dtype}}(self.table, val)

if k != self.table.n_buckets:
# k falls into a previous bucket
idx = self.table.vals[k]
labels[i] = idx
else:
# k hasn't been seen yet
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = count

Expand All @@ -418,6 +439,19 @@ cdef class {{name}}HashTable(HashTable):

return np.asarray(labels)

def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
uniques = {{name}}Vector()
labels = self._factorize(values, uniques=uniques,
na_sentinel=na_sentinel, na_value=na_value)
return labels, uniques.to_array()

def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
return self._factorize(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value)

@cython.boundscheck(False)
def get_labels_groupby(self, const {{dtype}}_t[:] values):
cdef:
Expand Down Expand Up @@ -464,7 +498,21 @@ cdef class {{name}}HashTable(HashTable):
return np.asarray(labels), arr_uniques

@cython.boundscheck(False)
@cython.wraparound(False)
def unique(self, const {{dtype}}_t[:] values):
"""
Calculate unique values without sorting

Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated

Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
"""
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand Down Expand Up @@ -567,7 +615,21 @@ cdef class StringHashTable(HashTable):
return labels

@cython.boundscheck(False)
@cython.wraparound(False)
def unique(self, ndarray[object] values):
"""
Calculate unique values without sorting

Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
"""
cdef:
Py_ssize_t i, count, n = len(values)
int64_t[:] uindexer
Expand Down Expand Up @@ -602,11 +664,6 @@ cdef class StringHashTable(HashTable):
uniques.append(values[uindexer[i]])
return uniques.to_array()

def factorize(self, ndarray[object] values):
uniques = ObjectVector()
labels = self.get_labels(values, uniques, 0, 0)
return uniques.to_array(), labels

@cython.boundscheck(False)
def lookup(self, ndarray[object] values):
cdef:
Expand Down Expand Up @@ -669,34 +726,55 @@ cdef class StringHashTable(HashTable):
free(vecs)

@cython.boundscheck(False)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
@cython.wraparound(False)
def _factorize(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
"""
Calculate unique values and labels (no sorting); ignores all NA-values

Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
uniques : ObjectVector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64]
The labels from values to uniques
"""
cdef:
Py_ssize_t i, n = len(values)
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
int64_t[:] uindexer
Py_ssize_t idx, count = count_prior
int ret = 0
object val
const char *v
const char **vecs
khiter_t k
bint use_na_value

# these by-definition *must* be strings
labels = np.zeros(n, dtype=np.int64)
uindexer = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None

# pre-filter out missing
# and assign pointers
# assign pointers and pre-filter out missing
vecs = <const char **> malloc(n * sizeof(char *))
for i in range(n):
val = values[i]

if ((PyUnicode_Check(val) or PyString_Check(val)) and
not (use_na_value and val == na_value)):
if ((PyUnicode_Check(val) or PyString_Check(val))
and not (use_na_value and val == na_value)):
v = util.get_c_string(val)
vecs[i] = v
else:
Expand All @@ -711,9 +789,11 @@ cdef class StringHashTable(HashTable):
v = vecs[i]
k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
# k falls into a previous bucket
idx = self.table.vals[k]
labels[i] = <int64_t>idx
else:
# k hasn't been seen yet
k = kh_put_str(self.table, v, &ret)
self.table.vals[k] = count
uindexer[count] = i
Expand All @@ -728,6 +808,19 @@ cdef class StringHashTable(HashTable):

return np.asarray(labels)

def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
uniques = ObjectVector()
labels = self._factorize(values, uniques=uniques,
na_sentinel=na_sentinel, na_value=na_value)
return labels, uniques.to_array()

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
return self._factorize(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value)


cdef class PyObjectHashTable(HashTable):

Expand Down Expand Up @@ -814,7 +907,22 @@ cdef class PyObjectHashTable(HashTable):

return np.asarray(locs)

@cython.boundscheck(False)
@cython.wraparound(False)
def unique(self, ndarray[object] values):
"""
Calculate unique values without sorting

Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
"""
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand All @@ -832,13 +940,38 @@ cdef class PyObjectHashTable(HashTable):

return uniques.to_array()

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
@cython.boundscheck(False)
@cython.wraparound(False)
def _factorize(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
"""
Calculate unique values and labels (no sorting); ignores all NA-values

Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
uniques : ObjectVector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then None _plus_
any value satisfying val!=val are considered missing.

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64]
The labels from values to uniques
"""
cdef:
Py_ssize_t i, n = len(values)
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
Py_ssize_t idx, count = count_prior
int ret = 0
object val
khiter_t k
Expand All @@ -851,20 +984,35 @@ cdef class PyObjectHashTable(HashTable):
val = values[i]
hash(val)

if ((val != val or val is None) or
(use_na_value and val == na_value)):
if ((val != val or val is None)
or (use_na_value and val == na_value)):
labels[i] = na_sentinel
continue

k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
# k falls into a previous bucket
idx = self.table.vals[k]
labels[i] = idx
else:
# k hasn't been seen yet
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = count
uniques.append(val)
labels[i] = count
count += 1

return np.asarray(labels)

def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
uniques = ObjectVector()
labels = self._factorize(values, uniques=uniques,
na_sentinel=na_sentinel, na_value=na_value)
return labels, uniques.to_array()

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
return self._factorize(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value)
8 changes: 3 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,15 +467,13 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
-------
labels, uniques : ndarray
"""
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
(hash_klass, _), values = _get_data_algo(values, _hashtables)

table = hash_klass(size_hint or len(values))
uniques = vec_klass()
labels = table.get_labels(values, uniques, 0, na_sentinel,
na_value=na_value)
labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
na_value=na_value)

labels = ensure_platform_int(labels)
uniques = uniques.to_array()
return labels, uniques


Expand Down
Loading