Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: Introducing hash tables for complex64 and complex128 #38179

Merged
merged 16 commits into from
Dec 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from numpy cimport intp_t, ndarray

from pandas._libs.khash cimport (
complex64_t,
complex128_t,
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
kh_complex64_t,
kh_complex128_t,
kh_float32_t,
kh_float64_t,
kh_int8_t,
Expand All @@ -19,6 +23,8 @@ from pandas._libs.khash cimport (
kh_uint16_t,
kh_uint32_t,
kh_uint64_t,
khcomplex64_t,
khcomplex128_t,
uint8_t,
uint16_t,
uint32_t,
Expand Down Expand Up @@ -90,6 +96,18 @@ cdef class Float32HashTable(HashTable):
cpdef get_item(self, float32_t val)
cpdef set_item(self, float32_t key, Py_ssize_t val)

cdef class Complex64HashTable(HashTable):
cdef kh_complex64_t *table

cpdef get_item(self, complex64_t val)
cpdef set_item(self, complex64_t key, Py_ssize_t val)

cdef class Complex128HashTable(HashTable):
cdef kh_complex128_t *table

cpdef get_item(self, complex128_t val)
cpdef set_item(self, complex128_t key, Py_ssize_t val)

cdef class PyObjectHashTable(HashTable):
cdef kh_pymap_t *table

Expand Down
12 changes: 11 additions & 1 deletion pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,17 @@ cnp.import_array()


from pandas._libs cimport util
from pandas._libs.khash cimport KHASH_TRACE_DOMAIN, kh_str_t, khiter_t
from pandas._libs.khash cimport (
KHASH_TRACE_DOMAIN,
are_equivalent_float32_t,
are_equivalent_float64_t,
are_equivalent_khcomplex64_t,
are_equivalent_khcomplex128_t,
kh_str_t,
khcomplex64_t,
khcomplex128_t,
khiter_t,
)
from pandas._libs.missing cimport checknull


Expand Down
155 changes: 115 additions & 40 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,73 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
{{py:

# name
cimported_types = ['float32',
complex_types = ['complex64',
'complex128']
}}

{{for name in complex_types}}
cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
cdef kh{{name}}_t res
res.real = val.real
res.imag = val.imag
return res


cdef {{name}}_t to_{{name}}(kh{{name}}_t val) nogil:
cdef {{name}}_t res
res.real = val.real
res.imag = val.imag
return res

{{endfor}}


{{py:


# name
c_types = ['khcomplex128_t',
'khcomplex64_t',
'float64_t',
'float32_t',
'int64_t',
'int32_t',
'int16_t',
'int8_t',
'uint64_t',
'uint32_t',
'uint16_t',
'uint8_t']
}}

{{for c_type in c_types}}

cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
{{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
return val.real != val.real or val.imag != val.imag
{{elif c_type in {'float64_t', 'float32_t'} }}
return val != val
{{else}}
return False
{{endif}}


{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
# are_equivalent_{{c_type}} is cimported via khash.pxd
{{else}}
cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
return val1 == val2
{{endif}}

{{endfor}}


{{py:

# name
cimported_types = ['complex64',
'complex128',
'float32',
'float64',
'int8',
'int16',
Expand All @@ -32,6 +98,7 @@ from pandas._libs.khash cimport (
kh_put_{{name}},
kh_resize_{{name}},
)

{{endfor}}

# ----------------------------------------------------------------------
Expand All @@ -48,7 +115,9 @@ from pandas._libs.missing cimport C_NA
# but is included for completeness (rather ObjectVector is used
# for uniques in hashtables)

dtypes = [('Float64', 'float64', 'float64_t'),
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
('Complex64', 'complex64', 'khcomplex64_t'),
('Float64', 'float64', 'float64_t'),
('Float32', 'float32', 'float32_t'),
('Int64', 'int64', 'int64_t'),
('Int32', 'int32', 'int32_t'),
Expand Down Expand Up @@ -94,6 +163,8 @@ ctypedef fused vector_data:
UInt8VectorData
Float64VectorData
Float32VectorData
Complex128VectorData
Complex64VectorData
StringVectorData

cdef inline bint needs_resize(vector_data *data) nogil:
Expand All @@ -106,7 +177,9 @@ cdef inline bint needs_resize(vector_data *data) nogil:
{{py:

# name, dtype, c_type
dtypes = [('Float64', 'float64', 'float64_t'),
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
('Complex64', 'complex64', 'khcomplex64_t'),
('Float64', 'float64', 'float64_t'),
('UInt64', 'uint64', 'uint64_t'),
('Int64', 'int64', 'int64_t'),
('Float32', 'float32', 'float32_t'),
Expand Down Expand Up @@ -303,22 +376,24 @@ cdef class HashTable:

{{py:

# name, dtype, float_group
dtypes = [('Float64', 'float64', True),
('UInt64', 'uint64', False),
('Int64', 'int64', False),
('Float32', 'float32', True),
('UInt32', 'uint32', False),
('Int32', 'int32', False),
('UInt16', 'uint16', False),
('Int16', 'int16', False),
('UInt8', 'uint8', False),
('Int8', 'int8', False)]
# name, dtype, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
('Float64', 'float64', 'float64_t', ''),
('UInt64', 'uint64', 'uint64_t', ''),
('Int64', 'int64', 'int64_t', ''),
('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
('Float32', 'float32', 'float32_t', ''),
('UInt32', 'uint32', 'uint32_t', ''),
('Int32', 'int32', 'int32_t', ''),
('UInt16', 'uint16', 'uint16_t', ''),
('Int16', 'int16', 'int16_t', ''),
('UInt8', 'uint8', 'uint8_t', ''),
('Int8', 'int8', 'int8_t', '')]

}}


{{for name, dtype, float_group in dtypes}}
{{for name, dtype, c_type, to_c_type in dtypes}}

cdef class {{name}}HashTable(HashTable):

Expand All @@ -339,7 +414,9 @@ cdef class {{name}}HashTable(HashTable):
def __contains__(self, object key):
cdef:
khiter_t k
k = kh_get_{{dtype}}(self.table, key)
{{c_type}} ckey
ckey = {{to_c_type}}(key)
k = kh_get_{{dtype}}(self.table, ckey)
return k != self.table.n_buckets

def sizeof(self, deep=False):
Expand All @@ -353,7 +430,9 @@ cdef class {{name}}HashTable(HashTable):
cpdef get_item(self, {{dtype}}_t val):
cdef:
khiter_t k
k = kh_get_{{dtype}}(self.table, val)
{{c_type}} cval
cval = {{to_c_type}}(val)
k = kh_get_{{dtype}}(self.table, cval)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
Expand All @@ -363,9 +442,9 @@ cdef class {{name}}HashTable(HashTable):
cdef:
khiter_t k
int ret = 0

k = kh_put_{{dtype}}(self.table, key, &ret)
self.table.keys[k] = key
{{c_type}} ckey
ckey = {{to_c_type}}(key)
k = kh_put_{{dtype}}(self.table, ckey, &ret)
if kh_exist_{{dtype}}(self.table, k):
self.table.vals[k] = val
else:
Expand All @@ -376,12 +455,12 @@ cdef class {{name}}HashTable(HashTable):
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{dtype}}_t key
{{c_type}} key
khiter_t k

with nogil:
for i in range(n):
key = keys[i]
key = {{to_c_type}}(keys[i])
k = kh_put_{{dtype}}(self.table, key, &ret)
self.table.vals[k] = <Py_ssize_t>values[i]

Expand All @@ -390,12 +469,12 @@ cdef class {{name}}HashTable(HashTable):
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{dtype}}_t val
{{c_type}} val
khiter_t k

with nogil:
for i in range(n):
val = values[i]
val= {{to_c_type}}(values[i])
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = i

Expand All @@ -404,13 +483,13 @@ cdef class {{name}}HashTable(HashTable):
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{dtype}}_t val
{{c_type}} val
khiter_t k
intp_t[:] locs = np.empty(n, dtype=np.intp)

with nogil:
for i in range(n):
val = values[i]
val = {{to_c_type}}(values[i])
k = kh_get_{{dtype}}(self.table, val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
Expand Down Expand Up @@ -466,7 +545,7 @@ cdef class {{name}}HashTable(HashTable):
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
int ret = 0
{{dtype}}_t val, na_value2
{{c_type}} val, na_value2
khiter_t k
{{name}}VectorData *ud
bint use_na_value, use_mask
Expand All @@ -487,23 +566,21 @@ cdef class {{name}}HashTable(HashTable):
# We use None, to make it optional, which requires `object` type
# for the parameter. To please the compiler, we use na_value2,
# which is only used if it's *specified*.
na_value2 = <{{dtype}}_t>na_value
na_value2 = {{to_c_type}}(na_value)
else:
na_value2 = 0
na_value2 = {{to_c_type}}(0)

with nogil:
for i in range(n):
val = values[i]
val = {{to_c_type}}(values[i])

if ignore_na and use_mask:
if mask_values[i]:
labels[i] = na_sentinel
continue
elif ignore_na and (
{{if not name.lower().startswith(("uint", "int"))}}
val != val or
{{endif}}
(use_na_value and val == na_value2)
is_nan_{{c_type}}(val) or
(use_na_value and are_equivalent_{{c_type}}(val, na_value2))
):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), skip the hashtable entry for them,
Expand Down Expand Up @@ -606,14 +683,15 @@ cdef class {{name}}HashTable(HashTable):
ignore_na=True, return_inverse=True)
return labels

{{if dtype == 'int64'}}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why adding this only for int64?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is used just in int64-version:

table = hashtable.Int64HashTable(size_hint)
group_index = ensure_int64(group_index)
# note, group labels come out ascending (ie, 1,2,3 etc)
comp_ids, obs_group_ids = table.get_labels_groupby(group_index)

The logic in this function:

# specific for groupby
{{if dtype != 'uint64'}}
if val < 0:
labels[i] = -1
continue
{{endif}}

IIUC, negative numbers are special and should be skipped, but this is only a convention for the meaning of values group_index. For other types there is no such convention. Instead of inventing something, I've decided to delete unused functions/versions.

@cython.boundscheck(False)
def get_labels_groupby(self, const {{dtype}}_t[:] values):
cdef:
Py_ssize_t i, n = len(values)
intp_t[:] labels
Py_ssize_t idx, count = 0
int ret = 0
{{dtype}}_t val
{{c_type}} val
khiter_t k
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud
Expand All @@ -623,14 +701,12 @@ cdef class {{name}}HashTable(HashTable):

with nogil:
for i in range(n):
val = values[i]
val = {{to_c_type}}(values[i])

# specific for groupby
{{if dtype != 'uint64'}}
if val < 0:
labels[i] = -1
continue
{{endif}}

k = kh_get_{{dtype}}(self.table, val)
if k != self.table.n_buckets:
Expand All @@ -650,6 +726,7 @@ cdef class {{name}}HashTable(HashTable):
arr_uniques = uniques.to_array()

return np.asarray(labels), arr_uniques
{{endif}}

{{endfor}}

Expand Down Expand Up @@ -698,7 +775,6 @@ cdef class StringHashTable(HashTable):
v = get_c_string(key)

k = kh_put_str(self.table, v, &ret)
self.table.keys[k] = v
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
if kh_exist_str(self.table, k):
self.table.vals[k] = val
else:
Expand Down Expand Up @@ -1022,7 +1098,6 @@ cdef class PyObjectHashTable(HashTable):
hash(key)

k = kh_put_pymap(self.table, <PyObject*>key, &ret)
# self.table.keys[k] = key
if kh_exist_pymap(self.table, k):
self.table.vals[k] = val
else:
Expand Down
Loading