Skip to content

Commit

Permalink
add uint64 support for some libgroupby funcs (pandas-dev#28931)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and Nico Cernek committed Jan 1, 2020
1 parent c3edfd3 commit 4bd8d6d
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 3 deletions.
62 changes: 60 additions & 2 deletions pandas/_libs/groupby_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ ctypedef fused rank_t:
float64_t
float32_t
int64_t
uint64_t
object


Expand All @@ -34,6 +35,7 @@ def group_last(rank_t[:, :] out,
rank_t val
ndarray[rank_t, ndim=2] resx
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

Expand Down Expand Up @@ -106,11 +108,20 @@ def group_last(rank_t[:, :] out,
if nobs[i, j] == 0:
if rank_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
runtime_error = True
break
else:
out[i, j] = NAN
else:
out[i, j] = resx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


group_last_float64 = group_last["float64_t"]
group_last_float32 = group_last["float32_t"]
group_last_int64 = group_last["int64_t"]
Expand All @@ -132,6 +143,7 @@ def group_nth(rank_t[:, :] out,
rank_t val
ndarray[rank_t, ndim=2] resx
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

Expand Down Expand Up @@ -199,11 +211,19 @@ def group_nth(rank_t[:, :] out,
if nobs[i, j] == 0:
if rank_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
runtime_error = True
break
else:
out[i, j] = NAN
else:
out[i, j] = resx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


group_nth_float64 = group_nth["float64_t"]
group_nth_float32 = group_nth["float32_t"]
Expand Down Expand Up @@ -282,12 +302,16 @@ def group_rank(float64_t[:, :] out,
if ascending ^ (na_option == 'top'):
if rank_t is int64_t:
nan_fill_val = np.iinfo(np.int64).max
elif rank_t is uint64_t:
nan_fill_val = np.iinfo(np.uint64).max
else:
nan_fill_val = np.inf
order = (masked_vals, mask, labels)
else:
if rank_t is int64_t:
nan_fill_val = np.iinfo(np.int64).min
elif rank_t is uint64_t:
nan_fill_val = 0
else:
nan_fill_val = -np.inf

Expand Down Expand Up @@ -397,6 +421,7 @@ def group_rank(float64_t[:, :] out,
group_rank_float64 = group_rank["float64_t"]
group_rank_float32 = group_rank["float32_t"]
group_rank_int64 = group_rank["int64_t"]
group_rank_uint64 = group_rank["uint64_t"]
# Note: we do not have a group_rank_object because that would require a
# not-nogil implementation, see GH#19560

Expand All @@ -410,6 +435,7 @@ ctypedef fused groupby_t:
float64_t
float32_t
int64_t
uint64_t


@cython.wraparound(False)
Expand All @@ -426,6 +452,7 @@ def group_max(groupby_t[:, :] out,
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
groupby_t val, count, nan_val
ndarray[groupby_t, ndim=2] maxx, nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

Expand All @@ -439,6 +466,11 @@ def group_max(groupby_t[:, :] out,
# Note: evaluated at compile-time
maxx[:] = -_int64_max
nan_val = NPY_NAT
elif groupby_t is uint64_t:
# NB: We do not define nan_val because there is no such thing
# for uint64_t. We carefully avoid having to reference it in this
# case.
maxx[:] = 0
else:
maxx[:] = -np.inf
nan_val = NAN
Expand All @@ -462,18 +494,26 @@ def group_max(groupby_t[:, :] out,
if val > maxx[lab, j]:
maxx[lab, j] = val
else:
if val == val and val != nan_val:
if val == val:
nobs[lab, j] += 1
if val > maxx[lab, j]:
maxx[lab, j] = val

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if groupby_t is uint64_t:
runtime_error = True
break
out[i, j] = nan_val
else:
out[i, j] = maxx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


@cython.wraparound(False)
@cython.boundscheck(False)
Expand All @@ -489,6 +529,7 @@ def group_min(groupby_t[:, :] out,
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
groupby_t val, count, nan_val
ndarray[groupby_t, ndim=2] minx, nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

Expand All @@ -501,6 +542,11 @@ def group_min(groupby_t[:, :] out,
if groupby_t is int64_t:
minx[:] = _int64_max
nan_val = NPY_NAT
elif groupby_t is uint64_t:
# NB: We do not define nan_val because there is no such thing
# for uint64_t. We carefully avoid having to reference it in this
# case.
minx[:] = np.iinfo(np.uint64).max
else:
minx[:] = np.inf
nan_val = NAN
Expand All @@ -524,18 +570,26 @@ def group_min(groupby_t[:, :] out,
if val < minx[lab, j]:
minx[lab, j] = val
else:
if val == val and val != nan_val:
if val == val:
nobs[lab, j] += 1
if val < minx[lab, j]:
minx[lab, j] = val

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if groupby_t is uint64_t:
runtime_error = True
break
out[i, j] = nan_val
else:
out[i, j] = minx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


@cython.boundscheck(False)
@cython.wraparound(False)
Expand Down Expand Up @@ -575,6 +629,8 @@ def group_cummin(groupby_t[:, :] out,
accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
if groupby_t is int64_t:
accum[:] = _int64_max
elif groupby_t is uint64_t:
accum[:] = np.iinfo(np.uint64).max
else:
accum[:] = np.inf

Expand Down Expand Up @@ -642,6 +698,8 @@ def group_cummax(groupby_t[:, :] out,
accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
if groupby_t is int64_t:
accum[:] = -_int64_max
elif groupby_t is uint64_t:
accum[:] = 0
else:
accum[:] = -np.inf

Expand Down
8 changes: 8 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1355,7 +1355,15 @@ def f(self, **kwargs):
return self._cython_agg_general(alias, alt=npfunc, **kwargs)
except AssertionError as e:
raise SpecificationError(str(e))
except DataError:
pass
except Exception:
# TODO: the remaining test cases that get here are from:
# - AttributeError from _cython_agg_blocks bug passing
# DataFrame to make_block; see GH#28275
# - TypeError in _cython_operation calling ensure_float64
# on object array containing complex numbers;
# see test_groupby_complex, test_max_nan_bug
pass

# apply a non-cython aggregation
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ def test_median_empty_bins(observed):


@pytest.mark.parametrize(
"dtype", ["int8", "int16", "int32", "int64", "float32", "float64"]
"dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
)
@pytest.mark.parametrize(
"method,data",
Expand Down

0 comments on commit 4bd8d6d

Please sign in to comment.