diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 59e43ee22afde..27d279bb90a31 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -14,7 +14,7 @@ method_blacklist = { 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', - 'var', 'mad', 'describe', 'std'}, + 'var', 'mad', 'describe', 'std', 'quantile'}, 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', 'std'} @@ -316,8 +316,9 @@ class GroupByMethods(object): ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', - 'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew', - 'std', 'sum', 'tail', 'unique', 'value_counts', 'var'], + 'pct_change', 'prod', 'quantile', 'rank', 'sem', 'shift', + 'size', 'skew', 'std', 'sum', 'tail', 'unique', 'value_counts', + 'var'], ['direct', 'transformation']] def setup(self, dtype, method, application): diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 170e7f14da397..e7389514a7ac5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -98,6 +98,7 @@ Performance Improvements - `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) +- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/_libs/groupby.pxd b/pandas/_libs/groupby.pxd new file mode 100644 index 0000000000000..70ad8a62871e9 --- /dev/null +++ b/pandas/_libs/groupby.pxd @@ -0,0 +1,6 @@ +cdef enum InterpolationEnumType: + INTERPOLATION_LINEAR, + INTERPOLATION_LOWER, + INTERPOLATION_HIGHER, + INTERPOLATION_NEAREST, + INTERPOLATION_MIDPOINT diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e6b6e2c8a0055..71e25c3955a6d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -644,5 +644,106 @@ def _group_ohlc(floating[:, :] out, group_ohlc_float32 = _group_ohlc['float'] group_ohlc_float64 = _group_ohlc['double'] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_quantile(ndarray[float64_t] out, + ndarray[int64_t] labels, + numeric[:] values, + ndarray[uint8_t] mask, + float64_t q, + object interpolation): + """ + Calculate the quantile per group. + + Parameters + ---------- + out : ndarray + Array of aggregated values that will be written to. + labels : ndarray + Array containing the unique group labels. + values : ndarray + Array containing the values to apply the function against. + q : float + The quantile value to search for. + + Notes + ----- + Rather than explicitly returning a value, this function modifies the + provided `out` parameter. + """ + cdef: + Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz + Py_ssize_t grp_start=0, idx=0 + int64_t lab + uint8_t interp + float64_t q_idx, frac, val, next_val + ndarray[int64_t] counts, non_na_counts, sort_arr + + assert values.shape[0] == N + inter_methods = { + 'linear': INTERPOLATION_LINEAR, + 'lower': INTERPOLATION_LOWER, + 'higher': INTERPOLATION_HIGHER, + 'nearest': INTERPOLATION_NEAREST, + 'midpoint': INTERPOLATION_MIDPOINT, + } + interp = inter_methods[interpolation] + + counts = np.zeros_like(out, dtype=np.int64) + non_na_counts = np.zeros_like(out, dtype=np.int64) + ngroups = len(counts) + + # First figure out the size of every group + with nogil: + for i in range(N): + lab = labels[i] + counts[lab] += 1 + if not mask[i]: + non_na_counts[lab] += 1 + + # Get an index of values sorted by labels and then values + order = (values, labels) + sort_arr = np.lexsort(order).astype(np.int64, copy=False) + + with nogil: + for i in range(ngroups): + # Figure out how many group elements there are + grp_sz = counts[i] + non_na_sz = non_na_counts[i] + + if non_na_sz == 0: + out[i] = NaN + else: + # Calculate where to retrieve the desired value + # Casting to int will intentionaly truncate result + idx = grp_start + (q * (non_na_sz - 1)) + + val = values[sort_arr[idx]] + # If requested quantile falls evenly on a particular index + # then write that index's value out. Otherwise interpolate + q_idx = q * (non_na_sz - 1) + frac = q_idx % 1 + + if frac == 0.0 or interp == INTERPOLATION_LOWER: + out[i] = val + else: + next_val = values[sort_arr[idx + 1]] + if interp == INTERPOLATION_LINEAR: + out[i] = val + (next_val - val) * frac + elif interp == INTERPOLATION_HIGHER: + out[i] = next_val + elif interp == INTERPOLATION_MIDPOINT: + out[i] = (val + next_val) / 2.0 + elif interp == INTERPOLATION_NEAREST: + if frac > .5 or (frac == .5 and q > .5): # Always OK? + out[i] = next_val + else: + out[i] = val + + # Increment the index reference in sorted_arr for the next group + grp_start += grp_sz + + # generated from template include "groupby_helper.pxi" diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c63bc5164e25b..c364f069bf53d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -29,6 +29,8 @@ class providing the base-class of operations. ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna +from pandas.api.types import ( + is_datetime64_dtype, is_integer_dtype, is_object_dtype) import pandas.core.algorithms as algorithms from pandas.core.base import ( DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError) @@ -1024,15 +1026,17 @@ def _bool_agg(self, val_test, skipna): """ def objs_to_bool(vals): - try: - vals = vals.astype(np.bool) - except ValueError: # for objects + # type: np.ndarray -> (np.ndarray, typing.Type) + if is_object_dtype(vals): vals = np.array([bool(x) for x in vals]) + else: + vals = vals.astype(np.bool) - return vals.view(np.uint8) + return vals.view(np.uint8), np.bool - def result_to_bool(result): - return result.astype(np.bool, copy=False) + def result_to_bool(result, inference): + # type: (np.ndarray, typing.Type) -> np.ndarray + return result.astype(inference, copy=False) return self._get_cythonized_result('group_any_all', self.grouper, aggregate=True, @@ -1688,6 +1692,75 @@ def nth(self, n, dropna=None): return result + def quantile(self, q=0.5, interpolation='linear'): + """ + Return group values at the given quantile, a la numpy.percentile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value(s) between 0 and 1 providing the quantile(s) to compute. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + Method to use when the desired quantile falls between two points. + + Returns + ------- + Series or DataFrame + Return type determined by caller of GroupBy object. + + See Also + -------- + Series.quantile : Similar method for Series. + DataFrame.quantile : Similar method for DataFrame. + numpy.percentile : NumPy method to compute qth percentile. + + Examples + -------- + >>> df = pd.DataFrame([ + ... ['a', 1], ['a', 2], ['a', 3], + ... ['b', 1], ['b', 3], ['b', 5] + ... ], columns=['key', 'val']) + >>> df.groupby('key').quantile() + val + key + a 2.0 + b 3.0 + """ + + def pre_processor(vals): + # type: np.ndarray -> (np.ndarray, Optional[typing.Type]) + if is_object_dtype(vals): + raise TypeError("'quantile' cannot be performed against " + "'object' dtypes!") + + inference = None + if is_integer_dtype(vals): + inference = np.int64 + elif is_datetime64_dtype(vals): + inference = 'datetime64[ns]' + vals = vals.astype(np.float) + + return vals, inference + + def post_processor(vals, inference): + # type: (np.ndarray, Optional[typing.Type]) -> np.ndarray + if inference: + # Check for edge case + if not (is_integer_dtype(inference) and + interpolation in {'linear', 'midpoint'}): + vals = vals.astype(inference) + + return vals + + return self._get_cythonized_result('group_quantile', self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=q, interpolation=interpolation) + @Substitution(name='groupby') def ngroup(self, ascending=True): """ @@ -1924,10 +1997,16 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, Whether the result of the Cython operation is an index of values to be retrieved, instead of the actual values themselves pre_processing : function, default None - Function to be applied to `values` prior to passing to Cython - Raises if `needs_values` is False + Function to be applied to `values` prior to passing to Cython. + Function should return a tuple where the first element is the + values to be passed to Cython and the second element is an optional + type which the values should be converted to after being returned + by the Cython operation. Raises if `needs_values` is False. post_processing : function, default None - Function to be applied to result of Cython function + Function to be applied to result of Cython function. Should accept + an array of values as the first argument and type inferences as its + second argument, i.e. the signature should be + (ndarray, typing.Type). **kwargs : dict Extra arguments to be passed back to Cython funcs @@ -1963,10 +2042,12 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, result = np.zeros(result_sz, dtype=cython_dtype) func = partial(base_func, result, labels) + inferences = None + if needs_values: vals = obj.values if pre_processing: - vals = pre_processing(vals) + vals, inferences = pre_processing(vals) func = partial(func, vals) if needs_mask: @@ -1982,7 +2063,7 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, result = algorithms.take_nd(obj.values, result) if post_processing: - result = post_processing(result) + result = post_processing(result, inferences) output[name] = result diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index a884a37840f8a..bddbe8d474726 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1060,6 +1060,55 @@ def test_size(df): tm.assert_series_equal(df.groupby('A').size(), out) +# quantile +# -------------------------------- +@pytest.mark.parametrize("interpolation", [ + "linear", "lower", "higher", "nearest", "midpoint"]) +@pytest.mark.parametrize("a_vals,b_vals", [ + # Ints + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), + ([1, 2, 3, 4], [4, 3, 2, 1]), + ([1, 2, 3, 4, 5], [4, 3, 2, 1]), + # Floats + ([1., 2., 3., 4., 5.], [5., 4., 3., 2., 1.]), + # Missing data + ([1., np.nan, 3., np.nan, 5.], [5., np.nan, 3., np.nan, 1.]), + ([np.nan, 4., np.nan, 2., np.nan], [np.nan, 4., np.nan, 2., np.nan]), + # Timestamps + ([x for x in pd.date_range('1/1/18', freq='D', periods=5)], + [x for x in pd.date_range('1/1/18', freq='D', periods=5)][::-1]), + # All NA + ([np.nan] * 5, [np.nan] * 5), +]) +@pytest.mark.parametrize('q', [0, .25, .5, .75, 1]) +def test_quantile(interpolation, a_vals, b_vals, q): + if interpolation == 'nearest' and q == 0.5 and b_vals == [4, 3, 2, 1]: + pytest.skip("Unclear numpy expectation for nearest result with " + "equidistant data") + + a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) + b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) + + df = DataFrame({ + 'key': ['a'] * len(a_vals) + ['b'] * len(b_vals), + 'val': a_vals + b_vals}) + + expected = DataFrame([a_expected, b_expected], columns=['val'], + index=Index(['a', 'b'], name='key')) + result = df.groupby('key').quantile(q, interpolation=interpolation) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_raises(): + df = pd.DataFrame([ + ['foo', 'a'], ['foo', 'b'], ['foo', 'c']], columns=['key', 'val']) + + with pytest.raises(TypeError, match="cannot be performed against " + "'object' dtypes"): + df.groupby('key').quantile() + + # pipe # -------------------------------- diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 12a5d494648fc..6a11f0ae9b44a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -208,7 +208,7 @@ def f(x, q=None, axis=0): trans_expected = ts_grouped.transform(g) assert_series_equal(apply_result, agg_expected) - assert_series_equal(agg_result, agg_expected, check_names=False) + assert_series_equal(agg_result, agg_expected) assert_series_equal(trans_result, trans_expected) agg_result = ts_grouped.agg(f, q=80) @@ -223,13 +223,13 @@ def f(x, q=None, axis=0): agg_result = df_grouped.agg(np.percentile, 80, axis=0) apply_result = df_grouped.apply(DataFrame.quantile, .8) expected = df_grouped.quantile(.8) - assert_frame_equal(apply_result, expected) - assert_frame_equal(agg_result, expected, check_names=False) + assert_frame_equal(apply_result, expected, check_names=False) + assert_frame_equal(agg_result, expected) agg_result = df_grouped.agg(f, q=80) apply_result = df_grouped.apply(DataFrame.quantile, q=.8) - assert_frame_equal(agg_result, expected, check_names=False) - assert_frame_equal(apply_result, expected) + assert_frame_equal(agg_result, expected) + assert_frame_equal(apply_result, expected, check_names=False) def test_len():