Skip to content

Commit

Permalink
Add DatasetGroupBy.quantile (#3527)
Browse files Browse the repository at this point in the history
* move the implementation of DataArrayGroupBy.quantile to GroupBy

* add tests for DatasetGroupBy

* update whats-new.rst

* move the item in whats-new.rst into New Features

* don't drop scalar quantile coords
  • Loading branch information
keewis authored and dcherian committed Nov 15, 2019
1 parent 68b004f commit 52d4845
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 68 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ New Features
invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`)
By `Deepak Cherian <https://github.com/dcherian>`_ and
`Guido Imperiale <https://github.com/crusaderky>`_.
- Add the documented-but-missing :py:meth:`xarray.core.groupby.DatasetGroupBy.quantile`.
(:issue:`3525`, :pull:`3527`). By `Justus Magin <https://github.com/keewis>`_.

Bug fixes
~~~~~~~~~
Expand Down
107 changes: 53 additions & 54 deletions xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,59 @@ def fillna(self, value):
out = ops.fillna(self, value)
return out

def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None):
"""Compute the qth quantile over each array in the groups and
concatenate them together into a new array.
Parameters
----------
q : float in range of [0,1] (or sequence of floats)
Quantile to compute, which must be between 0 and 1
inclusive.
dim : `...`, str or sequence of str, optional
Dimension(s) over which to apply quantile.
Defaults to the grouped dimension.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
This optional parameter specifies the interpolation method to
use when the desired quantile lies between two data points
``i < j``:
* linear: ``i + (j - i) * fraction``, where ``fraction`` is
the fractional part of the index surrounded by ``i`` and
``j``.
* lower: ``i``.
* higher: ``j``.
* nearest: ``i`` or ``j``, whichever is nearest.
* midpoint: ``(i + j) / 2``.
Returns
-------
quantiles : Variable
If `q` is a single quantile, then the result is a
scalar. If multiple percentiles are given, first axis of
the result corresponds to the quantile. In either case a
quantile dimension is added to the return array. The other
dimensions are the dimensions that remain after the
reduction of the array.
See Also
--------
numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile,
DataArray.quantile
"""
if dim is None:
dim = self._group_dim

out = self.map(
self._obj.__class__.quantile,
shortcut=False,
q=q,
dim=dim,
interpolation=interpolation,
keep_attrs=keep_attrs,
)

return out

def where(self, cond, other=dtypes.NA):
"""Return elements from `self` or `other` depending on `cond`.
Expand Down Expand Up @@ -737,60 +790,6 @@ def _combine(self, applied, restore_coord_dims=False, shortcut=False):
combined = self._maybe_unstack(combined)
return combined

def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None):
"""Compute the qth quantile over each array in the groups and
concatenate them together into a new array.
Parameters
----------
q : float in range of [0,1] (or sequence of floats)
Quantile to compute, which must be between 0 and 1
inclusive.
dim : `...`, str or sequence of str, optional
Dimension(s) over which to apply quantile.
Defaults to the grouped dimension.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
This optional parameter specifies the interpolation method to
use when the desired quantile lies between two data points
``i < j``:
* linear: ``i + (j - i) * fraction``, where ``fraction`` is
the fractional part of the index surrounded by ``i`` and
``j``.
* lower: ``i``.
* higher: ``j``.
* nearest: ``i`` or ``j``, whichever is nearest.
* midpoint: ``(i + j) / 2``.
Returns
-------
quantiles : Variable
If `q` is a single quantile, then the result
is a scalar. If multiple percentiles are given, first axis of
the result corresponds to the quantile and a quantile dimension
is added to the return array. The other dimensions are the
dimensions that remain after the reduction of the array.
See Also
--------
numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile,
DataArray.quantile
"""
if dim is None:
dim = self._group_dim

out = self.map(
self._obj.__class__.quantile,
shortcut=False,
q=q,
dim=dim,
interpolation=interpolation,
keep_attrs=keep_attrs,
)

if np.asarray(q, dtype=np.float64).ndim == 0:
out = out.drop_vars("quantile")
return out

def reduce(
self, func, dim=None, axis=None, keep_attrs=None, shortcut=True, **kwargs
):
Expand Down
143 changes: 129 additions & 14 deletions xarray/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,57 +137,73 @@ def test_da_groupby_empty():

def test_da_groupby_quantile():

array = xr.DataArray([1, 2, 3, 4, 5, 6], [("x", [1, 1, 1, 2, 2, 2])])
array = xr.DataArray(
data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x"
)

# Scalar quantile
expected = xr.DataArray([2, 5], [("x", [1, 2])])
expected = xr.DataArray(
data=[2, 5], coords={"x": [1, 2], "quantile": 0.5}, dims="x"
)
actual = array.groupby("x").quantile(0.5)
assert_identical(expected, actual)

# Vector quantile
expected = xr.DataArray([[1, 3], [4, 6]], [("x", [1, 2]), ("quantile", [0, 1])])
expected = xr.DataArray(
data=[[1, 3], [4, 6]],
coords={"x": [1, 2], "quantile": [0, 1]},
dims=("x", "quantile"),
)
actual = array.groupby("x").quantile([0, 1])
assert_identical(expected, actual)

# Multiple dimensions
array = xr.DataArray(
[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]],
[("x", [1, 1, 1, 2, 2]), ("y", [0, 0, 1])],
data=[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]],
coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]},
dims=("x", "y"),
)

actual_x = array.groupby("x").quantile(0, dim=...)
expected_x = xr.DataArray([1, 4], [("x", [1, 2])])
expected_x = xr.DataArray(
data=[1, 4], coords={"x": [1, 2], "quantile": 0}, dims="x"
)
assert_identical(expected_x, actual_x)

actual_y = array.groupby("y").quantile(0, dim=...)
expected_y = xr.DataArray([1, 22], [("y", [0, 1])])
expected_y = xr.DataArray(
data=[1, 22], coords={"y": [0, 1], "quantile": 0}, dims="y"
)
assert_identical(expected_y, actual_y)

actual_xx = array.groupby("x").quantile(0)
expected_xx = xr.DataArray(
[[1, 11, 22], [4, 15, 24]], [("x", [1, 2]), ("y", [0, 0, 1])]
data=[[1, 11, 22], [4, 15, 24]],
coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0},
dims=("x", "y"),
)
assert_identical(expected_xx, actual_xx)

actual_yy = array.groupby("y").quantile(0)
expected_yy = xr.DataArray(
[[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]],
[("x", [1, 1, 1, 2, 2]), ("y", [0, 1])],
data=[[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]],
coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0},
dims=("x", "y"),
)
assert_identical(expected_yy, actual_yy)

times = pd.date_range("2000-01-01", periods=365)
x = [0, 1]
foo = xr.DataArray(
np.reshape(np.arange(365 * 2), (365, 2)),
coords=dict(time=times, x=x),
coords={"time": times, "x": x},
dims=("time", "x"),
)
g = foo.groupby(foo.time.dt.month)

actual = g.quantile(0, dim=...)
expected = xr.DataArray(
[
data=[
0.0,
62.0,
120.0,
Expand All @@ -201,12 +217,111 @@ def test_da_groupby_quantile():
610.0,
670.0,
],
[("month", np.arange(1, 13))],
coords={"month": np.arange(1, 13), "quantile": 0},
dims="month",
)
assert_identical(expected, actual)

actual = g.quantile(0, dim="time")[:2]
expected = xr.DataArray([[0.0, 1], [62.0, 63]], [("month", [1, 2]), ("x", [0, 1])])
expected = xr.DataArray(
data=[[0.0, 1], [62.0, 63]],
coords={"month": [1, 2], "x": [0, 1], "quantile": 0},
dims=("month", "x"),
)
assert_identical(expected, actual)


def test_ds_groupby_quantile():
ds = xr.Dataset(
data_vars={"a": ("x", [1, 2, 3, 4, 5, 6])}, coords={"x": [1, 1, 1, 2, 2, 2]}
)

# Scalar quantile
expected = xr.Dataset(
data_vars={"a": ("x", [2, 5])}, coords={"quantile": 0.5, "x": [1, 2]}
)
actual = ds.groupby("x").quantile(0.5)
assert_identical(expected, actual)

# Vector quantile
expected = xr.Dataset(
data_vars={"a": (("x", "quantile"), [[1, 3], [4, 6]])},
coords={"x": [1, 2], "quantile": [0, 1]},
)
actual = ds.groupby("x").quantile([0, 1])
assert_identical(expected, actual)

# Multiple dimensions
ds = xr.Dataset(
data_vars={
"a": (
("x", "y"),
[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]],
)
},
coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]},
)

actual_x = ds.groupby("x").quantile(0, dim=...)
expected_x = xr.Dataset({"a": ("x", [1, 4])}, coords={"x": [1, 2], "quantile": 0})
assert_identical(expected_x, actual_x)

actual_y = ds.groupby("y").quantile(0, dim=...)
expected_y = xr.Dataset({"a": ("y", [1, 22])}, coords={"y": [0, 1], "quantile": 0})
assert_identical(expected_y, actual_y)

actual_xx = ds.groupby("x").quantile(0)
expected_xx = xr.Dataset(
{"a": (("x", "y"), [[1, 11, 22], [4, 15, 24]])},
coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0},
)
assert_identical(expected_xx, actual_xx)

actual_yy = ds.groupby("y").quantile(0)
expected_yy = xr.Dataset(
{"a": (("x", "y"), [[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]])},
coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0},
).transpose()
assert_identical(expected_yy, actual_yy)

times = pd.date_range("2000-01-01", periods=365)
x = [0, 1]
foo = xr.Dataset(
{"a": (("time", "x"), np.reshape(np.arange(365 * 2), (365, 2)))},
coords=dict(time=times, x=x),
)
g = foo.groupby(foo.time.dt.month)

actual = g.quantile(0, dim=...)
expected = xr.Dataset(
{
"a": (
"month",
[
0.0,
62.0,
120.0,
182.0,
242.0,
304.0,
364.0,
426.0,
488.0,
548.0,
610.0,
670.0,
],
)
},
coords={"month": np.arange(1, 13), "quantile": 0},
)
assert_identical(expected, actual)

actual = g.quantile(0, dim="time").isel(month=slice(None, 2))
expected = xr.Dataset(
data_vars={"a": (("month", "x"), [[0.0, 1], [62.0, 63]])},
coords={"month": [1, 2], "x": [0, 1], "quantile": 0},
)
assert_identical(expected, actual)


Expand Down

0 comments on commit 52d4845

Please sign in to comment.