From a01460bd90e3ae31a32e40005f33c2919efba8bb Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 3 Mar 2022 10:43:35 +0100 Subject: [PATCH] quantile: use skipna=None (#6303) * quantile: use skipna=None * better term * move whats new entry * remove duplicated entry --- doc/whats-new.rst | 4 +++- xarray/core/dataarray.py | 7 +++++-- xarray/core/dataset.py | 7 +++++-- xarray/core/groupby.py | 7 +++++-- xarray/core/variable.py | 12 ++++++++++-- xarray/tests/test_dataarray.py | 12 ++++++++---- xarray/tests/test_dataset.py | 3 ++- xarray/tests/test_groupby.py | 25 +++++++++++++++++++++++++ xarray/tests/test_variable.py | 12 ++++++++---- 9 files changed, 71 insertions(+), 18 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index fdedc18300f..bfd5f27cbec 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,6 +34,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- Set ``skipna=None`` for all ``quantile`` methods (e.g. :py:meth:`Dataset.quantile`) and + ensure it skips missing values for float dtypes (consistent with other methods). This should + not change the behavior (:pull:`6303`). By `Mathias Hauser `_. Documentation ~~~~~~~~~~~~~ @@ -86,7 +89,6 @@ Deprecations Bug fixes ~~~~~~~~~ - - Variables which are chunked using dask in larger (but aligned) chunks than the target zarr chunk size can now be stored using `to_zarr()` (:pull:`6258`) By `Tobias Kölling `_. - Multi-file datasets containing encoded :py:class:`cftime.datetime` objects can be read in parallel again (:issue:`6226`, :pull:`6249`, :pull:`6305`). By `Martin Bergemann `_ and `Stan West `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 3df9f7ca8a4..e04e5cb9c51 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3440,7 +3440,7 @@ def quantile( dim: str | Sequence[Hashable] | None = None, method: QUANTILE_METHODS = "linear", keep_attrs: bool = None, - skipna: bool = True, + skipna: bool = None, interpolation: QUANTILE_METHODS = None, ) -> DataArray: """Compute the qth quantile of the data along the specified dimension. @@ -3486,7 +3486,10 @@ def quantile( the original object to the new one. If False (default), the new object will be returned without attributes. skipna : bool, optional - Whether to skip missing values when aggregating. + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or skipna=True has not been + implemented (object, datetime64 or timedelta64). Returns ------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a1d7209bc75..b3112bdc7ab 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6160,7 +6160,7 @@ def quantile( method: QUANTILE_METHODS = "linear", numeric_only: bool = False, keep_attrs: bool = None, - skipna: bool = True, + skipna: bool = None, interpolation: QUANTILE_METHODS = None, ): """Compute the qth quantile of the data along the specified dimension. @@ -6209,7 +6209,10 @@ def quantile( numeric_only : bool, optional If True, only apply ``func`` to variables with a numeric dtype. skipna : bool, optional - Whether to skip missing values when aggregating. + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or skipna=True has not been + implemented (object, datetime64 or timedelta64). Returns ------- diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index d7aa6749592..d3ec824159c 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -554,7 +554,7 @@ def quantile( dim=None, method="linear", keep_attrs=None, - skipna=True, + skipna=None, interpolation=None, ): """Compute the qth quantile over each array in the groups and @@ -597,7 +597,10 @@ def quantile( version 1.22.0. skipna : bool, optional - Whether to skip missing values when aggregating. + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or skipna=True has not been + implemented (object, datetime64 or timedelta64). Returns ------- diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 74f394b68ca..c8d46d20d46 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1978,7 +1978,7 @@ def quantile( dim: str | Sequence[Hashable] | None = None, method: QUANTILE_METHODS = "linear", keep_attrs: bool = None, - skipna: bool = True, + skipna: bool = None, interpolation: QUANTILE_METHODS = None, ) -> Variable: """Compute the qth quantile of the data along the specified dimension. @@ -2024,6 +2024,11 @@ def quantile( If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + skipna : bool, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or skipna=True has not been + implemented (object, datetime64 or timedelta64). Returns ------- @@ -2059,7 +2064,10 @@ def quantile( method = interpolation - _quantile_func = np.nanquantile if skipna else np.quantile + if skipna or (skipna is None and self.dtype.kind in "cfO"): + _quantile_func = np.nanquantile + else: + _quantile_func = np.quantile if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 3939da08a67..55c68b7ff6b 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2516,15 +2516,19 @@ def test_reduce_out(self): with pytest.raises(TypeError): orig.mean(out=np.ones(orig.shape)) - @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("skipna", [True, False, None]) @pytest.mark.parametrize("q", [0.25, [0.50], [0.25, 0.75]]) @pytest.mark.parametrize( "axis, dim", zip([None, 0, [0], [0, 1]], [None, "x", ["x"], ["x", "y"]]) ) def test_quantile(self, q, axis, dim, skipna) -> None: - actual = DataArray(self.va).quantile(q, dim=dim, keep_attrs=True, skipna=skipna) - _percentile_func = np.nanpercentile if skipna else np.percentile - expected = _percentile_func(self.dv.values, np.array(q) * 100, axis=axis) + + va = self.va.copy(deep=True) + va[0, 0] = np.NaN + + actual = DataArray(va).quantile(q, dim=dim, keep_attrs=True, skipna=skipna) + _percentile_func = np.nanpercentile if skipna in (True, None) else np.percentile + expected = _percentile_func(va.values, np.array(q) * 100, axis=axis) np.testing.assert_allclose(actual.values, expected) if is_scalar(q): assert "quantile" not in actual.dims diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d726920acce..8d6c4f96857 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4718,10 +4718,11 @@ def test_reduce_keepdims(self): ) assert_identical(expected, actual) - @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("skipna", [True, False, None]) @pytest.mark.parametrize("q", [0.25, [0.50], [0.25, 0.75]]) def test_quantile(self, q, skipna) -> None: ds = create_test_data(seed=123) + ds.var1.data[0, 0] = np.NaN for dim in [None, "dim1", ["dim1"]]: ds_quantile = ds.quantile(q, dim=dim, skipna=skipna) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 1ec2a53c131..4b6da82bdc7 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -203,6 +203,17 @@ def test_da_groupby_quantile() -> None: actual = array.groupby("x").quantile([0, 1]) assert_identical(expected, actual) + array = xr.DataArray( + data=[np.NaN, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x" + ) + + for skipna in (True, False, None): + e = [np.NaN, 5] if skipna is False else [2.5, 5] + + expected = xr.DataArray(data=e, coords={"x": [1, 2], "quantile": 0.5}, dims="x") + actual = array.groupby("x").quantile(0.5, skipna=skipna) + assert_identical(expected, actual) + # Multiple dimensions array = xr.DataArray( data=[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]], @@ -306,6 +317,20 @@ def test_ds_groupby_quantile() -> None: actual = ds.groupby("x").quantile([0, 1]) assert_identical(expected, actual) + ds = xr.Dataset( + data_vars={"a": ("x", [np.NaN, 2, 3, 4, 5, 6])}, + coords={"x": [1, 1, 1, 2, 2, 2]}, + ) + + for skipna in (True, False, None): + e = [np.NaN, 5] if skipna is False else [2.5, 5] + + expected = xr.Dataset( + data_vars={"a": ("x", e)}, coords={"quantile": 0.5, "x": [1, 2]} + ) + actual = ds.groupby("x").quantile(0.5, skipna=skipna) + assert_identical(expected, actual) + # Multiple dimensions ds = xr.Dataset( data_vars={ diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index a88d5a22c0d..b8e2f6f4582 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1700,16 +1700,20 @@ def raise_if_called(*args, **kwargs): with set_options(use_bottleneck=False): v.min() - @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("skipna", [True, False, None]) @pytest.mark.parametrize("q", [0.25, [0.50], [0.25, 0.75]]) @pytest.mark.parametrize( "axis, dim", zip([None, 0, [0], [0, 1]], [None, "x", ["x"], ["x", "y"]]) ) def test_quantile(self, q, axis, dim, skipna): - v = Variable(["x", "y"], self.d) + + d = self.d.copy() + d[0, 0] = np.NaN + + v = Variable(["x", "y"], d) actual = v.quantile(q, dim=dim, skipna=skipna) - _percentile_func = np.nanpercentile if skipna else np.percentile - expected = _percentile_func(self.d, np.array(q) * 100, axis=axis) + _percentile_func = np.nanpercentile if skipna in (True, None) else np.percentile + expected = _percentile_func(d, np.array(q) * 100, axis=axis) np.testing.assert_allclose(actual.values, expected) @requires_dask