From ff4198865b42ee2f6f99f3f6f83fed68ef4ffbc7 Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Sun, 23 Jun 2019 19:18:32 +1000 Subject: [PATCH] ENH: keepdims=True for xarray reductions (#3033) * ENH: keepdims=True for xarray reductions Addresses #2170 Add new option `keepdims` to xarray reduce operations, following the behaviour of Numpy. `keepdims` may be passed to reductions on either Datasets or DataArrays, and will result in the reduced dimensions being still present in the output with size 1. Coordinates that depend on the reduced dimensions will be removed from the Dataset/DataArray * Set the default to be `False` * Correct lint error * Apply suggestions from code review Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> * Add test for dask and fix implementation * Move 'keepdims' up to where 'dims' is set * Fix lint, add test for scalar variable --- doc/whats-new.rst | 2 ++ xarray/core/dataarray.py | 18 +++++++++++++--- xarray/core/dataset.py | 9 ++++++-- xarray/core/variable.py | 20 +++++++++++++++--- xarray/tests/test_dataarray.py | 38 ++++++++++++++++++++++++++++++++++ xarray/tests/test_dataset.py | 19 +++++++++++++++++ xarray/tests/test_variable.py | 36 ++++++++++++++++++++++++++++++++ 7 files changed, 134 insertions(+), 8 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 52fa102f7fa..373cb8d13dc 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,8 @@ Enhancements ~~~~~~~~~~~~ +- Add ``keepdims`` argument for reduce operations (:issue:`2170`) + By `Scott Wales `_. - netCDF chunksizes are now only dropped when original_shape is different, not when it isn't found. (:issue:`2207`) By `Karel van de Plassche `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4c3dcc2781a..ff77a6ab704 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -259,8 +259,14 @@ def _replace(self, variable=None, coords=None, name=__default): return type(self)(variable, coords, name=name, fastpath=True) def _replace_maybe_drop_dims(self, variable, name=__default): - if variable.dims == self.dims: + if variable.dims == self.dims and variable.shape == self.shape: coords = self._coords.copy() + elif variable.dims == self.dims: + # Shape has changed (e.g. from reduce(..., keepdims=True) + new_sizes = dict(zip(self.dims, variable.shape)) + coords = OrderedDict((k, v) for k, v in self._coords.items() + if v.shape == tuple(new_sizes[d] + for d in v.dims)) else: allowed_dims = set(variable.dims) coords = OrderedDict((k, v) for k, v in self._coords.items() @@ -1642,7 +1648,8 @@ def combine_first(self, other): """ return ops.fillna(self, other, join="outer") - def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): + def reduce(self, func, dim=None, axis=None, keep_attrs=None, + keepdims=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters @@ -1662,6 +1669,10 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + keepdims : bool, default False + If True, the dimensions which are reduced are left in the result + as dimensions of size one. Coordinates that use these dimensions + are removed. **kwargs : dict Additional keyword arguments passed on to `func`. @@ -1672,7 +1683,8 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): summarized data and the indicated dimension(s) removed. """ - var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs) + var = self.variable.reduce(func, dim, axis, keep_attrs, keepdims, + **kwargs) return self._replace_maybe_drop_dims(var) def to_pandas(self): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 13a6a6ee9b2..3e00640ba60 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3152,8 +3152,8 @@ def combine_first(self, other): out = ops.fillna(self, other, join="outer", dataset_join="outer") return out - def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, - allow_lazy=False, **kwargs): + def reduce(self, func, dim=None, keep_attrs=None, keepdims=False, + numeric_only=False, allow_lazy=False, **kwargs): """Reduce this dataset by applying `func` along some dimension(s). Parameters @@ -3169,6 +3169,10 @@ def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, If True, the dataset's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + keepdims : bool, default False + If True, the dimensions which are reduced are left in the result + as dimensions of size one. Coordinates that use these dimensions + are removed. numeric_only : bool, optional If True, only apply ``func`` to variables with a numeric dtype. **kwargs : dict @@ -3218,6 +3222,7 @@ def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, reduce_dims = None variables[name] = var.reduce(func, dim=reduce_dims, keep_attrs=keep_attrs, + keepdims=keepdims, allow_lazy=allow_lazy, **kwargs) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 41f8795b595..ab1be181e31 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1334,7 +1334,7 @@ def where(self, cond, other=dtypes.NA): return ops.where_method(self, cond, other) def reduce(self, func, dim=None, axis=None, - keep_attrs=None, allow_lazy=False, **kwargs): + keep_attrs=None, keepdims=False, allow_lazy=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters @@ -1354,6 +1354,9 @@ def reduce(self, func, dim=None, axis=None, If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + keepdims : bool, default False + If True, the dimensions which are reduced are left in the result + as dimensions of size one **kwargs : dict Additional keyword arguments passed on to `func`. @@ -1381,8 +1384,19 @@ def reduce(self, func, dim=None, axis=None, else: removed_axes = (range(self.ndim) if axis is None else np.atleast_1d(axis) % self.ndim) - dims = [adim for n, adim in enumerate(self.dims) - if n not in removed_axes] + if keepdims: + # Insert np.newaxis for removed dims + slices = tuple(np.newaxis if i in removed_axes else + slice(None, None) for i in range(self.ndim)) + if getattr(data, 'shape', None) is None: + # Reduce has produced a scalar value, not an array-like + data = np.asanyarray(data)[slices] + else: + data = data[slices] + dims = self.dims + else: + dims = [adim for n, adim in enumerate(self.dims) + if n not in removed_axes] if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index a8825055479..47222194151 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1991,6 +1991,44 @@ def test_reduce(self): dims=['x', 'y']).mean('x') assert_equal(actual, expected) + def test_reduce_keepdims(self): + coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], + 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), + 'c': -999} + orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords, dims=['x', 'y']) + + # Mean on all axes loses non-constant coordinates + actual = orig.mean(keepdims=True) + expected = DataArray(orig.data.mean(keepdims=True), dims=orig.dims, + coords={k: v for k, v in coords.items() + if k in ['c']}) + assert_equal(actual, expected) + + assert actual.sizes['x'] == 1 + assert actual.sizes['y'] == 1 + + # Mean on specific axes loses coordinates not involving that axis + actual = orig.mean('y', keepdims=True) + expected = DataArray(orig.data.mean(axis=1, keepdims=True), + dims=orig.dims, + coords={k: v for k, v in coords.items() + if k not in ['y', 'lat']}) + assert_equal(actual, expected) + + @requires_bottleneck + def test_reduce_keepdims_bottleneck(self): + import bottleneck + + coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], + 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), + 'c': -999} + orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords, dims=['x', 'y']) + + # Bottleneck does not have its own keepdims implementation + actual = orig.reduce(bottleneck.nanmean, keepdims=True) + expected = orig.mean(keepdims=True) + assert_equal(actual, expected) + def test_reduce_dtype(self): coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 8cd129e35de..e3a01bbd3a1 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3898,6 +3898,25 @@ def total_sum(x): with raises_regex(TypeError, "unexpected keyword argument 'axis'"): ds.reduce(total_sum, dim='x') + def test_reduce_keepdims(self): + ds = Dataset({'a': (['x', 'y'], [[0, 1, 2, 3, 4]])}, + coords={'y': [0, 1, 2, 3, 4], 'x': [0], + 'lat': (['x', 'y'], [[0, 1, 2, 3, 4]]), + 'c': -999.0}) + + # Shape should match behaviour of numpy reductions with keepdims=True + # Coordinates involved in the reduction should be removed + actual = ds.mean(keepdims=True) + expected = Dataset({'a': (['x', 'y'], np.mean(ds.a, keepdims=True))}, + coords={'c': ds.c}) + assert_identical(expected, actual) + + actual = ds.mean('x', keepdims=True) + expected = Dataset({'a': (['x', 'y'], + np.mean(ds.a, axis=0, keepdims=True))}, + coords={'y': ds.y, 'c': ds.c}) + assert_identical(expected, actual) + def test_quantile(self): ds = create_test_data(seed=123) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 4ddd114d767..5da83880539 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1540,6 +1540,42 @@ def test_reduce_funcs(self): assert_identical( v.max(), Variable([], pd.Timestamp('2000-01-03'))) + def test_reduce_keepdims(self): + v = Variable(['x', 'y'], self.d) + + assert_identical(v.mean(keepdims=True), + Variable(v.dims, np.mean(self.d, keepdims=True))) + assert_identical(v.mean(dim='x', keepdims=True), + Variable(v.dims, np.mean(self.d, axis=0, + keepdims=True))) + assert_identical(v.mean(dim='y', keepdims=True), + Variable(v.dims, np.mean(self.d, axis=1, + keepdims=True))) + assert_identical(v.mean(dim=['y', 'x'], keepdims=True), + Variable(v.dims, np.mean(self.d, axis=(1, 0), + keepdims=True))) + + v = Variable([], 1.0) + assert_identical(v.mean(keepdims=True), + Variable([], np.mean(v.data, keepdims=True))) + + @requires_dask + def test_reduce_keepdims_dask(self): + import dask.array + v = Variable(['x', 'y'], self.d).chunk() + + actual = v.mean(keepdims=True) + assert isinstance(actual.data, dask.array.Array) + + expected = Variable(v.dims, np.mean(self.d, keepdims=True)) + assert_identical(actual, expected) + + actual = v.mean(dim='y', keepdims=True) + assert isinstance(actual.data, dask.array.Array) + + expected = Variable(v.dims, np.mean(self.d, axis=1, keepdims=True)) + assert_identical(actual, expected) + def test_reduce_keep_attrs(self): _attrs = {'units': 'test', 'long_name': 'testing'}