From 6ba980088fcb147be8cec29ddab73fa8f80b0526 Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Tue, 18 Jun 2019 16:55:48 +1000 Subject: [PATCH 1/7] ENH: keepdims=True for xarray reductions Addresses #2170 Add new option `keepdims` to xarray reduce operations, following the behaviour of Numpy. `keepdims` may be passed to reductions on either Datasets or DataArrays, and will result in the reduced dimensions being still present in the output with size 1. Coordinates that depend on the reduced dimensions will be removed from the Dataset/DataArray --- doc/whats-new.rst | 2 ++ xarray/core/dataarray.py | 18 +++++++++++++--- xarray/core/dataset.py | 9 ++++++-- xarray/core/variable.py | 11 +++++++++- xarray/tests/test_dataarray.py | 38 ++++++++++++++++++++++++++++++++++ xarray/tests/test_dataset.py | 19 +++++++++++++++++ xarray/tests/test_variable.py | 15 ++++++++++++++ 7 files changed, 106 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e62c7e87d44..72417eed4ae 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,8 @@ Enhancements ~~~~~~~~~~~~ +- Add ``keepdims`` argument for reduce operations (:issue:`2170`) + By `Scott Wales `_. - netCDF chunksizes are now only dropped when original_shape is different, not when it isn't found. (:issue:`2207`) By `Karel van de Plassche `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 094b8615880..af224e8be53 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -258,8 +258,14 @@ def _replace(self, variable=None, coords=None, name=__default): return type(self)(variable, coords, name=name, fastpath=True) def _replace_maybe_drop_dims(self, variable, name=__default): - if variable.dims == self.dims: + if variable.dims == self.dims and variable.shape == self.shape: coords = self._coords.copy() + elif variable.dims == self.dims: + # Shape has changed (e.g. from reduce(..., keepdims=True) + new_sizes = dict(zip(self.dims, variable.shape)) + coords = OrderedDict((k, v) for k, v in self._coords.items() + if v.shape == tuple(new_sizes[d] + for d in v.dims)) else: allowed_dims = set(variable.dims) coords = OrderedDict((k, v) for k, v in self._coords.items() @@ -1637,7 +1643,8 @@ def combine_first(self, other): """ return ops.fillna(self, other, join="outer") - def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): + def reduce(self, func, dim=None, axis=None, keep_attrs=None, keepdims=None, + **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters @@ -1657,6 +1664,10 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + keepdims : bool, optional + If True, the dimensions which are reduced are left in the result + as dimensions of size one. Coordinates that use these dimensions + are removed. **kwargs : dict Additional keyword arguments passed on to `func`. @@ -1667,7 +1678,8 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): summarized data and the indicated dimension(s) removed. """ - var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs) + var = self.variable.reduce(func, dim, axis, keep_attrs, keepdims, + **kwargs) return self._replace_maybe_drop_dims(var) def to_pandas(self): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ced1dba09e2..aec26c67d7e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3132,8 +3132,8 @@ def combine_first(self, other): out = ops.fillna(self, other, join="outer", dataset_join="outer") return out - def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, - allow_lazy=False, **kwargs): + def reduce(self, func, dim=None, keep_attrs=None, keepdims=None, + numeric_only=False, allow_lazy=False, **kwargs): """Reduce this dataset by applying `func` along some dimension(s). Parameters @@ -3149,6 +3149,10 @@ def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, If True, the dataset's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + keepdims : bool, optional + If True, the dimensions which are reduced are left in the result + as dimensions of size one. Coordinates that use these dimensions + are removed. numeric_only : bool, optional If True, only apply ``func`` to variables with a numeric dtype. **kwargs : dict @@ -3198,6 +3202,7 @@ def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, reduce_dims = None variables[name] = var.reduce(func, dim=reduce_dims, keep_attrs=keep_attrs, + keepdims=keepdims, allow_lazy=allow_lazy, **kwargs) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 41f8795b595..aac012a6fc6 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1334,7 +1334,7 @@ def where(self, cond, other=dtypes.NA): return ops.where_method(self, cond, other) def reduce(self, func, dim=None, axis=None, - keep_attrs=None, allow_lazy=False, **kwargs): + keep_attrs=None, keepdims=None, allow_lazy=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters @@ -1354,6 +1354,9 @@ def reduce(self, func, dim=None, axis=None, If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + keepdims : bool, optional + If True, the dimensions which are reduced are left in the result + as dimensions of size one **kwargs : dict Additional keyword arguments passed on to `func`. @@ -1388,6 +1391,12 @@ def reduce(self, func, dim=None, axis=None, keep_attrs = _get_keep_attrs(default=False) attrs = self._attrs if keep_attrs else None + if keepdims: + for i, d in enumerate(self.dims): + if d not in dims: + dims.insert(i, d) + data = np.expand_dims(data, axis=i) + return Variable(dims, data, attrs=attrs) @classmethod diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index fd9076e7f65..bcf72e2a4a3 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1976,6 +1976,44 @@ def test_reduce(self): dims=['x', 'y']).mean('x') assert_equal(actual, expected) + def test_reduce_keepdims(self): + coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], + 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), + 'c': -999} + orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords, dims=['x', 'y']) + + # Mean on all axes loses non-constant coordinates + actual = orig.mean(keepdims=True) + expected = DataArray(orig.data.mean(keepdims=True), dims=orig.dims, + coords={k: v for k, v in coords.items() + if k in ['c']}) + assert_equal(actual, expected) + + assert actual.sizes['x'] == 1 + assert actual.sizes['y'] == 1 + + # Mean on specific axes loses coordinates not involving that axis + actual = orig.mean('y', keepdims=True) + expected = DataArray(orig.data.mean(axis=1, keepdims=True), + dims=orig.dims, + coords={k: v for k, v in coords.items() + if k not in ['y', 'lat']}) + assert_equal(actual, expected) + + @requires_bottleneck + def test_reduce_keepdims_bottleneck(self): + import bottleneck + + coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], + 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), + 'c': -999} + orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords, dims=['x', 'y']) + + # Bottleneck does not have its own keepdims implementation + actual = orig.reduce(bottleneck.nanmean, keepdims=True) + expected = orig.mean(keepdims=True) + assert_equal(actual, expected) + def test_reduce_dtype(self): coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 812e2893db5..90e85406473 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3858,6 +3858,25 @@ def total_sum(x): with raises_regex(TypeError, "unexpected keyword argument 'axis'"): ds.reduce(total_sum, dim='x') + def test_reduce_keepdims(self): + ds = Dataset({'a': (['x', 'y'], [[0, 1, 2, 3, 4]])}, + coords={'y': [0, 1, 2, 3, 4], 'x': [0], + 'lat': (['x', 'y'], [[0, 1, 2, 3, 4]]), + 'c': -999.0}) + + # Shape should match behaviour of numpy reductions with keepdims=True + # Coordinates involved in the reduction should be removed + actual = ds.mean(keepdims=True) + expected = Dataset({'a': (['x', 'y'], np.mean(ds.a, keepdims=True))}, + coords={'c': ds.c}) + assert_identical(expected, actual) + + actual = ds.mean('x', keepdims=True) + expected = Dataset({'a': (['x', 'y'], + np.mean(ds.a, axis=0, keepdims=True))}, + coords={'y': ds.y, 'c': ds.c}) + assert_identical(expected, actual) + def test_quantile(self): ds = create_test_data(seed=123) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 4ddd114d767..10693a9dc1a 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1540,6 +1540,21 @@ def test_reduce_funcs(self): assert_identical( v.max(), Variable([], pd.Timestamp('2000-01-03'))) + def test_reduce_keepdims(self): + v = Variable(['x', 'y'], self.d) + + assert_identical(v.mean(keepdims=True), + Variable(v.dims, np.mean(self.d, keepdims=True))) + assert_identical(v.mean(dim='x', keepdims=True), + Variable(v.dims, np.mean(self.d, axis=0, + keepdims=True))) + assert_identical(v.mean(dim='y', keepdims=True), + Variable(v.dims, np.mean(self.d, axis=1, + keepdims=True))) + assert_identical(v.mean(dim=['y', 'x'], keepdims=True), + Variable(v.dims, np.mean(self.d, axis=(1, 0), + keepdims=True))) + def test_reduce_keep_attrs(self): _attrs = {'units': 'test', 'long_name': 'testing'} From bd2dc61a8ff1062c5daf03c77f633d7e3219468c Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Wed, 19 Jun 2019 12:54:36 +1000 Subject: [PATCH 2/7] Set the default to be `False` --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- xarray/core/variable.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index af224e8be53..4ad3efa3329 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1643,7 +1643,7 @@ def combine_first(self, other): """ return ops.fillna(self, other, join="outer") - def reduce(self, func, dim=None, axis=None, keep_attrs=None, keepdims=None, + def reduce(self, func, dim=None, axis=None, keep_attrs=None, keepdims=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index aec26c67d7e..b425adc690f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3132,7 +3132,7 @@ def combine_first(self, other): out = ops.fillna(self, other, join="outer", dataset_join="outer") return out - def reduce(self, func, dim=None, keep_attrs=None, keepdims=None, + def reduce(self, func, dim=None, keep_attrs=None, keepdims=False, numeric_only=False, allow_lazy=False, **kwargs): """Reduce this dataset by applying `func` along some dimension(s). diff --git a/xarray/core/variable.py b/xarray/core/variable.py index aac012a6fc6..3058f9876fc 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1334,7 +1334,7 @@ def where(self, cond, other=dtypes.NA): return ops.where_method(self, cond, other) def reduce(self, func, dim=None, axis=None, - keep_attrs=None, keepdims=None, allow_lazy=False, **kwargs): + keep_attrs=None, keepdims=False, allow_lazy=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters From 96611fa9aebdae1f79f3a8ea5a464ce3679dd7db Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Wed, 19 Jun 2019 14:20:58 +1000 Subject: [PATCH 3/7] Correct lint error --- xarray/core/dataarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4ad3efa3329..b659c2c82cc 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1643,8 +1643,8 @@ def combine_first(self, other): """ return ops.fillna(self, other, join="outer") - def reduce(self, func, dim=None, axis=None, keep_attrs=None, keepdims=False, - **kwargs): + def reduce(self, func, dim=None, axis=None, keep_attrs=None, + keepdims=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters From 25e679d85a5e742b89c78653cace033352fbdad0 Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Thu, 20 Jun 2019 14:55:10 +1000 Subject: [PATCH 4/7] Apply suggestions from code review Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- xarray/core/variable.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index b659c2c82cc..8c549157a6a 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1664,7 +1664,7 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=None, If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. - keepdims : bool, optional + keepdims : bool, default False If True, the dimensions which are reduced are left in the result as dimensions of size one. Coordinates that use these dimensions are removed. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b425adc690f..9b307529bbb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3149,7 +3149,7 @@ def reduce(self, func, dim=None, keep_attrs=None, keepdims=False, If True, the dataset's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. - keepdims : bool, optional + keepdims : bool, default False If True, the dimensions which are reduced are left in the result as dimensions of size one. Coordinates that use these dimensions are removed. diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 3058f9876fc..e2e8b69cb06 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1354,7 +1354,7 @@ def reduce(self, func, dim=None, axis=None, If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. - keepdims : bool, optional + keepdims : bool, default False If True, the dimensions which are reduced are left in the result as dimensions of size one **kwargs : dict From b832a545365e08176f1773439420215a1a08780f Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Fri, 21 Jun 2019 12:07:56 +1000 Subject: [PATCH 5/7] Add test for dask and fix implementation --- xarray/core/variable.py | 4 +++- xarray/tests/test_variable.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index e2e8b69cb06..ff5c92527c0 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1392,10 +1392,12 @@ def reduce(self, func, dim=None, axis=None, attrs = self._attrs if keep_attrs else None if keepdims: + slices = [slice(None, None) for d in dims] for i, d in enumerate(self.dims): if d not in dims: dims.insert(i, d) - data = np.expand_dims(data, axis=i) + slices.insert(i, np.newaxis) + data = data[tuple(slices)] return Variable(dims, data, attrs=attrs) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 10693a9dc1a..661a83ee9d3 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1555,6 +1555,25 @@ def test_reduce_keepdims(self): Variable(v.dims, np.mean(self.d, axis=(1, 0), keepdims=True))) + @requires_dask + def test_reduce_keepdims_dask(self): + import dask.array + v = Variable(['x', 'y'], self.d).chunk() + + actual = v.mean(keepdims=True) + assert isinstance(actual.data, dask.array.Array) + + expected = Variable(v.dims, np.mean(self.d, keepdims=True)) + assert_identical(actual, expected) + + actual = v.mean(dim='y', keepdims=True) + assert isinstance(actual.data, dask.array.Array) + + expected = Variable(v.dims, np.mean(self.d, axis=1, keepdims=True)) + assert_identical(actual, expected) + + + def test_reduce_keep_attrs(self): _attrs = {'units': 'test', 'long_name': 'testing'} From fd7a3e32b00ca0825640eedd61c0e007086748cd Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Fri, 21 Jun 2019 12:36:10 +1000 Subject: [PATCH 6/7] Move 'keepdims' up to where 'dims' is set --- xarray/core/variable.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ff5c92527c0..ab1be181e31 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1384,21 +1384,24 @@ def reduce(self, func, dim=None, axis=None, else: removed_axes = (range(self.ndim) if axis is None else np.atleast_1d(axis) % self.ndim) - dims = [adim for n, adim in enumerate(self.dims) - if n not in removed_axes] + if keepdims: + # Insert np.newaxis for removed dims + slices = tuple(np.newaxis if i in removed_axes else + slice(None, None) for i in range(self.ndim)) + if getattr(data, 'shape', None) is None: + # Reduce has produced a scalar value, not an array-like + data = np.asanyarray(data)[slices] + else: + data = data[slices] + dims = self.dims + else: + dims = [adim for n, adim in enumerate(self.dims) + if n not in removed_axes] if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) attrs = self._attrs if keep_attrs else None - if keepdims: - slices = [slice(None, None) for d in dims] - for i, d in enumerate(self.dims): - if d not in dims: - dims.insert(i, d) - slices.insert(i, np.newaxis) - data = data[tuple(slices)] - return Variable(dims, data, attrs=attrs) @classmethod From 413a1e1a878b754e2d249eeebb4074ddaefeeb40 Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Fri, 21 Jun 2019 13:19:21 +1000 Subject: [PATCH 7/7] Fix lint, add test for scalar variable --- xarray/tests/test_variable.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 661a83ee9d3..5da83880539 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1555,6 +1555,10 @@ def test_reduce_keepdims(self): Variable(v.dims, np.mean(self.d, axis=(1, 0), keepdims=True))) + v = Variable([], 1.0) + assert_identical(v.mean(keepdims=True), + Variable([], np.mean(v.data, keepdims=True))) + @requires_dask def test_reduce_keepdims_dask(self): import dask.array @@ -1572,8 +1576,6 @@ def test_reduce_keepdims_dask(self): expected = Variable(v.dims, np.mean(self.d, axis=1, keepdims=True)) assert_identical(actual, expected) - - def test_reduce_keep_attrs(self): _attrs = {'units': 'test', 'long_name': 'testing'}