From 0f8619d8b58841019f53f02bff5cb23b043b1040 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 1 Sep 2023 15:07:15 -0700 Subject: [PATCH 01/12] Allow `apply_ufunc` to ignore missing core dims --- xarray/core/computation.py | 43 +++++++++++++++++++++++++++++--- xarray/tests/test_computation.py | 38 ++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 685307fc8c3..2e7e6f32e17 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -32,6 +32,8 @@ from xarray.core.dataset import Dataset from xarray.core.types import CombineAttrsOptions, JoinOptions + MissingCoreDimOptions = Literal["raise", "copy", "drop"] + _NO_FILL_VALUE = utils.ReprObject("") _DEFAULT_NAME = utils.ReprObject("") _JOINS_WITHOUT_FILL_VALUES = frozenset({"inner", "exact"}) @@ -42,6 +44,7 @@ def _first_of_type(args, kind): for arg in args: if isinstance(arg, kind): return arg + raise ValueError("This should be unreachable.") @@ -376,7 +379,7 @@ def collect_dict_values( ] -def _as_variables_or_variable(arg): +def _as_variables_or_variable(arg) -> Variable | tuple[Variable]: try: return arg.variables except AttributeError: @@ -397,7 +400,12 @@ def _unpack_dict_tuples( def apply_dict_of_variables_vfunc( - func, *args, signature: _UFuncSignature, join="inner", fill_value=None + func, + *args, + signature: _UFuncSignature, + join="inner", + fill_value=None, + missing_core_dim: MissingCoreDimOptions = "raise", ): """Apply a variable level function over dicts of DataArray, DataArray, Variable and ndarray objects. @@ -408,7 +416,26 @@ def apply_dict_of_variables_vfunc( result_vars = {} for name, variable_args in zip(names, grouped_by_name): - result_vars[name] = func(*variable_args) + # TODO: is this a reasonable to check for missing core dims? We check for a dims + # property to protect against the case where a numpy array is passed in. + if hasattr(variable_args[0], "dims") and set( + signature.all_input_core_dims + ) - set(variable_args[0].dims): + if missing_core_dim == "raise": + raise ValueError( + f"Missing core dimension on {name!r}. Either add the core dimension, or set `missing_core_dim` to `copy` or `drop`." + ) + elif missing_core_dim == "copy": + # TODO: is it correct to copy the first variable here? + result_vars[name] = variable_args[0] + elif missing_core_dim == "drop": + pass + else: + raise ValueError( + f"Invalid value for `missing_core_dim`: {missing_core_dim!r}" + ) + else: + result_vars[name] = func(*variable_args) if signature.num_outputs > 1: return _unpack_dict_tuples(result_vars, signature.num_outputs) @@ -441,6 +468,7 @@ def apply_dataset_vfunc( fill_value=_NO_FILL_VALUE, exclude_dims=frozenset(), keep_attrs="override", + missing_core_dim: MissingCoreDimOptions = "raise", ) -> Dataset | tuple[Dataset, ...]: """Apply a variable level function over Dataset, dict of DataArray, DataArray, Variable and/or ndarray objects. @@ -467,7 +495,12 @@ def apply_dataset_vfunc( args = tuple(getattr(arg, "data_vars", arg) for arg in args) result_vars = apply_dict_of_variables_vfunc( - func, *args, signature=signature, join=dataset_join, fill_value=fill_value + func, + *args, + signature=signature, + join=dataset_join, + fill_value=fill_value, + missing_core_dim=missing_core_dim, ) out: Dataset | tuple[Dataset, ...] @@ -850,6 +883,7 @@ def apply_ufunc( output_sizes: Mapping[Any, int] | None = None, meta: Any = None, dask_gufunc_kwargs: dict[str, Any] | None = None, + missing_core_dim: MissingCoreDimOptions = "raise", ) -> Any: """Apply a vectorized function for unlabeled arrays on xarray objects. @@ -1191,6 +1225,7 @@ def apply_ufunc( dataset_join=dataset_join, fill_value=dataset_fill_value, keep_attrs=keep_attrs, + missing_core_dim=missing_core_dim, ) # feed DataArray apply_variable_ufunc through apply_dataarray_vfunc elif any(isinstance(a, DataArray) for a in args): diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 3c10e3f27ab..8a9eb628f8e 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -257,6 +257,44 @@ def func(x): assert_identical(out1, dataset) +def test_apply_missing_dims() -> None: + def add_one(a, core_dims, missing_core_dim): + return apply_ufunc( + lambda x: x + 1, + a, + input_core_dims=core_dims, + output_core_dims=core_dims, + missing_core_dim=missing_core_dim, + ) + + array = np.arange(6).reshape(2, 3) + variable = xr.Variable(["x", "y"], array) + variable_no_y = xr.Variable(["x", "z"], array) + + dataset = xr.Dataset({"matching": variable, "missing": variable_no_y}) + + # Check the standard stuff works OK + assert_identical( + add_one(dataset[["matching"]], core_dims=[["y"]], missing_core_dim="raise"), + dataset[["matching"]] + 1, + ) + + # `raise` — should raise on a missing dim + with pytest.raises(ValueError): + add_one(dataset, core_dims=[["y"]], missing_core_dim="raise"), + + # `drop` — should drop the var with the missing dim + assert_identical( + add_one(dataset, core_dims=[["y"]], missing_core_dim="drop"), + (dataset + 1).drop_vars("missing"), + ) + + # `copy` — should not add one to the missing with `copy` + copy_result = add_one(dataset, core_dims=[["y"]], missing_core_dim="copy") + assert_identical(copy_result["matching"], (dataset + 1)["matching"]) + assert_identical(copy_result["missing"], dataset["missing"]) + + @requires_dask def test_apply_dask_parallelized_two_outputs() -> None: data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y")) From ebe707ba5824551776b4bd0a68bcb060a1ecdecb Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Mon, 11 Sep 2023 17:52:04 -0700 Subject: [PATCH 02/12] --- xarray/core/computation.py | 47 +++++++++++-------- xarray/tests/test_computation.py | 80 +++++++++++++++++++++++++++++--- 2 files changed, 102 insertions(+), 25 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 0c63a4da22e..0264e613192 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -416,25 +416,34 @@ def apply_dict_of_variables_vfunc( result_vars = {} for name, variable_args in zip(names, grouped_by_name): - # TODO: is this a reasonable to check for missing core dims? We check for a dims - # property to protect against the case where a numpy array is passed in. - if hasattr(variable_args[0], "dims") and set( - signature.all_input_core_dims - ) - set(variable_args[0].dims): - if missing_core_dim == "raise": - raise ValueError( - f"Missing core dimension on {name!r}. Either add the core dimension, or set `missing_core_dim` to `copy` or `drop`." - ) - elif missing_core_dim == "copy": - # TODO: is it correct to copy the first variable here? - result_vars[name] = variable_args[0] - elif missing_core_dim == "drop": - pass - else: - raise ValueError( - f"Invalid value for `missing_core_dim`: {missing_core_dim!r}" - ) - else: + for core_dims, variable_arg in zip(signature.input_core_dims, variable_args): + # (if there's a more elegant way to do this than using a temporary, that'd + # be nice. But we need to have context of the specific object failing in + # order to produce a good error message; and need to copy the variable if + # necessary, skipping the intended evaluation of `func`.) + all_vars_have_all_core_dims = True + # Check whether all the dims are on the variable. Note that we need the + # `hasattr` to check for a dims property, to protect against the case where + # a numpy array is passed in. + if hasattr(variable_arg, "dims") and set(core_dims) - set( + variable_arg.dims + ): + if missing_core_dim == "raise": + raise ValueError( + f"Missing core dimension(s) {set(core_dims) - set(variable_arg.dims)} on `{name}` (object below). " + "Either add the core dimension, or set `missing_core_dim` to `copy` or `drop`." + f"\n\n{variable_arg}" + ) + elif missing_core_dim == "copy": + result_vars[name] = variable_args[0] + all_vars_have_all_core_dims = False + elif missing_core_dim == "drop": + all_vars_have_all_core_dims = False + else: + raise ValueError( + f"Invalid value for `missing_core_dim`: {missing_core_dim!r}" + ) + if all_vars_have_all_core_dims: result_vars[name] = func(*variable_args) if signature.num_outputs > 1: diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 8a9eb628f8e..c3db1352ca8 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -258,6 +258,8 @@ def func(x): def test_apply_missing_dims() -> None: + ## Single arg + def add_one(a, core_dims, missing_core_dim): return apply_ufunc( lambda x: x + 1, @@ -271,12 +273,12 @@ def add_one(a, core_dims, missing_core_dim): variable = xr.Variable(["x", "y"], array) variable_no_y = xr.Variable(["x", "z"], array) - dataset = xr.Dataset({"matching": variable, "missing": variable_no_y}) + dataset = xr.Dataset({"x_y": variable, "x_z": variable_no_y}) # Check the standard stuff works OK assert_identical( - add_one(dataset[["matching"]], core_dims=[["y"]], missing_core_dim="raise"), - dataset[["matching"]] + 1, + add_one(dataset[["x_y"]], core_dims=[["y"]], missing_core_dim="raise"), + dataset[["x_y"]] + 1, ) # `raise` — should raise on a missing dim @@ -286,13 +288,79 @@ def add_one(a, core_dims, missing_core_dim): # `drop` — should drop the var with the missing dim assert_identical( add_one(dataset, core_dims=[["y"]], missing_core_dim="drop"), - (dataset + 1).drop_vars("missing"), + (dataset + 1).drop_vars("x_z"), ) # `copy` — should not add one to the missing with `copy` copy_result = add_one(dataset, core_dims=[["y"]], missing_core_dim="copy") - assert_identical(copy_result["matching"], (dataset + 1)["matching"]) - assert_identical(copy_result["missing"], dataset["missing"]) + assert_identical(copy_result["x_y"], (dataset + 1)["x_y"]) + assert_identical(copy_result["x_z"], dataset["x_z"]) + + ## Multiple args + + def sum_add(a, b, core_dims, missing_core_dim): + return apply_ufunc( + lambda a, b: a.sum() + b.sum(), + a, + b, + input_core_dims=core_dims, + missing_core_dim=missing_core_dim, + ) + + # Check the standard stuff works OK + assert_identical( + sum_add( + dataset[["x_y"]], + dataset[["x_y"]], + core_dims=[["x", "y"], ["x", "y"]], + missing_core_dim="raise", + ), + dataset[["x_y"]].sum() * 2, + ) + + # `raise` — should raise on a missing dim + with pytest.raises( + ValueError, match=r".*Missing core dimension\(s\) \{'y'\} on `x_z`.*" + ): + sum_add( + dataset[["x_z"]], + dataset[["x_z"]], + core_dims=[["x", "y"], ["x", "z"]], + missing_core_dim="raise", + ) + + # `raise` on a missing dim on a non-first arg + with pytest.raises( + ValueError, match=r".*Missing core dimension\(s\) \{'y'\} on `x_z`.*" + ): + sum_add( + dataset[["x_z"]], + dataset[["x_z"]], + core_dims=[["x", "z"], ["x", "y"]], + missing_core_dim="raise", + ) + + # `drop` — should drop the var with the missing dim + assert_identical( + sum_add( + dataset[["x_z"]], + dataset[["x_z"]], + core_dims=[["x", "y"], ["x", "y"]], + missing_core_dim="drop", + ), + dataset[[]], + ) + + # `copy` — should drop the var with the missing dim + assert_identical( + sum_add( + dataset[["x_z"]], + dataset[["x_z"]], + core_dims=[["x", "y"], ["x", "y"]], + missing_core_dim="copy", + ), + dataset[["x_z"]], + ) @requires_dask From 49a686dfa654409febe4d399bb8b13d41d2b0185 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Mon, 11 Sep 2023 20:00:10 -0700 Subject: [PATCH 03/12] --- xarray/tests/test_computation.py | 56 +++++++++++++++++++------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index c3db1352ca8..a9135ab4d9f 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -273,28 +273,28 @@ def add_one(a, core_dims, missing_core_dim): variable = xr.Variable(["x", "y"], array) variable_no_y = xr.Variable(["x", "z"], array) - dataset = xr.Dataset({"x_y": variable, "x_z": variable_no_y}) + ds = xr.Dataset({"x_y": variable, "x_z": variable_no_y}) # Check the standard stuff works OK assert_identical( - add_one(dataset[["x_y"]], core_dims=[["y"]], missing_core_dim="raise"), - dataset[["x_y"]] + 1, + add_one(ds[["x_y"]], core_dims=[["y"]], missing_core_dim="raise"), + ds[["x_y"]] + 1, ) # `raise` — should raise on a missing dim with pytest.raises(ValueError): - add_one(dataset, core_dims=[["y"]], missing_core_dim="raise"), + add_one(ds, core_dims=[["y"]], missing_core_dim="raise"), # `drop` — should drop the var with the missing dim assert_identical( - add_one(dataset, core_dims=[["y"]], missing_core_dim="drop"), - (dataset + 1).drop_vars("x_z"), + add_one(ds, core_dims=[["y"]], missing_core_dim="drop"), + (ds + 1).drop_vars("x_z"), ) # `copy` — should not add one to the missing with `copy` - copy_result = add_one(dataset, core_dims=[["y"]], missing_core_dim="copy") - assert_identical(copy_result["x_y"], (dataset + 1)["x_y"]) - assert_identical(copy_result["x_z"], dataset["x_z"]) + copy_result = add_one(ds, core_dims=[["y"]], missing_core_dim="copy") + assert_identical(copy_result["x_y"], (ds + 1)["x_y"]) + assert_identical(copy_result["x_z"], ds["x_z"]) ## Multiple args @@ -310,12 +310,12 @@ def sum_add(a, b, core_dims, missing_core_dim): # Check the standard stuff works OK assert_identical( sum_add( - dataset[["x_y"]], - dataset[["x_y"]], + ds[["x_y"]], + ds[["x_y"]], core_dims=[["x", "y"], ["x", "y"]], missing_core_dim="raise", ), - dataset[["x_y"]].sum() * 2, + ds[["x_y"]].sum() * 2, ) # `raise` — should raise on a missing dim @@ -323,8 +323,8 @@ def sum_add(a, b, core_dims, missing_core_dim): ValueError, match=r".*Missing core dimension\(s\) \{'y'\} on `x_z`.*" ): sum_add( - dataset[["x_z"]], - dataset[["x_z"]], + ds[["x_z"]], + ds[["x_z"]], core_dims=[["x", "y"], ["x", "z"]], missing_core_dim="raise", ) @@ -334,8 +334,8 @@ def sum_add(a, b, core_dims, missing_core_dim): ValueError, match=r".*Missing core dimension\(s\) \{'y'\} on `x_z`.*" ): sum_add( - dataset[["x_z"]], - dataset[["x_z"]], + ds[["x_z"]], + ds[["x_z"]], core_dims=[["x", "z"], ["x", "y"]], missing_core_dim="raise", ) @@ -343,25 +343,37 @@ def sum_add(a, b, core_dims, missing_core_dim): # `drop` — should drop the var with the missing dim assert_identical( sum_add( - dataset[["x_z"]], - dataset[["x_z"]], + ds[["x_z"]], + ds[["x_z"]], core_dims=[["x", "y"], ["x", "y"]], missing_core_dim="drop", ), - dataset[[]], + ds[[]], ) # `copy` — should drop the var with the missing dim assert_identical( sum_add( - dataset[["x_z"]], - dataset[["x_z"]], + ds[["x_z"]], + ds[["x_z"]], core_dims=[["x", "y"], ["x", "y"]], missing_core_dim="copy", ), - dataset[["x_z"]], + ds[["x_z"]], ) + # TODO: need to add tests for passing in datasets with multiple vars + # ## Multiple vars per arg + # assert_identical( + # sum_add( + # ds, + # ds, + # core_dims=[["x", "z"], ["x", "y"]], + # missing_core_dim="drop", + # ), + # ds, + # ) + @requires_dask def test_apply_dask_parallelized_two_outputs() -> None: From bf754880019aa28fb7d34d914c5cdeff7443c91d Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Wed, 13 Sep 2023 13:32:48 -0700 Subject: [PATCH 04/12] Display data returned in ufunc error message This makes debugging much easier! --- xarray/core/computation.py | 22 ++++++++++------------ xarray/tests/test_computation.py | 10 ++++++++-- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 235b52402f1..ed9e8acf971 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -766,11 +766,9 @@ def func(*arrays): not isinstance(result_data, tuple) or len(result_data) != signature.num_outputs ): raise ValueError( - "applied function does not have the number of " - "outputs specified in the ufunc signature. " - "Result is not a tuple of {} elements: {!r}".format( - signature.num_outputs, result_data - ) + f"applied function does not have the number of " + f"outputs specified in the ufunc signature. " + f"Result is not a tuple of {signature.num_outputs} elements:\n\n{result_data}" ) objs = _all_of_type(args, Variable) @@ -784,21 +782,21 @@ def func(*arrays): data = as_compatible_data(data) if data.ndim != len(dims): raise ValueError( - "applied function returned data with unexpected " + "applied function returned data with an unexpected " f"number of dimensions. Received {data.ndim} dimension(s) but " - f"expected {len(dims)} dimensions with names: {dims!r}" + f"expected {len(dims)} dimensions with names: {dims!r}. The data returned " + f"was:\n\n{data!r}" ) var = Variable(dims, data, fastpath=True) for dim, new_size in var.sizes.items(): if dim in dim_sizes and new_size != dim_sizes[dim]: raise ValueError( - "size of dimension {!r} on inputs was unexpectedly " - "changed by applied function from {} to {}. Only " + f"size of dimension '{dim}' on inputs was unexpectedly " + f"changed by applied function from {dim_sizes[dim]} to {new_size}. Only " "dimensions specified in ``exclude_dims`` with " - "xarray.apply_ufunc are allowed to change size.".format( - dim, dim_sizes[dim], new_size - ) + "xarray.apply_ufunc are allowed to change size. " + "The data returned was:\n\n{data!r}" ) var.attrs = attrs diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 3c10e3f27ab..204e55b5b75 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1666,7 +1666,10 @@ def identity(x): def tuple3x(x): return (x, x, x) - with pytest.raises(ValueError, match=r"number of outputs"): + with pytest.raises( + ValueError, + match=r"number of outputs.*Result is not a tuple of 2 elements:\n\n\[0 1 2 3 4 5 6 7 8 9\]", + ): apply_ufunc(identity, variable, output_core_dims=[(), ()]) with pytest.raises(ValueError, match=r"number of outputs"): @@ -1682,7 +1685,10 @@ def add_dim(x): def remove_dim(x): return x[..., 0] - with pytest.raises(ValueError, match=r"unexpected number of dimensions"): + with pytest.raises( + ValueError, + match=r"unexpected number of dimensions.*The data returned was:\n\n.*array\(\[\[0", + ): apply_ufunc(add_dim, variable, output_core_dims=[("y", "z")]) with pytest.raises(ValueError, match=r"unexpected number of dimensions"): From 66278ee018dbe1cfc5668169d5802424092ad0f9 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Wed, 13 Sep 2023 19:54:20 -0700 Subject: [PATCH 05/12] Add tests for multiple var dataset --- xarray/core/computation.py | 71 +++++++++++++++----------------- xarray/tests/test_computation.py | 66 +++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 50 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 10fe33f5103..70e62e037e9 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -350,7 +350,7 @@ def assert_and_return_exact_match(all_keys): if keys != first_keys: raise ValueError( "exact match required for all data variable names, " - f"but {keys!r} != {first_keys!r}" + f"but {list(keys)} != {list(first_keys)}: {set(keys) ^ set(first_keys)} are not in both." ) return first_keys @@ -399,6 +399,24 @@ def _unpack_dict_tuples( return out +def _check_core_dims(signature, variable_args, name): + for core_dims, variable_arg in zip(signature.input_core_dims, variable_args): + # Check whether all the dims are on the variable. Note that we need the + # `hasattr` to check for a dims property, to protect against the case where + # a numpy array is passed in. + if hasattr(variable_arg, "dims") and set(core_dims) - set(variable_arg.dims): + # Slightly awkward design, of returning the error message. But we want to + # give a detailed error message, which requires inspecting the variable in + # the inner loop. + return ( + f"Missing core dimension(s) {set(core_dims) - set(variable_arg.dims)} on `{name}`. " + "Either add the core dimension, or set `missing_core_dim` to `copy` or `drop`. " + "The object:" + f"\n\n{variable_arg}" + ) + return True + + def apply_dict_of_variables_vfunc( func, *args, @@ -416,35 +434,20 @@ def apply_dict_of_variables_vfunc( result_vars = {} for name, variable_args in zip(names, grouped_by_name): - for core_dims, variable_arg in zip(signature.input_core_dims, variable_args): - # (if there's a more elegant way to do this than using a temporary, that'd - # be nice. But we need to have context of the specific object failing in - # order to produce a good error message; and need to copy the variable if - # necessary, skipping the intended evaluation of `func`.) - all_vars_have_all_core_dims = True - # Check whether all the dims are on the variable. Note that we need the - # `hasattr` to check for a dims property, to protect against the case where - # a numpy array is passed in. - if hasattr(variable_arg, "dims") and set(core_dims) - set( - variable_arg.dims - ): - if missing_core_dim == "raise": - raise ValueError( - f"Missing core dimension(s) {set(core_dims) - set(variable_arg.dims)} on `{name}` (object below). " - "Either add the core dimension, or set `missing_core_dim` to `copy` or `drop`." - f"\n\n{variable_arg}" - ) - elif missing_core_dim == "copy": - result_vars[name] = variable_args[0] - all_vars_have_all_core_dims = False - elif missing_core_dim == "drop": - all_vars_have_all_core_dims = False - else: - raise ValueError( - f"Invalid value for `missing_core_dim`: {missing_core_dim!r}" - ) - if all_vars_have_all_core_dims: + core_dim_check = _check_core_dims(signature, variable_args, name) + if core_dim_check is True: result_vars[name] = func(*variable_args) + else: + if missing_core_dim == "raise": + raise ValueError(core_dim_check) + elif missing_core_dim == "copy": + result_vars[name] = variable_args[0] + elif missing_core_dim == "drop": + pass + else: + raise ValueError( + f"Invalid value for `missing_core_dim`: {missing_core_dim!r}" + ) if signature.num_outputs > 1: return _unpack_dict_tuples(result_vars, signature.num_outputs) @@ -637,17 +640,9 @@ def broadcast_compat_data( return data set_old_dims = set(old_dims) - missing_core_dims = [d for d in core_dims if d not in set_old_dims] - if missing_core_dims: - raise ValueError( - "operand to apply_ufunc has required core dimensions {}, but " - "some of these dimensions are absent on an input variable: {}".format( - list(core_dims), missing_core_dims - ) - ) - set_new_dims = set(new_dims) unexpected_dims = [d for d in old_dims if d not in set_new_dims] + if unexpected_dims: raise ValueError( "operand to apply_ufunc encountered unexpected " diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 775ac7c0d5d..5fe0087136d 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -300,7 +300,7 @@ def add_one(a, core_dims, missing_core_dim): def sum_add(a, b, core_dims, missing_core_dim): return apply_ufunc( - lambda a, b: a.sum() + b.sum(), + lambda a, b, axis=None: a.sum(axis) + b.sum(axis), a, b, input_core_dims=core_dims, @@ -362,17 +362,59 @@ def sum_add(a, b, core_dims, missing_core_dim): ds[["x_z"]], ) - # TODO: need to add tests for passing in datasets with multiple vars - # ## Multiple vars per arg - # assert_identical( - # sum_add( - # ds, - # ds, - # core_dims=[["x", "z"], ["x", "y"]], - # missing_core_dim="drop", - # ), - # ds, - # ) + ## Multiple vars per arg + assert_identical( + sum_add( + ds[["x_y", "x_y"]], + ds[["x_y", "x_y"]], + core_dims=[["x", "y"], ["x", "y"]], + missing_core_dim="raise", + ), + ds[["x_y", "x_y"]].sum() * 2, + ) + + assert_identical( + sum_add( + ds, + ds, + core_dims=[["x", "y"], ["x", "y"]], + missing_core_dim="drop", + ), + ds[["x_y"]].sum() * 2, + ) + + assert_identical( + sum_add( + # The first one has the wrong dims — using `z` for the `x_t` var + ds.assign(x_t=ds["x_z"])[["x_y", "x_t"]], + ds.assign(x_t=ds["x_y"])[["x_y", "x_t"]], + core_dims=[["x", "y"], ["x", "y"]], + missing_core_dim="drop", + ), + ds[["x_y"]].sum() * 2, + ) + + assert_identical( + sum_add( + ds.assign(x_t=ds["x_y"])[["x_y", "x_t"]], + # The second one has the wrong dims — using `z` for the `x_t` var (seems + # duplicative but this was a bug in the initial impl...) + ds.assign(x_t=ds["x_z"])[["x_y", "x_t"]], + core_dims=[["x", "y"], ["x", "y"]], + missing_core_dim="drop", + ), + ds[["x_y"]].sum() * 2, + ) + + assert_identical( + sum_add( + ds.assign(x_t=ds["x_y"])[["x_y", "x_t"]], + ds.assign(x_t=ds["x_z"])[["x_y", "x_t"]], + core_dims=[["x", "y"], ["x", "y"]], + missing_core_dim="copy", + ), + ds.drop_vars("x_z").assign(x_y=30, x_t=ds["x_y"]), + ) @requires_dask From 519dd7130767411ca144042646425f3fdabf15ff Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Fri, 15 Sep 2023 12:19:13 -0700 Subject: [PATCH 06/12] Update xarray/core/computation.py Co-authored-by: Deepak Cherian --- xarray/core/computation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 8e3b385dfde..47e04c71a66 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -434,8 +434,8 @@ def apply_dict_of_variables_vfunc( result_vars = {} for name, variable_args in zip(names, grouped_by_name): - core_dim_check = _check_core_dims(signature, variable_args, name) - if core_dim_check is True: + core_dim_present = _check_core_dims(signature, variable_args, name) + if core_dim_present is True: result_vars[name] = func(*variable_args) else: if missing_core_dim == "raise": From ca9ef5a6d6ac37fb9cf149f4837272a1f43c964f Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 15 Sep 2023 12:22:36 -0700 Subject: [PATCH 07/12] --- xarray/core/computation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 47e04c71a66..3d75a4d3fce 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -439,7 +439,7 @@ def apply_dict_of_variables_vfunc( result_vars[name] = func(*variable_args) else: if missing_core_dim == "raise": - raise ValueError(core_dim_check) + raise ValueError(core_dim_present) elif missing_core_dim == "copy": result_vars[name] = variable_args[0] elif missing_core_dim == "drop": From e65d146a89a3ccdacf13b023c9bc92395d476888 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 15 Sep 2023 12:41:57 -0700 Subject: [PATCH 08/12] --- xarray/core/computation.py | 42 ++++++++++++++++++-------------- xarray/tests/test_computation.py | 42 +++++++++++++++++--------------- 2 files changed, 46 insertions(+), 38 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 3d75a4d3fce..e2c0f22e778 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -400,20 +400,26 @@ def _unpack_dict_tuples( def _check_core_dims(signature, variable_args, name): + """ + Chcek if an arg has all the core dims required by the signature. + + Slightly awkward design, of returning the error message. But we want to + give a detailed error message, which requires inspecting the variable in + the inner loop. + """ + missing = [] for core_dims, variable_arg in zip(signature.input_core_dims, variable_args): # Check whether all the dims are on the variable. Note that we need the # `hasattr` to check for a dims property, to protect against the case where # a numpy array is passed in. if hasattr(variable_arg, "dims") and set(core_dims) - set(variable_arg.dims): - # Slightly awkward design, of returning the error message. But we want to - # give a detailed error message, which requires inspecting the variable in - # the inner loop. - return ( - f"Missing core dimension(s) {set(core_dims) - set(variable_arg.dims)} on `{name}`. " - "Either add the core dimension, or set `missing_core_dim` to `copy` or `drop`. " - "The object:" - f"\n\n{variable_arg}" - ) + missing += [[variable_arg, core_dims]] + if missing: + message = f"Missing core dims from `{name}` on variable(s):\n\n" + for variable_arg, core_dims in missing: + message += f"Missing {set(core_dims) - set(variable_arg.dims)} on\n{variable_arg}\n\n" + message += "Either add the core dimension, or if passing a dataset alternatively pass `on_missing_core_dim` as `copy` or `drop`. " + return message return True @@ -423,7 +429,7 @@ def apply_dict_of_variables_vfunc( signature: _UFuncSignature, join="inner", fill_value=None, - missing_core_dim: MissingCoreDimOptions = "raise", + on_missing_core_dim: MissingCoreDimOptions = "raise", ): """Apply a variable level function over dicts of DataArray, DataArray, Variable and ndarray objects. @@ -438,15 +444,15 @@ def apply_dict_of_variables_vfunc( if core_dim_present is True: result_vars[name] = func(*variable_args) else: - if missing_core_dim == "raise": + if on_missing_core_dim == "raise": raise ValueError(core_dim_present) - elif missing_core_dim == "copy": + elif on_missing_core_dim == "copy": result_vars[name] = variable_args[0] - elif missing_core_dim == "drop": + elif on_missing_core_dim == "drop": pass else: raise ValueError( - f"Invalid value for `missing_core_dim`: {missing_core_dim!r}" + f"Invalid value for `on_missing_core_dim`: {on_missing_core_dim!r}" ) if signature.num_outputs > 1: @@ -480,7 +486,7 @@ def apply_dataset_vfunc( fill_value=_NO_FILL_VALUE, exclude_dims=frozenset(), keep_attrs="override", - missing_core_dim: MissingCoreDimOptions = "raise", + on_missing_core_dim: MissingCoreDimOptions = "raise", ) -> Dataset | tuple[Dataset, ...]: """Apply a variable level function over Dataset, dict of DataArray, DataArray, Variable and/or ndarray objects. @@ -512,7 +518,7 @@ def apply_dataset_vfunc( signature=signature, join=dataset_join, fill_value=fill_value, - missing_core_dim=missing_core_dim, + on_missing_core_dim=on_missing_core_dim, ) out: Dataset | tuple[Dataset, ...] @@ -888,7 +894,7 @@ def apply_ufunc( output_sizes: Mapping[Any, int] | None = None, meta: Any = None, dask_gufunc_kwargs: dict[str, Any] | None = None, - missing_core_dim: MissingCoreDimOptions = "raise", + on_missing_core_dim: MissingCoreDimOptions = "raise", ) -> Any: """Apply a vectorized function for unlabeled arrays on xarray objects. @@ -1230,7 +1236,7 @@ def apply_ufunc( dataset_join=dataset_join, fill_value=dataset_fill_value, keep_attrs=keep_attrs, - missing_core_dim=missing_core_dim, + on_missing_core_dim=on_missing_core_dim, ) # feed DataArray apply_variable_ufunc through apply_dataarray_vfunc elif any(isinstance(a, DataArray) for a in args): diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 47c8da23129..6540c49d707 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -260,13 +260,13 @@ def func(x): def test_apply_missing_dims() -> None: ## Single arg - def add_one(a, core_dims, missing_core_dim): + def add_one(a, core_dims, on_missing_core_dim): return apply_ufunc( lambda x: x + 1, a, input_core_dims=core_dims, output_core_dims=core_dims, - missing_core_dim=missing_core_dim, + on_missing_core_dim=on_missing_core_dim, ) array = np.arange(6).reshape(2, 3) @@ -277,34 +277,34 @@ def add_one(a, core_dims, missing_core_dim): # Check the standard stuff works OK assert_identical( - add_one(ds[["x_y"]], core_dims=[["y"]], missing_core_dim="raise"), + add_one(ds[["x_y"]], core_dims=[["y"]], on_missing_core_dim="raise"), ds[["x_y"]] + 1, ) # `raise` — should raise on a missing dim with pytest.raises(ValueError): - add_one(ds, core_dims=[["y"]], missing_core_dim="raise"), + add_one(ds, core_dims=[["y"]], on_missing_core_dim="raise"), # `drop` — should drop the var with the missing dim assert_identical( - add_one(ds, core_dims=[["y"]], missing_core_dim="drop"), + add_one(ds, core_dims=[["y"]], on_missing_core_dim="drop"), (ds + 1).drop_vars("x_z"), ) # `copy` — should not add one to the missing with `copy` - copy_result = add_one(ds, core_dims=[["y"]], missing_core_dim="copy") + copy_result = add_one(ds, core_dims=[["y"]], on_missing_core_dim="copy") assert_identical(copy_result["x_y"], (ds + 1)["x_y"]) assert_identical(copy_result["x_z"], ds["x_z"]) ## Multiple args - def sum_add(a, b, core_dims, missing_core_dim): + def sum_add(a, b, core_dims, on_missing_core_dim): return apply_ufunc( lambda a, b, axis=None: a.sum(axis) + b.sum(axis), a, b, input_core_dims=core_dims, - missing_core_dim=missing_core_dim, + on_missing_core_dim=on_missing_core_dim, ) # Check the standard stuff works OK @@ -313,31 +313,33 @@ def sum_add(a, b, core_dims, missing_core_dim): ds[["x_y"]], ds[["x_y"]], core_dims=[["x", "y"], ["x", "y"]], - missing_core_dim="raise", + on_missing_core_dim="raise", ), ds[["x_y"]].sum() * 2, ) # `raise` — should raise on a missing dim with pytest.raises( - ValueError, match=r".*Missing core dimension\(s\) \{'y'\} on `x_z`.*" + ValueError, + match=r".*Missing core dims from `x_z`.*\n\nMissing \{'y'\} on.*\n.* Date: Fri, 15 Sep 2023 12:44:55 -0700 Subject: [PATCH 09/12] --- xarray/core/computation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index e2c0f22e778..580037877d9 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1008,6 +1008,8 @@ def apply_ufunc( :py:func:`dask.array.apply_gufunc`. ``meta`` should be given in the ``dask_gufunc_kwargs`` parameter . It will be removed as direct parameter a future version. + on_missing_core_dim : {"raise", "copy", "drop"}, default: "raise" + How to handle missing core dimensions on input variables. Returns ------- From 1486b72e54cdb0f19d110938964ce00a1e2992b8 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 15 Sep 2023 12:47:29 -0700 Subject: [PATCH 10/12] --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f60941033f4..9598581895e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -30,6 +30,10 @@ New Features By `Martin Raspaud `_. - Improved static typing of reduction methods (:pull:`6746`). By `Richard Kleijn `_. +- Added `on_missing_core_dims` to :py:meth:`apply_ufunc` to allow for copying or + dropping a :py:class:`Dataset`'s variables with missing core dimensions. + (:pull:`8138`) + By `Maximilian Roos `_. Breaking changes ~~~~~~~~~~~~~~~~ From d027413c08de2c45a91b7eec9f866718b6a22e2d Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 16 Sep 2023 11:55:00 -0700 Subject: [PATCH 11/12] Update xarray/core/computation.py Co-authored-by: Deepak Cherian --- xarray/core/computation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 580037877d9..60d7502dd02 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -415,7 +415,7 @@ def _check_core_dims(signature, variable_args, name): if hasattr(variable_arg, "dims") and set(core_dims) - set(variable_arg.dims): missing += [[variable_arg, core_dims]] if missing: - message = f"Missing core dims from `{name}` on variable(s):\n\n" + message = f"Missing core dims on some variables named `{name}`:\n\n" for variable_arg, core_dims in missing: message += f"Missing {set(core_dims) - set(variable_arg.dims)} on\n{variable_arg}\n\n" message += "Either add the core dimension, or if passing a dataset alternatively pass `on_missing_core_dim` as `copy` or `drop`. " From b682018fde17d1d818b0c4b6015e14079b0d8893 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 16 Sep 2023 12:28:50 -0700 Subject: [PATCH 12/12] more error message changes --- xarray/core/computation.py | 12 +++++++----- xarray/tests/test_computation.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 60d7502dd02..6dfff2a24bf 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -408,16 +408,18 @@ def _check_core_dims(signature, variable_args, name): the inner loop. """ missing = [] - for core_dims, variable_arg in zip(signature.input_core_dims, variable_args): + for i, (core_dims, variable_arg) in enumerate( + zip(signature.input_core_dims, variable_args) + ): # Check whether all the dims are on the variable. Note that we need the # `hasattr` to check for a dims property, to protect against the case where # a numpy array is passed in. if hasattr(variable_arg, "dims") and set(core_dims) - set(variable_arg.dims): - missing += [[variable_arg, core_dims]] + missing += [[i, variable_arg, core_dims]] if missing: - message = f"Missing core dims on some variables named `{name}`:\n\n" - for variable_arg, core_dims in missing: - message += f"Missing {set(core_dims) - set(variable_arg.dims)} on\n{variable_arg}\n\n" + message = "" + for i, variable_arg, core_dims in missing: + message += f"Missing core dims {set(core_dims) - set(variable_arg.dims)} from arg number {i + 1} on a variable named `{name}`:\n{variable_arg}\n\n" message += "Either add the core dimension, or if passing a dataset alternatively pass `on_missing_core_dim` as `copy` or `drop`. " return message return True diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 6540c49d707..052672efb32 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -321,7 +321,7 @@ def sum_add(a, b, core_dims, on_missing_core_dim): # `raise` — should raise on a missing dim with pytest.raises( ValueError, - match=r".*Missing core dims from `x_z`.*\n\nMissing \{'y'\} on.*\n.*