From 829a74401649f79be94ab3950e9b9f3dc9d5a341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 22 Dec 2022 14:39:06 +0100 Subject: [PATCH 01/39] ENH: fill missing variables during concat by reindexing --- xarray/core/concat.py | 62 ++++++++++++++++++++++++++++++------- xarray/tests/test_concat.py | 6 ++-- 2 files changed, 53 insertions(+), 15 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 2eea2ecb3ee..d709b296e28 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -378,7 +378,9 @@ def process_subset_opt(opt, subset): elif opt == "all": concat_over.update( - set(getattr(datasets[0], subset)) - set(datasets[0].dims) + set().union( + *list((set(getattr(d, subset)) - set(d.dims) for d in datasets)) + ) ) elif opt == "minimal": pass @@ -553,16 +555,35 @@ def get_indexes(name): data = var.set_dims(dim).values yield PandasIndex(data, dim, coord_dtype=var.dtype) + # preserve variable order for variables in first dataset + data_var_order = list(datasets[0].variables) + # append additional variables to the end + data_var_order += [e for e in data_names if e not in data_var_order] + # create concatenation index, needed for later reindexing + concat_index = list(range(sum(concat_dim_lengths))) + # stack up each variable and/or index to fill-out the dataset (in order) # n.b. this loop preserves variable order, needed for groupby. - for name in datasets[0].variables: + for name in data_var_order: if name in concat_over and name not in result_indexes: - try: - vars = ensure_common_dims([ds[name].variable for ds in datasets]) - except KeyError: - raise ValueError(f"{name!r} is not present in all datasets.") - - # Try concatenate the indexes, concatenate the variables when no index + variables = [] + variable_index = [] + for i, ds in enumerate(datasets): + if name in ds.variables: + variables.append(ds.variables[name]) + # add to variable index, needed for reindexing + variable_index.extend( + [sum(concat_dim_lengths[:i]) + k for k in range(concat_dim_lengths[i])] + ) + else: + # raise if coordinate not in all datasets + if name in coord_names: + raise ValueError( + f"coordinate {name!r} not present in all datasets." + ) + vars = list(ensure_common_dims(variables)) + + # Try to concatenate the indexes, concatenate the variables when no index # is found on all datasets. indexes: list[Index] = list(get_indexes(name)) if indexes: @@ -586,9 +607,28 @@ def get_indexes(name): ) result_vars[k] = v else: - combined_var = concat_vars( - vars, dim, positions, combine_attrs=combine_attrs - ) + # if variable is only present in one dataset of multiple datasets, + # then do not concat + if len(variables) == 1 and len(datasets) > 1: + combined_var = variables[0] + # only concat if variable is in multiple datasets + # or if single dataset (GH1988) + else: + combined_var = concat_vars( + vars, dim, positions, combine_attrs=combine_attrs + ) + # reindex if variable is not present in all datasets + if len(variable_index) < len(concat_index): + try: + fill = fill_value[name] + except (TypeError, KeyError): + fill = fill_value + combined_var = ( + DataArray(data=combined_var, name=name) + .assign_coords({dim: variable_index}) + .reindex({dim: concat_index}, fill_value=fill) + .variable + ) result_vars[name] = combined_var elif name in result_vars: diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index e0e0038cd89..186ad5b0865 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -50,8 +50,6 @@ def test_concat_compat() -> None: ValueError, match=r"coordinates in some datasets but not others" ): concat([ds1, ds2], dim="q") - with pytest.raises(ValueError, match=r"'q' is not present in all datasets"): - concat([ds2, ds1], dim="q") class TestConcatDataset: @@ -776,7 +774,7 @@ def test_concat_merge_single_non_dim_coord(): actual = concat([da1, da2], "x", coords=coords) assert_identical(actual, expected) - with pytest.raises(ValueError, match=r"'y' is not present in all datasets."): + with pytest.raises(ValueError, match=r"'y' not present in all datasets."): concat([da1, da2], dim="x", coords="all") da1 = DataArray([1, 2, 3], dims="x", coords={"x": [1, 2, 3], "y": 1}) @@ -784,7 +782,7 @@ def test_concat_merge_single_non_dim_coord(): da3 = DataArray([7, 8, 9], dims="x", coords={"x": [7, 8, 9], "y": 1}) for coords in ["different", "all"]: with pytest.raises(ValueError, match=r"'y' not present in all datasets"): - concat([da1, da2, da3], dim="x") + concat([da1, da2, da3], dim="x", coords=coords) def test_concat_preserve_coordinate_order() -> None: From 871d2dbd25dc7a5bb1e5bae7033eb15bc957a793 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 22 Dec 2022 14:43:09 +0000 Subject: [PATCH 02/39] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/concat.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d709b296e28..6b83782f411 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -379,7 +379,7 @@ def process_subset_opt(opt, subset): elif opt == "all": concat_over.update( set().union( - *list((set(getattr(d, subset)) - set(d.dims) for d in datasets)) + *list(set(getattr(d, subset)) - set(d.dims) for d in datasets) ) ) elif opt == "minimal": @@ -573,7 +573,10 @@ def get_indexes(name): variables.append(ds.variables[name]) # add to variable index, needed for reindexing variable_index.extend( - [sum(concat_dim_lengths[:i]) + k for k in range(concat_dim_lengths[i])] + [ + sum(concat_dim_lengths[:i]) + k + for k in range(concat_dim_lengths[i]) + ] ) else: # raise if coordinate not in all datasets From 14bb7799e9c568dffd1d2bf2569ac8e9ab530a78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 4 Jan 2023 14:44:42 +0100 Subject: [PATCH 03/39] FIX: use `Any` for type of `fill_value` as this seems consistent with other places --- xarray/core/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 6b83782f411..ffd3bd338a4 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -441,7 +441,7 @@ def _dataset_concat( coords: str | list[str], compat: CompatOptions, positions: Iterable[Iterable[int]] | None, - fill_value: object = dtypes.NA, + fill_value: Any = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", ) -> T_Dataset: From 6b497134dd93743701ea55106ee393993f7b362f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 4 Jan 2023 13:32:15 +0100 Subject: [PATCH 04/39] ENH: add tests --- xarray/tests/test_concat.py | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 186ad5b0865..ea52b2a6da3 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -571,6 +571,57 @@ def test_concat_str_dtype(self, dtype, dim) -> None: assert np.issubdtype(actual.x2.dtype, dtype) + @pytest.mark.parametrize("dim", [True, False]) + @pytest.mark.parametrize("coord", [True, False]) + def test_concat_fill_missing_variables(self, dim, coord): + # create var names list with one missing value + def get_var_names(var_cnt=10, list_cnt=10): + orig = [f'd{i:02d}' for i in range(var_cnt)] + var_names = [] + for i in range(0, list_cnt): + l1 = orig.copy() + var_names.append(l1) + return var_names + + def create_ds(var_names, dim=False, coord=False, drop_idx=False): + out_ds = [] + ds = Dataset() + ds = ds.assign_coords({"x": np.arange(2)}) + ds = ds.assign_coords({"y": np.arange(3)}) + ds = ds.assign_coords({"z": np.arange(4)}) + for i, dsl in enumerate(var_names): + vlist = dsl.copy() + if drop_idx: + vlist.pop(drop_idx[i]) + foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4) + dsi = ds.copy() + if coord: + dsi = ds.assign({"time": (["time"], [i * 2, i * 2 + 1])}) + for k in vlist: + dsi = dsi.assign({k: (["time", "x", "y", "z"], foo_data.copy())}) + if not dim: + dsi = dsi.isel(time=0) + out_ds.append(dsi) + return out_ds + var_names = get_var_names() + + import random + random.seed(42) + drop_idx = [random.randrange(len(vlist)) for vlist in var_names] + expected = concat(create_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all") + for i, idx in enumerate(drop_idx): + if dim: + expected[var_names[0][idx]][i * 2: i * 2 + 2] = np.nan + else: + expected[var_names[0][idx]][i] = np.nan + + concat_ds = create_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx) + actual = concat(concat_ds, dim="time", data_vars="all") + + for name in var_names[0]: + assert_equal(expected[name], actual[name]) + assert_equal(expected, actual) + class TestConcatDataArray: def test_concat(self) -> None: From 43862a18c119a801ca1b225e5c0b4638e13bc6f5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Jan 2023 07:49:38 +0000 Subject: [PATCH 05/39] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_concat.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index ea52b2a6da3..31ff8841fd5 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -576,7 +576,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None: def test_concat_fill_missing_variables(self, dim, coord): # create var names list with one missing value def get_var_names(var_cnt=10, list_cnt=10): - orig = [f'd{i:02d}' for i in range(var_cnt)] + orig = [f"d{i:02d}" for i in range(var_cnt)] var_names = [] for i in range(0, list_cnt): l1 = orig.copy() @@ -603,15 +603,19 @@ def create_ds(var_names, dim=False, coord=False, drop_idx=False): dsi = dsi.isel(time=0) out_ds.append(dsi) return out_ds + var_names = get_var_names() import random + random.seed(42) drop_idx = [random.randrange(len(vlist)) for vlist in var_names] - expected = concat(create_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all") + expected = concat( + create_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all" + ) for i, idx in enumerate(drop_idx): if dim: - expected[var_names[0][idx]][i * 2: i * 2 + 2] = np.nan + expected[var_names[0][idx]][i * 2 : i * 2 + 2] = np.nan else: expected[var_names[0][idx]][i] = np.nan From 5dd3e78dbe9fb873f72b4a93e83a340d0c3c680d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 8 Jan 2023 14:33:21 +0100 Subject: [PATCH 06/39] typing Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/tests/test_concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 31ff8841fd5..473994eac67 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -573,7 +573,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None: @pytest.mark.parametrize("dim", [True, False]) @pytest.mark.parametrize("coord", [True, False]) - def test_concat_fill_missing_variables(self, dim, coord): + def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None: # create var names list with one missing value def get_var_names(var_cnt=10, list_cnt=10): orig = [f"d{i:02d}" for i in range(var_cnt)] From aed0f6e8209dcff16fd5922cea78e4d02040686a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 8 Jan 2023 14:33:35 +0100 Subject: [PATCH 07/39] typing Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/tests/test_concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 473994eac67..54c9a9b0d63 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -575,7 +575,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None: @pytest.mark.parametrize("coord", [True, False]) def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None: # create var names list with one missing value - def get_var_names(var_cnt=10, list_cnt=10): + def get_var_names(var_cnt: int=10, list_cnt: int=10) -> list[str]: orig = [f"d{i:02d}" for i in range(var_cnt)] var_names = [] for i in range(0, list_cnt): From fa1aba54709553cdf66a6fd984c348e80db7a794 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 8 Jan 2023 14:33:56 +0100 Subject: [PATCH 08/39] typing Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/tests/test_concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 54c9a9b0d63..16d57b4eae7 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -583,7 +583,7 @@ def get_var_names(var_cnt: int=10, list_cnt: int=10) -> list[str]: var_names.append(l1) return var_names - def create_ds(var_names, dim=False, coord=False, drop_idx=False): + def create_ds(var_names: list[str], dim: bool =False, coord: bool =False, drop_idx: list[int] |None =None) -> list[Dataset]: out_ds = [] ds = Dataset() ds = ds.assign_coords({"x": np.arange(2)}) From 308c0094c976503c00f1463c01de34eae81efae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 8 Jan 2023 14:34:21 +0100 Subject: [PATCH 09/39] use None instead of False Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/tests/test_concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 16d57b4eae7..ebd05853646 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -591,7 +591,7 @@ def create_ds(var_names: list[str], dim: bool =False, coord: bool =False, drop_i ds = ds.assign_coords({"z": np.arange(4)}) for i, dsl in enumerate(var_names): vlist = dsl.copy() - if drop_idx: + if drop_idx is not None: vlist.pop(drop_idx[i]) foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4) dsi = ds.copy() From 58813e382e67b908bf8f93c6944ad5a54fd002e3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 8 Jan 2023 13:35:30 +0000 Subject: [PATCH 10/39] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_concat.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index ebd05853646..7e5be610b79 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -575,7 +575,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None: @pytest.mark.parametrize("coord", [True, False]) def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None: # create var names list with one missing value - def get_var_names(var_cnt: int=10, list_cnt: int=10) -> list[str]: + def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[str]: orig = [f"d{i:02d}" for i in range(var_cnt)] var_names = [] for i in range(0, list_cnt): @@ -583,7 +583,12 @@ def get_var_names(var_cnt: int=10, list_cnt: int=10) -> list[str]: var_names.append(l1) return var_names - def create_ds(var_names: list[str], dim: bool =False, coord: bool =False, drop_idx: list[int] |None =None) -> list[Dataset]: + def create_ds( + var_names: list[str], + dim: bool = False, + coord: bool = False, + drop_idx: list[int] | None = None, + ) -> list[Dataset]: out_ds = [] ds = Dataset() ds = ds.assign_coords({"x": np.arange(2)}) From a248d5b66e5f926f0b993fec21831559faf64fca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 8 Jan 2023 16:04:29 +0100 Subject: [PATCH 11/39] concatenate variable in any case if variable has concat_dim --- xarray/core/concat.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index ffd3bd338a4..f492269b916 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -610,9 +610,10 @@ def get_indexes(name): ) result_vars[k] = v else: - # if variable is only present in one dataset of multiple datasets, - # then do not concat - if len(variables) == 1 and len(datasets) > 1: + # do not concat if: + # 1. variable is only present in one dataset of multiple datasets and + # 2. dim is not a dimension of variable + if dim not in variables[0].dims and len(variables) == 1 and len(datasets) > 1: combined_var = variables[0] # only concat if variable is in multiple datasets # or if single dataset (GH1988) From 1a6ad5bff9e57379c38422b2df51d9c27c7dacf9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 8 Jan 2023 15:05:34 +0000 Subject: [PATCH 12/39] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/concat.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index f492269b916..67023c800f1 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -613,7 +613,11 @@ def get_indexes(name): # do not concat if: # 1. variable is only present in one dataset of multiple datasets and # 2. dim is not a dimension of variable - if dim not in variables[0].dims and len(variables) == 1 and len(datasets) > 1: + if ( + dim not in variables[0].dims + and len(variables) == 1 + and len(datasets) > 1 + ): combined_var = variables[0] # only concat if variable is in multiple datasets # or if single dataset (GH1988) From b9d0d024b0b6866d178f00d96e1b40bf4c9c35e5 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Sun, 8 Jan 2023 16:19:17 +0100 Subject: [PATCH 13/39] add tests from @scottcha #3545 --- xarray/tests/test_concat.py | 593 ++++++++++++++++++++++++++++++++++++ 1 file changed, 593 insertions(+) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 7e5be610b79..b7e2bb943cc 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1,5 +1,6 @@ from __future__ import annotations +import random from copy import deepcopy from typing import TYPE_CHECKING, Any @@ -23,6 +24,86 @@ from xarray.core.types import CombineAttrsOptions, JoinOptions +# helper method to create multiple tests datasets to concat +def create_concat_datasets(num_datasets=2, seed=None, include_day=True): + random.seed(seed) + result = [] + lat = np.random.randn(1, 4) + lon = np.random.randn(1, 4) + for i in range(num_datasets): + if include_day: + result.append( + Dataset( + data_vars={ + "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)), + }, + coords={ + "lat": (["x", "y"], lat), + "lon": (["x", "y"], lon), + "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], + }, + ) + ) + else: + result.append( + Dataset( + data_vars={ + "temperature": (["x", "y"], np.random.randn(1, 4)), + "pressure": (["x", "y"], np.random.randn(1, 4)), + "humidity": (["x", "y"], np.random.randn(1, 4)), + "precipitation": (["x", "y"], np.random.randn(1, 4)), + "cloud cover": (["x", "y"], np.random.randn(1, 4)), + }, + coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)}, + ) + ) + + return result + + +# helper method to create multiple tests datasets to concat with specific types +def create_typed_datasets(num_datasets=2, seed=None): + random.seed(seed) + var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"] + result = [] + lat = np.random.randn(1, 4) + lon = np.random.randn(1, 4) + for i in range(num_datasets): + result.append( + Dataset( + data_vars={ + "float": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "float2": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "string": ( + ["x", "y", "day"], + np.random.choice(var_strings, (1, 4, 2)), + ), + "int": (["x", "y", "day"], np.random.randint(0, 10, (1, 4, 2))), + "datetime64": ( + ["x", "y", "day"], + np.arange( + np.datetime64("2017-01-01"), np.datetime64("2017-01-09") + ).reshape(1, 4, 2), + ), + "timedelta64": ( + ["x", "y", "day"], + np.reshape([pd.Timedelta(days=i) for i in range(8)], [1, 4, 2]), + ), + }, + coords={ + "lat": (["x", "y"], lat), + "lon": (["x", "y"], lon), + "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], + }, + ) + ) + return result + + def test_concat_compat() -> None: ds1 = Dataset( { @@ -52,6 +133,518 @@ def test_concat_compat() -> None: concat([ds1, ds2], dim="q") +def test_concat_missing_var(): + datasets = create_concat_datasets(2, 123) + vars_to_drop = ["humidity", "precipitation", "cloud cover"] + datasets[0] = datasets[0].drop_vars(vars_to_drop) + datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"]) + + temperature_result = np.concatenate( + (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +def test_concat_missing_multiple_consecutive_var(): + datasets = create_concat_datasets(3, 123) + vars_to_drop = ["pressure", "humidity"] + datasets[0] = datasets[0].drop_vars(vars_to_drop) + datasets[1] = datasets[1].drop_vars(vars_to_drop) + + temperature_result = np.concatenate( + ( + datasets[0].temperature.values, + datasets[1].temperature.values, + datasets[2].temperature.values, + ), + axis=2, + ) + pressure_result = np.concatenate( + ( + np.full([1, 4, 2], np.nan), + np.full([1, 4, 2], np.nan), + datasets[2].pressure.values, + ), + axis=2, + ) + humidity_result = np.concatenate( + ( + np.full([1, 4, 2], np.nan), + np.full([1, 4, 2], np.nan), + datasets[2].humidity.values, + ), + axis=2, + ) + precipitation_result = np.concatenate( + ( + datasets[0].precipitation.values, + datasets[1].precipitation.values, + datasets[2].precipitation.values, + ), + axis=2, + ) + cloudcover_result = np.concatenate( + ( + datasets[0]["cloud cover"].values, + datasets[1]["cloud cover"].values, + datasets[2]["cloud cover"].values, + ), + axis=2, + ) + + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + "humidity": (["x", "y", "day"], humidity_result), + "pressure": (["x", "y", "day"], pressure_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4", "day5", "day6"], + }, + ) + # assign here, as adding above gave switched pressure/humidity-order every once in a while + ds_result = ds_result.assign({"humidity": (["x", "y", "day"], humidity_result)}) + ds_result = ds_result.assign({"pressure": (["x", "y", "day"], pressure_result)}) + result = concat(datasets, dim="day") + r1 = [var for var in result.data_vars] + r2 = [var for var in ds_result.data_vars] + assert r1 == r2 # check the variables orders are the same + assert_equal(result, ds_result) + + +def test_concat_all_empty(): + ds1 = Dataset() + ds2 = Dataset() + result = concat([ds1, ds2], dim="new_dim") + + assert_equal(result, Dataset()) + + +def test_concat_second_empty(): + ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1}) + ds2 = Dataset(coords={"x": 0.1}) + + ds_result = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1}) + result = concat([ds1, ds2], dim="y") + + assert_equal(result, ds_result) + + +def test_multiple_missing_variables(): + datasets = create_concat_datasets(2, 123) + vars_to_drop = ["pressure", "cloud cover"] + datasets[1] = datasets[1].drop_vars(vars_to_drop) + + temperature_result = np.concatenate( + (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + humidity_result = np.concatenate( + (datasets[0].humidity.values, datasets[1].humidity.values), axis=2 + ) + precipitation_result = np.concatenate( + (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2 + ) + cloudcover_result = np.concatenate( + (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + "humidity": (["x", "y", "day"], humidity_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +@pytest.mark.xfail(strict=True) +def test_concat_multiple_datasets_missing_vars_and_new_dim(): + vars_to_drop = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud cover", + ] + datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=False) + # set up the test data + datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] + + # set up the validation data + # the below code just drops one var per dataset depending on the location of the + # dataset in the list and allows us to quickly catch any boundaries cases across + # the three equivalence classes of beginning, middle and end of the concat list + result_vars = dict.fromkeys(vars_to_drop) + for i in range(len(vars_to_drop)): + for d in range(len(datasets)): + if d != i: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values + else: + result_vars[vars_to_drop[i]] = np.concatenate( + ( + result_vars[vars_to_drop[i]], + datasets[d][vars_to_drop[i]].values, + ), + axis=1, + ) + else: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = np.full([1, 4], np.nan) + else: + result_vars[vars_to_drop[i]] = np.concatenate( + (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)), + axis=1, + ) + # TODO: this test still has two unexpected errors: + + # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case + # as we are concating on new dims + # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable + # the resulting set + + ds_result = Dataset( + data_vars={ + # pressure will be first in this since the first dataset is missing this var + # and there isn't a good way to determine that this should be first + # this also means temperature will be last as the first data vars will + # determine the order for all that exist in that dataset + "pressure": (["x", "y", "day"], result_vars["pressure"]), + "humidity": (["x", "y", "day"], result_vars["humidity"]), + "precipitation": (["x", "y", "day"], result_vars["precipitation"]), + "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), + "temperature": (["x", "y", "day"], result_vars["temperature"]), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + # "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))], + }, + ) + + result = concat(datasets, dim="day") + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +def test_multiple_datasets_with_missing_variables(): + vars_to_drop = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud cover", + ] + datasets = create_concat_datasets(len(vars_to_drop), 123) + # set up the test data + datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] + + # set up the validation data + # the below code just drops one var per dataset depending on the location of the + # dataset in the list and allows us to quickly catch any boundaries cases across + # the three equivalence classes of beginning, middle and end of the concat list + result_vars = dict.fromkeys(vars_to_drop) + for i in range(len(vars_to_drop)): + for d in range(len(datasets)): + if d != i: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values + else: + result_vars[vars_to_drop[i]] = np.concatenate( + ( + result_vars[vars_to_drop[i]], + datasets[d][vars_to_drop[i]].values, + ), + axis=2, + ) + else: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan) + else: + result_vars[vars_to_drop[i]] = np.concatenate( + (result_vars[vars_to_drop[i]], np.full([1, 4, 2], np.nan)), + axis=2, + ) + + ds_result = Dataset( + data_vars={ + # pressure will be first in this since the first dataset is missing this var + # and there isn't a good way to determine that this should be first + # this also means temperature will be last as the first data vars will + # determine the order for all that exist in that dataset + "pressure": (["x", "y", "day"], result_vars["pressure"]), + "humidity": (["x", "y", "day"], result_vars["humidity"]), + "precipitation": (["x", "y", "day"], result_vars["precipitation"]), + "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), + "temperature": (["x", "y", "day"], result_vars["temperature"]), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))], + }, + ) + result = concat(datasets, dim="day") + + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +def test_multiple_datasets_with_multiple_missing_variables(): + vars_to_drop_in_first = ["temperature", "pressure"] + vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"] + datasets = create_concat_datasets(2, 123) + # set up the test data + datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) + datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) + + temperature_result = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[1].pressure.values), axis=2 + ) + humidity_result = np.concatenate( + (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + precipitation_result = np.concatenate( + (datasets[0].precipitation.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + cloudcover_result = np.concatenate( + (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "humidity": (["x", "y", "day"], humidity_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + # these two are at the end of the expected as they are missing from the first + # dataset in the concat list + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +def test_type_of_missing_fill(): + datasets = create_typed_datasets(2, 123) + + vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"] + + # set up the test data + datasets[1] = datasets[1].drop_vars(vars[1:]) + + float_result = np.concatenate( + (datasets[0].float.values, datasets[1].float.values), axis=2 + ) + float2_result = np.concatenate( + (datasets[0].float2.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + # to correctly create the expected dataset we need to ensure we promote the string array to + # object type before filling as it will be promoted to that in the concat case. + # this matches the behavior of pandas + string_values = datasets[0].string.values + string_values = string_values.astype(object) + string_result = np.concatenate((string_values, np.full([1, 4, 2], np.nan)), axis=2) + datetime_result = np.concatenate( + (datasets[0].datetime64.values, np.full([1, 4, 2], np.datetime64("NaT"))), + axis=2, + ) + timedelta_result = np.concatenate( + (datasets[0].timedelta64.values, np.full([1, 4, 2], np.timedelta64("NaT"))), + axis=2, + ) + # string_result = string_result.astype(object) + int_result = np.concatenate( + (datasets[0].int.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "float": (["x", "y", "day"], float_result), + "float2": (["x", "y", "day"], float2_result), + "string": (["x", "y", "day"], string_result), + "int": (["x", "y", "day"], int_result), + "datetime64": (["x", "y", "day"], datetime_result), + "timedelta64": (["x", "y", "day"], timedelta_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day", fill_value=dtypes.NA) + + assert_equal(result, ds_result) + + # test in the reverse order + float_result_rev = np.concatenate( + (datasets[1].float.values, datasets[0].float.values), axis=2 + ) + float2_result_rev = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[0].float2.values), axis=2 + ) + string_result_rev = np.concatenate( + (np.full([1, 4, 2], np.nan), string_values), axis=2 + ) + datetime_result_rev = np.concatenate( + (np.full([1, 4, 2], np.datetime64("NaT")), datasets[0].datetime64.values), + axis=2, + ) + timedelta_result_rev = np.concatenate( + (np.full([1, 4, 2], np.timedelta64("NaT")), datasets[0].timedelta64.values), + axis=2, + ) + int_result_rev = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[0].int.values), axis=2 + ) + ds_result_rev = Dataset( + data_vars={ + "float": (["x", "y", "day"], float_result_rev), + "float2": (["x", "y", "day"], float2_result_rev), + "string": (["x", "y", "day"], string_result_rev), + "int": (["x", "y", "day"], int_result_rev), + "datetime64": (["x", "y", "day"], datetime_result_rev), + "timedelta64": (["x", "y", "day"], timedelta_result_rev), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day3", "day4", "day1", "day2"], + }, + ) + result_rev = concat(datasets[::-1], dim="day", fill_value=dtypes.NA) + + assert_equal(result_rev, ds_result_rev) + + +def test_order_when_filling_missing(): + vars_to_drop_in_first = [] + # drop middle + vars_to_drop_in_second = ["humidity"] + datasets = create_concat_datasets(2, 123) + # set up the test data + datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) + datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) + + temperature_result = np.concatenate( + (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (datasets[0].pressure.values, datasets[1].pressure.values), axis=2 + ) + humidity_result = np.concatenate( + (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + precipitation_result = np.concatenate( + (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2 + ) + cloudcover_result = np.concatenate( + (datasets[0]["cloud cover"].values, datasets[1]["cloud cover"].values), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + "humidity": (["x", "y", "day"], humidity_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + assert_equal(result, ds_result) + + result_keys = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud cover", + ] + result_index = 0 + for k in result.data_vars.keys(): + assert k == result_keys[result_index] + result_index += 1 + + result_keys_rev = [ + "temperature", + "pressure", + "precipitation", + "cloud cover", + "humidity", + ] + # test order when concat in reversed order + rev_result = concat(datasets[::-1], dim="day") + result_index = 0 + for k in rev_result.data_vars.keys(): + assert k == result_keys_rev[result_index] + result_index += 1 + + class TestConcatDataset: @pytest.fixture def data(self) -> Dataset: From 2db43b9a76086b08d9dfd0852e40e49b89dc8661 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 8 Jan 2023 16:44:29 +0100 Subject: [PATCH 14/39] typing --- xarray/tests/test_concat.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index b7e2bb943cc..9a1c4b02f08 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -25,7 +25,9 @@ # helper method to create multiple tests datasets to concat -def create_concat_datasets(num_datasets=2, seed=None, include_day=True): +def create_concat_datasets( + num_datasets: int = 2, seed: [int | None] = None, include_day: bool = True +) -> list[Dataset]: random.seed(seed) result = [] lat = np.random.randn(1, 4) @@ -66,7 +68,9 @@ def create_concat_datasets(num_datasets=2, seed=None, include_day=True): # helper method to create multiple tests datasets to concat with specific types -def create_typed_datasets(num_datasets=2, seed=None): +def create_typed_datasets( + num_datasets: int = 2, seed: [int | None] = None +) -> list[Dataset]: random.seed(seed) var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"] result = [] @@ -133,7 +137,7 @@ def test_concat_compat() -> None: concat([ds1, ds2], dim="q") -def test_concat_missing_var(): +def test_concat_missing_var() -> None: datasets = create_concat_datasets(2, 123) vars_to_drop = ["humidity", "precipitation", "cloud cover"] datasets[0] = datasets[0].drop_vars(vars_to_drop) @@ -165,7 +169,7 @@ def test_concat_missing_var(): assert_equal(result, ds_result) -def test_concat_missing_multiple_consecutive_var(): +def test_concat_missing_multiple_consecutive_var() -> None: datasets = create_concat_datasets(3, 123) vars_to_drop = ["pressure", "humidity"] datasets[0] = datasets[0].drop_vars(vars_to_drop) @@ -236,7 +240,7 @@ def test_concat_missing_multiple_consecutive_var(): assert_equal(result, ds_result) -def test_concat_all_empty(): +def test_concat_all_empty() -> None: ds1 = Dataset() ds2 = Dataset() result = concat([ds1, ds2], dim="new_dim") @@ -244,7 +248,7 @@ def test_concat_all_empty(): assert_equal(result, Dataset()) -def test_concat_second_empty(): +def test_concat_second_empty() -> None: ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1}) ds2 = Dataset(coords={"x": 0.1}) @@ -254,7 +258,7 @@ def test_concat_second_empty(): assert_equal(result, ds_result) -def test_multiple_missing_variables(): +def test_multiple_missing_variables() -> None: datasets = create_concat_datasets(2, 123) vars_to_drop = ["pressure", "cloud cover"] datasets[1] = datasets[1].drop_vars(vars_to_drop) @@ -298,7 +302,7 @@ def test_multiple_missing_variables(): @pytest.mark.xfail(strict=True) -def test_concat_multiple_datasets_missing_vars_and_new_dim(): +def test_concat_multiple_datasets_missing_vars_and_new_dim() -> None: vars_to_drop = [ "temperature", "pressure", @@ -370,7 +374,7 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim(): assert_equal(result, ds_result) -def test_multiple_datasets_with_missing_variables(): +def test_multiple_datasets_with_missing_variables() -> None: vars_to_drop = [ "temperature", "pressure", @@ -436,7 +440,7 @@ def test_multiple_datasets_with_missing_variables(): assert_equal(result, ds_result) -def test_multiple_datasets_with_multiple_missing_variables(): +def test_multiple_datasets_with_multiple_missing_variables() -> None: vars_to_drop_in_first = ["temperature", "pressure"] vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"] datasets = create_concat_datasets(2, 123) @@ -484,7 +488,7 @@ def test_multiple_datasets_with_multiple_missing_variables(): assert_equal(result, ds_result) -def test_type_of_missing_fill(): +def test_type_of_missing_fill() -> None: datasets = create_typed_datasets(2, 123) vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"] @@ -576,7 +580,7 @@ def test_type_of_missing_fill(): assert_equal(result_rev, ds_result_rev) -def test_order_when_filling_missing(): +def test_order_when_filling_missing() -> None: vars_to_drop_in_first = [] # drop middle vars_to_drop_in_second = ["humidity"] From e54d423b4f9f823906a94d0d5ea8d01decb32166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 8 Jan 2023 17:20:48 +0100 Subject: [PATCH 15/39] fix typing --- xarray/tests/test_concat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 9a1c4b02f08..db15486b971 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -26,7 +26,7 @@ # helper method to create multiple tests datasets to concat def create_concat_datasets( - num_datasets: int = 2, seed: [int | None] = None, include_day: bool = True + num_datasets: int = 2, seed: int | None = None, include_day: bool = True ) -> list[Dataset]: random.seed(seed) result = [] @@ -69,7 +69,7 @@ def create_concat_datasets( # helper method to create multiple tests datasets to concat with specific types def create_typed_datasets( - num_datasets: int = 2, seed: [int | None] = None + num_datasets: int = 2, seed: int | None = None ) -> list[Dataset]: random.seed(seed) var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"] @@ -1172,7 +1172,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None: @pytest.mark.parametrize("coord", [True, False]) def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None: # create var names list with one missing value - def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[str]: + def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[list[str]]: orig = [f"d{i:02d}" for i in range(var_cnt)] var_names = [] for i in range(0, list_cnt): @@ -1181,7 +1181,7 @@ def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[str]: return var_names def create_ds( - var_names: list[str], + var_names: list[list[str]], dim: bool = False, coord: bool = False, drop_idx: list[int] | None = None, From 7fe0ce2f1957cd19abcb3858eb230e8ee988f5ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 9 Jan 2023 09:25:26 +0100 Subject: [PATCH 16/39] fix tests with, finalize typing --- xarray/tests/test_concat.py | 205 ++++++++++++++++++++---------------- 1 file changed, 113 insertions(+), 92 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index db15486b971..260ed7357ea 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -2,7 +2,7 @@ import random from copy import deepcopy -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Callable import numpy as np import pandas as pd @@ -230,13 +230,12 @@ def test_concat_missing_multiple_consecutive_var() -> None: "day": ["day1", "day2", "day3", "day4", "day5", "day6"], }, ) - # assign here, as adding above gave switched pressure/humidity-order every once in a while - ds_result = ds_result.assign({"humidity": (["x", "y", "day"], humidity_result)}) - ds_result = ds_result.assign({"pressure": (["x", "y", "day"], pressure_result)}) result = concat(datasets, dim="day") r1 = [var for var in result.data_vars] r2 = [var for var in ds_result.data_vars] - assert r1 == r2 # check the variables orders are the same + # check the variables orders are the same for the first three variables + assert r1[:3] == r2[:3] + assert set(r1[3:]) == set(r2[3:]) # just check availability for the remaining vars assert_equal(result, ds_result) @@ -301,8 +300,8 @@ def test_multiple_missing_variables() -> None: assert_equal(result, ds_result) -@pytest.mark.xfail(strict=True) -def test_concat_multiple_datasets_missing_vars_and_new_dim() -> None: +@pytest.mark.parametrize("include_day", [True, False]) +def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) -> None: vars_to_drop = [ "temperature", "pressure", @@ -310,47 +309,51 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim() -> None: "precipitation", "cloud cover", ] - datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=False) + + datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=include_day) # set up the test data datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] + dim_size = 2 if include_day else 1 + # set up the validation data # the below code just drops one var per dataset depending on the location of the # dataset in the list and allows us to quickly catch any boundaries cases across # the three equivalence classes of beginning, middle and end of the concat list - result_vars = dict.fromkeys(vars_to_drop) + result_vars = dict.fromkeys(vars_to_drop, np.array([])) for i in range(len(vars_to_drop)): for d in range(len(datasets)): if d != i: - if result_vars[vars_to_drop[i]] is None: - result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values + if include_day: + ds_vals = datasets[d][vars_to_drop[i]].values + else: + ds_vals = datasets[d][vars_to_drop[i]].values[..., None] + if not result_vars[vars_to_drop[i]].size: + result_vars[vars_to_drop[i]] = ds_vals else: result_vars[vars_to_drop[i]] = np.concatenate( ( result_vars[vars_to_drop[i]], - datasets[d][vars_to_drop[i]].values, + ds_vals, ), - axis=1, + axis=-1, ) else: - if result_vars[vars_to_drop[i]] is None: - result_vars[vars_to_drop[i]] = np.full([1, 4], np.nan) + if not result_vars[vars_to_drop[i]].size: + result_vars[vars_to_drop[i]] = np.full([1, 4, dim_size], np.nan) else: result_vars[vars_to_drop[i]] = np.concatenate( - (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)), - axis=1, + ( + result_vars[vars_to_drop[i]], + np.full([1, 4, dim_size], np.nan), + ), + axis=-1, ) - # TODO: this test still has two unexpected errors: - - # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case - # as we are concating on new dims - # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable - # the resulting set ds_result = Dataset( data_vars={ - # pressure will be first in this since the first dataset is missing this var - # and there isn't a good way to determine that this should be first + # pressure will be first here since it is first in first dataset and + # there isn't a good way to determine that temperature should be first # this also means temperature will be last as the first data vars will # determine the order for all that exist in that dataset "pressure": (["x", "y", "day"], result_vars["pressure"]), @@ -362,11 +365,17 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim() -> None: coords={ "lat": (["x", "y"], datasets[0].lat.values), "lon": (["x", "y"], datasets[0].lon.values), - # "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))], }, ) + if include_day: + ds_result = ds_result.assign_coords( + {"day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))]} + ) + else: + ds_result = ds_result.transpose("day", "x", "y") result = concat(datasets, dim="day") + r1 = list(result.data_vars.keys()) r2 = list(ds_result.data_vars.keys()) assert r1 == r2 # check the variables orders are the same @@ -390,11 +399,11 @@ def test_multiple_datasets_with_missing_variables() -> None: # the below code just drops one var per dataset depending on the location of the # dataset in the list and allows us to quickly catch any boundaries cases across # the three equivalence classes of beginning, middle and end of the concat list - result_vars = dict.fromkeys(vars_to_drop) + result_vars = dict.fromkeys(vars_to_drop, np.array([])) for i in range(len(vars_to_drop)): for d in range(len(datasets)): if d != i: - if result_vars[vars_to_drop[i]] is None: + if not result_vars[vars_to_drop[i]].size: result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values else: result_vars[vars_to_drop[i]] = np.concatenate( @@ -405,7 +414,7 @@ def test_multiple_datasets_with_missing_variables() -> None: axis=2, ) else: - if result_vars[vars_to_drop[i]] is None: + if not result_vars[vars_to_drop[i]].size: result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan) else: result_vars[vars_to_drop[i]] = np.concatenate( @@ -483,8 +492,9 @@ def test_multiple_datasets_with_multiple_missing_variables() -> None: r1 = list(result.data_vars.keys()) r2 = list(ds_result.data_vars.keys()) - assert r1 == r2 # check the variables orders are the same - + # check the variables orders are the same for the first three variables + assert r1[:3] == r2[:3] + assert set(r1[3:]) == set(r2[3:]) # just check availability for the remaining vars assert_equal(result, ds_result) @@ -581,7 +591,7 @@ def test_type_of_missing_fill() -> None: def test_order_when_filling_missing() -> None: - vars_to_drop_in_first = [] + vars_to_drop_in_first: list[str] = [] # drop middle vars_to_drop_in_second = ["humidity"] datasets = create_concat_datasets(2, 123) @@ -649,6 +659,77 @@ def test_order_when_filling_missing() -> None: result_index += 1 +@pytest.fixture +def concat_var_names() -> Callable: + # create var names list with one missing value + def get_varnames(var_cnt: int = 10, list_cnt: int = 10) -> list[list[str]]: + orig = [f"d{i:02d}" for i in range(var_cnt)] + var_names = [] + for i in range(0, list_cnt): + l1 = orig.copy() + var_names.append(l1) + return var_names + + return get_varnames + + +@pytest.fixture +def create_concat_ds() -> Callable: + def create_ds( + var_names: list[list[str]], + dim: bool = False, + coord: bool = False, + drop_idx: list[int] | None = None, + ) -> list[Dataset]: + out_ds = [] + ds = Dataset() + ds = ds.assign_coords({"x": np.arange(2)}) + ds = ds.assign_coords({"y": np.arange(3)}) + ds = ds.assign_coords({"z": np.arange(4)}) + for i, dsl in enumerate(var_names): + vlist = dsl.copy() + if drop_idx is not None: + vlist.pop(drop_idx[i]) + foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4) + dsi = ds.copy() + if coord: + dsi = ds.assign({"time": (["time"], [i * 2, i * 2 + 1])}) + for k in vlist: + dsi = dsi.assign({k: (["time", "x", "y", "z"], foo_data.copy())}) + if not dim: + dsi = dsi.isel(time=0) + out_ds.append(dsi) + return out_ds + + return create_ds + + +@pytest.mark.parametrize("dim", [True, False]) +@pytest.mark.parametrize("coord", [True, False]) +def test_concat_fill_missing_variables( + concat_var_names, create_concat_ds, dim: bool, coord: bool +) -> None: + var_names = concat_var_names() + + random.seed(42) + drop_idx = [random.randrange(len(vlist)) for vlist in var_names] + expected = concat( + create_concat_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all" + ) + for i, idx in enumerate(drop_idx): + if dim: + expected[var_names[0][idx]][i * 2 : i * 2 + 2] = np.nan + else: + expected[var_names[0][idx]][i] = np.nan + + concat_ds = create_concat_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx) + actual = concat(concat_ds, dim="time", data_vars="all") + + for name in var_names[0]: + assert_equal(expected[name], actual[name]) + assert_equal(expected, actual) + + class TestConcatDataset: @pytest.fixture def data(self) -> Dataset: @@ -1168,66 +1249,6 @@ def test_concat_str_dtype(self, dtype, dim) -> None: assert np.issubdtype(actual.x2.dtype, dtype) - @pytest.mark.parametrize("dim", [True, False]) - @pytest.mark.parametrize("coord", [True, False]) - def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None: - # create var names list with one missing value - def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[list[str]]: - orig = [f"d{i:02d}" for i in range(var_cnt)] - var_names = [] - for i in range(0, list_cnt): - l1 = orig.copy() - var_names.append(l1) - return var_names - - def create_ds( - var_names: list[list[str]], - dim: bool = False, - coord: bool = False, - drop_idx: list[int] | None = None, - ) -> list[Dataset]: - out_ds = [] - ds = Dataset() - ds = ds.assign_coords({"x": np.arange(2)}) - ds = ds.assign_coords({"y": np.arange(3)}) - ds = ds.assign_coords({"z": np.arange(4)}) - for i, dsl in enumerate(var_names): - vlist = dsl.copy() - if drop_idx is not None: - vlist.pop(drop_idx[i]) - foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4) - dsi = ds.copy() - if coord: - dsi = ds.assign({"time": (["time"], [i * 2, i * 2 + 1])}) - for k in vlist: - dsi = dsi.assign({k: (["time", "x", "y", "z"], foo_data.copy())}) - if not dim: - dsi = dsi.isel(time=0) - out_ds.append(dsi) - return out_ds - - var_names = get_var_names() - - import random - - random.seed(42) - drop_idx = [random.randrange(len(vlist)) for vlist in var_names] - expected = concat( - create_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all" - ) - for i, idx in enumerate(drop_idx): - if dim: - expected[var_names[0][idx]][i * 2 : i * 2 + 2] = np.nan - else: - expected[var_names[0][idx]][i] = np.nan - - concat_ds = create_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx) - actual = concat(concat_ds, dim="time", data_vars="all") - - for name in var_names[0]: - assert_equal(expected[name], actual[name]) - assert_equal(expected, actual) - class TestConcatDataArray: def test_concat(self) -> None: From ffd13f684d6a71e514e2ecb02e92713a99629bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 9 Jan 2023 13:14:47 +0100 Subject: [PATCH 17/39] add whats-new.rst entry --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4b03e8d676e..d7127deee1c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -75,6 +75,9 @@ Bug fixes By `Benoît Bovy `_. - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`, :pull:`7393`). By `Ian Carroll `_. +- Make :py:func:`xarray.concat` more robust when concatenating variables present in some datasets but + not others (:issue:`508`, :pull:`7400`). + By `Kai Mühlbauer `_ and `Scott Chamberlin `_. Internal Changes ~~~~~~~~~~~~~~~~ From 146b740e3714aa63db0c908b2448a0042eafb0ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 9 Jan 2023 15:57:50 +0100 Subject: [PATCH 18/39] Update xarray/tests/test_concat.py Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/tests/test_concat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 260ed7357ea..1f9eaf7c2fd 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -28,10 +28,10 @@ def create_concat_datasets( num_datasets: int = 2, seed: int | None = None, include_day: bool = True ) -> list[Dataset]: - random.seed(seed) + rng = default_rng(seed) + lat = rng.standard_normal(size=(1, 4)) + lon = rng.standard_normal(size=(1, 4)) result = [] - lat = np.random.randn(1, 4) - lon = np.random.randn(1, 4) for i in range(num_datasets): if include_day: result.append( From 3196e83be35cdb156c5cd083ab624e2c10b3ef37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 9 Jan 2023 15:58:01 +0100 Subject: [PATCH 19/39] Update xarray/tests/test_concat.py Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/tests/test_concat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 1f9eaf7c2fd..973f4b60e2b 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -234,6 +234,7 @@ def test_concat_missing_multiple_consecutive_var() -> None: r1 = [var for var in result.data_vars] r2 = [var for var in ds_result.data_vars] # check the variables orders are the same for the first three variables + # TODO: Can all variables become deterministic? assert r1[:3] == r2[:3] assert set(r1[3:]) == set(r2[3:]) # just check availability for the remaining vars assert_equal(result, ds_result) From e53cfcccc41c06a0617fbc6a8b33d0a23e244d73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 9 Jan 2023 16:09:10 +0100 Subject: [PATCH 20/39] add TODO, fix numpy.random.default_rng --- xarray/core/concat.py | 3 +++ xarray/tests/test_concat.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 67023c800f1..25f6534f89a 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -558,6 +558,9 @@ def get_indexes(name): # preserve variable order for variables in first dataset data_var_order = list(datasets[0].variables) # append additional variables to the end + # TODO: since data_names is a set the ordering for the appended variables + # is not deterministic, see also discussion in + # https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738 data_var_order += [e for e in data_names if e not in data_var_order] # create concatenation index, needed for later reindexing concat_index = list(range(sum(concat_dim_lengths))) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 973f4b60e2b..3ca98deedfe 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -28,7 +28,7 @@ def create_concat_datasets( num_datasets: int = 2, seed: int | None = None, include_day: bool = True ) -> list[Dataset]: - rng = default_rng(seed) + rng = np.random.default_rng(seed) lat = rng.standard_normal(size=(1, 4)) lon = rng.standard_normal(size=(1, 4)) result = [] From c41419f43619ff45e3f14d722797075a737c8ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 10 Jan 2023 08:11:34 +0100 Subject: [PATCH 21/39] change np.random to use Generator --- xarray/tests/test_concat.py | 62 +++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 3ca98deedfe..35b3a826e9c 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1,6 +1,5 @@ from __future__ import annotations -import random from copy import deepcopy from typing import TYPE_CHECKING, Any, Callable @@ -37,11 +36,26 @@ def create_concat_datasets( result.append( Dataset( data_vars={ - "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)), - "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)), - "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)), - "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)), - "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "temperature": ( + ["x", "y", "day"], + rng.standard_normal(size=(1, 4, 2)), + ), + "pressure": ( + ["x", "y", "day"], + rng.standard_normal(size=(1, 4, 2)), + ), + "humidity": ( + ["x", "y", "day"], + rng.standard_normal(size=(1, 4, 2)), + ), + "precipitation": ( + ["x", "y", "day"], + rng.standard_normal(size=(1, 4, 2)), + ), + "cloud cover": ( + ["x", "y", "day"], + rng.standard_normal(size=(1, 4, 2)), + ), }, coords={ "lat": (["x", "y"], lat), @@ -54,11 +68,11 @@ def create_concat_datasets( result.append( Dataset( data_vars={ - "temperature": (["x", "y"], np.random.randn(1, 4)), - "pressure": (["x", "y"], np.random.randn(1, 4)), - "humidity": (["x", "y"], np.random.randn(1, 4)), - "precipitation": (["x", "y"], np.random.randn(1, 4)), - "cloud cover": (["x", "y"], np.random.randn(1, 4)), + "temperature": (["x", "y"], rng.standard_normal(size=(1, 4))), + "pressure": (["x", "y"], rng.standard_normal(size=(1, 4))), + "humidity": (["x", "y"], rng.standard_normal(size=(1, 4))), + "precipitation": (["x", "y"], rng.standard_normal(size=(1, 4))), + "cloud cover": (["x", "y"], rng.standard_normal(size=(1, 4))), }, coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)}, ) @@ -71,22 +85,22 @@ def create_concat_datasets( def create_typed_datasets( num_datasets: int = 2, seed: int | None = None ) -> list[Dataset]: - random.seed(seed) var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"] result = [] - lat = np.random.randn(1, 4) - lon = np.random.randn(1, 4) + rng = np.random.default_rng(seed) + lat = rng.standard_normal(size=(1, 4)) + lon = rng.standard_normal(size=(1, 4)) for i in range(num_datasets): result.append( Dataset( data_vars={ - "float": (["x", "y", "day"], np.random.randn(1, 4, 2)), - "float2": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "float": (["x", "y", "day"], rng.standard_normal(size=(1, 4, 2))), + "float2": (["x", "y", "day"], rng.standard_normal(size=(1, 4, 2))), "string": ( ["x", "y", "day"], - np.random.choice(var_strings, (1, 4, 2)), + rng.choice(var_strings, size=(1, 4, 2)), ), - "int": (["x", "y", "day"], np.random.randint(0, 10, (1, 4, 2))), + "int": (["x", "y", "day"], rng.integers(0, 10, size=(1, 4, 2))), "datetime64": ( ["x", "y", "day"], np.arange( @@ -710,10 +724,11 @@ def create_ds( def test_concat_fill_missing_variables( concat_var_names, create_concat_ds, dim: bool, coord: bool ) -> None: + # random single variables missing in each dataset var_names = concat_var_names() - random.seed(42) - drop_idx = [random.randrange(len(vlist)) for vlist in var_names] + rng = np.random.default_rng(seed=42) + drop_idx = [rng.integers(len(vlist)) for vlist in var_names] expected = concat( create_concat_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all" ) @@ -726,6 +741,13 @@ def test_concat_fill_missing_variables( concat_ds = create_concat_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx) actual = concat(concat_ds, dim="time", data_vars="all") + r1 = list(actual.data_vars.keys()) + r2 = list(expected.data_vars.keys()) + # move element missing in first dataset at the end + # TODO: Can all variables become deterministic? + r2 = r2 + [r2.pop(drop_idx[0])] + assert r1 == r2 # check the variables orders are the same + for name in var_names[0]: assert_equal(expected[name], actual[name]) assert_equal(expected, actual) From b2b0b184d10d5f3c19be5282499a3d6be0ec3c8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 10 Jan 2023 15:19:01 +0100 Subject: [PATCH 22/39] move code for variable order into dedicated function, merge with _parse_datasets, provide fast lane for variable order estimation --- xarray/core/concat.py | 55 ++++++++++++++++++++++++++++--------- xarray/tests/test_concat.py | 14 +++++----- 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 25f6534f89a..f6824306bfd 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -406,20 +406,51 @@ def process_subset_opt(opt, subset): return concat_over, equals, concat_dim_lengths +# search for dataset containing all wanted variables +# if no matching dataset is found return order from first dataset +# and append missing variables at the end +def _get_var_order( + datasets: list[T_Dataset], data_names: set[Hashable], max_vars_index: int +) -> list[Hashable]: + # check if first dataset contains all wanted variables + data_var_order = list(datasets[0].variables) + if not data_names.issubset(set(data_var_order)): + # find dataset with max count variables and check it + max_var_order = list(datasets[max_vars_index].variables) + if data_names.issubset(set(max_var_order)): + data_var_order = max_var_order + else: + # TODO: since data_names is a set, ordering for the appended variables + # is not deterministic, see also discussion in + # https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738 + data_var_order += [e for e in data_names if e not in data_var_order] + return data_var_order + + # determine dimensional coordinate names and a dict mapping name to DataArray def _parse_datasets( - datasets: Iterable[T_Dataset], -) -> tuple[dict[Hashable, Variable], dict[Hashable, int], set[Hashable], set[Hashable]]: + datasets: list[T_Dataset], +) -> tuple[ + dict[Hashable, Variable], + dict[Hashable, int], + set[Hashable], + set[Hashable], + list[Hashable], +]: dims: set[Hashable] = set() all_coord_names: set[Hashable] = set() data_vars: set[Hashable] = set() # list of data_vars dim_coords: dict[Hashable, Variable] = {} # maps dim name to variable dims_sizes: dict[Hashable, int] = {} # shared dimension sizes to expand variables + data_vars_order: list[Hashable] # dataset index of maximum count of variables - for ds in datasets: + data_vars_count = [] + + for i, ds in enumerate(datasets): dims_sizes.update(ds.dims) all_coord_names.update(ds.coords) + data_vars_count.append(len(ds.data_vars)) data_vars.update(ds.data_vars) # preserves ordering of dimensions @@ -431,7 +462,10 @@ def _parse_datasets( dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) - return dim_coords, dims_sizes, all_coord_names, data_vars + max_vars_index = data_vars_count.index(max(data_vars_count)) + data_vars_order = _get_var_order(datasets, data_vars, max_vars_index) + + return dim_coords, dims_sizes, all_coord_names, data_vars, data_vars_order def _dataset_concat( @@ -473,7 +507,9 @@ def _dataset_concat( align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value) ) - dim_coords, dims_sizes, coord_names, data_names = _parse_datasets(datasets) + dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets( + datasets + ) dim_names = set(dim_coords) unlabeled_dims = dim_names - coord_names @@ -555,19 +591,12 @@ def get_indexes(name): data = var.set_dims(dim).values yield PandasIndex(data, dim, coord_dtype=var.dtype) - # preserve variable order for variables in first dataset - data_var_order = list(datasets[0].variables) - # append additional variables to the end - # TODO: since data_names is a set the ordering for the appended variables - # is not deterministic, see also discussion in - # https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738 - data_var_order += [e for e in data_names if e not in data_var_order] # create concatenation index, needed for later reindexing concat_index = list(range(sum(concat_dim_lengths))) # stack up each variable and/or index to fill-out the dataset (in order) # n.b. this loop preserves variable order, needed for groupby. - for name in data_var_order: + for name in data_vars_order: if name in concat_over and name not in result_indexes: variables = [] variable_index = [] diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 35b3a826e9c..a6eb936b8ef 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -149,6 +149,8 @@ def test_concat_compat() -> None: ValueError, match=r"coordinates in some datasets but not others" ): concat([ds1, ds2], dim="q") + with pytest.raises(ValueError, match=r"'q' not present in all datasets"): + concat([ds2, ds1], dim="q") def test_concat_missing_var() -> None: @@ -247,10 +249,7 @@ def test_concat_missing_multiple_consecutive_var() -> None: result = concat(datasets, dim="day") r1 = [var for var in result.data_vars] r2 = [var for var in ds_result.data_vars] - # check the variables orders are the same for the first three variables - # TODO: Can all variables become deterministic? - assert r1[:3] == r2[:3] - assert set(r1[3:]) == set(r2[3:]) # just check availability for the remaining vars + r1 == r2 assert_equal(result, ds_result) @@ -439,8 +438,8 @@ def test_multiple_datasets_with_missing_variables() -> None: ds_result = Dataset( data_vars={ - # pressure will be first in this since the first dataset is missing this var - # and there isn't a good way to determine that this should be first + # pressure will be first here since it is first in first dataset and + # there isn't a good way to determine that temperature should be first # this also means temperature will be last as the first data vars will # determine the order for all that exist in that dataset "pressure": (["x", "y", "day"], result_vars["pressure"]), @@ -508,6 +507,7 @@ def test_multiple_datasets_with_multiple_missing_variables() -> None: r1 = list(result.data_vars.keys()) r2 = list(ds_result.data_vars.keys()) # check the variables orders are the same for the first three variables + # TODO: Can all variables become deterministic? assert r1[:3] == r2[:3] assert set(r1[3:]) == set(r2[3:]) # just check availability for the remaining vars assert_equal(result, ds_result) @@ -662,9 +662,9 @@ def test_order_when_filling_missing() -> None: result_keys_rev = [ "temperature", "pressure", + "humidity", "precipitation", "cloud cover", - "humidity", ] # test order when concat in reversed order rev_result = concat(datasets[::-1], dim="day") From bb0a8ae91dd37a0a95a14acfb9b8e4a41f6be048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 10 Jan 2023 15:43:16 +0100 Subject: [PATCH 23/39] fix comment --- xarray/core/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index f6824306bfd..d1bf31ea98b 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -443,7 +443,7 @@ def _parse_datasets( data_vars: set[Hashable] = set() # list of data_vars dim_coords: dict[Hashable, Variable] = {} # maps dim name to variable dims_sizes: dict[Hashable, int] = {} # shared dimension sizes to expand variables - data_vars_order: list[Hashable] # dataset index of maximum count of variables + data_vars_order: list[Hashable] # ordered list of variables data_vars_count = [] From e266fe535802d3628102a436d5271c8c939bece9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Jan 2023 10:33:54 +0100 Subject: [PATCH 24/39] Use order from first dataset, append missing variables to the end --- xarray/core/concat.py | 29 +++++++++-------------------- xarray/tests/test_concat.py | 2 +- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d1bf31ea98b..508e71c10b9 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -406,24 +406,17 @@ def process_subset_opt(opt, subset): return concat_over, equals, concat_dim_lengths -# search for dataset containing all wanted variables -# if no matching dataset is found return order from first dataset -# and append missing variables at the end def _get_var_order( - datasets: list[T_Dataset], data_names: set[Hashable], max_vars_index: int + datasets: list[T_Dataset], data_names: set[Hashable] ) -> list[Hashable]: - # check if first dataset contains all wanted variables + # get ordering from first dataset + # if first_dataset contains all wanted variables, we are safe + # otherwise we append missing variables to the end data_var_order = list(datasets[0].variables) - if not data_names.issubset(set(data_var_order)): - # find dataset with max count variables and check it - max_var_order = list(datasets[max_vars_index].variables) - if data_names.issubset(set(max_var_order)): - data_var_order = max_var_order - else: - # TODO: since data_names is a set, ordering for the appended variables - # is not deterministic, see also discussion in - # https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738 - data_var_order += [e for e in data_names if e not in data_var_order] + # TODO: since data_names is a set, ordering for the appended variables + # is not deterministic, see also discussion in + # https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738 + data_var_order += [e for e in data_names if e not in data_var_order] return data_var_order @@ -445,12 +438,9 @@ def _parse_datasets( dims_sizes: dict[Hashable, int] = {} # shared dimension sizes to expand variables data_vars_order: list[Hashable] # ordered list of variables - data_vars_count = [] - for i, ds in enumerate(datasets): dims_sizes.update(ds.dims) all_coord_names.update(ds.coords) - data_vars_count.append(len(ds.data_vars)) data_vars.update(ds.data_vars) # preserves ordering of dimensions @@ -462,8 +452,7 @@ def _parse_datasets( dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) - max_vars_index = data_vars_count.index(max(data_vars_count)) - data_vars_order = _get_var_order(datasets, data_vars, max_vars_index) + data_vars_order = _get_var_order(datasets, data_vars) return dim_coords, dims_sizes, all_coord_names, data_vars, data_vars_order diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index a6eb936b8ef..89ab191e8fe 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -662,9 +662,9 @@ def test_order_when_filling_missing() -> None: result_keys_rev = [ "temperature", "pressure", - "humidity", "precipitation", "cloud cover", + "humidity", ] # test order when concat in reversed order rev_result = concat(datasets[::-1], dim="day") From 6120796b01e9ecbb6c147a4ffbcff844f9eb2545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Jan 2023 10:44:10 +0100 Subject: [PATCH 25/39] ensure fill_value is dict --- xarray/core/concat.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 508e71c10b9..474d83aa4ee 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import defaultdict from typing import TYPE_CHECKING, Any, Hashable, Iterable, cast, overload import pandas as pd @@ -496,6 +497,12 @@ def _dataset_concat( align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value) ) + # ensure dictionary for fill_value + if isinstance(fill_value, dict): + fill_value_ = defaultdict(lambda: dtypes.NA, **fill_value) + else: + fill_value_ = defaultdict(lambda: fill_value) + dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets( datasets ) @@ -648,14 +655,10 @@ def get_indexes(name): ) # reindex if variable is not present in all datasets if len(variable_index) < len(concat_index): - try: - fill = fill_value[name] - except (TypeError, KeyError): - fill = fill_value combined_var = ( DataArray(data=combined_var, name=name) .assign_coords({dim: variable_index}) - .reindex({dim: concat_index}, fill_value=fill) + .reindex({dim: concat_index}, fill_value=fill_value_[name]) .variable ) result_vars[name] = combined_var From 5733d931feaddccc6cb404a2edc6ae5a79ef1cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Jan 2023 15:14:24 +0100 Subject: [PATCH 26/39] ensure fill_value in align --- xarray/core/alignment.py | 2 +- xarray/core/concat.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 38978a5e4f3..051ad305904 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -71,7 +71,7 @@ def reindex_variables( for name, var in variables.items(): if isinstance(fill_value, dict): - fill_value_ = fill_value.get(name, dtypes.NA) + fill_value_ = fill_value[name] else: fill_value_ = fill_value diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 474d83aa4ee..2cf4bace388 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -491,18 +491,18 @@ def _dataset_concat( dim, index = _calc_concat_dim_index(dim) - # Make sure we're working on a copy (we'll be loading variables) - datasets = [ds.copy() for ds in datasets] - datasets = list( - align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value) - ) - # ensure dictionary for fill_value if isinstance(fill_value, dict): - fill_value_ = defaultdict(lambda: dtypes.NA, **fill_value) + fill_value_ = fill_value.copy() else: fill_value_ = defaultdict(lambda: fill_value) + # Make sure we're working on a copy (we'll be loading variables) + datasets = [ds.copy() for ds in datasets] + datasets = list( + align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value_) + ) + dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets( datasets ) From 98914392be66bb8ce8c1238444dd6b7e1e2b68fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Jan 2023 15:15:01 +0100 Subject: [PATCH 27/39] simplify combined_var, fix test --- xarray/core/concat.py | 34 +++++++++++----------------------- xarray/tests/test_concat.py | 2 +- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 2cf4bace388..468f7ade657 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -598,7 +598,7 @@ def get_indexes(name): variable_index = [] for i, ds in enumerate(datasets): if name in ds.variables: - variables.append(ds.variables[name]) + variables.append(ds[name].variable) # add to variable index, needed for reindexing variable_index.extend( [ @@ -638,29 +638,17 @@ def get_indexes(name): ) result_vars[k] = v else: - # do not concat if: - # 1. variable is only present in one dataset of multiple datasets and - # 2. dim is not a dimension of variable - if ( - dim not in variables[0].dims - and len(variables) == 1 - and len(datasets) > 1 - ): - combined_var = variables[0] - # only concat if variable is in multiple datasets - # or if single dataset (GH1988) - else: - combined_var = concat_vars( - vars, dim, positions, combine_attrs=combine_attrs + combined_var = concat_vars( + vars, dim, positions, combine_attrs=combine_attrs + ) + # reindex if variable is not present in all datasets + if len(variable_index) < len(concat_index): + combined_var = ( + DataArray(data=combined_var, name=name) + .assign_coords({dim: variable_index}) + .reindex({dim: concat_index}, fill_value=fill_value_[name]) + .variable ) - # reindex if variable is not present in all datasets - if len(variable_index) < len(concat_index): - combined_var = ( - DataArray(data=combined_var, name=name) - .assign_coords({dim: variable_index}) - .reindex({dim: concat_index}, fill_value=fill_value_[name]) - .variable - ) result_vars[name] = combined_var elif name in result_vars: diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 89ab191e8fe..0bed5786464 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -785,7 +785,7 @@ def test_concat_merge_variables_present_in_some_datasets(self, data) -> None: split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] data0, data1 = deepcopy(split_data) data1["foo"] = ("bar", np.random.randn(10)) - actual = concat([data0, data1], "dim1") + actual = concat([data0, data1], "dim1", data_vars="minimal") expected = data.copy().assign(foo=data1.foo) assert_identical(expected, actual) From 94b9ba95e1d76bac2cc553dff745b96c9d141bfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Jan 2023 15:53:49 +0100 Subject: [PATCH 28/39] revert fill_value for alignment.py --- xarray/core/alignment.py | 2 +- xarray/core/concat.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 051ad305904..38978a5e4f3 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -71,7 +71,7 @@ def reindex_variables( for name, var in variables.items(): if isinstance(fill_value, dict): - fill_value_ = fill_value[name] + fill_value_ = fill_value.get(name, dtypes.NA) else: fill_value_ = fill_value diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 468f7ade657..421fb5f737f 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -500,7 +500,7 @@ def _dataset_concat( # Make sure we're working on a copy (we'll be loading variables) datasets = [ds.copy() for ds in datasets] datasets = list( - align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value_) + align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value) ) dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets( From 70be70f6b3758f491e2390fdd84838615688b94b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Jan 2023 09:46:11 +0100 Subject: [PATCH 29/39] derive variable order in order of appearance as suggested per review --- xarray/core/concat.py | 22 +++------------------- xarray/tests/test_concat.py | 4 +--- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 421fb5f737f..6ca4f02543e 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -407,20 +407,6 @@ def process_subset_opt(opt, subset): return concat_over, equals, concat_dim_lengths -def _get_var_order( - datasets: list[T_Dataset], data_names: set[Hashable] -) -> list[Hashable]: - # get ordering from first dataset - # if first_dataset contains all wanted variables, we are safe - # otherwise we append missing variables to the end - data_var_order = list(datasets[0].variables) - # TODO: since data_names is a set, ordering for the appended variables - # is not deterministic, see also discussion in - # https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738 - data_var_order += [e for e in data_names if e not in data_var_order] - return data_var_order - - # determine dimensional coordinate names and a dict mapping name to DataArray def _parse_datasets( datasets: list[T_Dataset], @@ -431,18 +417,18 @@ def _parse_datasets( set[Hashable], list[Hashable], ]: - dims: set[Hashable] = set() all_coord_names: set[Hashable] = set() data_vars: set[Hashable] = set() # list of data_vars dim_coords: dict[Hashable, Variable] = {} # maps dim name to variable dims_sizes: dict[Hashable, int] = {} # shared dimension sizes to expand variables - data_vars_order: list[Hashable] # ordered list of variables + variables_order: dict[Hashable, Variable] = {} # variables in order of appearance for i, ds in enumerate(datasets): dims_sizes.update(ds.dims) all_coord_names.update(ds.coords) data_vars.update(ds.data_vars) + variables_order.update(ds.variables) # preserves ordering of dimensions for dim in ds.dims: @@ -453,9 +439,7 @@ def _parse_datasets( dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) - data_vars_order = _get_var_order(datasets, data_vars) - - return dim_coords, dims_sizes, all_coord_names, data_vars, data_vars_order + return dim_coords, dims_sizes, all_coord_names, data_vars, list(variables_order) def _dataset_concat( diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 0bed5786464..0e1c576079d 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -145,9 +145,7 @@ def test_concat_compat() -> None: for var in ["has_x", "no_x_y"]: assert "y" not in result[var].dims and "y" not in result[var].coords - with pytest.raises( - ValueError, match=r"coordinates in some datasets but not others" - ): + with pytest.raises(ValueError, match=r"'q' not present in all datasets"): concat([ds1, ds2], dim="q") with pytest.raises(ValueError, match=r"'q' not present in all datasets"): concat([ds2, ds1], dim="q") From c3eda8f5359e0781b3c758db724e2228a9bb5a68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Jan 2023 09:49:25 +0100 Subject: [PATCH 30/39] remove unneeded enumerate --- xarray/core/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 6ca4f02543e..77c784513ec 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -424,7 +424,7 @@ def _parse_datasets( dims_sizes: dict[Hashable, int] = {} # shared dimension sizes to expand variables variables_order: dict[Hashable, Variable] = {} # variables in order of appearance - for i, ds in enumerate(datasets): + for ds in datasets: dims_sizes.update(ds.dims) all_coord_names.update(ds.coords) data_vars.update(ds.data_vars) From 70f38abc2a850da7472c69323a9a1001dc8df2c3 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 17 Jan 2023 08:23:33 -0700 Subject: [PATCH 31/39] Use alignment.reindex_variables instead. This also removes the need to handle fill_value --- xarray/core/concat.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 77c784513ec..10db61612d0 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,12 +1,11 @@ from __future__ import annotations -from collections import defaultdict from typing import TYPE_CHECKING, Any, Hashable, Iterable, cast, overload import pandas as pd from xarray.core import dtypes, utils -from xarray.core.alignment import align +from xarray.core.alignment import align, reindex_variables from xarray.core.duck_array_ops import lazy_array_equiv from xarray.core.indexes import Index, PandasIndex from xarray.core.merge import ( @@ -475,12 +474,6 @@ def _dataset_concat( dim, index = _calc_concat_dim_index(dim) - # ensure dictionary for fill_value - if isinstance(fill_value, dict): - fill_value_ = fill_value.copy() - else: - fill_value_ = defaultdict(lambda: fill_value) - # Make sure we're working on a copy (we'll be loading variables) datasets = [ds.copy() for ds in datasets] datasets = list( @@ -627,12 +620,13 @@ def get_indexes(name): ) # reindex if variable is not present in all datasets if len(variable_index) < len(concat_index): - combined_var = ( - DataArray(data=combined_var, name=name) - .assign_coords({dim: variable_index}) - .reindex({dim: concat_index}, fill_value=fill_value_[name]) - .variable - ) + combined_var = reindex_variables( + variables={name: combined_var}, + dim_pos_indexers={ + dim: pd.Index(variable_index).get_indexer(concat_index) + }, + fill_value=fill_value, + )[name] result_vars[name] = combined_var elif name in result_vars: From 4825c9406377c0348980448c74a83d15e6fccfd6 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 17 Jan 2023 08:26:54 -0700 Subject: [PATCH 32/39] small cleanup --- xarray/core/concat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 10db61612d0..c464c50ce59 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -480,7 +480,7 @@ def _dataset_concat( align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value) ) - dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets( + dim_coords, dims_sizes, coord_names, data_names, vars_order = _parse_datasets( datasets ) dim_names = set(dim_coords) @@ -569,7 +569,7 @@ def get_indexes(name): # stack up each variable and/or index to fill-out the dataset (in order) # n.b. this loop preserves variable order, needed for groupby. - for name in data_vars_order: + for name in vars_order: if name in concat_over and name not in result_indexes: variables = [] variable_index = [] @@ -589,7 +589,7 @@ def get_indexes(name): raise ValueError( f"coordinate {name!r} not present in all datasets." ) - vars = list(ensure_common_dims(variables)) + vars = ensure_common_dims(variables) # Try to concatenate the indexes, concatenate the variables when no index # is found on all datasets. From a71f633a52b50cbcbd5a05717cc3662c64e525b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Jan 2023 18:07:56 +0100 Subject: [PATCH 33/39] Update doc/whats-new.rst Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d7127deee1c..25d803ec5a1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -75,7 +75,7 @@ Bug fixes By `Benoît Bovy `_. - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`, :pull:`7393`). By `Ian Carroll `_. -- Make :py:func:`xarray.concat` more robust when concatenating variables present in some datasets but +- :py:func:`xarray.concat` can now concatenate variables present in some datasets but not others (:issue:`508`, :pull:`7400`). By `Kai Mühlbauer `_ and `Scott Chamberlin `_. From 92e6108484331819fa5e58844de41056f51c5f09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 18 Jan 2023 12:14:39 +0100 Subject: [PATCH 34/39] adapt tests as per review request, fix ensure_common_dims --- xarray/core/concat.py | 17 +- xarray/tests/test_concat.py | 615 ++++++++++-------------------------- 2 files changed, 172 insertions(+), 460 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index c464c50ce59..95a2199f9c5 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -536,7 +536,7 @@ def _dataset_concat( # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables - def ensure_common_dims(vars): + def ensure_common_dims(vars, concat_dim_lengths): # ensure each variable with the given name shares the same # dimensions and the same shape for all of them except along the # concat dimension @@ -573,23 +573,24 @@ def get_indexes(name): if name in concat_over and name not in result_indexes: variables = [] variable_index = [] + var_concat_dim_length = [] for i, ds in enumerate(datasets): if name in ds.variables: variables.append(ds[name].variable) # add to variable index, needed for reindexing - variable_index.extend( - [ - sum(concat_dim_lengths[:i]) + k - for k in range(concat_dim_lengths[i]) - ] - ) + var_idx = [ + sum(concat_dim_lengths[:i]) + k + for k in range(concat_dim_lengths[i]) + ] + variable_index.extend(var_idx) + var_concat_dim_length.append(len(var_idx)) else: # raise if coordinate not in all datasets if name in coord_names: raise ValueError( f"coordinate {name!r} not present in all datasets." ) - vars = ensure_common_dims(variables) + vars = ensure_common_dims(variables, var_concat_dim_length) # Try to concatenate the indexes, concatenate the variables when no index # is found on all datasets. diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 0e1c576079d..64e3ca30f36 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -31,32 +31,17 @@ def create_concat_datasets( lat = rng.standard_normal(size=(1, 4)) lon = rng.standard_normal(size=(1, 4)) result = [] + variables = ["temperature", "pressure", "humidity", "precipitation", "cloud_cover"] for i in range(num_datasets): if include_day: + data_tuple = ( + ["x", "y", "day"], + rng.standard_normal(size=(1, 4, 2)), + ) + data_vars = {v: data_tuple for v in variables} result.append( Dataset( - data_vars={ - "temperature": ( - ["x", "y", "day"], - rng.standard_normal(size=(1, 4, 2)), - ), - "pressure": ( - ["x", "y", "day"], - rng.standard_normal(size=(1, 4, 2)), - ), - "humidity": ( - ["x", "y", "day"], - rng.standard_normal(size=(1, 4, 2)), - ), - "precipitation": ( - ["x", "y", "day"], - rng.standard_normal(size=(1, 4, 2)), - ), - "cloud cover": ( - ["x", "y", "day"], - rng.standard_normal(size=(1, 4, 2)), - ), - }, + data_vars=data_vars, coords={ "lat": (["x", "y"], lat), "lon": (["x", "y"], lon), @@ -65,15 +50,14 @@ def create_concat_datasets( ) ) else: + data_tuple = ( + ["x", "y"], + rng.standard_normal(size=(1, 4)), + ) + data_vars = {v: data_tuple for v in variables} result.append( Dataset( - data_vars={ - "temperature": (["x", "y"], rng.standard_normal(size=(1, 4))), - "pressure": (["x", "y"], rng.standard_normal(size=(1, 4))), - "humidity": (["x", "y"], rng.standard_normal(size=(1, 4))), - "precipitation": (["x", "y"], rng.standard_normal(size=(1, 4))), - "cloud cover": (["x", "y"], rng.standard_normal(size=(1, 4))), - }, + data_vars=data_vars, coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)}, ) ) @@ -153,163 +137,82 @@ def test_concat_compat() -> None: def test_concat_missing_var() -> None: datasets = create_concat_datasets(2, 123) - vars_to_drop = ["humidity", "precipitation", "cloud cover"] - datasets[0] = datasets[0].drop_vars(vars_to_drop) - datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"]) + expected = concat(datasets, dim="day") + vars_to_drop = ["humidity", "precipitation", "cloud_cover"] - temperature_result = np.concatenate( - (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 - ) - pressure_result = np.concatenate( - (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2 - ) - ds_result = Dataset( - data_vars={ - "temperature": (["x", "y", "day"], temperature_result), - "pressure": (["x", "y", "day"], pressure_result), - }, - coords={ - "lat": (["x", "y"], datasets[0].lat.values), - "lon": (["x", "y"], datasets[0].lon.values), - "day": ["day1", "day2", "day3", "day4"], - }, - ) - result = concat(datasets, dim="day") + expected = expected.drop_vars(vars_to_drop) + expected["pressure"][..., 2:] = np.nan - r1 = list(result.data_vars.keys()) - r2 = list(ds_result.data_vars.keys()) - assert r1 == r2 # check the variables orders are the same + datasets[0] = datasets[0].drop_vars(vars_to_drop) + datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"]) + actual = concat(datasets, dim="day") - assert_equal(result, ds_result) + assert list(actual.data_vars.keys()) == ["temperature", "pressure"] + assert_identical(actual, expected) def test_concat_missing_multiple_consecutive_var() -> None: datasets = create_concat_datasets(3, 123) - vars_to_drop = ["pressure", "humidity"] + expected = concat(datasets, dim="day") + vars_to_drop = ["humidity", "pressure"] + + expected["pressure"][..., :4] = np.nan + expected["humidity"][..., :4] = np.nan + datasets[0] = datasets[0].drop_vars(vars_to_drop) datasets[1] = datasets[1].drop_vars(vars_to_drop) + actual = concat(datasets, dim="day") - temperature_result = np.concatenate( - ( - datasets[0].temperature.values, - datasets[1].temperature.values, - datasets[2].temperature.values, - ), - axis=2, - ) - pressure_result = np.concatenate( - ( - np.full([1, 4, 2], np.nan), - np.full([1, 4, 2], np.nan), - datasets[2].pressure.values, - ), - axis=2, - ) - humidity_result = np.concatenate( - ( - np.full([1, 4, 2], np.nan), - np.full([1, 4, 2], np.nan), - datasets[2].humidity.values, - ), - axis=2, - ) - precipitation_result = np.concatenate( - ( - datasets[0].precipitation.values, - datasets[1].precipitation.values, - datasets[2].precipitation.values, - ), - axis=2, - ) - cloudcover_result = np.concatenate( - ( - datasets[0]["cloud cover"].values, - datasets[1]["cloud cover"].values, - datasets[2]["cloud cover"].values, - ), - axis=2, - ) - - ds_result = Dataset( - data_vars={ - "temperature": (["x", "y", "day"], temperature_result), - "precipitation": (["x", "y", "day"], precipitation_result), - "cloud cover": (["x", "y", "day"], cloudcover_result), - "humidity": (["x", "y", "day"], humidity_result), - "pressure": (["x", "y", "day"], pressure_result), - }, - coords={ - "lat": (["x", "y"], datasets[0].lat.values), - "lon": (["x", "y"], datasets[0].lon.values), - "day": ["day1", "day2", "day3", "day4", "day5", "day6"], - }, - ) - result = concat(datasets, dim="day") - r1 = [var for var in result.data_vars] - r2 = [var for var in ds_result.data_vars] - r1 == r2 - assert_equal(result, ds_result) + assert list(actual.data_vars.keys()) == [ + "temperature", + "precipitation", + "cloud_cover", + "pressure", + "humidity", + ] + assert_identical(actual, expected) def test_concat_all_empty() -> None: ds1 = Dataset() ds2 = Dataset() - result = concat([ds1, ds2], dim="new_dim") + expected = Dataset() + actual = concat([ds1, ds2], dim="new_dim") - assert_equal(result, Dataset()) + assert_identical(actual, expected) def test_concat_second_empty() -> None: ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1}) ds2 = Dataset(coords={"x": 0.1}) - ds_result = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1}) - result = concat([ds1, ds2], dim="y") + expected = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1}) + actual = concat([ds1, ds2], dim="y") - assert_equal(result, ds_result) + assert_identical(actual, expected) def test_multiple_missing_variables() -> None: datasets = create_concat_datasets(2, 123) - vars_to_drop = ["pressure", "cloud cover"] - datasets[1] = datasets[1].drop_vars(vars_to_drop) + expected = concat(datasets, dim="day") + vars_to_drop = ["pressure", "cloud_cover"] - temperature_result = np.concatenate( - (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 - ) - pressure_result = np.concatenate( - (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2 - ) - humidity_result = np.concatenate( - (datasets[0].humidity.values, datasets[1].humidity.values), axis=2 - ) - precipitation_result = np.concatenate( - (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2 - ) - cloudcover_result = np.concatenate( - (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2 - ) - ds_result = Dataset( - data_vars={ - "temperature": (["x", "y", "day"], temperature_result), - "pressure": (["x", "y", "day"], pressure_result), - "humidity": (["x", "y", "day"], humidity_result), - "precipitation": (["x", "y", "day"], precipitation_result), - "cloud cover": (["x", "y", "day"], cloudcover_result), - }, - coords={ - "lat": (["x", "y"], datasets[0].lat.values), - "lon": (["x", "y"], datasets[0].lon.values), - "day": ["day1", "day2", "day3", "day4"], - }, - ) - result = concat(datasets, dim="day") + expected["pressure"][..., 2:] = np.nan + expected["cloud_cover"][..., 2:] = np.nan - r1 = list(result.data_vars.keys()) - r2 = list(ds_result.data_vars.keys()) - assert r1 == r2 # check the variables orders are the same + datasets[1] = datasets[1].drop_vars(vars_to_drop) + actual = concat(datasets, dim="day") + + # check the variables orders are the same + assert list(actual.data_vars.keys()) == [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud_cover", + ] - assert_equal(result, ds_result) + assert_identical(actual, expected) @pytest.mark.parametrize("include_day", [True, False]) @@ -319,80 +222,31 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) -> "pressure", "humidity", "precipitation", - "cloud cover", + "cloud_cover", ] datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=include_day) - # set up the test data - datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] + expected = concat(datasets, dim="day") - dim_size = 2 if include_day else 1 - - # set up the validation data - # the below code just drops one var per dataset depending on the location of the - # dataset in the list and allows us to quickly catch any boundaries cases across - # the three equivalence classes of beginning, middle and end of the concat list - result_vars = dict.fromkeys(vars_to_drop, np.array([])) - for i in range(len(vars_to_drop)): - for d in range(len(datasets)): - if d != i: - if include_day: - ds_vals = datasets[d][vars_to_drop[i]].values - else: - ds_vals = datasets[d][vars_to_drop[i]].values[..., None] - if not result_vars[vars_to_drop[i]].size: - result_vars[vars_to_drop[i]] = ds_vals - else: - result_vars[vars_to_drop[i]] = np.concatenate( - ( - result_vars[vars_to_drop[i]], - ds_vals, - ), - axis=-1, - ) - else: - if not result_vars[vars_to_drop[i]].size: - result_vars[vars_to_drop[i]] = np.full([1, 4, dim_size], np.nan) - else: - result_vars[vars_to_drop[i]] = np.concatenate( - ( - result_vars[vars_to_drop[i]], - np.full([1, 4, dim_size], np.nan), - ), - axis=-1, - ) - - ds_result = Dataset( - data_vars={ - # pressure will be first here since it is first in first dataset and - # there isn't a good way to determine that temperature should be first - # this also means temperature will be last as the first data vars will - # determine the order for all that exist in that dataset - "pressure": (["x", "y", "day"], result_vars["pressure"]), - "humidity": (["x", "y", "day"], result_vars["humidity"]), - "precipitation": (["x", "y", "day"], result_vars["precipitation"]), - "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), - "temperature": (["x", "y", "day"], result_vars["temperature"]), - }, - coords={ - "lat": (["x", "y"], datasets[0].lat.values), - "lon": (["x", "y"], datasets[0].lon.values), - }, - ) - if include_day: - ds_result = ds_result.assign_coords( - {"day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))]} - ) - else: - ds_result = ds_result.transpose("day", "x", "y") + for i, name in enumerate(vars_to_drop): + if include_day: + expected[name][..., i * 2 : (i + 1) * 2] = np.nan + else: + expected[name][i : i + 1, ...] = np.nan - result = concat(datasets, dim="day") + # set up the test data + datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] - r1 = list(result.data_vars.keys()) - r2 = list(ds_result.data_vars.keys()) - assert r1 == r2 # check the variables orders are the same + actual = concat(datasets, dim="day") - assert_equal(result, ds_result) + assert list(actual.data_vars.keys()) == [ + "pressure", + "humidity", + "precipitation", + "cloud_cover", + "temperature", + ] + assert_identical(actual, expected) def test_multiple_datasets_with_missing_variables() -> None: @@ -401,206 +255,83 @@ def test_multiple_datasets_with_missing_variables() -> None: "pressure", "humidity", "precipitation", - "cloud cover", + "cloud_cover", ] datasets = create_concat_datasets(len(vars_to_drop), 123) - # set up the test data - datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] - # set up the validation data - # the below code just drops one var per dataset depending on the location of the - # dataset in the list and allows us to quickly catch any boundaries cases across - # the three equivalence classes of beginning, middle and end of the concat list - result_vars = dict.fromkeys(vars_to_drop, np.array([])) - for i in range(len(vars_to_drop)): - for d in range(len(datasets)): - if d != i: - if not result_vars[vars_to_drop[i]].size: - result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values - else: - result_vars[vars_to_drop[i]] = np.concatenate( - ( - result_vars[vars_to_drop[i]], - datasets[d][vars_to_drop[i]].values, - ), - axis=2, - ) - else: - if not result_vars[vars_to_drop[i]].size: - result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan) - else: - result_vars[vars_to_drop[i]] = np.concatenate( - (result_vars[vars_to_drop[i]], np.full([1, 4, 2], np.nan)), - axis=2, - ) - - ds_result = Dataset( - data_vars={ - # pressure will be first here since it is first in first dataset and - # there isn't a good way to determine that temperature should be first - # this also means temperature will be last as the first data vars will - # determine the order for all that exist in that dataset - "pressure": (["x", "y", "day"], result_vars["pressure"]), - "humidity": (["x", "y", "day"], result_vars["humidity"]), - "precipitation": (["x", "y", "day"], result_vars["precipitation"]), - "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), - "temperature": (["x", "y", "day"], result_vars["temperature"]), - }, - coords={ - "lat": (["x", "y"], datasets[0].lat.values), - "lon": (["x", "y"], datasets[0].lon.values), - "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))], - }, - ) - result = concat(datasets, dim="day") + expected = concat(datasets, dim="day") + for i, name in enumerate(vars_to_drop): + expected[name][..., i * 2 : (i + 1) * 2] = np.nan - r1 = list(result.data_vars.keys()) - r2 = list(ds_result.data_vars.keys()) - assert r1 == r2 # check the variables orders are the same + # set up the test data + datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] + actual = concat(datasets, dim="day") - assert_equal(result, ds_result) + assert list(actual.data_vars.keys()) == [ + "pressure", + "humidity", + "precipitation", + "cloud_cover", + "temperature", + ] + assert_identical(actual, expected) def test_multiple_datasets_with_multiple_missing_variables() -> None: vars_to_drop_in_first = ["temperature", "pressure"] - vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"] + vars_to_drop_in_second = ["humidity", "precipitation", "cloud_cover"] datasets = create_concat_datasets(2, 123) + expected = concat(datasets, dim="day") + for name in vars_to_drop_in_first: + expected[name][..., :2] = np.nan + for name in vars_to_drop_in_second: + expected[name][..., 2:] = np.nan + # set up the test data datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) - temperature_result = np.concatenate( - (np.full([1, 4, 2], np.nan), datasets[1].temperature.values), axis=2 - ) - pressure_result = np.concatenate( - (np.full([1, 4, 2], np.nan), datasets[1].pressure.values), axis=2 - ) - humidity_result = np.concatenate( - (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2 - ) - precipitation_result = np.concatenate( - (datasets[0].precipitation.values, np.full([1, 4, 2], np.nan)), axis=2 - ) - cloudcover_result = np.concatenate( - (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2 - ) - ds_result = Dataset( - data_vars={ - "humidity": (["x", "y", "day"], humidity_result), - "precipitation": (["x", "y", "day"], precipitation_result), - "cloud cover": (["x", "y", "day"], cloudcover_result), - # these two are at the end of the expected as they are missing from the first - # dataset in the concat list - "temperature": (["x", "y", "day"], temperature_result), - "pressure": (["x", "y", "day"], pressure_result), - }, - coords={ - "lat": (["x", "y"], datasets[0].lat.values), - "lon": (["x", "y"], datasets[0].lon.values), - "day": ["day1", "day2", "day3", "day4"], - }, - ) - result = concat(datasets, dim="day") + actual = concat(datasets, dim="day") - r1 = list(result.data_vars.keys()) - r2 = list(ds_result.data_vars.keys()) - # check the variables orders are the same for the first three variables - # TODO: Can all variables become deterministic? - assert r1[:3] == r2[:3] - assert set(r1[3:]) == set(r2[3:]) # just check availability for the remaining vars - assert_equal(result, ds_result) + assert list(actual.data_vars.keys()) == [ + "humidity", + "precipitation", + "cloud_cover", + "temperature", + "pressure", + ] + assert_identical(actual, expected) def test_type_of_missing_fill() -> None: datasets = create_typed_datasets(2, 123) - + expected1 = concat(datasets, dim="day", fill_value=dtypes.NA) + expected2 = concat(datasets[::-1], dim="day", fill_value=dtypes.NA) vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"] + expected = [expected2, expected1] + for i, exp in enumerate(expected): + sl = slice(i * 2, (i + 1) * 2) + exp["float2"][..., sl] = np.nan + exp["datetime64"][..., sl] = np.nan + exp["timedelta64"][..., sl] = np.nan + var = exp["int"] * 1.0 + var[..., sl] = np.nan + exp["int"] = var + var = exp["string"].astype(object) + var[..., sl] = np.nan + exp["string"] = var # set up the test data datasets[1] = datasets[1].drop_vars(vars[1:]) - float_result = np.concatenate( - (datasets[0].float.values, datasets[1].float.values), axis=2 - ) - float2_result = np.concatenate( - (datasets[0].float2.values, np.full([1, 4, 2], np.nan)), axis=2 - ) - # to correctly create the expected dataset we need to ensure we promote the string array to - # object type before filling as it will be promoted to that in the concat case. - # this matches the behavior of pandas - string_values = datasets[0].string.values - string_values = string_values.astype(object) - string_result = np.concatenate((string_values, np.full([1, 4, 2], np.nan)), axis=2) - datetime_result = np.concatenate( - (datasets[0].datetime64.values, np.full([1, 4, 2], np.datetime64("NaT"))), - axis=2, - ) - timedelta_result = np.concatenate( - (datasets[0].timedelta64.values, np.full([1, 4, 2], np.timedelta64("NaT"))), - axis=2, - ) - # string_result = string_result.astype(object) - int_result = np.concatenate( - (datasets[0].int.values, np.full([1, 4, 2], np.nan)), axis=2 - ) - ds_result = Dataset( - data_vars={ - "float": (["x", "y", "day"], float_result), - "float2": (["x", "y", "day"], float2_result), - "string": (["x", "y", "day"], string_result), - "int": (["x", "y", "day"], int_result), - "datetime64": (["x", "y", "day"], datetime_result), - "timedelta64": (["x", "y", "day"], timedelta_result), - }, - coords={ - "lat": (["x", "y"], datasets[0].lat.values), - "lon": (["x", "y"], datasets[0].lon.values), - "day": ["day1", "day2", "day3", "day4"], - }, - ) - result = concat(datasets, dim="day", fill_value=dtypes.NA) + actual = concat(datasets, dim="day", fill_value=dtypes.NA) - assert_equal(result, ds_result) + assert_identical(actual, expected[1]) - # test in the reverse order - float_result_rev = np.concatenate( - (datasets[1].float.values, datasets[0].float.values), axis=2 - ) - float2_result_rev = np.concatenate( - (np.full([1, 4, 2], np.nan), datasets[0].float2.values), axis=2 - ) - string_result_rev = np.concatenate( - (np.full([1, 4, 2], np.nan), string_values), axis=2 - ) - datetime_result_rev = np.concatenate( - (np.full([1, 4, 2], np.datetime64("NaT")), datasets[0].datetime64.values), - axis=2, - ) - timedelta_result_rev = np.concatenate( - (np.full([1, 4, 2], np.timedelta64("NaT")), datasets[0].timedelta64.values), - axis=2, - ) - int_result_rev = np.concatenate( - (np.full([1, 4, 2], np.nan), datasets[0].int.values), axis=2 - ) - ds_result_rev = Dataset( - data_vars={ - "float": (["x", "y", "day"], float_result_rev), - "float2": (["x", "y", "day"], float2_result_rev), - "string": (["x", "y", "day"], string_result_rev), - "int": (["x", "y", "day"], int_result_rev), - "datetime64": (["x", "y", "day"], datetime_result_rev), - "timedelta64": (["x", "y", "day"], timedelta_result_rev), - }, - coords={ - "lat": (["x", "y"], datasets[0].lat.values), - "lon": (["x", "y"], datasets[0].lon.values), - "day": ["day3", "day4", "day1", "day2"], - }, - ) - result_rev = concat(datasets[::-1], dim="day", fill_value=dtypes.NA) + # reversed + actual = concat(datasets[::-1], dim="day", fill_value=dtypes.NA) - assert_equal(result_rev, ds_result_rev) + assert_identical(actual, expected[0]) def test_order_when_filling_missing() -> None: @@ -608,68 +339,38 @@ def test_order_when_filling_missing() -> None: # drop middle vars_to_drop_in_second = ["humidity"] datasets = create_concat_datasets(2, 123) + expected1 = concat(datasets, dim="day") + for name in vars_to_drop_in_second: + expected1[name][..., 2:] = np.nan + expected2 = concat(datasets[::-1], dim="day") + for name in vars_to_drop_in_second: + expected2[name][..., :2] = np.nan + # set up the test data datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) - temperature_result = np.concatenate( - (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 - ) - pressure_result = np.concatenate( - (datasets[0].pressure.values, datasets[1].pressure.values), axis=2 - ) - humidity_result = np.concatenate( - (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2 - ) - precipitation_result = np.concatenate( - (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2 - ) - cloudcover_result = np.concatenate( - (datasets[0]["cloud cover"].values, datasets[1]["cloud cover"].values), axis=2 - ) - ds_result = Dataset( - data_vars={ - "temperature": (["x", "y", "day"], temperature_result), - "pressure": (["x", "y", "day"], pressure_result), - "precipitation": (["x", "y", "day"], precipitation_result), - "cloud cover": (["x", "y", "day"], cloudcover_result), - "humidity": (["x", "y", "day"], humidity_result), - }, - coords={ - "lat": (["x", "y"], datasets[0].lat.values), - "lon": (["x", "y"], datasets[0].lon.values), - "day": ["day1", "day2", "day3", "day4"], - }, - ) - result = concat(datasets, dim="day") - - assert_equal(result, ds_result) + actual = concat(datasets, dim="day") - result_keys = [ + assert list(actual.data_vars.keys()) == [ "temperature", "pressure", "humidity", "precipitation", - "cloud cover", + "cloud_cover", ] - result_index = 0 - for k in result.data_vars.keys(): - assert k == result_keys[result_index] - result_index += 1 + assert_identical(actual, expected1) - result_keys_rev = [ + actual = concat(datasets[::-1], dim="day") + + assert list(actual.data_vars.keys()) == [ "temperature", "pressure", "precipitation", - "cloud cover", + "cloud_cover", "humidity", ] - # test order when concat in reversed order - rev_result = concat(datasets[::-1], dim="day") - result_index = 0 - for k in rev_result.data_vars.keys(): - assert k == result_keys_rev[result_index] - result_index += 1 + assert_identical(actual, expected2) @pytest.fixture @@ -739,16 +440,19 @@ def test_concat_fill_missing_variables( concat_ds = create_concat_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx) actual = concat(concat_ds, dim="time", data_vars="all") - r1 = list(actual.data_vars.keys()) - r2 = list(expected.data_vars.keys()) - # move element missing in first dataset at the end - # TODO: Can all variables become deterministic? - r2 = r2 + [r2.pop(drop_idx[0])] - assert r1 == r2 # check the variables orders are the same - - for name in var_names[0]: - assert_equal(expected[name], actual[name]) - assert_equal(expected, actual) + assert list(actual.data_vars.keys()) == [ + "d01", + "d02", + "d03", + "d04", + "d05", + "d06", + "d07", + "d08", + "d09", + "d00", + ] + assert_identical(actual, expected) class TestConcatDataset: @@ -787,6 +491,13 @@ def test_concat_merge_variables_present_in_some_datasets(self, data) -> None: expected = data.copy().assign(foo=data1.foo) assert_identical(expected, actual) + # expand foo + actual = concat([data0, data1], "dim1") + foo = np.ones((8, 10), dtype=data1.foo.dtype) * np.nan + foo[3:] = data1.foo.values[None, ...] + expected = data.copy().assign(foo=(["dim1", "bar"], foo)) + assert_identical(expected, actual) + def test_concat_2(self, data) -> None: dim = "dim2" datasets = [g for _, g in data.groupby(dim, squeeze=True)] From 8610397c647bc01d757bc04354e8d43b1c8b19d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 19 Jan 2023 09:33:42 +0100 Subject: [PATCH 35/39] adapt tests as per review request --- xarray/tests/test_concat.py | 67 +++++++++++++------------------------ 1 file changed, 23 insertions(+), 44 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 64e3ca30f36..d3d8eeeea4c 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -136,7 +136,7 @@ def test_concat_compat() -> None: def test_concat_missing_var() -> None: - datasets = create_concat_datasets(2, 123) + datasets = create_concat_datasets(2, seed=123) expected = concat(datasets, dim="day") vars_to_drop = ["humidity", "precipitation", "cloud_cover"] @@ -152,7 +152,7 @@ def test_concat_missing_var() -> None: def test_concat_missing_multiple_consecutive_var() -> None: - datasets = create_concat_datasets(3, 123) + datasets = create_concat_datasets(3, seed=123) expected = concat(datasets, dim="day") vars_to_drop = ["humidity", "pressure"] @@ -191,9 +191,16 @@ def test_concat_second_empty() -> None: assert_identical(actual, expected) + expected = Dataset( + data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": ("y", [0.1, 0.1])} + ) + actual = concat([ds1, ds2], dim="y", coords="all") + + assert_identical(actual, expected) -def test_multiple_missing_variables() -> None: - datasets = create_concat_datasets(2, 123) + +def test_concat_multiple_missing_variables() -> None: + datasets = create_concat_datasets(2, seed=123) expected = concat(datasets, dim="day") vars_to_drop = ["pressure", "cloud_cover"] @@ -216,7 +223,7 @@ def test_multiple_missing_variables() -> None: @pytest.mark.parametrize("include_day", [True, False]) -def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) -> None: +def test_concat_multiple_datasets_missing_vars(include_day: bool) -> None: vars_to_drop = [ "temperature", "pressure", @@ -225,7 +232,9 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) -> "cloud_cover", ] - datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=include_day) + datasets = create_concat_datasets( + len(vars_to_drop), seed=123, include_day=include_day + ) expected = concat(datasets, dim="day") for i, name in enumerate(vars_to_drop): @@ -235,36 +244,8 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) -> expected[name][i : i + 1, ...] = np.nan # set up the test data - datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] - - actual = concat(datasets, dim="day") - - assert list(actual.data_vars.keys()) == [ - "pressure", - "humidity", - "precipitation", - "cloud_cover", - "temperature", - ] - assert_identical(actual, expected) + datasets = [ds.drop_vars(varname) for ds, varname in zip(datasets, vars_to_drop)] - -def test_multiple_datasets_with_missing_variables() -> None: - vars_to_drop = [ - "temperature", - "pressure", - "humidity", - "precipitation", - "cloud_cover", - ] - datasets = create_concat_datasets(len(vars_to_drop), 123) - - expected = concat(datasets, dim="day") - for i, name in enumerate(vars_to_drop): - expected[name][..., i * 2 : (i + 1) * 2] = np.nan - - # set up the test data - datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] actual = concat(datasets, dim="day") assert list(actual.data_vars.keys()) == [ @@ -277,10 +258,10 @@ def test_multiple_datasets_with_missing_variables() -> None: assert_identical(actual, expected) -def test_multiple_datasets_with_multiple_missing_variables() -> None: +def test_concat_multiple_datasets_with_multiple_missing_variables() -> None: vars_to_drop_in_first = ["temperature", "pressure"] vars_to_drop_in_second = ["humidity", "precipitation", "cloud_cover"] - datasets = create_concat_datasets(2, 123) + datasets = create_concat_datasets(2, seed=123) expected = concat(datasets, dim="day") for name in vars_to_drop_in_first: expected[name][..., :2] = np.nan @@ -303,8 +284,8 @@ def test_multiple_datasets_with_multiple_missing_variables() -> None: assert_identical(actual, expected) -def test_type_of_missing_fill() -> None: - datasets = create_typed_datasets(2, 123) +def test_concat_type_of_missing_fill() -> None: + datasets = create_typed_datasets(2, seed=123) expected1 = concat(datasets, dim="day", fill_value=dtypes.NA) expected2 = concat(datasets[::-1], dim="day", fill_value=dtypes.NA) vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"] @@ -334,11 +315,11 @@ def test_type_of_missing_fill() -> None: assert_identical(actual, expected[0]) -def test_order_when_filling_missing() -> None: +def test_concat_order_when_filling_missing() -> None: vars_to_drop_in_first: list[str] = [] # drop middle vars_to_drop_in_second = ["humidity"] - datasets = create_concat_datasets(2, 123) + datasets = create_concat_datasets(2, seed=123) expected1 = concat(datasets, dim="day") for name in vars_to_drop_in_second: expected1[name][..., 2:] = np.nan @@ -423,11 +404,9 @@ def create_ds( def test_concat_fill_missing_variables( concat_var_names, create_concat_ds, dim: bool, coord: bool ) -> None: - # random single variables missing in each dataset var_names = concat_var_names() + drop_idx = [0, 7, 6, 4, 4, 8, 0, 6, 2, 0] - rng = np.random.default_rng(seed=42) - drop_idx = [rng.integers(len(vlist)) for vlist in var_names] expected = concat( create_concat_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all" ) From 6cb163eda58962b34230a41de1f1a66b82c11de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 19 Jan 2023 15:05:52 +0100 Subject: [PATCH 36/39] fix whats-new.rst --- doc/whats-new.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 25d803ec5a1..b0f6a07841b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,6 +35,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- :py:func:`xarray.concat` can now concatenate variables present in some datasets but + not others (:issue:`508`, :pull:`7400`). + By `Kai Mühlbauer `_ and `Scott Chamberlin `_. Documentation ~~~~~~~~~~~~~ @@ -75,9 +78,6 @@ Bug fixes By `Benoît Bovy `_. - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`, :pull:`7393`). By `Ian Carroll `_. -- :py:func:`xarray.concat` can now concatenate variables present in some datasets but - not others (:issue:`508`, :pull:`7400`). - By `Kai Mühlbauer `_ and `Scott Chamberlin `_. Internal Changes ~~~~~~~~~~~~~~~~ From 070c0fbca77a30846cd024443e93ffdb1ace6eb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 9 Jan 2023 13:14:47 +0100 Subject: [PATCH 37/39] add whats-new.rst entry --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b0f6a07841b..6c949a2367e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -78,6 +78,9 @@ Bug fixes By `Benoît Bovy `_. - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`, :pull:`7393`). By `Ian Carroll `_. +- Make :py:func:`xarray.concat` more robust when concatenating variables present in some datasets but + not others (:issue:`508`, :pull:`7400`). + By `Kai Mühlbauer `_ and `Scott Chamberlin `_. Internal Changes ~~~~~~~~~~~~~~~~ From de608904199a800d986d1c9638c2cac86dee1d23 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 19 Jan 2023 10:00:46 -0700 Subject: [PATCH 38/39] Add additional test with scalar data_var --- xarray/tests/test_concat.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index d3d8eeeea4c..eb3cd7fccc8 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -188,14 +188,27 @@ def test_concat_second_empty() -> None: expected = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1}) actual = concat([ds1, ds2], dim="y") - assert_identical(actual, expected) expected = Dataset( data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": ("y", [0.1, 0.1])} ) actual = concat([ds1, ds2], dim="y", coords="all") + assert_identical(actual, expected) + + # Check concatenating scalar data_var only present in ds1 + ds1["b"] = 0.1 + expected = Dataset( + data_vars={"a": ("y", [0.1, np.nan]), "b": ("y", [0.1, np.nan])}, + coords={"x": ("y", [0.1, 0.1])}, + ) + actual = concat([ds1, ds2], dim="y", coords="all", data_vars="all") + assert_identical(actual, expected) + expected = Dataset( + data_vars={"a": ("y", [0.1, np.nan]), "b": 0.1}, coords={"x": 0.1} + ) + actual = concat([ds1, ds2], dim="y", coords="different", data_vars="different") assert_identical(actual, expected) From b14c8f68a72d6e4bffef4416d554af569e613275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 20 Jan 2023 08:02:41 +0100 Subject: [PATCH 39/39] remove erroneous content from whats-new.rst --- doc/whats-new.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6c949a2367e..b0f6a07841b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -78,9 +78,6 @@ Bug fixes By `Benoît Bovy `_. - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`, :pull:`7393`). By `Ian Carroll `_. -- Make :py:func:`xarray.concat` more robust when concatenating variables present in some datasets but - not others (:issue:`508`, :pull:`7400`). - By `Kai Mühlbauer `_ and `Scott Chamberlin `_. Internal Changes ~~~~~~~~~~~~~~~~