From 829a74401649f79be94ab3950e9b9f3dc9d5a341 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Thu, 22 Dec 2022 14:39:06 +0100
Subject: [PATCH 01/39] ENH: fill missing variables during concat by reindexing

---
 xarray/core/concat.py       | 62 ++++++++++++++++++++++++++++++-------
 xarray/tests/test_concat.py |  6 ++--
 2 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 2eea2ecb3ee..d709b296e28 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -378,7 +378,9 @@ def process_subset_opt(opt, subset):
 
             elif opt == "all":
                 concat_over.update(
-                    set(getattr(datasets[0], subset)) - set(datasets[0].dims)
+                    set().union(
+                        *list((set(getattr(d, subset)) - set(d.dims) for d in datasets))
+                    )
                 )
             elif opt == "minimal":
                 pass
@@ -553,16 +555,35 @@ def get_indexes(name):
                     data = var.set_dims(dim).values
                     yield PandasIndex(data, dim, coord_dtype=var.dtype)
 
+    # preserve variable order for variables in first dataset
+    data_var_order = list(datasets[0].variables)
+    # append additional variables to the end
+    data_var_order += [e for e in data_names if e not in data_var_order]
+    # create concatenation index, needed for later reindexing
+    concat_index = list(range(sum(concat_dim_lengths)))
+
     # stack up each variable and/or index to fill-out the dataset (in order)
     # n.b. this loop preserves variable order, needed for groupby.
-    for name in datasets[0].variables:
+    for name in data_var_order:
         if name in concat_over and name not in result_indexes:
-            try:
-                vars = ensure_common_dims([ds[name].variable for ds in datasets])
-            except KeyError:
-                raise ValueError(f"{name!r} is not present in all datasets.")
-
-            # Try concatenate the indexes, concatenate the variables when no index
+            variables = []
+            variable_index = []
+            for i, ds in enumerate(datasets):
+                if name in ds.variables:
+                    variables.append(ds.variables[name])
+                    # add to variable index, needed for reindexing
+                    variable_index.extend(
+                        [sum(concat_dim_lengths[:i]) + k for k in range(concat_dim_lengths[i])]
+                    )
+                else:
+                    # raise if coordinate not in all datasets
+                    if name in coord_names:
+                        raise ValueError(
+                            f"coordinate {name!r} not present in all datasets."
+                        )
+            vars = list(ensure_common_dims(variables))
+
+            # Try to concatenate the indexes, concatenate the variables when no index
             # is found on all datasets.
             indexes: list[Index] = list(get_indexes(name))
             if indexes:
@@ -586,9 +607,28 @@ def get_indexes(name):
                     )
                     result_vars[k] = v
             else:
-                combined_var = concat_vars(
-                    vars, dim, positions, combine_attrs=combine_attrs
-                )
+                # if variable is only present in one dataset of multiple datasets,
+                # then do not concat
+                if len(variables) == 1 and len(datasets) > 1:
+                    combined_var = variables[0]
+                # only concat if variable is in multiple datasets
+                # or if single dataset (GH1988)
+                else:
+                    combined_var = concat_vars(
+                        vars, dim, positions, combine_attrs=combine_attrs
+                    )
+                    # reindex if variable is not present in all datasets
+                    if len(variable_index) < len(concat_index):
+                        try:
+                            fill = fill_value[name]
+                        except (TypeError, KeyError):
+                            fill = fill_value
+                        combined_var = (
+                            DataArray(data=combined_var, name=name)
+                            .assign_coords({dim: variable_index})
+                            .reindex({dim: concat_index}, fill_value=fill)
+                            .variable
+                        )
                 result_vars[name] = combined_var
 
         elif name in result_vars:
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index e0e0038cd89..186ad5b0865 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -50,8 +50,6 @@ def test_concat_compat() -> None:
         ValueError, match=r"coordinates in some datasets but not others"
     ):
         concat([ds1, ds2], dim="q")
-    with pytest.raises(ValueError, match=r"'q' is not present in all datasets"):
-        concat([ds2, ds1], dim="q")
 
 
 class TestConcatDataset:
@@ -776,7 +774,7 @@ def test_concat_merge_single_non_dim_coord():
         actual = concat([da1, da2], "x", coords=coords)
         assert_identical(actual, expected)
 
-    with pytest.raises(ValueError, match=r"'y' is not present in all datasets."):
+    with pytest.raises(ValueError, match=r"'y' not present in all datasets."):
         concat([da1, da2], dim="x", coords="all")
 
     da1 = DataArray([1, 2, 3], dims="x", coords={"x": [1, 2, 3], "y": 1})
@@ -784,7 +782,7 @@ def test_concat_merge_single_non_dim_coord():
     da3 = DataArray([7, 8, 9], dims="x", coords={"x": [7, 8, 9], "y": 1})
     for coords in ["different", "all"]:
         with pytest.raises(ValueError, match=r"'y' not present in all datasets"):
-            concat([da1, da2, da3], dim="x")
+            concat([da1, da2, da3], dim="x", coords=coords)
 
 
 def test_concat_preserve_coordinate_order() -> None:

From 871d2dbd25dc7a5bb1e5bae7033eb15bc957a793 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Dec 2022 14:43:09 +0000
Subject: [PATCH 02/39] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 xarray/core/concat.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index d709b296e28..6b83782f411 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -379,7 +379,7 @@ def process_subset_opt(opt, subset):
             elif opt == "all":
                 concat_over.update(
                     set().union(
-                        *list((set(getattr(d, subset)) - set(d.dims) for d in datasets))
+                        *list(set(getattr(d, subset)) - set(d.dims) for d in datasets)
                     )
                 )
             elif opt == "minimal":
@@ -573,7 +573,10 @@ def get_indexes(name):
                     variables.append(ds.variables[name])
                     # add to variable index, needed for reindexing
                     variable_index.extend(
-                        [sum(concat_dim_lengths[:i]) + k for k in range(concat_dim_lengths[i])]
+                        [
+                            sum(concat_dim_lengths[:i]) + k
+                            for k in range(concat_dim_lengths[i])
+                        ]
                     )
                 else:
                     # raise if coordinate not in all datasets

From 14bb7799e9c568dffd1d2bf2569ac8e9ab530a78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Wed, 4 Jan 2023 14:44:42 +0100
Subject: [PATCH 03/39] FIX: use `Any` for type of `fill_value` as this seems
 consistent with other places

---
 xarray/core/concat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 6b83782f411..ffd3bd338a4 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -441,7 +441,7 @@ def _dataset_concat(
     coords: str | list[str],
     compat: CompatOptions,
     positions: Iterable[Iterable[int]] | None,
-    fill_value: object = dtypes.NA,
+    fill_value: Any = dtypes.NA,
     join: JoinOptions = "outer",
     combine_attrs: CombineAttrsOptions = "override",
 ) -> T_Dataset:

From 6b497134dd93743701ea55106ee393993f7b362f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Wed, 4 Jan 2023 13:32:15 +0100
Subject: [PATCH 04/39] ENH: add tests

---
 xarray/tests/test_concat.py | 51 +++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 186ad5b0865..ea52b2a6da3 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -571,6 +571,57 @@ def test_concat_str_dtype(self, dtype, dim) -> None:
 
         assert np.issubdtype(actual.x2.dtype, dtype)
 
+    @pytest.mark.parametrize("dim", [True, False])
+    @pytest.mark.parametrize("coord", [True, False])
+    def test_concat_fill_missing_variables(self, dim, coord):
+        # create var names list with one missing value
+        def get_var_names(var_cnt=10, list_cnt=10):
+            orig = [f'd{i:02d}' for i in range(var_cnt)]
+            var_names = []
+            for i in range(0, list_cnt):
+                l1 = orig.copy()
+                var_names.append(l1)
+            return var_names
+
+        def create_ds(var_names, dim=False, coord=False, drop_idx=False):
+            out_ds = []
+            ds = Dataset()
+            ds = ds.assign_coords({"x": np.arange(2)})
+            ds = ds.assign_coords({"y": np.arange(3)})
+            ds = ds.assign_coords({"z": np.arange(4)})
+            for i, dsl in enumerate(var_names):
+                vlist = dsl.copy()
+                if drop_idx:
+                    vlist.pop(drop_idx[i])
+                foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4)
+                dsi = ds.copy()
+                if coord:
+                    dsi = ds.assign({"time": (["time"], [i * 2, i * 2 + 1])})
+                for k in vlist:
+                    dsi = dsi.assign({k: (["time", "x", "y", "z"], foo_data.copy())})
+                if not dim:
+                    dsi = dsi.isel(time=0)
+                out_ds.append(dsi)
+            return out_ds
+        var_names = get_var_names()
+
+        import random
+        random.seed(42)
+        drop_idx = [random.randrange(len(vlist)) for vlist in var_names]
+        expected = concat(create_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all")
+        for i, idx in enumerate(drop_idx):
+            if dim:
+                expected[var_names[0][idx]][i * 2: i * 2 + 2] = np.nan
+            else:
+                expected[var_names[0][idx]][i] = np.nan
+
+        concat_ds = create_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx)
+        actual = concat(concat_ds, dim="time", data_vars="all")
+
+        for name in var_names[0]:
+            assert_equal(expected[name], actual[name])
+        assert_equal(expected, actual)
+
 
 class TestConcatDataArray:
     def test_concat(self) -> None:

From 43862a18c119a801ca1b225e5c0b4638e13bc6f5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 5 Jan 2023 07:49:38 +0000
Subject: [PATCH 05/39] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 xarray/tests/test_concat.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index ea52b2a6da3..31ff8841fd5 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -576,7 +576,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None:
     def test_concat_fill_missing_variables(self, dim, coord):
         # create var names list with one missing value
         def get_var_names(var_cnt=10, list_cnt=10):
-            orig = [f'd{i:02d}' for i in range(var_cnt)]
+            orig = [f"d{i:02d}" for i in range(var_cnt)]
             var_names = []
             for i in range(0, list_cnt):
                 l1 = orig.copy()
@@ -603,15 +603,19 @@ def create_ds(var_names, dim=False, coord=False, drop_idx=False):
                     dsi = dsi.isel(time=0)
                 out_ds.append(dsi)
             return out_ds
+
         var_names = get_var_names()
 
         import random
+
         random.seed(42)
         drop_idx = [random.randrange(len(vlist)) for vlist in var_names]
-        expected = concat(create_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all")
+        expected = concat(
+            create_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all"
+        )
         for i, idx in enumerate(drop_idx):
             if dim:
-                expected[var_names[0][idx]][i * 2: i * 2 + 2] = np.nan
+                expected[var_names[0][idx]][i * 2 : i * 2 + 2] = np.nan
             else:
                 expected[var_names[0][idx]][i] = np.nan
 

From 5dd3e78dbe9fb873f72b4a93e83a340d0c3c680d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 8 Jan 2023 14:33:21 +0100
Subject: [PATCH 06/39] typing

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
---
 xarray/tests/test_concat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 31ff8841fd5..473994eac67 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -573,7 +573,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None:
 
     @pytest.mark.parametrize("dim", [True, False])
     @pytest.mark.parametrize("coord", [True, False])
-    def test_concat_fill_missing_variables(self, dim, coord):
+    def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None:
         # create var names list with one missing value
         def get_var_names(var_cnt=10, list_cnt=10):
             orig = [f"d{i:02d}" for i in range(var_cnt)]

From aed0f6e8209dcff16fd5922cea78e4d02040686a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 8 Jan 2023 14:33:35 +0100
Subject: [PATCH 07/39] typing

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
---
 xarray/tests/test_concat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 473994eac67..54c9a9b0d63 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -575,7 +575,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None:
     @pytest.mark.parametrize("coord", [True, False])
     def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None:
         # create var names list with one missing value
-        def get_var_names(var_cnt=10, list_cnt=10):
+        def get_var_names(var_cnt: int=10, list_cnt: int=10) -> list[str]:
             orig = [f"d{i:02d}" for i in range(var_cnt)]
             var_names = []
             for i in range(0, list_cnt):

From fa1aba54709553cdf66a6fd984c348e80db7a794 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 8 Jan 2023 14:33:56 +0100
Subject: [PATCH 08/39] typing

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
---
 xarray/tests/test_concat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 54c9a9b0d63..16d57b4eae7 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -583,7 +583,7 @@ def get_var_names(var_cnt: int=10, list_cnt: int=10) -> list[str]:
                 var_names.append(l1)
             return var_names
 
-        def create_ds(var_names, dim=False, coord=False, drop_idx=False):
+        def create_ds(var_names: list[str], dim: bool =False, coord: bool =False, drop_idx: list[int] |None =None) -> list[Dataset]:
             out_ds = []
             ds = Dataset()
             ds = ds.assign_coords({"x": np.arange(2)})

From 308c0094c976503c00f1463c01de34eae81efae8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 8 Jan 2023 14:34:21 +0100
Subject: [PATCH 09/39] use None instead of False

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
---
 xarray/tests/test_concat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 16d57b4eae7..ebd05853646 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -591,7 +591,7 @@ def create_ds(var_names: list[str], dim: bool =False, coord: bool =False, drop_i
             ds = ds.assign_coords({"z": np.arange(4)})
             for i, dsl in enumerate(var_names):
                 vlist = dsl.copy()
-                if drop_idx:
+                if drop_idx is not None:
                     vlist.pop(drop_idx[i])
                 foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4)
                 dsi = ds.copy()

From 58813e382e67b908bf8f93c6944ad5a54fd002e3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 8 Jan 2023 13:35:30 +0000
Subject: [PATCH 10/39] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 xarray/tests/test_concat.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index ebd05853646..7e5be610b79 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -575,7 +575,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None:
     @pytest.mark.parametrize("coord", [True, False])
     def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None:
         # create var names list with one missing value
-        def get_var_names(var_cnt: int=10, list_cnt: int=10) -> list[str]:
+        def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[str]:
             orig = [f"d{i:02d}" for i in range(var_cnt)]
             var_names = []
             for i in range(0, list_cnt):
@@ -583,7 +583,12 @@ def get_var_names(var_cnt: int=10, list_cnt: int=10) -> list[str]:
                 var_names.append(l1)
             return var_names
 
-        def create_ds(var_names: list[str], dim: bool =False, coord: bool =False, drop_idx: list[int] |None =None) -> list[Dataset]:
+        def create_ds(
+            var_names: list[str],
+            dim: bool = False,
+            coord: bool = False,
+            drop_idx: list[int] | None = None,
+        ) -> list[Dataset]:
             out_ds = []
             ds = Dataset()
             ds = ds.assign_coords({"x": np.arange(2)})

From a248d5b66e5f926f0b993fec21831559faf64fca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 8 Jan 2023 16:04:29 +0100
Subject: [PATCH 11/39] concatenate variable in any case if variable has
 concat_dim

---
 xarray/core/concat.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index ffd3bd338a4..f492269b916 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -610,9 +610,10 @@ def get_indexes(name):
                     )
                     result_vars[k] = v
             else:
-                # if variable is only present in one dataset of multiple datasets,
-                # then do not concat
-                if len(variables) == 1 and len(datasets) > 1:
+                # do not concat if:
+                # 1. variable is only present in one dataset of multiple datasets and
+                # 2. dim is not a dimension of variable
+                if dim not in variables[0].dims and len(variables) == 1 and len(datasets) > 1:
                     combined_var = variables[0]
                 # only concat if variable is in multiple datasets
                 # or if single dataset (GH1988)

From 1a6ad5bff9e57379c38422b2df51d9c27c7dacf9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 8 Jan 2023 15:05:34 +0000
Subject: [PATCH 12/39] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 xarray/core/concat.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index f492269b916..67023c800f1 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -613,7 +613,11 @@ def get_indexes(name):
                 # do not concat if:
                 # 1. variable is only present in one dataset of multiple datasets and
                 # 2. dim is not a dimension of variable
-                if dim not in variables[0].dims and len(variables) == 1 and len(datasets) > 1:
+                if (
+                    dim not in variables[0].dims
+                    and len(variables) == 1
+                    and len(datasets) > 1
+                ):
                     combined_var = variables[0]
                 # only concat if variable is in multiple datasets
                 # or if single dataset (GH1988)

From b9d0d024b0b6866d178f00d96e1b40bf4c9c35e5 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Sun, 8 Jan 2023 16:19:17 +0100
Subject: [PATCH 13/39] add tests from @scottcha #3545

---
 xarray/tests/test_concat.py | 593 ++++++++++++++++++++++++++++++++++++
 1 file changed, 593 insertions(+)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 7e5be610b79..b7e2bb943cc 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import random
 from copy import deepcopy
 from typing import TYPE_CHECKING, Any
 
@@ -23,6 +24,86 @@
     from xarray.core.types import CombineAttrsOptions, JoinOptions
 
 
+# helper method to create multiple tests datasets to concat
+def create_concat_datasets(num_datasets=2, seed=None, include_day=True):
+    random.seed(seed)
+    result = []
+    lat = np.random.randn(1, 4)
+    lon = np.random.randn(1, 4)
+    for i in range(num_datasets):
+        if include_day:
+            result.append(
+                Dataset(
+                    data_vars={
+                        "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                        "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                        "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                        "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                        "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    },
+                    coords={
+                        "lat": (["x", "y"], lat),
+                        "lon": (["x", "y"], lon),
+                        "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
+                    },
+                )
+            )
+        else:
+            result.append(
+                Dataset(
+                    data_vars={
+                        "temperature": (["x", "y"], np.random.randn(1, 4)),
+                        "pressure": (["x", "y"], np.random.randn(1, 4)),
+                        "humidity": (["x", "y"], np.random.randn(1, 4)),
+                        "precipitation": (["x", "y"], np.random.randn(1, 4)),
+                        "cloud cover": (["x", "y"], np.random.randn(1, 4)),
+                    },
+                    coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)},
+                )
+            )
+
+    return result
+
+
+# helper method to create multiple tests datasets to concat with specific types
+def create_typed_datasets(num_datasets=2, seed=None):
+    random.seed(seed)
+    var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"]
+    result = []
+    lat = np.random.randn(1, 4)
+    lon = np.random.randn(1, 4)
+    for i in range(num_datasets):
+        result.append(
+            Dataset(
+                data_vars={
+                    "float": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    "float2": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    "string": (
+                        ["x", "y", "day"],
+                        np.random.choice(var_strings, (1, 4, 2)),
+                    ),
+                    "int": (["x", "y", "day"], np.random.randint(0, 10, (1, 4, 2))),
+                    "datetime64": (
+                        ["x", "y", "day"],
+                        np.arange(
+                            np.datetime64("2017-01-01"), np.datetime64("2017-01-09")
+                        ).reshape(1, 4, 2),
+                    ),
+                    "timedelta64": (
+                        ["x", "y", "day"],
+                        np.reshape([pd.Timedelta(days=i) for i in range(8)], [1, 4, 2]),
+                    ),
+                },
+                coords={
+                    "lat": (["x", "y"], lat),
+                    "lon": (["x", "y"], lon),
+                    "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
+                },
+            )
+        )
+    return result
+
+
 def test_concat_compat() -> None:
     ds1 = Dataset(
         {
@@ -52,6 +133,518 @@ def test_concat_compat() -> None:
         concat([ds1, ds2], dim="q")
 
 
+def test_concat_missing_var():
+    datasets = create_concat_datasets(2, 123)
+    vars_to_drop = ["humidity", "precipitation", "cloud cover"]
+    datasets[0] = datasets[0].drop_vars(vars_to_drop)
+    datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"])
+
+    temperature_result = np.concatenate(
+        (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
+    )
+    pressure_result = np.concatenate(
+        (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
+
+    assert_equal(result, ds_result)
+
+
+def test_concat_missing_multiple_consecutive_var():
+    datasets = create_concat_datasets(3, 123)
+    vars_to_drop = ["pressure", "humidity"]
+    datasets[0] = datasets[0].drop_vars(vars_to_drop)
+    datasets[1] = datasets[1].drop_vars(vars_to_drop)
+
+    temperature_result = np.concatenate(
+        (
+            datasets[0].temperature.values,
+            datasets[1].temperature.values,
+            datasets[2].temperature.values,
+        ),
+        axis=2,
+    )
+    pressure_result = np.concatenate(
+        (
+            np.full([1, 4, 2], np.nan),
+            np.full([1, 4, 2], np.nan),
+            datasets[2].pressure.values,
+        ),
+        axis=2,
+    )
+    humidity_result = np.concatenate(
+        (
+            np.full([1, 4, 2], np.nan),
+            np.full([1, 4, 2], np.nan),
+            datasets[2].humidity.values,
+        ),
+        axis=2,
+    )
+    precipitation_result = np.concatenate(
+        (
+            datasets[0].precipitation.values,
+            datasets[1].precipitation.values,
+            datasets[2].precipitation.values,
+        ),
+        axis=2,
+    )
+    cloudcover_result = np.concatenate(
+        (
+            datasets[0]["cloud cover"].values,
+            datasets[1]["cloud cover"].values,
+            datasets[2]["cloud cover"].values,
+        ),
+        axis=2,
+    )
+
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], temperature_result),
+            "precipitation": (["x", "y", "day"], precipitation_result),
+            "cloud cover": (["x", "y", "day"], cloudcover_result),
+            "humidity": (["x", "y", "day"], humidity_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4", "day5", "day6"],
+        },
+    )
+    # assign here, as adding above gave switched pressure/humidity-order every once in a while
+    ds_result = ds_result.assign({"humidity": (["x", "y", "day"], humidity_result)})
+    ds_result = ds_result.assign({"pressure": (["x", "y", "day"], pressure_result)})
+    result = concat(datasets, dim="day")
+    r1 = [var for var in result.data_vars]
+    r2 = [var for var in ds_result.data_vars]
+    assert r1 == r2  # check the variables orders are the same
+    assert_equal(result, ds_result)
+
+
+def test_concat_all_empty():
+    ds1 = Dataset()
+    ds2 = Dataset()
+    result = concat([ds1, ds2], dim="new_dim")
+
+    assert_equal(result, Dataset())
+
+
+def test_concat_second_empty():
+    ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1})
+    ds2 = Dataset(coords={"x": 0.1})
+
+    ds_result = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1})
+    result = concat([ds1, ds2], dim="y")
+
+    assert_equal(result, ds_result)
+
+
+def test_multiple_missing_variables():
+    datasets = create_concat_datasets(2, 123)
+    vars_to_drop = ["pressure", "cloud cover"]
+    datasets[1] = datasets[1].drop_vars(vars_to_drop)
+
+    temperature_result = np.concatenate(
+        (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
+    )
+    pressure_result = np.concatenate(
+        (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    humidity_result = np.concatenate(
+        (datasets[0].humidity.values, datasets[1].humidity.values), axis=2
+    )
+    precipitation_result = np.concatenate(
+        (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2
+    )
+    cloudcover_result = np.concatenate(
+        (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+            "humidity": (["x", "y", "day"], humidity_result),
+            "precipitation": (["x", "y", "day"], precipitation_result),
+            "cloud cover": (["x", "y", "day"], cloudcover_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
+
+    assert_equal(result, ds_result)
+
+
+@pytest.mark.xfail(strict=True)
+def test_concat_multiple_datasets_missing_vars_and_new_dim():
+    vars_to_drop = [
+        "temperature",
+        "pressure",
+        "humidity",
+        "precipitation",
+        "cloud cover",
+    ]
+    datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=False)
+    # set up the test data
+    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
+
+    # set up the validation data
+    # the below code just drops one var per dataset depending on the location of the
+    # dataset in the list and allows us to quickly catch any boundaries cases across
+    # the three equivalence classes of beginning, middle and end of the concat list
+    result_vars = dict.fromkeys(vars_to_drop)
+    for i in range(len(vars_to_drop)):
+        for d in range(len(datasets)):
+            if d != i:
+                if result_vars[vars_to_drop[i]] is None:
+                    result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values
+                else:
+                    result_vars[vars_to_drop[i]] = np.concatenate(
+                        (
+                            result_vars[vars_to_drop[i]],
+                            datasets[d][vars_to_drop[i]].values,
+                        ),
+                        axis=1,
+                    )
+            else:
+                if result_vars[vars_to_drop[i]] is None:
+                    result_vars[vars_to_drop[i]] = np.full([1, 4], np.nan)
+                else:
+                    result_vars[vars_to_drop[i]] = np.concatenate(
+                        (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)),
+                        axis=1,
+                    )
+    # TODO: this test still has two unexpected errors:
+
+    # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case
+    #   as we are concating on new dims
+    # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable
+    #   the resulting set
+
+    ds_result = Dataset(
+        data_vars={
+            # pressure will be first in this since the first dataset is missing this var
+            # and there isn't a good way to determine that this should be first
+            # this also means temperature will be last as the first data vars will
+            # determine the order for all that exist in that dataset
+            "pressure": (["x", "y", "day"], result_vars["pressure"]),
+            "humidity": (["x", "y", "day"], result_vars["humidity"]),
+            "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
+            "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
+            "temperature": (["x", "y", "day"], result_vars["temperature"]),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            #  "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))],
+        },
+    )
+
+    result = concat(datasets, dim="day")
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
+
+    assert_equal(result, ds_result)
+
+
+def test_multiple_datasets_with_missing_variables():
+    vars_to_drop = [
+        "temperature",
+        "pressure",
+        "humidity",
+        "precipitation",
+        "cloud cover",
+    ]
+    datasets = create_concat_datasets(len(vars_to_drop), 123)
+    # set up the test data
+    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
+
+    # set up the validation data
+    # the below code just drops one var per dataset depending on the location of the
+    # dataset in the list and allows us to quickly catch any boundaries cases across
+    # the three equivalence classes of beginning, middle and end of the concat list
+    result_vars = dict.fromkeys(vars_to_drop)
+    for i in range(len(vars_to_drop)):
+        for d in range(len(datasets)):
+            if d != i:
+                if result_vars[vars_to_drop[i]] is None:
+                    result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values
+                else:
+                    result_vars[vars_to_drop[i]] = np.concatenate(
+                        (
+                            result_vars[vars_to_drop[i]],
+                            datasets[d][vars_to_drop[i]].values,
+                        ),
+                        axis=2,
+                    )
+            else:
+                if result_vars[vars_to_drop[i]] is None:
+                    result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan)
+                else:
+                    result_vars[vars_to_drop[i]] = np.concatenate(
+                        (result_vars[vars_to_drop[i]], np.full([1, 4, 2], np.nan)),
+                        axis=2,
+                    )
+
+    ds_result = Dataset(
+        data_vars={
+            # pressure will be first in this since the first dataset is missing this var
+            # and there isn't a good way to determine that this should be first
+            # this also means temperature will be last as the first data vars will
+            # determine the order for all that exist in that dataset
+            "pressure": (["x", "y", "day"], result_vars["pressure"]),
+            "humidity": (["x", "y", "day"], result_vars["humidity"]),
+            "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
+            "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
+            "temperature": (["x", "y", "day"], result_vars["temperature"]),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
+
+    assert_equal(result, ds_result)
+
+
+def test_multiple_datasets_with_multiple_missing_variables():
+    vars_to_drop_in_first = ["temperature", "pressure"]
+    vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"]
+    datasets = create_concat_datasets(2, 123)
+    # set up the test data
+    datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
+    datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
+
+    temperature_result = np.concatenate(
+        (np.full([1, 4, 2], np.nan), datasets[1].temperature.values), axis=2
+    )
+    pressure_result = np.concatenate(
+        (np.full([1, 4, 2], np.nan), datasets[1].pressure.values), axis=2
+    )
+    humidity_result = np.concatenate(
+        (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    precipitation_result = np.concatenate(
+        (datasets[0].precipitation.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    cloudcover_result = np.concatenate(
+        (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "humidity": (["x", "y", "day"], humidity_result),
+            "precipitation": (["x", "y", "day"], precipitation_result),
+            "cloud cover": (["x", "y", "day"], cloudcover_result),
+            # these two are at the end of the expected as they are missing from the first
+            # dataset in the concat list
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
+
+    assert_equal(result, ds_result)
+
+
+def test_type_of_missing_fill():
+    datasets = create_typed_datasets(2, 123)
+
+    vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"]
+
+    # set up the test data
+    datasets[1] = datasets[1].drop_vars(vars[1:])
+
+    float_result = np.concatenate(
+        (datasets[0].float.values, datasets[1].float.values), axis=2
+    )
+    float2_result = np.concatenate(
+        (datasets[0].float2.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    # to correctly create the expected dataset we need to ensure we promote the string array to
+    # object type before filling as it will be promoted to that in the concat case.
+    # this matches the behavior of pandas
+    string_values = datasets[0].string.values
+    string_values = string_values.astype(object)
+    string_result = np.concatenate((string_values, np.full([1, 4, 2], np.nan)), axis=2)
+    datetime_result = np.concatenate(
+        (datasets[0].datetime64.values, np.full([1, 4, 2], np.datetime64("NaT"))),
+        axis=2,
+    )
+    timedelta_result = np.concatenate(
+        (datasets[0].timedelta64.values, np.full([1, 4, 2], np.timedelta64("NaT"))),
+        axis=2,
+    )
+    # string_result = string_result.astype(object)
+    int_result = np.concatenate(
+        (datasets[0].int.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "float": (["x", "y", "day"], float_result),
+            "float2": (["x", "y", "day"], float2_result),
+            "string": (["x", "y", "day"], string_result),
+            "int": (["x", "y", "day"], int_result),
+            "datetime64": (["x", "y", "day"], datetime_result),
+            "timedelta64": (["x", "y", "day"], timedelta_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day", fill_value=dtypes.NA)
+
+    assert_equal(result, ds_result)
+
+    # test in the reverse order
+    float_result_rev = np.concatenate(
+        (datasets[1].float.values, datasets[0].float.values), axis=2
+    )
+    float2_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.nan), datasets[0].float2.values), axis=2
+    )
+    string_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.nan), string_values), axis=2
+    )
+    datetime_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.datetime64("NaT")), datasets[0].datetime64.values),
+        axis=2,
+    )
+    timedelta_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.timedelta64("NaT")), datasets[0].timedelta64.values),
+        axis=2,
+    )
+    int_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.nan), datasets[0].int.values), axis=2
+    )
+    ds_result_rev = Dataset(
+        data_vars={
+            "float": (["x", "y", "day"], float_result_rev),
+            "float2": (["x", "y", "day"], float2_result_rev),
+            "string": (["x", "y", "day"], string_result_rev),
+            "int": (["x", "y", "day"], int_result_rev),
+            "datetime64": (["x", "y", "day"], datetime_result_rev),
+            "timedelta64": (["x", "y", "day"], timedelta_result_rev),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day3", "day4", "day1", "day2"],
+        },
+    )
+    result_rev = concat(datasets[::-1], dim="day", fill_value=dtypes.NA)
+
+    assert_equal(result_rev, ds_result_rev)
+
+
+def test_order_when_filling_missing():
+    vars_to_drop_in_first = []
+    # drop middle
+    vars_to_drop_in_second = ["humidity"]
+    datasets = create_concat_datasets(2, 123)
+    # set up the test data
+    datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
+    datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
+
+    temperature_result = np.concatenate(
+        (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
+    )
+    pressure_result = np.concatenate(
+        (datasets[0].pressure.values, datasets[1].pressure.values), axis=2
+    )
+    humidity_result = np.concatenate(
+        (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    precipitation_result = np.concatenate(
+        (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2
+    )
+    cloudcover_result = np.concatenate(
+        (datasets[0]["cloud cover"].values, datasets[1]["cloud cover"].values), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+            "precipitation": (["x", "y", "day"], precipitation_result),
+            "cloud cover": (["x", "y", "day"], cloudcover_result),
+            "humidity": (["x", "y", "day"], humidity_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    assert_equal(result, ds_result)
+
+    result_keys = [
+        "temperature",
+        "pressure",
+        "humidity",
+        "precipitation",
+        "cloud cover",
+    ]
+    result_index = 0
+    for k in result.data_vars.keys():
+        assert k == result_keys[result_index]
+        result_index += 1
+
+    result_keys_rev = [
+        "temperature",
+        "pressure",
+        "precipitation",
+        "cloud cover",
+        "humidity",
+    ]
+    # test order when concat in reversed order
+    rev_result = concat(datasets[::-1], dim="day")
+    result_index = 0
+    for k in rev_result.data_vars.keys():
+        assert k == result_keys_rev[result_index]
+        result_index += 1
+
+
 class TestConcatDataset:
     @pytest.fixture
     def data(self) -> Dataset:

From 2db43b9a76086b08d9dfd0852e40e49b89dc8661 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 8 Jan 2023 16:44:29 +0100
Subject: [PATCH 14/39] typing

---
 xarray/tests/test_concat.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index b7e2bb943cc..9a1c4b02f08 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -25,7 +25,9 @@
 
 
 # helper method to create multiple tests datasets to concat
-def create_concat_datasets(num_datasets=2, seed=None, include_day=True):
+def create_concat_datasets(
+    num_datasets: int = 2, seed: [int | None] = None, include_day: bool = True
+) -> list[Dataset]:
     random.seed(seed)
     result = []
     lat = np.random.randn(1, 4)
@@ -66,7 +68,9 @@ def create_concat_datasets(num_datasets=2, seed=None, include_day=True):
 
 
 # helper method to create multiple tests datasets to concat with specific types
-def create_typed_datasets(num_datasets=2, seed=None):
+def create_typed_datasets(
+    num_datasets: int = 2, seed: [int | None] = None
+) -> list[Dataset]:
     random.seed(seed)
     var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"]
     result = []
@@ -133,7 +137,7 @@ def test_concat_compat() -> None:
         concat([ds1, ds2], dim="q")
 
 
-def test_concat_missing_var():
+def test_concat_missing_var() -> None:
     datasets = create_concat_datasets(2, 123)
     vars_to_drop = ["humidity", "precipitation", "cloud cover"]
     datasets[0] = datasets[0].drop_vars(vars_to_drop)
@@ -165,7 +169,7 @@ def test_concat_missing_var():
     assert_equal(result, ds_result)
 
 
-def test_concat_missing_multiple_consecutive_var():
+def test_concat_missing_multiple_consecutive_var() -> None:
     datasets = create_concat_datasets(3, 123)
     vars_to_drop = ["pressure", "humidity"]
     datasets[0] = datasets[0].drop_vars(vars_to_drop)
@@ -236,7 +240,7 @@ def test_concat_missing_multiple_consecutive_var():
     assert_equal(result, ds_result)
 
 
-def test_concat_all_empty():
+def test_concat_all_empty() -> None:
     ds1 = Dataset()
     ds2 = Dataset()
     result = concat([ds1, ds2], dim="new_dim")
@@ -244,7 +248,7 @@ def test_concat_all_empty():
     assert_equal(result, Dataset())
 
 
-def test_concat_second_empty():
+def test_concat_second_empty() -> None:
     ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1})
     ds2 = Dataset(coords={"x": 0.1})
 
@@ -254,7 +258,7 @@ def test_concat_second_empty():
     assert_equal(result, ds_result)
 
 
-def test_multiple_missing_variables():
+def test_multiple_missing_variables() -> None:
     datasets = create_concat_datasets(2, 123)
     vars_to_drop = ["pressure", "cloud cover"]
     datasets[1] = datasets[1].drop_vars(vars_to_drop)
@@ -298,7 +302,7 @@ def test_multiple_missing_variables():
 
 
 @pytest.mark.xfail(strict=True)
-def test_concat_multiple_datasets_missing_vars_and_new_dim():
+def test_concat_multiple_datasets_missing_vars_and_new_dim() -> None:
     vars_to_drop = [
         "temperature",
         "pressure",
@@ -370,7 +374,7 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim():
     assert_equal(result, ds_result)
 
 
-def test_multiple_datasets_with_missing_variables():
+def test_multiple_datasets_with_missing_variables() -> None:
     vars_to_drop = [
         "temperature",
         "pressure",
@@ -436,7 +440,7 @@ def test_multiple_datasets_with_missing_variables():
     assert_equal(result, ds_result)
 
 
-def test_multiple_datasets_with_multiple_missing_variables():
+def test_multiple_datasets_with_multiple_missing_variables() -> None:
     vars_to_drop_in_first = ["temperature", "pressure"]
     vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"]
     datasets = create_concat_datasets(2, 123)
@@ -484,7 +488,7 @@ def test_multiple_datasets_with_multiple_missing_variables():
     assert_equal(result, ds_result)
 
 
-def test_type_of_missing_fill():
+def test_type_of_missing_fill() -> None:
     datasets = create_typed_datasets(2, 123)
 
     vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"]
@@ -576,7 +580,7 @@ def test_type_of_missing_fill():
     assert_equal(result_rev, ds_result_rev)
 
 
-def test_order_when_filling_missing():
+def test_order_when_filling_missing() -> None:
     vars_to_drop_in_first = []
     # drop middle
     vars_to_drop_in_second = ["humidity"]

From e54d423b4f9f823906a94d0d5ea8d01decb32166 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 8 Jan 2023 17:20:48 +0100
Subject: [PATCH 15/39] fix typing

---
 xarray/tests/test_concat.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 9a1c4b02f08..db15486b971 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -26,7 +26,7 @@
 
 # helper method to create multiple tests datasets to concat
 def create_concat_datasets(
-    num_datasets: int = 2, seed: [int | None] = None, include_day: bool = True
+    num_datasets: int = 2, seed: int | None = None, include_day: bool = True
 ) -> list[Dataset]:
     random.seed(seed)
     result = []
@@ -69,7 +69,7 @@ def create_concat_datasets(
 
 # helper method to create multiple tests datasets to concat with specific types
 def create_typed_datasets(
-    num_datasets: int = 2, seed: [int | None] = None
+    num_datasets: int = 2, seed: int | None = None
 ) -> list[Dataset]:
     random.seed(seed)
     var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"]
@@ -1172,7 +1172,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None:
     @pytest.mark.parametrize("coord", [True, False])
     def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None:
         # create var names list with one missing value
-        def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[str]:
+        def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[list[str]]:
             orig = [f"d{i:02d}" for i in range(var_cnt)]
             var_names = []
             for i in range(0, list_cnt):
@@ -1181,7 +1181,7 @@ def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[str]:
             return var_names
 
         def create_ds(
-            var_names: list[str],
+            var_names: list[list[str]],
             dim: bool = False,
             coord: bool = False,
             drop_idx: list[int] | None = None,

From 7fe0ce2f1957cd19abcb3858eb230e8ee988f5ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 9 Jan 2023 09:25:26 +0100
Subject: [PATCH 16/39] fix tests with, finalize typing

---
 xarray/tests/test_concat.py | 205 ++++++++++++++++++++----------------
 1 file changed, 113 insertions(+), 92 deletions(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index db15486b971..260ed7357ea 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -2,7 +2,7 @@
 
 import random
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Callable
 
 import numpy as np
 import pandas as pd
@@ -230,13 +230,12 @@ def test_concat_missing_multiple_consecutive_var() -> None:
             "day": ["day1", "day2", "day3", "day4", "day5", "day6"],
         },
     )
-    # assign here, as adding above gave switched pressure/humidity-order every once in a while
-    ds_result = ds_result.assign({"humidity": (["x", "y", "day"], humidity_result)})
-    ds_result = ds_result.assign({"pressure": (["x", "y", "day"], pressure_result)})
     result = concat(datasets, dim="day")
     r1 = [var for var in result.data_vars]
     r2 = [var for var in ds_result.data_vars]
-    assert r1 == r2  # check the variables orders are the same
+    # check the variables orders are the same for the first three variables
+    assert r1[:3] == r2[:3]
+    assert set(r1[3:]) == set(r2[3:])  # just check availability for the remaining vars
     assert_equal(result, ds_result)
 
 
@@ -301,8 +300,8 @@ def test_multiple_missing_variables() -> None:
     assert_equal(result, ds_result)
 
 
-@pytest.mark.xfail(strict=True)
-def test_concat_multiple_datasets_missing_vars_and_new_dim() -> None:
+@pytest.mark.parametrize("include_day", [True, False])
+def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) -> None:
     vars_to_drop = [
         "temperature",
         "pressure",
@@ -310,47 +309,51 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim() -> None:
         "precipitation",
         "cloud cover",
     ]
-    datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=False)
+
+    datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=include_day)
     # set up the test data
     datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
 
+    dim_size = 2 if include_day else 1
+
     # set up the validation data
     # the below code just drops one var per dataset depending on the location of the
     # dataset in the list and allows us to quickly catch any boundaries cases across
     # the three equivalence classes of beginning, middle and end of the concat list
-    result_vars = dict.fromkeys(vars_to_drop)
+    result_vars = dict.fromkeys(vars_to_drop, np.array([]))
     for i in range(len(vars_to_drop)):
         for d in range(len(datasets)):
             if d != i:
-                if result_vars[vars_to_drop[i]] is None:
-                    result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values
+                if include_day:
+                    ds_vals = datasets[d][vars_to_drop[i]].values
+                else:
+                    ds_vals = datasets[d][vars_to_drop[i]].values[..., None]
+                if not result_vars[vars_to_drop[i]].size:
+                    result_vars[vars_to_drop[i]] = ds_vals
                 else:
                     result_vars[vars_to_drop[i]] = np.concatenate(
                         (
                             result_vars[vars_to_drop[i]],
-                            datasets[d][vars_to_drop[i]].values,
+                            ds_vals,
                         ),
-                        axis=1,
+                        axis=-1,
                     )
             else:
-                if result_vars[vars_to_drop[i]] is None:
-                    result_vars[vars_to_drop[i]] = np.full([1, 4], np.nan)
+                if not result_vars[vars_to_drop[i]].size:
+                    result_vars[vars_to_drop[i]] = np.full([1, 4, dim_size], np.nan)
                 else:
                     result_vars[vars_to_drop[i]] = np.concatenate(
-                        (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)),
-                        axis=1,
+                        (
+                            result_vars[vars_to_drop[i]],
+                            np.full([1, 4, dim_size], np.nan),
+                        ),
+                        axis=-1,
                     )
-    # TODO: this test still has two unexpected errors:
-
-    # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case
-    #   as we are concating on new dims
-    # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable
-    #   the resulting set
 
     ds_result = Dataset(
         data_vars={
-            # pressure will be first in this since the first dataset is missing this var
-            # and there isn't a good way to determine that this should be first
+            # pressure will be first here since it is first in first dataset and
+            # there isn't a good way to determine that temperature should be first
             # this also means temperature will be last as the first data vars will
             # determine the order for all that exist in that dataset
             "pressure": (["x", "y", "day"], result_vars["pressure"]),
@@ -362,11 +365,17 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim() -> None:
         coords={
             "lat": (["x", "y"], datasets[0].lat.values),
             "lon": (["x", "y"], datasets[0].lon.values),
-            #  "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))],
         },
     )
+    if include_day:
+        ds_result = ds_result.assign_coords(
+            {"day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))]}
+        )
+    else:
+        ds_result = ds_result.transpose("day", "x", "y")
 
     result = concat(datasets, dim="day")
+
     r1 = list(result.data_vars.keys())
     r2 = list(ds_result.data_vars.keys())
     assert r1 == r2  # check the variables orders are the same
@@ -390,11 +399,11 @@ def test_multiple_datasets_with_missing_variables() -> None:
     # the below code just drops one var per dataset depending on the location of the
     # dataset in the list and allows us to quickly catch any boundaries cases across
     # the three equivalence classes of beginning, middle and end of the concat list
-    result_vars = dict.fromkeys(vars_to_drop)
+    result_vars = dict.fromkeys(vars_to_drop, np.array([]))
     for i in range(len(vars_to_drop)):
         for d in range(len(datasets)):
             if d != i:
-                if result_vars[vars_to_drop[i]] is None:
+                if not result_vars[vars_to_drop[i]].size:
                     result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values
                 else:
                     result_vars[vars_to_drop[i]] = np.concatenate(
@@ -405,7 +414,7 @@ def test_multiple_datasets_with_missing_variables() -> None:
                         axis=2,
                     )
             else:
-                if result_vars[vars_to_drop[i]] is None:
+                if not result_vars[vars_to_drop[i]].size:
                     result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan)
                 else:
                     result_vars[vars_to_drop[i]] = np.concatenate(
@@ -483,8 +492,9 @@ def test_multiple_datasets_with_multiple_missing_variables() -> None:
 
     r1 = list(result.data_vars.keys())
     r2 = list(ds_result.data_vars.keys())
-    assert r1 == r2  # check the variables orders are the same
-
+    # check the variables orders are the same for the first three variables
+    assert r1[:3] == r2[:3]
+    assert set(r1[3:]) == set(r2[3:])  # just check availability for the remaining vars
     assert_equal(result, ds_result)
 
 
@@ -581,7 +591,7 @@ def test_type_of_missing_fill() -> None:
 
 
 def test_order_when_filling_missing() -> None:
-    vars_to_drop_in_first = []
+    vars_to_drop_in_first: list[str] = []
     # drop middle
     vars_to_drop_in_second = ["humidity"]
     datasets = create_concat_datasets(2, 123)
@@ -649,6 +659,77 @@ def test_order_when_filling_missing() -> None:
         result_index += 1
 
 
+@pytest.fixture
+def concat_var_names() -> Callable:
+    # create var names list with one missing value
+    def get_varnames(var_cnt: int = 10, list_cnt: int = 10) -> list[list[str]]:
+        orig = [f"d{i:02d}" for i in range(var_cnt)]
+        var_names = []
+        for i in range(0, list_cnt):
+            l1 = orig.copy()
+            var_names.append(l1)
+        return var_names
+
+    return get_varnames
+
+
+@pytest.fixture
+def create_concat_ds() -> Callable:
+    def create_ds(
+        var_names: list[list[str]],
+        dim: bool = False,
+        coord: bool = False,
+        drop_idx: list[int] | None = None,
+    ) -> list[Dataset]:
+        out_ds = []
+        ds = Dataset()
+        ds = ds.assign_coords({"x": np.arange(2)})
+        ds = ds.assign_coords({"y": np.arange(3)})
+        ds = ds.assign_coords({"z": np.arange(4)})
+        for i, dsl in enumerate(var_names):
+            vlist = dsl.copy()
+            if drop_idx is not None:
+                vlist.pop(drop_idx[i])
+            foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4)
+            dsi = ds.copy()
+            if coord:
+                dsi = ds.assign({"time": (["time"], [i * 2, i * 2 + 1])})
+            for k in vlist:
+                dsi = dsi.assign({k: (["time", "x", "y", "z"], foo_data.copy())})
+            if not dim:
+                dsi = dsi.isel(time=0)
+            out_ds.append(dsi)
+        return out_ds
+
+    return create_ds
+
+
+@pytest.mark.parametrize("dim", [True, False])
+@pytest.mark.parametrize("coord", [True, False])
+def test_concat_fill_missing_variables(
+    concat_var_names, create_concat_ds, dim: bool, coord: bool
+) -> None:
+    var_names = concat_var_names()
+
+    random.seed(42)
+    drop_idx = [random.randrange(len(vlist)) for vlist in var_names]
+    expected = concat(
+        create_concat_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all"
+    )
+    for i, idx in enumerate(drop_idx):
+        if dim:
+            expected[var_names[0][idx]][i * 2 : i * 2 + 2] = np.nan
+        else:
+            expected[var_names[0][idx]][i] = np.nan
+
+    concat_ds = create_concat_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx)
+    actual = concat(concat_ds, dim="time", data_vars="all")
+
+    for name in var_names[0]:
+        assert_equal(expected[name], actual[name])
+    assert_equal(expected, actual)
+
+
 class TestConcatDataset:
     @pytest.fixture
     def data(self) -> Dataset:
@@ -1168,66 +1249,6 @@ def test_concat_str_dtype(self, dtype, dim) -> None:
 
         assert np.issubdtype(actual.x2.dtype, dtype)
 
-    @pytest.mark.parametrize("dim", [True, False])
-    @pytest.mark.parametrize("coord", [True, False])
-    def test_concat_fill_missing_variables(self, dim: bool, coord: bool) -> None:
-        # create var names list with one missing value
-        def get_var_names(var_cnt: int = 10, list_cnt: int = 10) -> list[list[str]]:
-            orig = [f"d{i:02d}" for i in range(var_cnt)]
-            var_names = []
-            for i in range(0, list_cnt):
-                l1 = orig.copy()
-                var_names.append(l1)
-            return var_names
-
-        def create_ds(
-            var_names: list[list[str]],
-            dim: bool = False,
-            coord: bool = False,
-            drop_idx: list[int] | None = None,
-        ) -> list[Dataset]:
-            out_ds = []
-            ds = Dataset()
-            ds = ds.assign_coords({"x": np.arange(2)})
-            ds = ds.assign_coords({"y": np.arange(3)})
-            ds = ds.assign_coords({"z": np.arange(4)})
-            for i, dsl in enumerate(var_names):
-                vlist = dsl.copy()
-                if drop_idx is not None:
-                    vlist.pop(drop_idx[i])
-                foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4)
-                dsi = ds.copy()
-                if coord:
-                    dsi = ds.assign({"time": (["time"], [i * 2, i * 2 + 1])})
-                for k in vlist:
-                    dsi = dsi.assign({k: (["time", "x", "y", "z"], foo_data.copy())})
-                if not dim:
-                    dsi = dsi.isel(time=0)
-                out_ds.append(dsi)
-            return out_ds
-
-        var_names = get_var_names()
-
-        import random
-
-        random.seed(42)
-        drop_idx = [random.randrange(len(vlist)) for vlist in var_names]
-        expected = concat(
-            create_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all"
-        )
-        for i, idx in enumerate(drop_idx):
-            if dim:
-                expected[var_names[0][idx]][i * 2 : i * 2 + 2] = np.nan
-            else:
-                expected[var_names[0][idx]][i] = np.nan
-
-        concat_ds = create_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx)
-        actual = concat(concat_ds, dim="time", data_vars="all")
-
-        for name in var_names[0]:
-            assert_equal(expected[name], actual[name])
-        assert_equal(expected, actual)
-
 
 class TestConcatDataArray:
     def test_concat(self) -> None:

From ffd13f684d6a71e514e2ecb02e92713a99629bcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 9 Jan 2023 13:14:47 +0100
Subject: [PATCH 17/39] add whats-new.rst entry

---
 doc/whats-new.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 4b03e8d676e..d7127deee1c 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -75,6 +75,9 @@ Bug fixes
   By `Benoît Bovy <https://github.com/benbovy>`_.
 - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`,
   :pull:`7393`). By `Ian Carroll <https://github.com/itcarroll>`_.
+- Make :py:func:`xarray.concat` more robust when concatenating variables present in some datasets but
+  not others (:issue:`508`, :pull:`7400`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_ and `Scott Chamberlin <https://github.com/scottcha>`_.
 
 Internal Changes
 ~~~~~~~~~~~~~~~~

From 146b740e3714aa63db0c908b2448a0042eafb0ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Mon, 9 Jan 2023 15:57:50 +0100
Subject: [PATCH 18/39] Update xarray/tests/test_concat.py

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
---
 xarray/tests/test_concat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 260ed7357ea..1f9eaf7c2fd 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -28,10 +28,10 @@
 def create_concat_datasets(
     num_datasets: int = 2, seed: int | None = None, include_day: bool = True
 ) -> list[Dataset]:
-    random.seed(seed)
+    rng = default_rng(seed)
+    lat = rng.standard_normal(size=(1, 4))
+    lon = rng.standard_normal(size=(1, 4))
     result = []
-    lat = np.random.randn(1, 4)
-    lon = np.random.randn(1, 4)
     for i in range(num_datasets):
         if include_day:
             result.append(

From 3196e83be35cdb156c5cd083ab624e2c10b3ef37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Mon, 9 Jan 2023 15:58:01 +0100
Subject: [PATCH 19/39] Update xarray/tests/test_concat.py

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
---
 xarray/tests/test_concat.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 1f9eaf7c2fd..973f4b60e2b 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -234,6 +234,7 @@ def test_concat_missing_multiple_consecutive_var() -> None:
     r1 = [var for var in result.data_vars]
     r2 = [var for var in ds_result.data_vars]
     # check the variables orders are the same for the first three variables
+    # TODO: Can all variables become deterministic?
     assert r1[:3] == r2[:3]
     assert set(r1[3:]) == set(r2[3:])  # just check availability for the remaining vars
     assert_equal(result, ds_result)

From e53cfcccc41c06a0617fbc6a8b33d0a23e244d73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 9 Jan 2023 16:09:10 +0100
Subject: [PATCH 20/39] add TODO, fix numpy.random.default_rng

---
 xarray/core/concat.py       | 3 +++
 xarray/tests/test_concat.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 67023c800f1..25f6534f89a 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -558,6 +558,9 @@ def get_indexes(name):
     # preserve variable order for variables in first dataset
     data_var_order = list(datasets[0].variables)
     # append additional variables to the end
+    # TODO: since data_names is a set the ordering for the appended variables
+    #  is not deterministic, see also discussion in
+    #  https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738
     data_var_order += [e for e in data_names if e not in data_var_order]
     # create concatenation index, needed for later reindexing
     concat_index = list(range(sum(concat_dim_lengths)))
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 973f4b60e2b..3ca98deedfe 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -28,7 +28,7 @@
 def create_concat_datasets(
     num_datasets: int = 2, seed: int | None = None, include_day: bool = True
 ) -> list[Dataset]:
-    rng = default_rng(seed)
+    rng = np.random.default_rng(seed)
     lat = rng.standard_normal(size=(1, 4))
     lon = rng.standard_normal(size=(1, 4))
     result = []

From c41419f43619ff45e3f14d722797075a737c8ff8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Tue, 10 Jan 2023 08:11:34 +0100
Subject: [PATCH 21/39] change np.random to use Generator

---
 xarray/tests/test_concat.py | 62 +++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 3ca98deedfe..35b3a826e9c 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import random
 from copy import deepcopy
 from typing import TYPE_CHECKING, Any, Callable
 
@@ -37,11 +36,26 @@ def create_concat_datasets(
             result.append(
                 Dataset(
                     data_vars={
-                        "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                        "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                        "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                        "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                        "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                        "temperature": (
+                            ["x", "y", "day"],
+                            rng.standard_normal(size=(1, 4, 2)),
+                        ),
+                        "pressure": (
+                            ["x", "y", "day"],
+                            rng.standard_normal(size=(1, 4, 2)),
+                        ),
+                        "humidity": (
+                            ["x", "y", "day"],
+                            rng.standard_normal(size=(1, 4, 2)),
+                        ),
+                        "precipitation": (
+                            ["x", "y", "day"],
+                            rng.standard_normal(size=(1, 4, 2)),
+                        ),
+                        "cloud cover": (
+                            ["x", "y", "day"],
+                            rng.standard_normal(size=(1, 4, 2)),
+                        ),
                     },
                     coords={
                         "lat": (["x", "y"], lat),
@@ -54,11 +68,11 @@ def create_concat_datasets(
             result.append(
                 Dataset(
                     data_vars={
-                        "temperature": (["x", "y"], np.random.randn(1, 4)),
-                        "pressure": (["x", "y"], np.random.randn(1, 4)),
-                        "humidity": (["x", "y"], np.random.randn(1, 4)),
-                        "precipitation": (["x", "y"], np.random.randn(1, 4)),
-                        "cloud cover": (["x", "y"], np.random.randn(1, 4)),
+                        "temperature": (["x", "y"], rng.standard_normal(size=(1, 4))),
+                        "pressure": (["x", "y"], rng.standard_normal(size=(1, 4))),
+                        "humidity": (["x", "y"], rng.standard_normal(size=(1, 4))),
+                        "precipitation": (["x", "y"], rng.standard_normal(size=(1, 4))),
+                        "cloud cover": (["x", "y"], rng.standard_normal(size=(1, 4))),
                     },
                     coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)},
                 )
@@ -71,22 +85,22 @@ def create_concat_datasets(
 def create_typed_datasets(
     num_datasets: int = 2, seed: int | None = None
 ) -> list[Dataset]:
-    random.seed(seed)
     var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"]
     result = []
-    lat = np.random.randn(1, 4)
-    lon = np.random.randn(1, 4)
+    rng = np.random.default_rng(seed)
+    lat = rng.standard_normal(size=(1, 4))
+    lon = rng.standard_normal(size=(1, 4))
     for i in range(num_datasets):
         result.append(
             Dataset(
                 data_vars={
-                    "float": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                    "float2": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    "float": (["x", "y", "day"], rng.standard_normal(size=(1, 4, 2))),
+                    "float2": (["x", "y", "day"], rng.standard_normal(size=(1, 4, 2))),
                     "string": (
                         ["x", "y", "day"],
-                        np.random.choice(var_strings, (1, 4, 2)),
+                        rng.choice(var_strings, size=(1, 4, 2)),
                     ),
-                    "int": (["x", "y", "day"], np.random.randint(0, 10, (1, 4, 2))),
+                    "int": (["x", "y", "day"], rng.integers(0, 10, size=(1, 4, 2))),
                     "datetime64": (
                         ["x", "y", "day"],
                         np.arange(
@@ -710,10 +724,11 @@ def create_ds(
 def test_concat_fill_missing_variables(
     concat_var_names, create_concat_ds, dim: bool, coord: bool
 ) -> None:
+    # random single variables missing in each dataset
     var_names = concat_var_names()
 
-    random.seed(42)
-    drop_idx = [random.randrange(len(vlist)) for vlist in var_names]
+    rng = np.random.default_rng(seed=42)
+    drop_idx = [rng.integers(len(vlist)) for vlist in var_names]
     expected = concat(
         create_concat_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all"
     )
@@ -726,6 +741,13 @@ def test_concat_fill_missing_variables(
     concat_ds = create_concat_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx)
     actual = concat(concat_ds, dim="time", data_vars="all")
 
+    r1 = list(actual.data_vars.keys())
+    r2 = list(expected.data_vars.keys())
+    # move element missing in first dataset at the end
+    # TODO: Can all variables become deterministic?
+    r2 = r2 + [r2.pop(drop_idx[0])]
+    assert r1 == r2  # check the variables orders are the same
+
     for name in var_names[0]:
         assert_equal(expected[name], actual[name])
     assert_equal(expected, actual)

From b2b0b184d10d5f3c19be5282499a3d6be0ec3c8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Tue, 10 Jan 2023 15:19:01 +0100
Subject: [PATCH 22/39] move code for variable order into dedicated function,
 merge with _parse_datasets, provide fast lane for variable order estimation

---
 xarray/core/concat.py       | 55 ++++++++++++++++++++++++++++---------
 xarray/tests/test_concat.py | 14 +++++-----
 2 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 25f6534f89a..f6824306bfd 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -406,20 +406,51 @@ def process_subset_opt(opt, subset):
     return concat_over, equals, concat_dim_lengths
 
 
+# search for dataset containing all wanted variables
+# if no matching dataset is found return order from first dataset
+# and append missing variables at the end
+def _get_var_order(
+    datasets: list[T_Dataset], data_names: set[Hashable], max_vars_index: int
+) -> list[Hashable]:
+    # check if first dataset contains all wanted variables
+    data_var_order = list(datasets[0].variables)
+    if not data_names.issubset(set(data_var_order)):
+        # find dataset with max count variables and check it
+        max_var_order = list(datasets[max_vars_index].variables)
+        if data_names.issubset(set(max_var_order)):
+            data_var_order = max_var_order
+        else:
+            # TODO: since data_names is a set, ordering for the appended variables
+            #  is not deterministic, see also discussion in
+            #  https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738
+            data_var_order += [e for e in data_names if e not in data_var_order]
+    return data_var_order
+
+
 # determine dimensional coordinate names and a dict mapping name to DataArray
 def _parse_datasets(
-    datasets: Iterable[T_Dataset],
-) -> tuple[dict[Hashable, Variable], dict[Hashable, int], set[Hashable], set[Hashable]]:
+    datasets: list[T_Dataset],
+) -> tuple[
+    dict[Hashable, Variable],
+    dict[Hashable, int],
+    set[Hashable],
+    set[Hashable],
+    list[Hashable],
+]:
 
     dims: set[Hashable] = set()
     all_coord_names: set[Hashable] = set()
     data_vars: set[Hashable] = set()  # list of data_vars
     dim_coords: dict[Hashable, Variable] = {}  # maps dim name to variable
     dims_sizes: dict[Hashable, int] = {}  # shared dimension sizes to expand variables
+    data_vars_order: list[Hashable]  # dataset index of maximum count of variables
 
-    for ds in datasets:
+    data_vars_count = []
+
+    for i, ds in enumerate(datasets):
         dims_sizes.update(ds.dims)
         all_coord_names.update(ds.coords)
+        data_vars_count.append(len(ds.data_vars))
         data_vars.update(ds.data_vars)
 
         # preserves ordering of dimensions
@@ -431,7 +462,10 @@ def _parse_datasets(
                 dim_coords[dim] = ds.coords[dim].variable
         dims = dims | set(ds.dims)
 
-    return dim_coords, dims_sizes, all_coord_names, data_vars
+    max_vars_index = data_vars_count.index(max(data_vars_count))
+    data_vars_order = _get_var_order(datasets, data_vars, max_vars_index)
+
+    return dim_coords, dims_sizes, all_coord_names, data_vars, data_vars_order
 
 
 def _dataset_concat(
@@ -473,7 +507,9 @@ def _dataset_concat(
         align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value)
     )
 
-    dim_coords, dims_sizes, coord_names, data_names = _parse_datasets(datasets)
+    dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets(
+        datasets
+    )
     dim_names = set(dim_coords)
     unlabeled_dims = dim_names - coord_names
 
@@ -555,19 +591,12 @@ def get_indexes(name):
                     data = var.set_dims(dim).values
                     yield PandasIndex(data, dim, coord_dtype=var.dtype)
 
-    # preserve variable order for variables in first dataset
-    data_var_order = list(datasets[0].variables)
-    # append additional variables to the end
-    # TODO: since data_names is a set the ordering for the appended variables
-    #  is not deterministic, see also discussion in
-    #  https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738
-    data_var_order += [e for e in data_names if e not in data_var_order]
     # create concatenation index, needed for later reindexing
     concat_index = list(range(sum(concat_dim_lengths)))
 
     # stack up each variable and/or index to fill-out the dataset (in order)
     # n.b. this loop preserves variable order, needed for groupby.
-    for name in data_var_order:
+    for name in data_vars_order:
         if name in concat_over and name not in result_indexes:
             variables = []
             variable_index = []
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 35b3a826e9c..a6eb936b8ef 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -149,6 +149,8 @@ def test_concat_compat() -> None:
         ValueError, match=r"coordinates in some datasets but not others"
     ):
         concat([ds1, ds2], dim="q")
+    with pytest.raises(ValueError, match=r"'q' not present in all datasets"):
+        concat([ds2, ds1], dim="q")
 
 
 def test_concat_missing_var() -> None:
@@ -247,10 +249,7 @@ def test_concat_missing_multiple_consecutive_var() -> None:
     result = concat(datasets, dim="day")
     r1 = [var for var in result.data_vars]
     r2 = [var for var in ds_result.data_vars]
-    # check the variables orders are the same for the first three variables
-    # TODO: Can all variables become deterministic?
-    assert r1[:3] == r2[:3]
-    assert set(r1[3:]) == set(r2[3:])  # just check availability for the remaining vars
+    r1 == r2
     assert_equal(result, ds_result)
 
 
@@ -439,8 +438,8 @@ def test_multiple_datasets_with_missing_variables() -> None:
 
     ds_result = Dataset(
         data_vars={
-            # pressure will be first in this since the first dataset is missing this var
-            # and there isn't a good way to determine that this should be first
+            # pressure will be first here since it is first in first dataset and
+            # there isn't a good way to determine that temperature should be first
             # this also means temperature will be last as the first data vars will
             # determine the order for all that exist in that dataset
             "pressure": (["x", "y", "day"], result_vars["pressure"]),
@@ -508,6 +507,7 @@ def test_multiple_datasets_with_multiple_missing_variables() -> None:
     r1 = list(result.data_vars.keys())
     r2 = list(ds_result.data_vars.keys())
     # check the variables orders are the same for the first three variables
+    # TODO: Can all variables become deterministic?
     assert r1[:3] == r2[:3]
     assert set(r1[3:]) == set(r2[3:])  # just check availability for the remaining vars
     assert_equal(result, ds_result)
@@ -662,9 +662,9 @@ def test_order_when_filling_missing() -> None:
     result_keys_rev = [
         "temperature",
         "pressure",
+        "humidity",
         "precipitation",
         "cloud cover",
-        "humidity",
     ]
     # test order when concat in reversed order
     rev_result = concat(datasets[::-1], dim="day")

From bb0a8ae91dd37a0a95a14acfb9b8e4a41f6be048 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Tue, 10 Jan 2023 15:43:16 +0100
Subject: [PATCH 23/39] fix comment

---
 xarray/core/concat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index f6824306bfd..d1bf31ea98b 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -443,7 +443,7 @@ def _parse_datasets(
     data_vars: set[Hashable] = set()  # list of data_vars
     dim_coords: dict[Hashable, Variable] = {}  # maps dim name to variable
     dims_sizes: dict[Hashable, int] = {}  # shared dimension sizes to expand variables
-    data_vars_order: list[Hashable]  # dataset index of maximum count of variables
+    data_vars_order: list[Hashable]  # ordered list of variables
 
     data_vars_count = []
 

From e266fe535802d3628102a436d5271c8c939bece9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 16 Jan 2023 10:33:54 +0100
Subject: [PATCH 24/39] Use order from first dataset, append missing variables
 to the end

---
 xarray/core/concat.py       | 29 +++++++++--------------------
 xarray/tests/test_concat.py |  2 +-
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index d1bf31ea98b..508e71c10b9 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -406,24 +406,17 @@ def process_subset_opt(opt, subset):
     return concat_over, equals, concat_dim_lengths
 
 
-# search for dataset containing all wanted variables
-# if no matching dataset is found return order from first dataset
-# and append missing variables at the end
 def _get_var_order(
-    datasets: list[T_Dataset], data_names: set[Hashable], max_vars_index: int
+    datasets: list[T_Dataset], data_names: set[Hashable]
 ) -> list[Hashable]:
-    # check if first dataset contains all wanted variables
+    # get ordering from first dataset
+    # if first_dataset contains all wanted variables, we are safe
+    # otherwise we append missing variables to the end
     data_var_order = list(datasets[0].variables)
-    if not data_names.issubset(set(data_var_order)):
-        # find dataset with max count variables and check it
-        max_var_order = list(datasets[max_vars_index].variables)
-        if data_names.issubset(set(max_var_order)):
-            data_var_order = max_var_order
-        else:
-            # TODO: since data_names is a set, ordering for the appended variables
-            #  is not deterministic, see also discussion in
-            #  https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738
-            data_var_order += [e for e in data_names if e not in data_var_order]
+    # TODO: since data_names is a set, ordering for the appended variables
+    #  is not deterministic, see also discussion in
+    #  https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738
+    data_var_order += [e for e in data_names if e not in data_var_order]
     return data_var_order
 
 
@@ -445,12 +438,9 @@ def _parse_datasets(
     dims_sizes: dict[Hashable, int] = {}  # shared dimension sizes to expand variables
     data_vars_order: list[Hashable]  # ordered list of variables
 
-    data_vars_count = []
-
     for i, ds in enumerate(datasets):
         dims_sizes.update(ds.dims)
         all_coord_names.update(ds.coords)
-        data_vars_count.append(len(ds.data_vars))
         data_vars.update(ds.data_vars)
 
         # preserves ordering of dimensions
@@ -462,8 +452,7 @@ def _parse_datasets(
                 dim_coords[dim] = ds.coords[dim].variable
         dims = dims | set(ds.dims)
 
-    max_vars_index = data_vars_count.index(max(data_vars_count))
-    data_vars_order = _get_var_order(datasets, data_vars, max_vars_index)
+    data_vars_order = _get_var_order(datasets, data_vars)
 
     return dim_coords, dims_sizes, all_coord_names, data_vars, data_vars_order
 
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index a6eb936b8ef..89ab191e8fe 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -662,9 +662,9 @@ def test_order_when_filling_missing() -> None:
     result_keys_rev = [
         "temperature",
         "pressure",
-        "humidity",
         "precipitation",
         "cloud cover",
+        "humidity",
     ]
     # test order when concat in reversed order
     rev_result = concat(datasets[::-1], dim="day")

From 6120796b01e9ecbb6c147a4ffbcff844f9eb2545 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 16 Jan 2023 10:44:10 +0100
Subject: [PATCH 25/39] ensure fill_value is dict

---
 xarray/core/concat.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 508e71c10b9..474d83aa4ee 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from collections import defaultdict
 from typing import TYPE_CHECKING, Any, Hashable, Iterable, cast, overload
 
 import pandas as pd
@@ -496,6 +497,12 @@ def _dataset_concat(
         align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value)
     )
 
+    # ensure dictionary for fill_value
+    if isinstance(fill_value, dict):
+        fill_value_ = defaultdict(lambda: dtypes.NA, **fill_value)
+    else:
+        fill_value_ = defaultdict(lambda: fill_value)
+
     dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets(
         datasets
     )
@@ -648,14 +655,10 @@ def get_indexes(name):
                     )
                     # reindex if variable is not present in all datasets
                     if len(variable_index) < len(concat_index):
-                        try:
-                            fill = fill_value[name]
-                        except (TypeError, KeyError):
-                            fill = fill_value
                         combined_var = (
                             DataArray(data=combined_var, name=name)
                             .assign_coords({dim: variable_index})
-                            .reindex({dim: concat_index}, fill_value=fill)
+                            .reindex({dim: concat_index}, fill_value=fill_value_[name])
                             .variable
                         )
                 result_vars[name] = combined_var

From 5733d931feaddccc6cb404a2edc6ae5a79ef1cd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 16 Jan 2023 15:14:24 +0100
Subject: [PATCH 26/39] ensure fill_value in align

---
 xarray/core/alignment.py |  2 +-
 xarray/core/concat.py    | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
index 38978a5e4f3..051ad305904 100644
--- a/xarray/core/alignment.py
+++ b/xarray/core/alignment.py
@@ -71,7 +71,7 @@ def reindex_variables(
 
     for name, var in variables.items():
         if isinstance(fill_value, dict):
-            fill_value_ = fill_value.get(name, dtypes.NA)
+            fill_value_ = fill_value[name]
         else:
             fill_value_ = fill_value
 
diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 474d83aa4ee..2cf4bace388 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -491,18 +491,18 @@ def _dataset_concat(
 
     dim, index = _calc_concat_dim_index(dim)
 
-    # Make sure we're working on a copy (we'll be loading variables)
-    datasets = [ds.copy() for ds in datasets]
-    datasets = list(
-        align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value)
-    )
-
     # ensure dictionary for fill_value
     if isinstance(fill_value, dict):
-        fill_value_ = defaultdict(lambda: dtypes.NA, **fill_value)
+        fill_value_ = fill_value.copy()
     else:
         fill_value_ = defaultdict(lambda: fill_value)
 
+    # Make sure we're working on a copy (we'll be loading variables)
+    datasets = [ds.copy() for ds in datasets]
+    datasets = list(
+        align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value_)
+    )
+
     dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets(
         datasets
     )

From 98914392be66bb8ce8c1238444dd6b7e1e2b68fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 16 Jan 2023 15:15:01 +0100
Subject: [PATCH 27/39] simplify combined_var, fix test

---
 xarray/core/concat.py       | 34 +++++++++++-----------------------
 xarray/tests/test_concat.py |  2 +-
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 2cf4bace388..468f7ade657 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -598,7 +598,7 @@ def get_indexes(name):
             variable_index = []
             for i, ds in enumerate(datasets):
                 if name in ds.variables:
-                    variables.append(ds.variables[name])
+                    variables.append(ds[name].variable)
                     # add to variable index, needed for reindexing
                     variable_index.extend(
                         [
@@ -638,29 +638,17 @@ def get_indexes(name):
                     )
                     result_vars[k] = v
             else:
-                # do not concat if:
-                # 1. variable is only present in one dataset of multiple datasets and
-                # 2. dim is not a dimension of variable
-                if (
-                    dim not in variables[0].dims
-                    and len(variables) == 1
-                    and len(datasets) > 1
-                ):
-                    combined_var = variables[0]
-                # only concat if variable is in multiple datasets
-                # or if single dataset (GH1988)
-                else:
-                    combined_var = concat_vars(
-                        vars, dim, positions, combine_attrs=combine_attrs
+                combined_var = concat_vars(
+                    vars, dim, positions, combine_attrs=combine_attrs
+                )
+                # reindex if variable is not present in all datasets
+                if len(variable_index) < len(concat_index):
+                    combined_var = (
+                        DataArray(data=combined_var, name=name)
+                        .assign_coords({dim: variable_index})
+                        .reindex({dim: concat_index}, fill_value=fill_value_[name])
+                        .variable
                     )
-                    # reindex if variable is not present in all datasets
-                    if len(variable_index) < len(concat_index):
-                        combined_var = (
-                            DataArray(data=combined_var, name=name)
-                            .assign_coords({dim: variable_index})
-                            .reindex({dim: concat_index}, fill_value=fill_value_[name])
-                            .variable
-                        )
                 result_vars[name] = combined_var
 
         elif name in result_vars:
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 89ab191e8fe..0bed5786464 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -785,7 +785,7 @@ def test_concat_merge_variables_present_in_some_datasets(self, data) -> None:
         split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))]
         data0, data1 = deepcopy(split_data)
         data1["foo"] = ("bar", np.random.randn(10))
-        actual = concat([data0, data1], "dim1")
+        actual = concat([data0, data1], "dim1", data_vars="minimal")
         expected = data.copy().assign(foo=data1.foo)
         assert_identical(expected, actual)
 

From 94b9ba95e1d76bac2cc553dff745b96c9d141bfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 16 Jan 2023 15:53:49 +0100
Subject: [PATCH 28/39] revert fill_value for alignment.py

---
 xarray/core/alignment.py | 2 +-
 xarray/core/concat.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
index 051ad305904..38978a5e4f3 100644
--- a/xarray/core/alignment.py
+++ b/xarray/core/alignment.py
@@ -71,7 +71,7 @@ def reindex_variables(
 
     for name, var in variables.items():
         if isinstance(fill_value, dict):
-            fill_value_ = fill_value[name]
+            fill_value_ = fill_value.get(name, dtypes.NA)
         else:
             fill_value_ = fill_value
 
diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 468f7ade657..421fb5f737f 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -500,7 +500,7 @@ def _dataset_concat(
     # Make sure we're working on a copy (we'll be loading variables)
     datasets = [ds.copy() for ds in datasets]
     datasets = list(
-        align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value_)
+        align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value)
     )
 
     dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets(

From 70be70f6b3758f491e2390fdd84838615688b94b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Tue, 17 Jan 2023 09:46:11 +0100
Subject: [PATCH 29/39] derive variable order in order of appearance as
 suggested per review

---
 xarray/core/concat.py       | 22 +++-------------------
 xarray/tests/test_concat.py |  4 +---
 2 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 421fb5f737f..6ca4f02543e 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -407,20 +407,6 @@ def process_subset_opt(opt, subset):
     return concat_over, equals, concat_dim_lengths
 
 
-def _get_var_order(
-    datasets: list[T_Dataset], data_names: set[Hashable]
-) -> list[Hashable]:
-    # get ordering from first dataset
-    # if first_dataset contains all wanted variables, we are safe
-    # otherwise we append missing variables to the end
-    data_var_order = list(datasets[0].variables)
-    # TODO: since data_names is a set, ordering for the appended variables
-    #  is not deterministic, see also discussion in
-    #  https://github.com/pydata/xarray/pull/3545#pullrequestreview-347543738
-    data_var_order += [e for e in data_names if e not in data_var_order]
-    return data_var_order
-
-
 # determine dimensional coordinate names and a dict mapping name to DataArray
 def _parse_datasets(
     datasets: list[T_Dataset],
@@ -431,18 +417,18 @@ def _parse_datasets(
     set[Hashable],
     list[Hashable],
 ]:
-
     dims: set[Hashable] = set()
     all_coord_names: set[Hashable] = set()
     data_vars: set[Hashable] = set()  # list of data_vars
     dim_coords: dict[Hashable, Variable] = {}  # maps dim name to variable
     dims_sizes: dict[Hashable, int] = {}  # shared dimension sizes to expand variables
-    data_vars_order: list[Hashable]  # ordered list of variables
+    variables_order: dict[Hashable, Variable] = {}  # variables in order of appearance
 
     for i, ds in enumerate(datasets):
         dims_sizes.update(ds.dims)
         all_coord_names.update(ds.coords)
         data_vars.update(ds.data_vars)
+        variables_order.update(ds.variables)
 
         # preserves ordering of dimensions
         for dim in ds.dims:
@@ -453,9 +439,7 @@ def _parse_datasets(
                 dim_coords[dim] = ds.coords[dim].variable
         dims = dims | set(ds.dims)
 
-    data_vars_order = _get_var_order(datasets, data_vars)
-
-    return dim_coords, dims_sizes, all_coord_names, data_vars, data_vars_order
+    return dim_coords, dims_sizes, all_coord_names, data_vars, list(variables_order)
 
 
 def _dataset_concat(
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 0bed5786464..0e1c576079d 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -145,9 +145,7 @@ def test_concat_compat() -> None:
 
     for var in ["has_x", "no_x_y"]:
         assert "y" not in result[var].dims and "y" not in result[var].coords
-    with pytest.raises(
-        ValueError, match=r"coordinates in some datasets but not others"
-    ):
+    with pytest.raises(ValueError, match=r"'q' not present in all datasets"):
         concat([ds1, ds2], dim="q")
     with pytest.raises(ValueError, match=r"'q' not present in all datasets"):
         concat([ds2, ds1], dim="q")

From c3eda8f5359e0781b3c758db724e2228a9bb5a68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Tue, 17 Jan 2023 09:49:25 +0100
Subject: [PATCH 30/39] remove unneeded enumerate

---
 xarray/core/concat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 6ca4f02543e..77c784513ec 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -424,7 +424,7 @@ def _parse_datasets(
     dims_sizes: dict[Hashable, int] = {}  # shared dimension sizes to expand variables
     variables_order: dict[Hashable, Variable] = {}  # variables in order of appearance
 
-    for i, ds in enumerate(datasets):
+    for ds in datasets:
         dims_sizes.update(ds.dims)
         all_coord_names.update(ds.coords)
         data_vars.update(ds.data_vars)

From 70f38abc2a850da7472c69323a9a1001dc8df2c3 Mon Sep 17 00:00:00 2001
From: dcherian <deepak@cherian.net>
Date: Tue, 17 Jan 2023 08:23:33 -0700
Subject: [PATCH 31/39] Use alignment.reindex_variables instead.

This also removes the need to handle fill_value
---
 xarray/core/concat.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 77c784513ec..10db61612d0 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -1,12 +1,11 @@
 from __future__ import annotations
 
-from collections import defaultdict
 from typing import TYPE_CHECKING, Any, Hashable, Iterable, cast, overload
 
 import pandas as pd
 
 from xarray.core import dtypes, utils
-from xarray.core.alignment import align
+from xarray.core.alignment import align, reindex_variables
 from xarray.core.duck_array_ops import lazy_array_equiv
 from xarray.core.indexes import Index, PandasIndex
 from xarray.core.merge import (
@@ -475,12 +474,6 @@ def _dataset_concat(
 
     dim, index = _calc_concat_dim_index(dim)
 
-    # ensure dictionary for fill_value
-    if isinstance(fill_value, dict):
-        fill_value_ = fill_value.copy()
-    else:
-        fill_value_ = defaultdict(lambda: fill_value)
-
     # Make sure we're working on a copy (we'll be loading variables)
     datasets = [ds.copy() for ds in datasets]
     datasets = list(
@@ -627,12 +620,13 @@ def get_indexes(name):
                 )
                 # reindex if variable is not present in all datasets
                 if len(variable_index) < len(concat_index):
-                    combined_var = (
-                        DataArray(data=combined_var, name=name)
-                        .assign_coords({dim: variable_index})
-                        .reindex({dim: concat_index}, fill_value=fill_value_[name])
-                        .variable
-                    )
+                    combined_var = reindex_variables(
+                        variables={name: combined_var},
+                        dim_pos_indexers={
+                            dim: pd.Index(variable_index).get_indexer(concat_index)
+                        },
+                        fill_value=fill_value,
+                    )[name]
                 result_vars[name] = combined_var
 
         elif name in result_vars:

From 4825c9406377c0348980448c74a83d15e6fccfd6 Mon Sep 17 00:00:00 2001
From: dcherian <deepak@cherian.net>
Date: Tue, 17 Jan 2023 08:26:54 -0700
Subject: [PATCH 32/39] small cleanup

---
 xarray/core/concat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 10db61612d0..c464c50ce59 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -480,7 +480,7 @@ def _dataset_concat(
         align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value)
     )
 
-    dim_coords, dims_sizes, coord_names, data_names, data_vars_order = _parse_datasets(
+    dim_coords, dims_sizes, coord_names, data_names, vars_order = _parse_datasets(
         datasets
     )
     dim_names = set(dim_coords)
@@ -569,7 +569,7 @@ def get_indexes(name):
 
     # stack up each variable and/or index to fill-out the dataset (in order)
     # n.b. this loop preserves variable order, needed for groupby.
-    for name in data_vars_order:
+    for name in vars_order:
         if name in concat_over and name not in result_indexes:
             variables = []
             variable_index = []
@@ -589,7 +589,7 @@ def get_indexes(name):
                         raise ValueError(
                             f"coordinate {name!r} not present in all datasets."
                         )
-            vars = list(ensure_common_dims(variables))
+            vars = ensure_common_dims(variables)
 
             # Try to concatenate the indexes, concatenate the variables when no index
             # is found on all datasets.

From a71f633a52b50cbcbd5a05717cc3662c64e525b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Tue, 17 Jan 2023 18:07:56 +0100
Subject: [PATCH 33/39] Update doc/whats-new.rst

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
---
 doc/whats-new.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index d7127deee1c..25d803ec5a1 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -75,7 +75,7 @@ Bug fixes
   By `Benoît Bovy <https://github.com/benbovy>`_.
 - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`,
   :pull:`7393`). By `Ian Carroll <https://github.com/itcarroll>`_.
-- Make :py:func:`xarray.concat` more robust when concatenating variables present in some datasets but
+- :py:func:`xarray.concat` can now concatenate variables present in some datasets but
   not others (:issue:`508`, :pull:`7400`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_ and `Scott Chamberlin <https://github.com/scottcha>`_.
 

From 92e6108484331819fa5e58844de41056f51c5f09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Wed, 18 Jan 2023 12:14:39 +0100
Subject: [PATCH 34/39] adapt tests as per review request, fix
 ensure_common_dims

---
 xarray/core/concat.py       |  17 +-
 xarray/tests/test_concat.py | 615 ++++++++++--------------------------
 2 files changed, 172 insertions(+), 460 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index c464c50ce59..95a2199f9c5 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -536,7 +536,7 @@ def _dataset_concat(
 
     # we've already verified everything is consistent; now, calculate
     # shared dimension sizes so we can expand the necessary variables
-    def ensure_common_dims(vars):
+    def ensure_common_dims(vars, concat_dim_lengths):
         # ensure each variable with the given name shares the same
         # dimensions and the same shape for all of them except along the
         # concat dimension
@@ -573,23 +573,24 @@ def get_indexes(name):
         if name in concat_over and name not in result_indexes:
             variables = []
             variable_index = []
+            var_concat_dim_length = []
             for i, ds in enumerate(datasets):
                 if name in ds.variables:
                     variables.append(ds[name].variable)
                     # add to variable index, needed for reindexing
-                    variable_index.extend(
-                        [
-                            sum(concat_dim_lengths[:i]) + k
-                            for k in range(concat_dim_lengths[i])
-                        ]
-                    )
+                    var_idx = [
+                        sum(concat_dim_lengths[:i]) + k
+                        for k in range(concat_dim_lengths[i])
+                    ]
+                    variable_index.extend(var_idx)
+                    var_concat_dim_length.append(len(var_idx))
                 else:
                     # raise if coordinate not in all datasets
                     if name in coord_names:
                         raise ValueError(
                             f"coordinate {name!r} not present in all datasets."
                         )
-            vars = ensure_common_dims(variables)
+            vars = ensure_common_dims(variables, var_concat_dim_length)
 
             # Try to concatenate the indexes, concatenate the variables when no index
             # is found on all datasets.
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 0e1c576079d..64e3ca30f36 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -31,32 +31,17 @@ def create_concat_datasets(
     lat = rng.standard_normal(size=(1, 4))
     lon = rng.standard_normal(size=(1, 4))
     result = []
+    variables = ["temperature", "pressure", "humidity", "precipitation", "cloud_cover"]
     for i in range(num_datasets):
         if include_day:
+            data_tuple = (
+                ["x", "y", "day"],
+                rng.standard_normal(size=(1, 4, 2)),
+            )
+            data_vars = {v: data_tuple for v in variables}
             result.append(
                 Dataset(
-                    data_vars={
-                        "temperature": (
-                            ["x", "y", "day"],
-                            rng.standard_normal(size=(1, 4, 2)),
-                        ),
-                        "pressure": (
-                            ["x", "y", "day"],
-                            rng.standard_normal(size=(1, 4, 2)),
-                        ),
-                        "humidity": (
-                            ["x", "y", "day"],
-                            rng.standard_normal(size=(1, 4, 2)),
-                        ),
-                        "precipitation": (
-                            ["x", "y", "day"],
-                            rng.standard_normal(size=(1, 4, 2)),
-                        ),
-                        "cloud cover": (
-                            ["x", "y", "day"],
-                            rng.standard_normal(size=(1, 4, 2)),
-                        ),
-                    },
+                    data_vars=data_vars,
                     coords={
                         "lat": (["x", "y"], lat),
                         "lon": (["x", "y"], lon),
@@ -65,15 +50,14 @@ def create_concat_datasets(
                 )
             )
         else:
+            data_tuple = (
+                ["x", "y"],
+                rng.standard_normal(size=(1, 4)),
+            )
+            data_vars = {v: data_tuple for v in variables}
             result.append(
                 Dataset(
-                    data_vars={
-                        "temperature": (["x", "y"], rng.standard_normal(size=(1, 4))),
-                        "pressure": (["x", "y"], rng.standard_normal(size=(1, 4))),
-                        "humidity": (["x", "y"], rng.standard_normal(size=(1, 4))),
-                        "precipitation": (["x", "y"], rng.standard_normal(size=(1, 4))),
-                        "cloud cover": (["x", "y"], rng.standard_normal(size=(1, 4))),
-                    },
+                    data_vars=data_vars,
                     coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)},
                 )
             )
@@ -153,163 +137,82 @@ def test_concat_compat() -> None:
 
 def test_concat_missing_var() -> None:
     datasets = create_concat_datasets(2, 123)
-    vars_to_drop = ["humidity", "precipitation", "cloud cover"]
-    datasets[0] = datasets[0].drop_vars(vars_to_drop)
-    datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"])
+    expected = concat(datasets, dim="day")
+    vars_to_drop = ["humidity", "precipitation", "cloud_cover"]
 
-    temperature_result = np.concatenate(
-        (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
-    )
-    pressure_result = np.concatenate(
-        (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2
-    )
-    ds_result = Dataset(
-        data_vars={
-            "temperature": (["x", "y", "day"], temperature_result),
-            "pressure": (["x", "y", "day"], pressure_result),
-        },
-        coords={
-            "lat": (["x", "y"], datasets[0].lat.values),
-            "lon": (["x", "y"], datasets[0].lon.values),
-            "day": ["day1", "day2", "day3", "day4"],
-        },
-    )
-    result = concat(datasets, dim="day")
+    expected = expected.drop_vars(vars_to_drop)
+    expected["pressure"][..., 2:] = np.nan
 
-    r1 = list(result.data_vars.keys())
-    r2 = list(ds_result.data_vars.keys())
-    assert r1 == r2  # check the variables orders are the same
+    datasets[0] = datasets[0].drop_vars(vars_to_drop)
+    datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"])
+    actual = concat(datasets, dim="day")
 
-    assert_equal(result, ds_result)
+    assert list(actual.data_vars.keys()) == ["temperature", "pressure"]
+    assert_identical(actual, expected)
 
 
 def test_concat_missing_multiple_consecutive_var() -> None:
     datasets = create_concat_datasets(3, 123)
-    vars_to_drop = ["pressure", "humidity"]
+    expected = concat(datasets, dim="day")
+    vars_to_drop = ["humidity", "pressure"]
+
+    expected["pressure"][..., :4] = np.nan
+    expected["humidity"][..., :4] = np.nan
+
     datasets[0] = datasets[0].drop_vars(vars_to_drop)
     datasets[1] = datasets[1].drop_vars(vars_to_drop)
+    actual = concat(datasets, dim="day")
 
-    temperature_result = np.concatenate(
-        (
-            datasets[0].temperature.values,
-            datasets[1].temperature.values,
-            datasets[2].temperature.values,
-        ),
-        axis=2,
-    )
-    pressure_result = np.concatenate(
-        (
-            np.full([1, 4, 2], np.nan),
-            np.full([1, 4, 2], np.nan),
-            datasets[2].pressure.values,
-        ),
-        axis=2,
-    )
-    humidity_result = np.concatenate(
-        (
-            np.full([1, 4, 2], np.nan),
-            np.full([1, 4, 2], np.nan),
-            datasets[2].humidity.values,
-        ),
-        axis=2,
-    )
-    precipitation_result = np.concatenate(
-        (
-            datasets[0].precipitation.values,
-            datasets[1].precipitation.values,
-            datasets[2].precipitation.values,
-        ),
-        axis=2,
-    )
-    cloudcover_result = np.concatenate(
-        (
-            datasets[0]["cloud cover"].values,
-            datasets[1]["cloud cover"].values,
-            datasets[2]["cloud cover"].values,
-        ),
-        axis=2,
-    )
-
-    ds_result = Dataset(
-        data_vars={
-            "temperature": (["x", "y", "day"], temperature_result),
-            "precipitation": (["x", "y", "day"], precipitation_result),
-            "cloud cover": (["x", "y", "day"], cloudcover_result),
-            "humidity": (["x", "y", "day"], humidity_result),
-            "pressure": (["x", "y", "day"], pressure_result),
-        },
-        coords={
-            "lat": (["x", "y"], datasets[0].lat.values),
-            "lon": (["x", "y"], datasets[0].lon.values),
-            "day": ["day1", "day2", "day3", "day4", "day5", "day6"],
-        },
-    )
-    result = concat(datasets, dim="day")
-    r1 = [var for var in result.data_vars]
-    r2 = [var for var in ds_result.data_vars]
-    r1 == r2
-    assert_equal(result, ds_result)
+    assert list(actual.data_vars.keys()) == [
+        "temperature",
+        "precipitation",
+        "cloud_cover",
+        "pressure",
+        "humidity",
+    ]
+    assert_identical(actual, expected)
 
 
 def test_concat_all_empty() -> None:
     ds1 = Dataset()
     ds2 = Dataset()
-    result = concat([ds1, ds2], dim="new_dim")
+    expected = Dataset()
+    actual = concat([ds1, ds2], dim="new_dim")
 
-    assert_equal(result, Dataset())
+    assert_identical(actual, expected)
 
 
 def test_concat_second_empty() -> None:
     ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1})
     ds2 = Dataset(coords={"x": 0.1})
 
-    ds_result = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1})
-    result = concat([ds1, ds2], dim="y")
+    expected = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1})
+    actual = concat([ds1, ds2], dim="y")
 
-    assert_equal(result, ds_result)
+    assert_identical(actual, expected)
 
 
 def test_multiple_missing_variables() -> None:
     datasets = create_concat_datasets(2, 123)
-    vars_to_drop = ["pressure", "cloud cover"]
-    datasets[1] = datasets[1].drop_vars(vars_to_drop)
+    expected = concat(datasets, dim="day")
+    vars_to_drop = ["pressure", "cloud_cover"]
 
-    temperature_result = np.concatenate(
-        (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
-    )
-    pressure_result = np.concatenate(
-        (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2
-    )
-    humidity_result = np.concatenate(
-        (datasets[0].humidity.values, datasets[1].humidity.values), axis=2
-    )
-    precipitation_result = np.concatenate(
-        (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2
-    )
-    cloudcover_result = np.concatenate(
-        (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2
-    )
-    ds_result = Dataset(
-        data_vars={
-            "temperature": (["x", "y", "day"], temperature_result),
-            "pressure": (["x", "y", "day"], pressure_result),
-            "humidity": (["x", "y", "day"], humidity_result),
-            "precipitation": (["x", "y", "day"], precipitation_result),
-            "cloud cover": (["x", "y", "day"], cloudcover_result),
-        },
-        coords={
-            "lat": (["x", "y"], datasets[0].lat.values),
-            "lon": (["x", "y"], datasets[0].lon.values),
-            "day": ["day1", "day2", "day3", "day4"],
-        },
-    )
-    result = concat(datasets, dim="day")
+    expected["pressure"][..., 2:] = np.nan
+    expected["cloud_cover"][..., 2:] = np.nan
 
-    r1 = list(result.data_vars.keys())
-    r2 = list(ds_result.data_vars.keys())
-    assert r1 == r2  # check the variables orders are the same
+    datasets[1] = datasets[1].drop_vars(vars_to_drop)
+    actual = concat(datasets, dim="day")
+
+    # check the variables orders are the same
+    assert list(actual.data_vars.keys()) == [
+        "temperature",
+        "pressure",
+        "humidity",
+        "precipitation",
+        "cloud_cover",
+    ]
 
-    assert_equal(result, ds_result)
+    assert_identical(actual, expected)
 
 
 @pytest.mark.parametrize("include_day", [True, False])
@@ -319,80 +222,31 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) ->
         "pressure",
         "humidity",
         "precipitation",
-        "cloud cover",
+        "cloud_cover",
     ]
 
     datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=include_day)
-    # set up the test data
-    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
+    expected = concat(datasets, dim="day")
 
-    dim_size = 2 if include_day else 1
-
-    # set up the validation data
-    # the below code just drops one var per dataset depending on the location of the
-    # dataset in the list and allows us to quickly catch any boundaries cases across
-    # the three equivalence classes of beginning, middle and end of the concat list
-    result_vars = dict.fromkeys(vars_to_drop, np.array([]))
-    for i in range(len(vars_to_drop)):
-        for d in range(len(datasets)):
-            if d != i:
-                if include_day:
-                    ds_vals = datasets[d][vars_to_drop[i]].values
-                else:
-                    ds_vals = datasets[d][vars_to_drop[i]].values[..., None]
-                if not result_vars[vars_to_drop[i]].size:
-                    result_vars[vars_to_drop[i]] = ds_vals
-                else:
-                    result_vars[vars_to_drop[i]] = np.concatenate(
-                        (
-                            result_vars[vars_to_drop[i]],
-                            ds_vals,
-                        ),
-                        axis=-1,
-                    )
-            else:
-                if not result_vars[vars_to_drop[i]].size:
-                    result_vars[vars_to_drop[i]] = np.full([1, 4, dim_size], np.nan)
-                else:
-                    result_vars[vars_to_drop[i]] = np.concatenate(
-                        (
-                            result_vars[vars_to_drop[i]],
-                            np.full([1, 4, dim_size], np.nan),
-                        ),
-                        axis=-1,
-                    )
-
-    ds_result = Dataset(
-        data_vars={
-            # pressure will be first here since it is first in first dataset and
-            # there isn't a good way to determine that temperature should be first
-            # this also means temperature will be last as the first data vars will
-            # determine the order for all that exist in that dataset
-            "pressure": (["x", "y", "day"], result_vars["pressure"]),
-            "humidity": (["x", "y", "day"], result_vars["humidity"]),
-            "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
-            "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
-            "temperature": (["x", "y", "day"], result_vars["temperature"]),
-        },
-        coords={
-            "lat": (["x", "y"], datasets[0].lat.values),
-            "lon": (["x", "y"], datasets[0].lon.values),
-        },
-    )
-    if include_day:
-        ds_result = ds_result.assign_coords(
-            {"day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))]}
-        )
-    else:
-        ds_result = ds_result.transpose("day", "x", "y")
+    for i, name in enumerate(vars_to_drop):
+        if include_day:
+            expected[name][..., i * 2 : (i + 1) * 2] = np.nan
+        else:
+            expected[name][i : i + 1, ...] = np.nan
 
-    result = concat(datasets, dim="day")
+    # set up the test data
+    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
 
-    r1 = list(result.data_vars.keys())
-    r2 = list(ds_result.data_vars.keys())
-    assert r1 == r2  # check the variables orders are the same
+    actual = concat(datasets, dim="day")
 
-    assert_equal(result, ds_result)
+    assert list(actual.data_vars.keys()) == [
+        "pressure",
+        "humidity",
+        "precipitation",
+        "cloud_cover",
+        "temperature",
+    ]
+    assert_identical(actual, expected)
 
 
 def test_multiple_datasets_with_missing_variables() -> None:
@@ -401,206 +255,83 @@ def test_multiple_datasets_with_missing_variables() -> None:
         "pressure",
         "humidity",
         "precipitation",
-        "cloud cover",
+        "cloud_cover",
     ]
     datasets = create_concat_datasets(len(vars_to_drop), 123)
-    # set up the test data
-    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
 
-    # set up the validation data
-    # the below code just drops one var per dataset depending on the location of the
-    # dataset in the list and allows us to quickly catch any boundaries cases across
-    # the three equivalence classes of beginning, middle and end of the concat list
-    result_vars = dict.fromkeys(vars_to_drop, np.array([]))
-    for i in range(len(vars_to_drop)):
-        for d in range(len(datasets)):
-            if d != i:
-                if not result_vars[vars_to_drop[i]].size:
-                    result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values
-                else:
-                    result_vars[vars_to_drop[i]] = np.concatenate(
-                        (
-                            result_vars[vars_to_drop[i]],
-                            datasets[d][vars_to_drop[i]].values,
-                        ),
-                        axis=2,
-                    )
-            else:
-                if not result_vars[vars_to_drop[i]].size:
-                    result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan)
-                else:
-                    result_vars[vars_to_drop[i]] = np.concatenate(
-                        (result_vars[vars_to_drop[i]], np.full([1, 4, 2], np.nan)),
-                        axis=2,
-                    )
-
-    ds_result = Dataset(
-        data_vars={
-            # pressure will be first here since it is first in first dataset and
-            # there isn't a good way to determine that temperature should be first
-            # this also means temperature will be last as the first data vars will
-            # determine the order for all that exist in that dataset
-            "pressure": (["x", "y", "day"], result_vars["pressure"]),
-            "humidity": (["x", "y", "day"], result_vars["humidity"]),
-            "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
-            "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
-            "temperature": (["x", "y", "day"], result_vars["temperature"]),
-        },
-        coords={
-            "lat": (["x", "y"], datasets[0].lat.values),
-            "lon": (["x", "y"], datasets[0].lon.values),
-            "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))],
-        },
-    )
-    result = concat(datasets, dim="day")
+    expected = concat(datasets, dim="day")
+    for i, name in enumerate(vars_to_drop):
+        expected[name][..., i * 2 : (i + 1) * 2] = np.nan
 
-    r1 = list(result.data_vars.keys())
-    r2 = list(ds_result.data_vars.keys())
-    assert r1 == r2  # check the variables orders are the same
+    # set up the test data
+    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
+    actual = concat(datasets, dim="day")
 
-    assert_equal(result, ds_result)
+    assert list(actual.data_vars.keys()) == [
+        "pressure",
+        "humidity",
+        "precipitation",
+        "cloud_cover",
+        "temperature",
+    ]
+    assert_identical(actual, expected)
 
 
 def test_multiple_datasets_with_multiple_missing_variables() -> None:
     vars_to_drop_in_first = ["temperature", "pressure"]
-    vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"]
+    vars_to_drop_in_second = ["humidity", "precipitation", "cloud_cover"]
     datasets = create_concat_datasets(2, 123)
+    expected = concat(datasets, dim="day")
+    for name in vars_to_drop_in_first:
+        expected[name][..., :2] = np.nan
+    for name in vars_to_drop_in_second:
+        expected[name][..., 2:] = np.nan
+
     # set up the test data
     datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
     datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
 
-    temperature_result = np.concatenate(
-        (np.full([1, 4, 2], np.nan), datasets[1].temperature.values), axis=2
-    )
-    pressure_result = np.concatenate(
-        (np.full([1, 4, 2], np.nan), datasets[1].pressure.values), axis=2
-    )
-    humidity_result = np.concatenate(
-        (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2
-    )
-    precipitation_result = np.concatenate(
-        (datasets[0].precipitation.values, np.full([1, 4, 2], np.nan)), axis=2
-    )
-    cloudcover_result = np.concatenate(
-        (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2
-    )
-    ds_result = Dataset(
-        data_vars={
-            "humidity": (["x", "y", "day"], humidity_result),
-            "precipitation": (["x", "y", "day"], precipitation_result),
-            "cloud cover": (["x", "y", "day"], cloudcover_result),
-            # these two are at the end of the expected as they are missing from the first
-            # dataset in the concat list
-            "temperature": (["x", "y", "day"], temperature_result),
-            "pressure": (["x", "y", "day"], pressure_result),
-        },
-        coords={
-            "lat": (["x", "y"], datasets[0].lat.values),
-            "lon": (["x", "y"], datasets[0].lon.values),
-            "day": ["day1", "day2", "day3", "day4"],
-        },
-    )
-    result = concat(datasets, dim="day")
+    actual = concat(datasets, dim="day")
 
-    r1 = list(result.data_vars.keys())
-    r2 = list(ds_result.data_vars.keys())
-    # check the variables orders are the same for the first three variables
-    # TODO: Can all variables become deterministic?
-    assert r1[:3] == r2[:3]
-    assert set(r1[3:]) == set(r2[3:])  # just check availability for the remaining vars
-    assert_equal(result, ds_result)
+    assert list(actual.data_vars.keys()) == [
+        "humidity",
+        "precipitation",
+        "cloud_cover",
+        "temperature",
+        "pressure",
+    ]
+    assert_identical(actual, expected)
 
 
 def test_type_of_missing_fill() -> None:
     datasets = create_typed_datasets(2, 123)
-
+    expected1 = concat(datasets, dim="day", fill_value=dtypes.NA)
+    expected2 = concat(datasets[::-1], dim="day", fill_value=dtypes.NA)
     vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"]
+    expected = [expected2, expected1]
+    for i, exp in enumerate(expected):
+        sl = slice(i * 2, (i + 1) * 2)
+        exp["float2"][..., sl] = np.nan
+        exp["datetime64"][..., sl] = np.nan
+        exp["timedelta64"][..., sl] = np.nan
+        var = exp["int"] * 1.0
+        var[..., sl] = np.nan
+        exp["int"] = var
+        var = exp["string"].astype(object)
+        var[..., sl] = np.nan
+        exp["string"] = var
 
     # set up the test data
     datasets[1] = datasets[1].drop_vars(vars[1:])
 
-    float_result = np.concatenate(
-        (datasets[0].float.values, datasets[1].float.values), axis=2
-    )
-    float2_result = np.concatenate(
-        (datasets[0].float2.values, np.full([1, 4, 2], np.nan)), axis=2
-    )
-    # to correctly create the expected dataset we need to ensure we promote the string array to
-    # object type before filling as it will be promoted to that in the concat case.
-    # this matches the behavior of pandas
-    string_values = datasets[0].string.values
-    string_values = string_values.astype(object)
-    string_result = np.concatenate((string_values, np.full([1, 4, 2], np.nan)), axis=2)
-    datetime_result = np.concatenate(
-        (datasets[0].datetime64.values, np.full([1, 4, 2], np.datetime64("NaT"))),
-        axis=2,
-    )
-    timedelta_result = np.concatenate(
-        (datasets[0].timedelta64.values, np.full([1, 4, 2], np.timedelta64("NaT"))),
-        axis=2,
-    )
-    # string_result = string_result.astype(object)
-    int_result = np.concatenate(
-        (datasets[0].int.values, np.full([1, 4, 2], np.nan)), axis=2
-    )
-    ds_result = Dataset(
-        data_vars={
-            "float": (["x", "y", "day"], float_result),
-            "float2": (["x", "y", "day"], float2_result),
-            "string": (["x", "y", "day"], string_result),
-            "int": (["x", "y", "day"], int_result),
-            "datetime64": (["x", "y", "day"], datetime_result),
-            "timedelta64": (["x", "y", "day"], timedelta_result),
-        },
-        coords={
-            "lat": (["x", "y"], datasets[0].lat.values),
-            "lon": (["x", "y"], datasets[0].lon.values),
-            "day": ["day1", "day2", "day3", "day4"],
-        },
-    )
-    result = concat(datasets, dim="day", fill_value=dtypes.NA)
+    actual = concat(datasets, dim="day", fill_value=dtypes.NA)
 
-    assert_equal(result, ds_result)
+    assert_identical(actual, expected[1])
 
-    # test in the reverse order
-    float_result_rev = np.concatenate(
-        (datasets[1].float.values, datasets[0].float.values), axis=2
-    )
-    float2_result_rev = np.concatenate(
-        (np.full([1, 4, 2], np.nan), datasets[0].float2.values), axis=2
-    )
-    string_result_rev = np.concatenate(
-        (np.full([1, 4, 2], np.nan), string_values), axis=2
-    )
-    datetime_result_rev = np.concatenate(
-        (np.full([1, 4, 2], np.datetime64("NaT")), datasets[0].datetime64.values),
-        axis=2,
-    )
-    timedelta_result_rev = np.concatenate(
-        (np.full([1, 4, 2], np.timedelta64("NaT")), datasets[0].timedelta64.values),
-        axis=2,
-    )
-    int_result_rev = np.concatenate(
-        (np.full([1, 4, 2], np.nan), datasets[0].int.values), axis=2
-    )
-    ds_result_rev = Dataset(
-        data_vars={
-            "float": (["x", "y", "day"], float_result_rev),
-            "float2": (["x", "y", "day"], float2_result_rev),
-            "string": (["x", "y", "day"], string_result_rev),
-            "int": (["x", "y", "day"], int_result_rev),
-            "datetime64": (["x", "y", "day"], datetime_result_rev),
-            "timedelta64": (["x", "y", "day"], timedelta_result_rev),
-        },
-        coords={
-            "lat": (["x", "y"], datasets[0].lat.values),
-            "lon": (["x", "y"], datasets[0].lon.values),
-            "day": ["day3", "day4", "day1", "day2"],
-        },
-    )
-    result_rev = concat(datasets[::-1], dim="day", fill_value=dtypes.NA)
+    # reversed
+    actual = concat(datasets[::-1], dim="day", fill_value=dtypes.NA)
 
-    assert_equal(result_rev, ds_result_rev)
+    assert_identical(actual, expected[0])
 
 
 def test_order_when_filling_missing() -> None:
@@ -608,68 +339,38 @@ def test_order_when_filling_missing() -> None:
     # drop middle
     vars_to_drop_in_second = ["humidity"]
     datasets = create_concat_datasets(2, 123)
+    expected1 = concat(datasets, dim="day")
+    for name in vars_to_drop_in_second:
+        expected1[name][..., 2:] = np.nan
+    expected2 = concat(datasets[::-1], dim="day")
+    for name in vars_to_drop_in_second:
+        expected2[name][..., :2] = np.nan
+
     # set up the test data
     datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
     datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
 
-    temperature_result = np.concatenate(
-        (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
-    )
-    pressure_result = np.concatenate(
-        (datasets[0].pressure.values, datasets[1].pressure.values), axis=2
-    )
-    humidity_result = np.concatenate(
-        (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2
-    )
-    precipitation_result = np.concatenate(
-        (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2
-    )
-    cloudcover_result = np.concatenate(
-        (datasets[0]["cloud cover"].values, datasets[1]["cloud cover"].values), axis=2
-    )
-    ds_result = Dataset(
-        data_vars={
-            "temperature": (["x", "y", "day"], temperature_result),
-            "pressure": (["x", "y", "day"], pressure_result),
-            "precipitation": (["x", "y", "day"], precipitation_result),
-            "cloud cover": (["x", "y", "day"], cloudcover_result),
-            "humidity": (["x", "y", "day"], humidity_result),
-        },
-        coords={
-            "lat": (["x", "y"], datasets[0].lat.values),
-            "lon": (["x", "y"], datasets[0].lon.values),
-            "day": ["day1", "day2", "day3", "day4"],
-        },
-    )
-    result = concat(datasets, dim="day")
-
-    assert_equal(result, ds_result)
+    actual = concat(datasets, dim="day")
 
-    result_keys = [
+    assert list(actual.data_vars.keys()) == [
         "temperature",
         "pressure",
         "humidity",
         "precipitation",
-        "cloud cover",
+        "cloud_cover",
     ]
-    result_index = 0
-    for k in result.data_vars.keys():
-        assert k == result_keys[result_index]
-        result_index += 1
+    assert_identical(actual, expected1)
 
-    result_keys_rev = [
+    actual = concat(datasets[::-1], dim="day")
+
+    assert list(actual.data_vars.keys()) == [
         "temperature",
         "pressure",
         "precipitation",
-        "cloud cover",
+        "cloud_cover",
         "humidity",
     ]
-    # test order when concat in reversed order
-    rev_result = concat(datasets[::-1], dim="day")
-    result_index = 0
-    for k in rev_result.data_vars.keys():
-        assert k == result_keys_rev[result_index]
-        result_index += 1
+    assert_identical(actual, expected2)
 
 
 @pytest.fixture
@@ -739,16 +440,19 @@ def test_concat_fill_missing_variables(
     concat_ds = create_concat_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx)
     actual = concat(concat_ds, dim="time", data_vars="all")
 
-    r1 = list(actual.data_vars.keys())
-    r2 = list(expected.data_vars.keys())
-    # move element missing in first dataset at the end
-    # TODO: Can all variables become deterministic?
-    r2 = r2 + [r2.pop(drop_idx[0])]
-    assert r1 == r2  # check the variables orders are the same
-
-    for name in var_names[0]:
-        assert_equal(expected[name], actual[name])
-    assert_equal(expected, actual)
+    assert list(actual.data_vars.keys()) == [
+        "d01",
+        "d02",
+        "d03",
+        "d04",
+        "d05",
+        "d06",
+        "d07",
+        "d08",
+        "d09",
+        "d00",
+    ]
+    assert_identical(actual, expected)
 
 
 class TestConcatDataset:
@@ -787,6 +491,13 @@ def test_concat_merge_variables_present_in_some_datasets(self, data) -> None:
         expected = data.copy().assign(foo=data1.foo)
         assert_identical(expected, actual)
 
+        # expand foo
+        actual = concat([data0, data1], "dim1")
+        foo = np.ones((8, 10), dtype=data1.foo.dtype) * np.nan
+        foo[3:] = data1.foo.values[None, ...]
+        expected = data.copy().assign(foo=(["dim1", "bar"], foo))
+        assert_identical(expected, actual)
+
     def test_concat_2(self, data) -> None:
         dim = "dim2"
         datasets = [g for _, g in data.groupby(dim, squeeze=True)]

From 8610397c647bc01d757bc04354e8d43b1c8b19d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Thu, 19 Jan 2023 09:33:42 +0100
Subject: [PATCH 35/39] adapt tests as per review request

---
 xarray/tests/test_concat.py | 67 +++++++++++++------------------------
 1 file changed, 23 insertions(+), 44 deletions(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 64e3ca30f36..d3d8eeeea4c 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -136,7 +136,7 @@ def test_concat_compat() -> None:
 
 
 def test_concat_missing_var() -> None:
-    datasets = create_concat_datasets(2, 123)
+    datasets = create_concat_datasets(2, seed=123)
     expected = concat(datasets, dim="day")
     vars_to_drop = ["humidity", "precipitation", "cloud_cover"]
 
@@ -152,7 +152,7 @@ def test_concat_missing_var() -> None:
 
 
 def test_concat_missing_multiple_consecutive_var() -> None:
-    datasets = create_concat_datasets(3, 123)
+    datasets = create_concat_datasets(3, seed=123)
     expected = concat(datasets, dim="day")
     vars_to_drop = ["humidity", "pressure"]
 
@@ -191,9 +191,16 @@ def test_concat_second_empty() -> None:
 
     assert_identical(actual, expected)
 
+    expected = Dataset(
+        data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": ("y", [0.1, 0.1])}
+    )
+    actual = concat([ds1, ds2], dim="y", coords="all")
+
+    assert_identical(actual, expected)
 
-def test_multiple_missing_variables() -> None:
-    datasets = create_concat_datasets(2, 123)
+
+def test_concat_multiple_missing_variables() -> None:
+    datasets = create_concat_datasets(2, seed=123)
     expected = concat(datasets, dim="day")
     vars_to_drop = ["pressure", "cloud_cover"]
 
@@ -216,7 +223,7 @@ def test_multiple_missing_variables() -> None:
 
 
 @pytest.mark.parametrize("include_day", [True, False])
-def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) -> None:
+def test_concat_multiple_datasets_missing_vars(include_day: bool) -> None:
     vars_to_drop = [
         "temperature",
         "pressure",
@@ -225,7 +232,9 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) ->
         "cloud_cover",
     ]
 
-    datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=include_day)
+    datasets = create_concat_datasets(
+        len(vars_to_drop), seed=123, include_day=include_day
+    )
     expected = concat(datasets, dim="day")
 
     for i, name in enumerate(vars_to_drop):
@@ -235,36 +244,8 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim(include_day: bool) ->
             expected[name][i : i + 1, ...] = np.nan
 
     # set up the test data
-    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
-
-    actual = concat(datasets, dim="day")
-
-    assert list(actual.data_vars.keys()) == [
-        "pressure",
-        "humidity",
-        "precipitation",
-        "cloud_cover",
-        "temperature",
-    ]
-    assert_identical(actual, expected)
+    datasets = [ds.drop_vars(varname) for ds, varname in zip(datasets, vars_to_drop)]
 
-
-def test_multiple_datasets_with_missing_variables() -> None:
-    vars_to_drop = [
-        "temperature",
-        "pressure",
-        "humidity",
-        "precipitation",
-        "cloud_cover",
-    ]
-    datasets = create_concat_datasets(len(vars_to_drop), 123)
-
-    expected = concat(datasets, dim="day")
-    for i, name in enumerate(vars_to_drop):
-        expected[name][..., i * 2 : (i + 1) * 2] = np.nan
-
-    # set up the test data
-    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
     actual = concat(datasets, dim="day")
 
     assert list(actual.data_vars.keys()) == [
@@ -277,10 +258,10 @@ def test_multiple_datasets_with_missing_variables() -> None:
     assert_identical(actual, expected)
 
 
-def test_multiple_datasets_with_multiple_missing_variables() -> None:
+def test_concat_multiple_datasets_with_multiple_missing_variables() -> None:
     vars_to_drop_in_first = ["temperature", "pressure"]
     vars_to_drop_in_second = ["humidity", "precipitation", "cloud_cover"]
-    datasets = create_concat_datasets(2, 123)
+    datasets = create_concat_datasets(2, seed=123)
     expected = concat(datasets, dim="day")
     for name in vars_to_drop_in_first:
         expected[name][..., :2] = np.nan
@@ -303,8 +284,8 @@ def test_multiple_datasets_with_multiple_missing_variables() -> None:
     assert_identical(actual, expected)
 
 
-def test_type_of_missing_fill() -> None:
-    datasets = create_typed_datasets(2, 123)
+def test_concat_type_of_missing_fill() -> None:
+    datasets = create_typed_datasets(2, seed=123)
     expected1 = concat(datasets, dim="day", fill_value=dtypes.NA)
     expected2 = concat(datasets[::-1], dim="day", fill_value=dtypes.NA)
     vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"]
@@ -334,11 +315,11 @@ def test_type_of_missing_fill() -> None:
     assert_identical(actual, expected[0])
 
 
-def test_order_when_filling_missing() -> None:
+def test_concat_order_when_filling_missing() -> None:
     vars_to_drop_in_first: list[str] = []
     # drop middle
     vars_to_drop_in_second = ["humidity"]
-    datasets = create_concat_datasets(2, 123)
+    datasets = create_concat_datasets(2, seed=123)
     expected1 = concat(datasets, dim="day")
     for name in vars_to_drop_in_second:
         expected1[name][..., 2:] = np.nan
@@ -423,11 +404,9 @@ def create_ds(
 def test_concat_fill_missing_variables(
     concat_var_names, create_concat_ds, dim: bool, coord: bool
 ) -> None:
-    # random single variables missing in each dataset
     var_names = concat_var_names()
+    drop_idx = [0, 7, 6, 4, 4, 8, 0, 6, 2, 0]
 
-    rng = np.random.default_rng(seed=42)
-    drop_idx = [rng.integers(len(vlist)) for vlist in var_names]
     expected = concat(
         create_concat_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all"
     )

From 6cb163eda58962b34230a41de1f1a66b82c11de5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Thu, 19 Jan 2023 15:05:52 +0100
Subject: [PATCH 36/39] fix whats-new.rst

---
 doc/whats-new.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 25d803ec5a1..b0f6a07841b 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -35,6 +35,9 @@ Deprecations
 Bug fixes
 ~~~~~~~~~
 
+- :py:func:`xarray.concat` can now concatenate variables present in some datasets but
+  not others (:issue:`508`, :pull:`7400`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_ and `Scott Chamberlin <https://github.com/scottcha>`_.
 
 Documentation
 ~~~~~~~~~~~~~
@@ -75,9 +78,6 @@ Bug fixes
   By `Benoît Bovy <https://github.com/benbovy>`_.
 - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`,
   :pull:`7393`). By `Ian Carroll <https://github.com/itcarroll>`_.
-- :py:func:`xarray.concat` can now concatenate variables present in some datasets but
-  not others (:issue:`508`, :pull:`7400`).
-  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_ and `Scott Chamberlin <https://github.com/scottcha>`_.
 
 Internal Changes
 ~~~~~~~~~~~~~~~~

From 070c0fbca77a30846cd024443e93ffdb1ace6eb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 9 Jan 2023 13:14:47 +0100
Subject: [PATCH 37/39] add whats-new.rst entry

---
 doc/whats-new.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index b0f6a07841b..6c949a2367e 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -78,6 +78,9 @@ Bug fixes
   By `Benoît Bovy <https://github.com/benbovy>`_.
 - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`,
   :pull:`7393`). By `Ian Carroll <https://github.com/itcarroll>`_.
+- Make :py:func:`xarray.concat` more robust when concatenating variables present in some datasets but
+  not others (:issue:`508`, :pull:`7400`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_ and `Scott Chamberlin <https://github.com/scottcha>`_.
 
 Internal Changes
 ~~~~~~~~~~~~~~~~

From de608904199a800d986d1c9638c2cac86dee1d23 Mon Sep 17 00:00:00 2001
From: dcherian <deepak@cherian.net>
Date: Thu, 19 Jan 2023 10:00:46 -0700
Subject: [PATCH 38/39] Add additional test with scalar data_var

---
 xarray/tests/test_concat.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index d3d8eeeea4c..eb3cd7fccc8 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -188,14 +188,27 @@ def test_concat_second_empty() -> None:
 
     expected = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1})
     actual = concat([ds1, ds2], dim="y")
-
     assert_identical(actual, expected)
 
     expected = Dataset(
         data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": ("y", [0.1, 0.1])}
     )
     actual = concat([ds1, ds2], dim="y", coords="all")
+    assert_identical(actual, expected)
+
+    # Check concatenating scalar data_var only present in ds1
+    ds1["b"] = 0.1
+    expected = Dataset(
+        data_vars={"a": ("y", [0.1, np.nan]), "b": ("y", [0.1, np.nan])},
+        coords={"x": ("y", [0.1, 0.1])},
+    )
+    actual = concat([ds1, ds2], dim="y", coords="all", data_vars="all")
+    assert_identical(actual, expected)
 
+    expected = Dataset(
+        data_vars={"a": ("y", [0.1, np.nan]), "b": 0.1}, coords={"x": 0.1}
+    )
+    actual = concat([ds1, ds2], dim="y", coords="different", data_vars="different")
     assert_identical(actual, expected)
 
 

From b14c8f68a72d6e4bffef4416d554af569e613275 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Fri, 20 Jan 2023 08:02:41 +0100
Subject: [PATCH 39/39] remove erroneous content from whats-new.rst

---
 doc/whats-new.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 6c949a2367e..b0f6a07841b 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -78,9 +78,6 @@ Bug fixes
   By `Benoît Bovy <https://github.com/benbovy>`_.
 - Preserve original dtype on accessing MultiIndex levels (:issue:`7250`,
   :pull:`7393`). By `Ian Carroll <https://github.com/itcarroll>`_.
-- Make :py:func:`xarray.concat` more robust when concatenating variables present in some datasets but
-  not others (:issue:`508`, :pull:`7400`).
-  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_ and `Scott Chamberlin <https://github.com/scottcha>`_.
 
 Internal Changes
 ~~~~~~~~~~~~~~~~