Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combining attrs of member DataArrays of Datasets #4017

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,12 +951,18 @@ def open_mfdataset(
coords=coords,
ids=ids,
join=join,
combine_attrs="drop",
)
elif combine == "by_coords":
# Redo ordering from coordinates, ignoring how they were ordered
# previously
combined = combine_by_coords(
datasets, compat=compat, data_vars=data_vars, coords=coords, join=join
datasets,
compat=compat,
data_vars=data_vars,
coords=coords,
join=join,
combine_attrs="drop",
)
else:
raise ValueError(
Expand Down
30 changes: 30 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2652,6 +2652,36 @@ def test_open_mfdataset_does_same_as_concat(self, combine, opt, join):
ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t", join=join)
assert_identical(ds, ds_expect)

def test_open_mfdataset_dataset_attr_by_coords(self):
"""
Case when an attribute differs across the multiple files
"""
with self.setup_files_and_datasets() as (files, [ds1, ds2]):
# Give the files an inconsistent attribute
for i, f in enumerate(files):
ds = open_dataset(f).load()
ds.attrs["test_dataset_attr"] = 10 + i
ds.close()
ds.to_netcdf(f)

with xr.open_mfdataset(files, combine="by_coords", concat_dim="t") as ds:
assert ds.test_dataset_attr == 10

def test_open_mfdataset_dataarray_attr_by_coords(self):
"""
Case when an attribute of a member DataArray differs across the multiple files
"""
with self.setup_files_and_datasets() as (files, [ds1, ds2]):
# Give the files an inconsistent attribute
for i, f in enumerate(files):
ds = open_dataset(f).load()
ds["v1"].attrs["test_dataarray_attr"] = i
ds.close()
ds.to_netcdf(f)

with xr.open_mfdataset(files, combine="by_coords", concat_dim="t") as ds:
assert ds["v1"].test_dataarray_attr == 0
Comment on lines +2670 to +2683
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TomNicholas do you think we should make this test pass (i.e. support combining attrs on DataArrays contained in the Dataset being loaded)?

Proposal for a fix:

  • Support open_mfdataset-style attribute combining by upgrading the combine_attrs argument to combine_nested/combine_by_coords (and probably the other functions that have a combine_attrs argument) so it can be passed one of the Datasets or DataArrays being combined/merged, and will then take the attrs from that.
  • In open_mfdataset use the attrs_file argument to get the Dataset to take the attrs from, and pass that to the combine_attrs argument of combine_nested/combine_by_coords.


@pytest.mark.parametrize("combine", ["nested", "by_coords"])
@pytest.mark.parametrize("opt", ["all", "minimal", "different"])
def test_open_mfdataset_exact_join_raises_error(self, combine, opt):
Expand Down
29 changes: 29 additions & 0 deletions xarray/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,35 @@ def test_concat_combine_attrs_kwarg(self):
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
assert_identical(actual, expected[combine_attrs])

def test_concat_combine_attrs_kwarg_member_attrs(self):
ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]})
ds1["a"].attrs = {"b": 42}

ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]})
ds2["a"].attrs = {"b": 42, "c": 43}

expected = {}

expected["drop"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]})

expected["no_conflicts"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]})
expected["no_conflicts"]["a"].attrs = {"b": 42, "c": 43}

expected["override"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]})
expected["override"]["a"].attrs = {"b": 42}

with raises_regex(ValueError, "combine_attrs='identical'"):
actual = concat([ds1, ds2], dim="x", combine_attrs="identical")
with raises_regex(ValueError, "combine_attrs='no_conflicts'"):
ds3 = ds2.copy(deep=True)
ds3["a"].attrs["b"] = 44
actual = concat([ds1, ds3], dim="x", combine_attrs="no_conflicts")

for combine_attrs in expected:
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
assert_identical(actual, expected[combine_attrs])
assert_identical(actual["a"], expected[combine_attrs]["a"])

def test_concat_promote_shape(self):
# mixed dims within variables
objs = [Dataset({}, {"x": 0}), Dataset({"x": [1]})]
Expand Down