From 702d57176eab022d98a963b66b4fb47fd99c9b51 Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Fri, 8 Nov 2019 16:48:49 +0100 Subject: [PATCH 01/12] Add 'master_file' kwarg in open_mfdataset. --- xarray/backends/api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d23594fc675..e2603a68666 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -718,6 +718,7 @@ def open_mfdataset( autoclose=None, parallel=False, join="outer", + master_file=0, **kwargs, ): """Open multiple files as a single dataset. @@ -825,6 +826,8 @@ def open_mfdataset( - 'override': if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. + master_file : int, optional + Index of the file used to read global attributes from. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. @@ -959,7 +962,7 @@ def open_mfdataset( raise combined._file_obj = _MultiFileCloser(file_objs) - combined.attrs = datasets[0].attrs + combined.attrs = datasets[master_file].attrs # FIXME allow path return combined From f83a634c32567992088d924409f87289b197c6ba Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Fri, 8 Nov 2019 17:02:58 +0100 Subject: [PATCH 02/12] Allow master_file to be a path or an index. --- xarray/backends/api.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e2603a68666..367b766d6ee 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -730,8 +730,8 @@ def open_mfdataset( ``combine_by_coords`` and ``combine_nested``. By default the old (now deprecated) ``auto_combine`` will be used, please specify either ``combine='by_coords'`` or ``combine='nested'`` in future. Requires dask to be installed. See documentation for - details on dask [1]. Attributes from the first dataset file are used for the - combined dataset. + details on dask [1]. Global attributes from the ``master_file`` are used + for the combined dataset. Parameters ---------- @@ -826,8 +826,10 @@ def open_mfdataset( - 'override': if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. - master_file : int, optional - Index of the file used to read global attributes from. + master_file : int or str, optional + Index or path of the file used to read global attributes from. + For instance use -1 to read history from the last file. + Note that wildcard matches are sorted by filename. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. @@ -962,7 +964,12 @@ def open_mfdataset( raise combined._file_obj = _MultiFileCloser(file_objs) - combined.attrs = datasets[master_file].attrs # FIXME allow path + + # read global attributes from the master file path or index + if isinstance(master_file, str): + master_file = paths.index(master_file) + combined.attrs = datasets[master_file].attrs + return combined From 6b7d52b586bea8674c619869a4dfadbabd998957 Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Mon, 11 Nov 2019 11:15:39 +0100 Subject: [PATCH 03/12] Add open_mfdataset master_file kwarg tests. --- xarray/tests/test_backends.py | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index de3a7eadab0..65f02df36e8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2799,6 +2799,42 @@ def test_attrs_mfdataset(self): with raises_regex(AttributeError, "no attribute"): actual.test2 + def test_open_mfdataset_master_file_index(self): + original = Dataset({"foo": ("x", np.random.randn(10))}) + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.attrs["test1"] = "foo" + ds2.attrs["test2"] = "bar" + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, tmp2], concat_dim="x", combine="nested", master_file=-1 + ) as actual: + # attributes are inherited from the master file + assert actual.test2 == ds2.test2 + # attributes from ds1 are not retained, e.g., + with raises_regex(AttributeError, "no attribute"): + actual.test1 + + def test_open_mfdataset_master_file_path(self): + original = Dataset({"foo": ("x", np.random.randn(10))}) + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.attrs["test1"] = "foo" + ds2.attrs["test2"] = "bar" + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, tmp2], concat_dim="x", combine="nested", master_file=tmp2 + ) as actual: + # attributes are inherited from the master file + assert actual.test2 == ds2.test2 + # attributes from ds1 are not retained, e.g., + with raises_regex(AttributeError, "no attribute"): + actual.test1 + def test_open_mfdataset_auto_combine(self): original = Dataset({"foo": ("x", np.random.randn(10)), "x": np.arange(10)}) with create_tmp_file() as tmp1: From a02ff9aba71445ae149f8b8b5c914e921dd5a58e Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Mon, 11 Nov 2019 11:17:43 +0100 Subject: [PATCH 04/12] Document master_file kwarg in whats-new.rst. --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 04fe88e9993..38b2609c91f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -69,6 +69,9 @@ New Features invoked. (:issue:`3378`, :pull:`3446`) By `Deepak Cherian `_ and `Guido Imperiale `_. +- Add `master_file` option in :py:func:`~xarray.open_mfdataset` to choose the + source file for global attributes in a multi-file dataset (:issue:`2382`, + :pull:`3498`) by `Julien Seguinot _`. Bug fixes ~~~~~~~~~ From 8cc3b349149a5b54d7b491b1fc0aee5dd6719640 Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Wed, 13 Nov 2019 13:49:01 +0100 Subject: [PATCH 05/12] Add dcherian suggestions in master_file tests. Co-Authored-By: Deepak Cherian --- xarray/tests/test_backends.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 65f02df36e8..dac0fa0047c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2812,10 +2812,9 @@ def test_open_mfdataset_master_file_index(self): [tmp1, tmp2], concat_dim="x", combine="nested", master_file=-1 ) as actual: # attributes are inherited from the master file - assert actual.test2 == ds2.test2 + assert actual.attrs["test2"] == ds2.attrs["test2"] # attributes from ds1 are not retained, e.g., - with raises_regex(AttributeError, "no attribute"): - actual.test1 + assert "test1" not in actual.attrs def test_open_mfdataset_master_file_path(self): original = Dataset({"foo": ("x", np.random.randn(10))}) @@ -2830,10 +2829,9 @@ def test_open_mfdataset_master_file_path(self): [tmp1, tmp2], concat_dim="x", combine="nested", master_file=tmp2 ) as actual: # attributes are inherited from the master file - assert actual.test2 == ds2.test2 + assert actual.attrs["test2"] == ds2.attrs["test2"] # attributes from ds1 are not retained, e.g., - with raises_regex(AttributeError, "no attribute"): - actual.test1 + assert "test1" not in actual.attrs def test_open_mfdataset_auto_combine(self): original = Dataset({"foo": ("x", np.random.randn(10)), "x": np.arange(10)}) From 64f9785ef93dc302032e2b3792cc66e5ce015a58 Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Thu, 14 Nov 2019 18:38:57 +0100 Subject: [PATCH 06/12] Rename master_file kwarg to attrs_file. Unlike netCDF4's master_file this is only used for attributes. --- doc/whats-new.rst | 2 +- xarray/backends/api.py | 12 ++++++------ xarray/tests/test_backends.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 38b2609c91f..db886eeac22 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -69,7 +69,7 @@ New Features invoked. (:issue:`3378`, :pull:`3446`) By `Deepak Cherian `_ and `Guido Imperiale `_. -- Add `master_file` option in :py:func:`~xarray.open_mfdataset` to choose the +- Add `attrs_file` option in :py:func:`~xarray.open_mfdataset` to choose the source file for global attributes in a multi-file dataset (:issue:`2382`, :pull:`3498`) by `Julien Seguinot _`. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 367b766d6ee..10c175d9c63 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -718,7 +718,7 @@ def open_mfdataset( autoclose=None, parallel=False, join="outer", - master_file=0, + attrs_file=0, **kwargs, ): """Open multiple files as a single dataset. @@ -730,7 +730,7 @@ def open_mfdataset( ``combine_by_coords`` and ``combine_nested``. By default the old (now deprecated) ``auto_combine`` will be used, please specify either ``combine='by_coords'`` or ``combine='nested'`` in future. Requires dask to be installed. See documentation for - details on dask [1]. Global attributes from the ``master_file`` are used + details on dask [1]. Global attributes from the ``attrs_file`` are used for the combined dataset. Parameters @@ -826,7 +826,7 @@ def open_mfdataset( - 'override': if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. - master_file : int or str, optional + attrs_file : int or str, optional Index or path of the file used to read global attributes from. For instance use -1 to read history from the last file. Note that wildcard matches are sorted by filename. @@ -966,9 +966,9 @@ def open_mfdataset( combined._file_obj = _MultiFileCloser(file_objs) # read global attributes from the master file path or index - if isinstance(master_file, str): - master_file = paths.index(master_file) - combined.attrs = datasets[master_file].attrs + if isinstance(attrs_file, str): + attrs_file = paths.index(attrs_file) + combined.attrs = datasets[attrs_file].attrs return combined diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index dac0fa0047c..908f71bcfd4 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2799,7 +2799,7 @@ def test_attrs_mfdataset(self): with raises_regex(AttributeError, "no attribute"): actual.test2 - def test_open_mfdataset_master_file_index(self): + def test_open_mfdataset_attrs_file_index(self): original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_files(2) as (tmp1, tmp2): ds1 = original.isel(x=slice(5)) @@ -2809,14 +2809,14 @@ def test_open_mfdataset_master_file_index(self): ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", master_file=-1 + [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=-1 ) as actual: # attributes are inherited from the master file assert actual.attrs["test2"] == ds2.attrs["test2"] # attributes from ds1 are not retained, e.g., assert "test1" not in actual.attrs - def test_open_mfdataset_master_file_path(self): + def test_open_mfdataset_attrs_file_path(self): original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_files(2) as (tmp1, tmp2): ds1 = original.isel(x=slice(5)) @@ -2826,7 +2826,7 @@ def test_open_mfdataset_master_file_path(self): ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", master_file=tmp2 + [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 ) as actual: # attributes are inherited from the master file assert actual.attrs["test2"] == ds2.attrs["test2"] From fb1d91fdbaf2652eebc3a36dbfd41ca335c276f6 Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Sat, 14 Dec 2019 17:08:33 +0100 Subject: [PATCH 07/12] Require master_file as a path, forbid indexes. Index behaviour is ambiguous for nested lists on older Python versions. The default remains index 0, which is backward-compatible but also ambiguous in this case (see docstring and pull request #3498). --- xarray/backends/api.py | 23 ++++++++++++++--------- xarray/tests/test_backends.py | 19 +------------------ 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 10c175d9c63..c14c78a9890 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -718,7 +718,7 @@ def open_mfdataset( autoclose=None, parallel=False, join="outer", - attrs_file=0, + attrs_file=None, **kwargs, ): """Open multiple files as a single dataset. @@ -826,10 +826,14 @@ def open_mfdataset( - 'override': if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. - attrs_file : int or str, optional - Index or path of the file used to read global attributes from. - For instance use -1 to read history from the last file. - Note that wildcard matches are sorted by filename. + attrs_file : str, optional + Path of the file used to read global attributes from. + By default global attributes are read from the first file provided. + Wildcard matches are sorted by filename. Nested lists are flattened + using a dictionary. For Python 3.6 and higher, global attributes are + read from the first file of the first list (of the first list etc.). + For Python 3.5 or lower, global attributes are read from an + arbitrary file. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. @@ -965,10 +969,11 @@ def open_mfdataset( combined._file_obj = _MultiFileCloser(file_objs) - # read global attributes from the master file path or index - if isinstance(attrs_file, str): - attrs_file = paths.index(attrs_file) - combined.attrs = datasets[attrs_file].attrs + # read global attributes from the attrs_file or from the first dataset + if attrs_file is not None: + combined.attrs = datasets[paths.index(attrs_file)].attrs + else: + combined.attrs = datasets[0].attrs return combined diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 908f71bcfd4..da83069268e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2799,24 +2799,7 @@ def test_attrs_mfdataset(self): with raises_regex(AttributeError, "no attribute"): actual.test2 - def test_open_mfdataset_attrs_file_index(self): - original = Dataset({"foo": ("x", np.random.randn(10))}) - with create_tmp_files(2) as (tmp1, tmp2): - ds1 = original.isel(x=slice(5)) - ds2 = original.isel(x=slice(5, 10)) - ds1.attrs["test1"] = "foo" - ds2.attrs["test2"] = "bar" - ds1.to_netcdf(tmp1) - ds2.to_netcdf(tmp2) - with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=-1 - ) as actual: - # attributes are inherited from the master file - assert actual.attrs["test2"] == ds2.attrs["test2"] - # attributes from ds1 are not retained, e.g., - assert "test1" not in actual.attrs - - def test_open_mfdataset_attrs_file_path(self): + def test_open_mfdataset_attrs_file(self): original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_files(2) as (tmp1, tmp2): ds1 = original.isel(x=slice(5)) From 8a44831f54bb07152ac69fe0d124b3888a0753ed Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Sat, 14 Dec 2019 17:27:01 +0100 Subject: [PATCH 08/12] Fix wrongly resolved conflict in waths-new.rst. --- doc/whats-new.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 89ebde7a6cd..bd7f1f2581e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -31,6 +31,9 @@ New Features - Added the :py:meth:`count` reduction method to both :py:class:`DatasetCoarsen` and :py:class:`DataArrayCoarsen` objects. (:pull:`3500`) By `Deepak Cherian `_ +- Add `attrs_file` option in :py:func:`~xarray.open_mfdataset` to choose the + source file for global attributes in a multi-file dataset (:issue:`2382`, + :pull:`3498`) by `Julien Seguinot _`. Bug fixes ~~~~~~~~~ @@ -153,9 +156,6 @@ New Features By `Deepak Cherian `_. - Add the documented-but-missing :py:meth:`DatasetGroupBy.quantile`. (:issue:`3525`, :pull:`3527`). By `Justus Magin `_. -- Add `attrs_file` option in :py:func:`~xarray.open_mfdataset` to choose the - source file for global attributes in a multi-file dataset (:issue:`2382`, - :pull:`3498`) by `Julien Seguinot _`. Bug fixes ~~~~~~~~~ From aa62e18f79ec44ec640ab9f83edb7201fdfea217 Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Mon, 30 Dec 2019 16:25:13 +0100 Subject: [PATCH 09/12] Simplify docstring on attrs_file. --- xarray/backends/api.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3839b50d726..dc7d5dd8efd 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -830,12 +830,8 @@ def open_mfdataset( dimension must have the same size in all objects. attrs_file : str, optional Path of the file used to read global attributes from. - By default global attributes are read from the first file provided. - Wildcard matches are sorted by filename. Nested lists are flattened - using a dictionary. For Python 3.6 and higher, global attributes are - read from the first file of the first list (of the first list etc.). - For Python 3.5 or lower, global attributes are read from an - arbitrary file. + By default global attributes are read from the first file provided, + with wildcard matches sorted by filename. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. From 1347ab5826f126a2a1552c133d7237328c2a2d70 Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Sat, 11 Jan 2020 15:53:11 +0100 Subject: [PATCH 10/12] Allow attrs_file to be a pathlib.Path. --- xarray/backends/api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index dc7d5dd8efd..7acb597cf25 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -969,6 +969,8 @@ def open_mfdataset( # read global attributes from the attrs_file or from the first dataset if attrs_file is not None: + if isinstance(attrs_file, Path): + attrs_file = str(attrs_file) combined.attrs = datasets[paths.index(attrs_file)].attrs else: combined.attrs = datasets[0].attrs From aa72572f481c343031275b8677aad4490e5b8a5b Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Sat, 11 Jan 2020 15:54:06 +0100 Subject: [PATCH 11/12] Add a test for attrs_file using pathlib.Path. --- xarray/tests/test_backends.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e3919687b3f..4fccdf2dd6c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2849,6 +2849,25 @@ def test_open_mfdataset_attrs_file(self): # attributes from ds1 are not retained, e.g., assert "test1" not in actual.attrs + def test_open_mfdataset_attrs_file_path(self): + original = Dataset({"foo": ("x", np.random.randn(10))}) + with create_tmp_files(2) as (tmp1, tmp2): + tmp1 = Path(tmp1) + tmp2 = Path(tmp2) + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.attrs["test1"] = "foo" + ds2.attrs["test2"] = "bar" + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 + ) as actual: + # attributes are inherited from the master file + assert actual.attrs["test2"] == ds2.attrs["test2"] + # attributes from ds1 are not retained, e.g., + assert "test1" not in actual.attrs + def test_open_mfdataset_auto_combine(self): original = Dataset({"foo": ("x", np.random.randn(10)), "x": np.arange(10)}) with create_tmp_file() as tmp1: From 175de6cb191872bb2ec10368275afcf1e5fd99e1 Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Sat, 11 Jan 2020 15:52:00 +0100 Subject: [PATCH 12/12] Document that attrs_file can now be a pathlib.Path. Co-Authored-By: keewis --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 7acb597cf25..eea1fb9ddce 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -828,7 +828,7 @@ def open_mfdataset( - 'override': if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. - attrs_file : str, optional + attrs_file : str or pathlib.Path, optional Path of the file used to read global attributes from. By default global attributes are read from the first file provided, with wildcard matches sorted by filename.