From 69eb9781c787454bb56c5a813733d6faabcbc803 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 13 Aug 2021 11:49:40 +0200 Subject: [PATCH 01/15] Allow in memory arrays --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9b4fa8fce5a..607167cd693 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -896,7 +896,7 @@ def open_mfdataset( DeprecationWarning, ) - open_kwargs = dict(engine=engine, chunks=chunks or {}, **kwargs) + open_kwargs = dict(engine=engine, chunks=chunks, **kwargs) if parallel: import dask From d2a474e95eab8a9b661b06edfffc1b7e34b8d7de Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 13 Aug 2021 21:49:46 +0200 Subject: [PATCH 02/15] fix tests --- xarray/tests/test_backends.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3bbc2c93b31..40e3c7e49df 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3012,7 +3012,9 @@ def test_open_mfdataset_manyfiles( ) as actual: # check that using open_mfdataset returns dask arrays for variables - assert isinstance(actual["foo"].data, dask_array_type) + # when a chunks parameter has been defined: + if chunks is not None: + assert isinstance(actual["foo"].data, dask_array_type) assert_identical(original, actual) @@ -3321,7 +3323,7 @@ def test_open_mfdataset(self): original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested", chunks={}, ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) @@ -3669,9 +3671,9 @@ def test_deterministic_names(self): with create_tmp_file() as tmp: data = create_test_data() data.to_netcdf(tmp) - with open_mfdataset(tmp, combine="by_coords") as ds: + with open_mfdataset(tmp, combine="by_coords", chunks={}) as ds: original_names = {k: v.data.name for k, v in ds.data_vars.items()} - with open_mfdataset(tmp, combine="by_coords") as ds: + with open_mfdataset(tmp, combine="by_coords", chunks={}) as ds: repeat_names = {k: v.data.name for k, v in ds.data_vars.items()} for var_name, dask_name in original_names.items(): assert var_name in dask_name From 9e3acc36630ae857173846a89eae87ab58b08594 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 13 Aug 2021 22:09:07 +0200 Subject: [PATCH 03/15] Update test_backends.py --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 40e3c7e49df..dc7ac4fae96 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3323,7 +3323,7 @@ def test_open_mfdataset(self): original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", chunks={}, + [tmp1, tmp2], concat_dim="x", combine="nested", chunks={} ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) @@ -3360,6 +3360,7 @@ def test_open_mfdataset_2d(self): [[tmp1, tmp2], [tmp3, tmp4]], combine="nested", concat_dim=["y", "x"], + chunks={}, ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5), (4, 4)) From 31a6ac0b23c476b510953631dd40cb0321eaa801 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 13 Aug 2021 22:45:07 +0200 Subject: [PATCH 04/15] make sure ndarrays are returned when None --- xarray/tests/test_backends.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index dc7ac4fae96..e8bc67460b3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3013,8 +3013,8 @@ def test_open_mfdataset_manyfiles( # check that using open_mfdataset returns dask arrays for variables # when a chunks parameter has been defined: - if chunks is not None: - assert isinstance(actual["foo"].data, dask_array_type) + array_type = np.ndarray if chunks is None else dask_array_type + assert isinstance(actual["foo"].data, array_type) assert_identical(original, actual) From f3234c4614c3f9b70ae4536f899588e35e1ee61c Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 14 Aug 2021 01:02:14 +0200 Subject: [PATCH 05/15] update chunks with all conditions --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e8bc67460b3..f6e8b0a1308 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3002,13 +3002,14 @@ def test_open_mfdataset_manyfiles( subds.to_zarr(store=tmpfiles[ii]) # check that calculation on opened datasets works properly + chunks = chunks if (not chunks and readengine != "zarr") else "auto" with open_mfdataset( tmpfiles, combine="nested", concat_dim="x", engine=readengine, parallel=parallel, - chunks=chunks if (not chunks and readengine != "zarr") else "auto", + chunks=chunks, ) as actual: # check that using open_mfdataset returns dask arrays for variables From 695477dc3b451a2b214a190d14ab19a07509338e Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 14 Aug 2021 17:46:36 +0200 Subject: [PATCH 06/15] Check the previous chunk default value as well --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f6e8b0a1308..28d5b9e3be7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2961,7 +2961,7 @@ def parallel(request): return request.param -@pytest.fixture(params=[None, 5]) +@pytest.fixture(params=[None, {}, 5]) def chunks(request): return request.param From 13f07c93f222bc3f0fbb62b22e0cb910e4360bda Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sun, 15 Aug 2021 11:10:58 +0200 Subject: [PATCH 07/15] Update whats-new.rst --- doc/whats-new.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9ac9639b8c1..1b4095593de 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -30,7 +30,11 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ - +- Allow in-memory arrays with :py:func:`xarray.open_mfdataset` by default. + This was not previously possible and was inconsistent with the behavior of + :py:func:`xarray.open_dataset`. + The previous default behavior is now activated by passing ``chunks={}`` (:pull:`5704`). + By `Jimmy Westling `_. Deprecations ~~~~~~~~~~~~ From 873911ab66bb31634089b33466b322b850d5c67a Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Mon, 23 Aug 2021 06:46:48 +0200 Subject: [PATCH 08/15] Use {} as default value --- xarray/backends/api.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ca530133d26..32efe2f74c1 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -693,7 +693,7 @@ def open_dataarray( def open_mfdataset( paths, - chunks=None, + chunks={}, concat_dim=None, compat="no_conflicts", preprocess=None, @@ -726,12 +726,14 @@ def open_mfdataset( concatenation along more than one dimension is desired, then ``paths`` must be a nested list-of-lists (see ``combine_nested`` for details). (A string glob will be expanded to a 1-dimensional list.) - chunks : int or dict, optional + chunks : int, dict, None, default: {} Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk - each dimension by ``chunks``. By default, chunks will be chosen to load entire - input files into memory at once. This has a major impact on performance: please - see the full documentation for more details [2]_. + each dimension by ``chunks``. By default, loads the dataset with dask using + engine preferred chunks if exposed by the backend, otherwise with + a single chunk for all arrays. If None, the files are loaded to memory + which has a major impact on performance: please see the full documentation for + more details [2]_. concat_dim : str, or list of str, DataArray, Index or None, optional Dimensions to concatenate files along. You only need to provide this argument if ``combine='nested'``, and if any of the dimensions along which you want to From d9e16619c282a62665ea5a875d204f3efc09578e Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Mon, 23 Aug 2021 06:59:09 +0200 Subject: [PATCH 09/15] Update whats-new.rst --- doc/whats-new.rst | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8f103253b41..f74e75f232e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,12 +35,6 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ -- Allow in-memory arrays with :py:func:`xarray.open_mfdataset` by default. - This was not previously possible and was inconsistent with the behavior of - :py:func:`xarray.open_dataset`. - The previous default behavior is now activated by passing ``chunks={}`` (:pull:`5704`). - By `Jimmy Westling `_. - - The ``__repr__`` of a :py:class:`xarray.Dataset`'s ``coords`` and ``data_vars`` ignore ``xarray.set_option(display_max_rows=...)`` and show the full output when called directly as, e.g., ``ds.data_vars`` or ``print(ds.data_vars)`` @@ -53,7 +47,8 @@ Deprecations Bug fixes ~~~~~~~~~ - +- Allow in-memory arrays with :py:func:`xarray.open_mfdataset` by passing ``chunks=None`. (:pull:`5704`). + By `Jimmy Westling `_. Documentation ~~~~~~~~~~~~~ From a6af772e8f3edebbcca78f28d5c878377d17db0d Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Mon, 23 Aug 2021 07:15:40 +0200 Subject: [PATCH 10/15] Update whats-new.rst --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f74e75f232e..1ab5227caa8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -47,7 +47,7 @@ Deprecations Bug fixes ~~~~~~~~~ -- Allow in-memory arrays with :py:func:`xarray.open_mfdataset` by passing ``chunks=None`. (:pull:`5704`). +- Allow in-memory arrays with :py:func:`xarray.open_mfdataset` by passing ``chunks=None``. (:pull:`5704`). By `Jimmy Westling `_. Documentation From 04cf17fed244ec2c7f147722866e0c21e101f67f Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Thu, 23 May 2024 23:38:50 +0200 Subject: [PATCH 11/15] Update test_backends.py --- xarray/tests/test_backends.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 37c8a93b4d8..fdbdf513a96 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4010,8 +4010,10 @@ def test_open_mfdataset_manyfiles( ) as actual: # check that using open_mfdataset returns dask arrays for variables # when a chunks parameter has been defined: - array_type = np.ndarray if chunks is None else dask_array_type - assert isinstance(actual["foo"].data, array_type) + if chunks is None: + assert isinstance(actual["foo"].data, np.ndarray) + else: + assert isinstance(actual["foo"].data, dask_array_type) assert_identical(original, actual) From 12412dbbed96c5c5eff9aef3c6d4f04381384bb9 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Thu, 23 May 2024 23:50:17 +0200 Subject: [PATCH 12/15] Update whats-new.rst --- doc/whats-new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4e0d7230d33..31a14bcb5da 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,6 +35,8 @@ Deprecations Bug fixes ~~~~~~~~~ +- Allow in-memory arrays with :py:func:`xarray.open_mfdataset` by passing ``chunks=None``. (:pull:`5704`). + By `Jimmy Westling `_. Documentation ~~~~~~~~~~~~~ @@ -2285,8 +2287,6 @@ Deprecations Bug fixes ~~~~~~~~~ -- Allow in-memory arrays with :py:func:`xarray.open_mfdataset` by passing ``chunks=None``. (:pull:`5704`). - By `Jimmy Westling `_. - Fix ZeroDivisionError from saving dask array with empty dimension (:issue: `5741`). By `Joseph K Aicher `_. From 81dbef956f545c2f2982073a350f80023bb61d34 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 24 May 2024 07:19:49 +0200 Subject: [PATCH 13/15] Update xarray/tests/test_backends.py --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fdbdf513a96..e04ec8b9154 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4332,7 +4332,7 @@ def test_open_mfdataset(self) -> None: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", chunks={} + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) From 493680f119fa71c09e0826ed789781630321f7ba Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 24 May 2024 07:19:57 +0200 Subject: [PATCH 14/15] Update xarray/tests/test_backends.py --- xarray/tests/test_backends.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e04ec8b9154..31e4edbcd27 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4369,7 +4369,6 @@ def test_open_mfdataset_2d(self) -> None: [[tmp1, tmp2], [tmp3, tmp4]], combine="nested", concat_dim=["y", "x"], - chunks={}, ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5), (4, 4)) From fb4138dd757e928b5777c686a283c12a370c86a6 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 24 May 2024 07:20:35 +0200 Subject: [PATCH 15/15] Apply suggestions from code review --- xarray/tests/test_backends.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 31e4edbcd27..ad9c2f90194 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4701,9 +4701,9 @@ def test_deterministic_names(self) -> None: with create_tmp_file() as tmp: data = create_test_data() data.to_netcdf(tmp) - with open_mfdataset(tmp, combine="by_coords", chunks={}) as ds: + with open_mfdataset(tmp, combine="by_coords") as ds: original_names = {k: v.data.name for k, v in ds.data_vars.items()} - with open_mfdataset(tmp, combine="by_coords", chunks={}) as ds: + with open_mfdataset(tmp, combine="by_coords") as ds: repeat_names = {k: v.data.name for k, v in ds.data_vars.items()} for var_name, dask_name in original_names.items(): assert var_name in dask_name