From 814c0eb42bb843d4501a2fe0f9afa78d96064581 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Thu, 11 Feb 2021 13:08:56 +0100 Subject: [PATCH 1/8] FIX: set `decode_strings=True` for h5netcdf backend, convert object string to byte string if necessary, unpin h5py --- ci/requirements/environment-windows.yml | 2 +- ci/requirements/environment.yml | 2 +- ci/requirements/py38-all-but-dask.yml | 2 +- xarray/backends/h5netcdf_.py | 7 +++++++ xarray/coding/strings.py | 7 +++++-- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml index 9455ef2f127..1eddfb2a45e 100644 --- a/ci/requirements/environment-windows.yml +++ b/ci/requirements/environment-windows.yml @@ -11,7 +11,7 @@ dependencies: - dask<2021.02.0 - distributed - h5netcdf - - h5py=2 + - h5py - hdf5 - hypothesis - iris diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 7261b5b6954..e78c2b9181d 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -12,7 +12,7 @@ dependencies: - dask<2021.02.0 - distributed - h5netcdf - - h5py=2 + - h5py - hdf5 - hypothesis - iris diff --git a/ci/requirements/py38-all-but-dask.yml b/ci/requirements/py38-all-but-dask.yml index 51ec48cc6b1..f6e3f7c5ff2 100644 --- a/ci/requirements/py38-all-but-dask.yml +++ b/ci/requirements/py38-all-but-dask.yml @@ -12,7 +12,7 @@ dependencies: - cftime - coveralls - h5netcdf - - h5py=2 + - h5py - hdf5 - hypothesis - lxml # Optional dep of pydap diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index aa892c4f89c..400705847b3 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -131,6 +131,7 @@ def open( autoclose=False, invalid_netcdf=None, phony_dims=None, + decode_strings=None, ): if isinstance(filename, bytes): @@ -157,6 +158,10 @@ def open( "h5netcdf backend keyword argument 'phony_dims' needs " "h5netcdf >= 0.8.0." ) + if LooseVersion(h5netcdf.__version__) >= LooseVersion("0.9.0") and LooseVersion( + h5netcdf.core.h5py.__version__ + ) >= LooseVersion("3.0.0"): + kwargs["decode_strings"] = decode_strings if lock is None: if mode == "r": @@ -358,6 +363,7 @@ def open_dataset( lock=None, invalid_netcdf=None, phony_dims=None, + decode_strings=True, ): store = H5NetCDFStore.open( @@ -367,6 +373,7 @@ def open_dataset( lock=lock, invalid_netcdf=invalid_netcdf, phony_dims=phony_dims, + decode_strings=decode_strings, ) store_entrypoint = StoreBackendEntrypoint() diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index e16e983fd8a..dc50e34ab20 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -191,8 +191,11 @@ def _numpy_char_to_bytes(arr): """Like netCDF4.chartostring, but faster and more flexible.""" # based on: http://stackoverflow.com/a/10984878/809705 arr = np.array(arr, copy=False, order="C") - dtype = "S" + str(arr.shape[-1]) - return arr.view(dtype).reshape(arr.shape[:-1]) + shape = arr.shape + dtype = "S" + str(shape[-1]) + if arr.dtype.kind == "O": + arr = arr.astype("S1") + return arr.view(dtype).reshape(shape[:-1]) class StackedBytesArray(indexing.ExplicitlyIndexedNDArrayMixin): From 257cee99b681d57284091901f8eefda5e79c4da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 11 Feb 2021 22:00:01 +0100 Subject: [PATCH 2/8] Update strings.py --- xarray/coding/strings.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index dc50e34ab20..e16e983fd8a 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -191,11 +191,8 @@ def _numpy_char_to_bytes(arr): """Like netCDF4.chartostring, but faster and more flexible.""" # based on: http://stackoverflow.com/a/10984878/809705 arr = np.array(arr, copy=False, order="C") - shape = arr.shape - dtype = "S" + str(shape[-1]) - if arr.dtype.kind == "O": - arr = arr.astype("S1") - return arr.view(dtype).reshape(shape[:-1]) + dtype = "S" + str(arr.shape[-1]) + return arr.view(dtype).reshape(arr.shape[:-1]) class StackedBytesArray(indexing.ExplicitlyIndexedNDArrayMixin): From 7f65cdd505b0308af32d2ba57e662dea7eb21c62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 11 Feb 2021 22:05:33 +0100 Subject: [PATCH 3/8] Update h5netcdf_.py --- xarray/backends/h5netcdf_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 400705847b3..f0310fa5549 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -158,7 +158,7 @@ def open( "h5netcdf backend keyword argument 'phony_dims' needs " "h5netcdf >= 0.8.0." ) - if LooseVersion(h5netcdf.__version__) >= LooseVersion("0.9.0") and LooseVersion( + if LooseVersion(h5netcdf.__version__) >= LooseVersion("0.10.0") and LooseVersion( h5netcdf.core.h5py.__version__ ) >= LooseVersion("3.0.0"): kwargs["decode_strings"] = decode_strings From 7ababa2d4d303d8fc6872788c714d7bd06e0faa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 11 Feb 2021 22:11:29 +0100 Subject: [PATCH 4/8] fix style --- xarray/backends/h5netcdf_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index f0310fa5549..f5cebf52646 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -158,9 +158,9 @@ def open( "h5netcdf backend keyword argument 'phony_dims' needs " "h5netcdf >= 0.8.0." ) - if LooseVersion(h5netcdf.__version__) >= LooseVersion("0.10.0") and LooseVersion( - h5netcdf.core.h5py.__version__ - ) >= LooseVersion("3.0.0"): + if LooseVersion(h5netcdf.__version__) >= LooseVersion( + "0.10.0" + ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): kwargs["decode_strings"] = decode_strings if lock is None: From 61c5f9aad7198d7ec120f43b32b6110de2cf2ed9 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Fri, 12 Feb 2021 07:26:07 +0100 Subject: [PATCH 5/8] FIX:change decode_strings -> decode_vlen_strings, add whats-new.rst entry --- doc/whats-new.rst | 2 ++ xarray/backends/h5netcdf_.py | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4b06003b630..f6b68c10b8f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -113,6 +113,8 @@ Bug fixes By `Leif Denby `_. - Fix time encoding bug associated with using cftime versions greater than 1.4.0 with xarray (:issue:`4870`, :pull:`4871`). By `Spencer Clark `_. +- Fix decoding of vlen strings using h5py versions greater than 3.0.0 with h5netcdf backend (:issue:`4570`, :pull:`4893`). + By `Kai Mühlbauer `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index f5cebf52646..dd1dbaf79c0 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -131,7 +131,7 @@ def open( autoclose=False, invalid_netcdf=None, phony_dims=None, - decode_strings=None, + decode_vlen_strings=None, ): if isinstance(filename, bytes): @@ -161,7 +161,7 @@ def open( if LooseVersion(h5netcdf.__version__) >= LooseVersion( "0.10.0" ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): - kwargs["decode_strings"] = decode_strings + kwargs["decode_vlen_strings"] = decode_vlen_strings if lock is None: if mode == "r": @@ -363,7 +363,7 @@ def open_dataset( lock=None, invalid_netcdf=None, phony_dims=None, - decode_strings=True, + decode_vlen_strings=True, ): store = H5NetCDFStore.open( @@ -373,7 +373,7 @@ def open_dataset( lock=lock, invalid_netcdf=invalid_netcdf, phony_dims=phony_dims, - decode_strings=decode_strings, + decode_strings=decode_vlen_strings, ) store_entrypoint = StoreBackendEntrypoint() From 00d15437d2aaaf919b0203be1bcd02d7e2915d2a Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Fri, 12 Feb 2021 07:47:00 +0100 Subject: [PATCH 6/8] FIX: change missed decode_strings -> decode_vlen_strings --- xarray/backends/h5netcdf_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index dd1dbaf79c0..b5f3aca6bdf 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -373,7 +373,7 @@ def open_dataset( lock=lock, invalid_netcdf=invalid_netcdf, phony_dims=phony_dims, - decode_strings=decode_vlen_strings, + decode_vlen_strings=decode_vlen_strings, ) store_entrypoint = StoreBackendEntrypoint() From f7a5662d6a09260aa965ff37839e7ab447b3d91a Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Tue, 16 Feb 2021 19:56:20 +0100 Subject: [PATCH 7/8] FIX: set `decode_vlen_strings=True` in `open` classmethod, call remaining tests with `decode_vlen_strings=True` --- xarray/backends/h5netcdf_.py | 2 +- xarray/tests/test_backends.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index b5f3aca6bdf..5766b34d9bd 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -131,7 +131,7 @@ def open( autoclose=False, invalid_netcdf=None, phony_dims=None, - decode_vlen_strings=None, + decode_vlen_strings=True, ): if isinstance(filename, bytes): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3750c0715ae..7b7fe68f0a6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2578,13 +2578,13 @@ def test_open_dataset_group(self): v = group.createVariable("x", "int") v[...] = 42 - h5 = h5netcdf.File(tmp_file, mode="r") + h5 = h5netcdf.File(tmp_file, mode="r", decode_vlen_strings=True) store = backends.H5NetCDFStore(h5["g"]) with open_dataset(store) as ds: expected = Dataset({"x": ((), 42)}) assert_identical(expected, ds) - h5 = h5netcdf.File(tmp_file, mode="r") + h5 = h5netcdf.File(tmp_file, mode="r", decode_vlen_strings=True) store = backends.H5NetCDFStore(h5, group="g") with open_dataset(store) as ds: expected = Dataset({"x": ((), 42)}) @@ -2599,7 +2599,7 @@ def test_deepcopy(self): v = nc.createVariable("y", np.int32, ("x",)) v[:] = np.arange(10) - h5 = h5netcdf.File(tmp_file, mode="r") + h5 = h5netcdf.File(tmp_file, mode="r", decode_vlen_strings=True) store = backends.H5NetCDFStore(h5) with open_dataset(store) as ds: copied = ds.copy(deep=True) From f93688ca1cb9f944d90935d6b9dad086728efa55 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Tue, 16 Feb 2021 20:15:27 +0100 Subject: [PATCH 8/8] FIX: cover tests for h5py=2 --- xarray/tests/test_backends.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7b7fe68f0a6..03f52c12dec 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2578,13 +2578,19 @@ def test_open_dataset_group(self): v = group.createVariable("x", "int") v[...] = 42 - h5 = h5netcdf.File(tmp_file, mode="r", decode_vlen_strings=True) + kwargs = {} + if LooseVersion(h5netcdf.__version__) >= LooseVersion( + "0.10.0" + ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): + kwargs = dict(decode_vlen_strings=True) + + h5 = h5netcdf.File(tmp_file, mode="r", **kwargs) store = backends.H5NetCDFStore(h5["g"]) with open_dataset(store) as ds: expected = Dataset({"x": ((), 42)}) assert_identical(expected, ds) - h5 = h5netcdf.File(tmp_file, mode="r", decode_vlen_strings=True) + h5 = h5netcdf.File(tmp_file, mode="r", **kwargs) store = backends.H5NetCDFStore(h5, group="g") with open_dataset(store) as ds: expected = Dataset({"x": ((), 42)}) @@ -2599,7 +2605,13 @@ def test_deepcopy(self): v = nc.createVariable("y", np.int32, ("x",)) v[:] = np.arange(10) - h5 = h5netcdf.File(tmp_file, mode="r", decode_vlen_strings=True) + kwargs = {} + if LooseVersion(h5netcdf.__version__) >= LooseVersion( + "0.10.0" + ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): + kwargs = dict(decode_vlen_strings=True) + + h5 = h5netcdf.File(tmp_file, mode="r", **kwargs) store = backends.H5NetCDFStore(h5) with open_dataset(store) as ds: copied = ds.copy(deep=True)