From add46d3f10bc98ca49090e094209d2359376e24b Mon Sep 17 00:00:00 2001 From: dcherian Date: Sun, 19 Apr 2020 07:42:57 -0600 Subject: [PATCH] Better chunking error messages for zarr backend --- doc/whats-new.rst | 2 ++ xarray/backends/zarr.py | 50 ++++++++++++++++++++--------------- xarray/tests/test_backends.py | 22 ++++++++++++--- 3 files changed, 50 insertions(+), 24 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e4c3a4d533f..d615bacb875 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -97,6 +97,8 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Raise more informative error messages for chunk size conflicts when writing to zarr files. + By `Deepak Cherian `_. - Run the ``isort`` pre-commit hook only on python source files and update the ``flake8`` version. (:issue:`3750`, :pull:`3711`) By `Justus Magin `_. diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index c262dae2811..973c167911e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -65,7 +65,7 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): +def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name): """ Given encoding chunks (possibly None) and variable chunks (possibly None) """ @@ -88,15 +88,16 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): if var_chunks and enc_chunks is None: if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks): raise ValueError( - "Zarr requires uniform chunk sizes except for final chunk." - " Variable dask chunks %r are incompatible. Consider " - "rechunking using `chunk()`." % (var_chunks,) + "Zarr requires uniform chunk sizes except for final chunk. " + f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. " + "Consider rechunking using `chunk()`." ) if any((chunks[0] < chunks[-1]) for chunks in var_chunks): raise ValueError( "Final chunk of Zarr array must be the same size or smaller " - "than the first. Variable Dask chunks %r are incompatible. " - "Consider rechunking using `chunk()`." % var_chunks + f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}." + "Consider either rechunking using `chunk()` or instead deleting " + "or modifying `encoding['chunks']`." ) # return the first chunk for each dimension return tuple(chunk[0] for chunk in var_chunks) @@ -114,13 +115,15 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over - return _determine_zarr_chunks(None, var_chunks, ndim) + return _determine_zarr_chunks(None, var_chunks, ndim, name) for x in enc_chunks_tuple: if not isinstance(x, int): raise TypeError( - "zarr chunks must be an int or a tuple of ints. " - "Instead found %r" % (enc_chunks_tuple,) + "zarr chunk sizes specified in `encoding['chunks']` " + "must be an int or a tuple of ints. " + f"Instead found encoding['chunks']={enc_chunks_tuple!r} " + f"for variable named {name!r}." ) # if there are chunks in encoding and the variable data is a numpy array, @@ -142,19 +145,22 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): for dchunk in dchunks[:-1]: if dchunk % zchunk: raise NotImplementedError( - "Specified zarr chunks %r would overlap multiple dask " - "chunks %r. This is not implemented in xarray yet. " - " Consider rechunking the data using " - "`chunk()` or specifying different chunks in encoding." - % (enc_chunks_tuple, var_chunks) + f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " + f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. " + "This is not implemented in xarray yet. " + "Consider either rechunking using `chunk()` or instead deleting " + "or modifying `encoding['chunks']`." ) if dchunks[-1] > zchunk: raise ValueError( "Final chunk of Zarr array must be the same size or " - "smaller than the first. The specified Zarr chunk " - "encoding is %r, but %r in variable Dask chunks %r is " - "incompatible. Consider rechunking using `chunk()`." - % (enc_chunks_tuple, dchunks, var_chunks) + "smaller than the first. " + f"Specified Zarr chunk encoding['chunks']={enc_chunks_tuple}, " + f"for variable named {name!r} " + f"but {dchunks} in the variable's Dask chunks {var_chunks} is " + "incompatible with this encoding. " + "Consider either rechunking using `chunk()` or instead deleting " + "or modifying `encoding['chunks']`." ) return enc_chunks_tuple @@ -177,7 +183,7 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): return dimensions, attributes -def extract_zarr_variable_encoding(variable, raise_on_invalid=False): +def extract_zarr_variable_encoding(variable, raise_on_invalid=False, name=None): """ Extract zarr encoding dictionary from xarray Variable @@ -207,7 +213,7 @@ def extract_zarr_variable_encoding(variable, raise_on_invalid=False): del encoding[k] chunks = _determine_zarr_chunks( - encoding.get("chunks"), variable.chunks, variable.ndim + encoding.get("chunks"), variable.chunks, variable.ndim, name ) encoding["chunks"] = chunks return encoding @@ -453,7 +459,9 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No writer.add(v.data, zarr_array, region=tuple(new_region)) else: # new variable - encoding = extract_zarr_variable_encoding(v, raise_on_invalid=check) + encoding = extract_zarr_variable_encoding( + v, raise_on_invalid=check, name=vn + ) encoded_attrs = {} # the magic for storing the hidden dimension data encoded_attrs[DIMENSION_KEY] = dims diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3fde292c04f..916c29ba7bd 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1685,11 +1685,27 @@ def test_chunk_encoding_with_dask(self): # should fail if dask_chunks are irregular... ds_chunk_irreg = ds.chunk({"x": (5, 4, 3)}) - with pytest.raises(ValueError) as e_info: + with raises_regex(ValueError, "uniform chunk sizes."): with self.roundtrip(ds_chunk_irreg) as actual: pass - # make sure this error message is correct and not some other error - assert e_info.match("chunks") + + # should fail if encoding["chunks"] clashes with dask_chunks + badenc = ds.chunk({"x": 4}) + badenc.var1.encoding["chunks"] = (6,) + with raises_regex(NotImplementedError, "named 'var1' would overlap"): + with self.roundtrip(badenc) as actual: + pass + + badenc.var1.encoding["chunks"] = (2,) + with raises_regex(ValueError, "Specified Zarr chunk encoding"): + with self.roundtrip(badenc) as actual: + pass + + badenc = badenc.chunk({"x": (3, 3, 6)}) + badenc.var1.encoding["chunks"] = (3,) + with raises_regex(ValueError, "incompatible with this encoding"): + with self.roundtrip(badenc) as actual: + pass # ... except if the last chunk is smaller than the first ds_chunk_irreg = ds.chunk({"x": (5, 5, 2)})