Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better chunking error messages for zarr backend #3983

Merged
merged 1 commit into from
Apr 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ Documentation

Internal Changes
~~~~~~~~~~~~~~~~
- Raise more informative error messages for chunk size conflicts when writing to zarr files.
By `Deepak Cherian <https://github.com/dcherian>`_.
- Run the ``isort`` pre-commit hook only on python source files
and update the ``flake8`` version. (:issue:`3750`, :pull:`3711`)
By `Justus Magin <https://github.com/keewis>`_.
Expand Down
50 changes: 29 additions & 21 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __getitem__(self, key):
# could possibly have a work-around for 0d data here


def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name):
"""
Given encoding chunks (possibly None) and variable chunks (possibly None)
"""
Expand All @@ -88,15 +88,16 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
if var_chunks and enc_chunks is None:
if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks):
raise ValueError(
"Zarr requires uniform chunk sizes except for final chunk."
" Variable dask chunks %r are incompatible. Consider "
"rechunking using `chunk()`." % (var_chunks,)
"Zarr requires uniform chunk sizes except for final chunk. "
f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. "
"Consider rechunking using `chunk()`."
)
if any((chunks[0] < chunks[-1]) for chunks in var_chunks):
raise ValueError(
"Final chunk of Zarr array must be the same size or smaller "
"than the first. Variable Dask chunks %r are incompatible. "
"Consider rechunking using `chunk()`." % var_chunks
f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}."
"Consider either rechunking using `chunk()` or instead deleting "
"or modifying `encoding['chunks']`."
)
# return the first chunk for each dimension
return tuple(chunk[0] for chunk in var_chunks)
Expand All @@ -114,13 +115,15 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):

if len(enc_chunks_tuple) != ndim:
# throw away encoding chunks, start over
return _determine_zarr_chunks(None, var_chunks, ndim)
return _determine_zarr_chunks(None, var_chunks, ndim, name)

for x in enc_chunks_tuple:
if not isinstance(x, int):
raise TypeError(
"zarr chunks must be an int or a tuple of ints. "
"Instead found %r" % (enc_chunks_tuple,)
"zarr chunk sizes specified in `encoding['chunks']` "
"must be an int or a tuple of ints. "
f"Instead found encoding['chunks']={enc_chunks_tuple!r} "
f"for variable named {name!r}."
)

# if there are chunks in encoding and the variable data is a numpy array,
Expand All @@ -142,19 +145,22 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
for dchunk in dchunks[:-1]:
if dchunk % zchunk:
raise NotImplementedError(
"Specified zarr chunks %r would overlap multiple dask "
"chunks %r. This is not implemented in xarray yet. "
" Consider rechunking the data using "
"`chunk()` or specifying different chunks in encoding."
% (enc_chunks_tuple, var_chunks)
f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for "
f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. "
"This is not implemented in xarray yet. "
"Consider either rechunking using `chunk()` or instead deleting "
"or modifying `encoding['chunks']`."
)
if dchunks[-1] > zchunk:
raise ValueError(
"Final chunk of Zarr array must be the same size or "
"smaller than the first. The specified Zarr chunk "
"encoding is %r, but %r in variable Dask chunks %r is "
"incompatible. Consider rechunking using `chunk()`."
% (enc_chunks_tuple, dchunks, var_chunks)
"smaller than the first. "
f"Specified Zarr chunk encoding['chunks']={enc_chunks_tuple}, "
f"for variable named {name!r} "
f"but {dchunks} in the variable's Dask chunks {var_chunks} is "
"incompatible with this encoding. "
"Consider either rechunking using `chunk()` or instead deleting "
"or modifying `encoding['chunks']`."
)
return enc_chunks_tuple

Expand All @@ -177,7 +183,7 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key):
return dimensions, attributes


def extract_zarr_variable_encoding(variable, raise_on_invalid=False):
def extract_zarr_variable_encoding(variable, raise_on_invalid=False, name=None):
"""
Extract zarr encoding dictionary from xarray Variable

Expand Down Expand Up @@ -207,7 +213,7 @@ def extract_zarr_variable_encoding(variable, raise_on_invalid=False):
del encoding[k]

chunks = _determine_zarr_chunks(
encoding.get("chunks"), variable.chunks, variable.ndim
encoding.get("chunks"), variable.chunks, variable.ndim, name
)
encoding["chunks"] = chunks
return encoding
Expand Down Expand Up @@ -453,7 +459,9 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
writer.add(v.data, zarr_array, region=tuple(new_region))
else:
# new variable
encoding = extract_zarr_variable_encoding(v, raise_on_invalid=check)
encoding = extract_zarr_variable_encoding(
v, raise_on_invalid=check, name=vn
)
encoded_attrs = {}
# the magic for storing the hidden dimension data
encoded_attrs[DIMENSION_KEY] = dims
Expand Down
22 changes: 19 additions & 3 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -1685,11 +1685,27 @@ def test_chunk_encoding_with_dask(self):

# should fail if dask_chunks are irregular...
ds_chunk_irreg = ds.chunk({"x": (5, 4, 3)})
with pytest.raises(ValueError) as e_info:
with raises_regex(ValueError, "uniform chunk sizes."):
with self.roundtrip(ds_chunk_irreg) as actual:
pass
# make sure this error message is correct and not some other error
assert e_info.match("chunks")

# should fail if encoding["chunks"] clashes with dask_chunks
badenc = ds.chunk({"x": 4})
badenc.var1.encoding["chunks"] = (6,)
with raises_regex(NotImplementedError, "named 'var1' would overlap"):
with self.roundtrip(badenc) as actual:
pass

badenc.var1.encoding["chunks"] = (2,)
with raises_regex(ValueError, "Specified Zarr chunk encoding"):
with self.roundtrip(badenc) as actual:
pass

badenc = badenc.chunk({"x": (3, 3, 6)})
badenc.var1.encoding["chunks"] = (3,)
with raises_regex(ValueError, "incompatible with this encoding"):
with self.roundtrip(badenc) as actual:
pass

# ... except if the last chunk is smaller than the first
ds_chunk_irreg = ds.chunk({"x": (5, 5, 2)})
Expand Down