diff --git a/nlmod/dims/attributes_encodings.py b/nlmod/dims/attributes_encodings.py new file mode 100644 index 00000000..fd184ee1 --- /dev/null +++ b/nlmod/dims/attributes_encodings.py @@ -0,0 +1,267 @@ +import numpy as np + +dim_attrs = { + "botm": dict( + name="Bottom elevation", + description="Bottom elevation for each model cell", + units="mNAP", + valid_min=-2000.0, + valid_max=500.0, + ), + "top": dict( + name="Top elevation", + description="Top elevation for each model cell", + units="mNAP", + valid_min=-2000.0, + valid_max=500.0, + ), + "kh": dict( + name="Horizontal hydraulic conductivity", + description="Horizontal hydraulic conductivity for each model cell", + units="m/day", + valid_min=0.0, + valid_max=1000.0, + ), + "kv": dict( + name="Vertical hydraulic conductivity", + description="Vertical hydraulic conductivity for each model cell", + units="m/day", + valid_min=0.0, + valid_max=1000.0, + ), + "ss": dict( + name="Specific storage", + description="Specific storage for each model cell", + units="1/m", + valid_min=0.0, + valid_max=1.0, + ), + "sy": dict( + name="Specific yield", + description="Specific yield for each model cell", + units="-", + valid_min=0.0, + valid_max=1.0, + ), + "porosity": dict( + name="Porosity", + description="Porosity for each model cell", + units="-", + valid_min=0.0, + valid_max=1.0, + ), + "recharge": dict( + name="Recharge", + description="Recharge for each model cell", + units="m/day", + valid_min=0.0, + valid_max=0.5, + ), + "heads": dict( + name="Heads", + description="Point water heads for each model cell", + units="mNAP", + valid_min=-100.0, + valid_max=500.0, + ), + "starting_head": dict( + name="Starting head", + description="Starting head for each model cell", + units="mNAP", + valid_min=-100.0, + valid_max=500.0, + ), + "freshwater_head": dict( + name="Freshwater head", + description="Freshwater head for each model cell", + units="mNAP", + valid_min=-100.0, + valid_max=500.0, + ), + "pointwater_head": dict( + name="Pointwater head", + description="Pointwater head for each model cell", + units="mNAP", + valid_min=-100.0, + valid_max=500.0, + ), + "density": dict( + name="Density", + description="Density for each model cell", + units="kg/m3", + valid_min=950.0, + valid_max=1200.0, + ), + "area": dict( + name="Cell area", + description="Cell area for each model cell", + units="m2", + valid_min=0.0, + valid_max=1e8, + ), +} + + +encoding_requirements = { + "heads": dict(dval_max=0.005), + "botm": dict(dval_max=0.005), + "top": dict(dval_max=0.005), + "kh": dict(dval_max=1e-6), + "kv": dict(dval_max=1e-6), + "ss": dict(dval_max=0.005), + "sy": dict(dval_max=0.005), + "porosity": dict(dval_max=0.005), + "recharge": dict(dval_max=0.0005), + "starting_head": dict(dval_max=0.005), + "freshwater_head": dict(dval_max=0.005), + "pointwater_head": dict(dval_max=0.005), + "density": dict(dval_max=0.005), + "area": dict(dval_max=0.05), +} + + +def get_encodings( + ds, set_encoding_inplace=True, allowed_to_read_data_vars_for_minmax=True +): + """Get the encoding for the data_vars. Based on the minimum values and maximum + values set in `dim_attrs` and the maximum allowed difference from + `encoding_requirements`. + + If a loss of data resolution is allowed floats can also be stored at int16, halfing + the space required for storage. The maximum acceptabel loss in resolution + (`dval_max`) is compared with the expected loss in resolution + (`is_int16_allowed()`). + + If `set_encoding_inplace` is False, a dictionary with encodings is returned that + can be passed as argument to `ds.to_netcdf()`. If True, the encodings are set + inplace; they are stored in the `ds["var"].encoding` for each var seperate. + + If encoding is specified as argument in `ds.to_netcdf()` the encoding stored in the + `ds["var"].encoding` for each var is ignored. + + Parameters + ---------- + ds : xarray.Dataset + Dataset containing the data_vars + set_encoding_inplace : bool + Set the encoding inplace, by default True + allowed_to_data_vars : bool + If True, only data_vars that are allowed to be read are used to calculate the + minimum and maximum values to estimate the effect of precision loss. + If False, min max from dim_attrs are used. By default True. + + Returns + ------- + encodings : dict + Dictionary containing the encodings for each data_var + + TODO: add support for strings + """ + encodings = {} + for varname, da in ds.data_vars.items(): + encodings[varname] = dict( + fletcher32=True, # Store checksums to detect corruption + ) + encoding = encodings[varname] + + assert "_FillValue" not in da.attrs, ( + f"Custom fillvalues are not supported. {varname} has a fillvalue set.") + + isfloat = np.issubdtype(da.dtype, np.floating) + isint = np.issubdtype(da.dtype, np.integer) + + # set the dtype, scale_factor and add_offset + if isfloat and varname in encoding_requirements and varname in dim_attrs: + dval_max = encoding_requirements[varname]["dval_max"] + + if allowed_to_read_data_vars_for_minmax: + vmin = float(da.min()) + vmax = float(da.max()) + else: + vmin = dim_attrs[varname]["valid_min"] + vmax = dim_attrs[varname]["valid_max"] + + float_as_int16 = is_int16_allowed(vmin, vmax, dval_max) + + if float_as_int16: + scale_factor, add_offset = compute_scale_and_offset(vmin, vmax) + encoding["dtype"] = "int16" + encoding["scale_factor"] = scale_factor + encoding["add_offset"] = add_offset + encoding["_FillValue"] = -32767 # default for NC_SHORT + # result = (np.array([vmin, vmax]) - add_offset) / scale_factor + else: + encoding["dtype"] = "float32" + + elif isint and allowed_to_read_data_vars_for_minmax: + vmin = int(da.min()) + vmax = int(da.max()) + + if vmin >= -32766 and vmax <= 32767: + encoding["dtype"] = "int16" + elif vmin >= -2147483646 and vmax <= 2147483647: + encoding["dtype"] = "int32" + else: + encoding["dtype"] = "int64" + else: + pass + + # set the compression + if isfloat or isint: + # Strings dont support compression. Only floats and ints for now. + encoding["zlib"] = True + encoding["complevel"] = 5 + + if set_encoding_inplace: + da.encoding = encoding + else: + return encodings + + +def compute_scale_and_offset(minValue, maxValue): + """ + Computes the scale_factor and offset for the dataset using a minValue and maxValue, + and int16. Useful for maximizing the compression of a dataset. + + Parameters + ---------- + minValue : float + Minimum value of the dataset + maxValue : float + Maximum value of the dataset + + Returns + ------- + scale_factor : float + Scale factor for the dataset + add_offset : float + Add offset for the dataset + """ + # stretch/compress data to the available packed range + # from -32766 to 32767, because -32767 is the default fillvalue. + width = 32766 + 32767 + scale_factor = (maxValue - minValue) / width + add_offset = 1.0 + scale_factor * (width - 1) / 2 + return scale_factor, add_offset + + +def is_int16_allowed(vmin, vmax, dval_max): + """compute the loss of resolution by storing a float as int16 (`dval`). And + compare it with the maximum allowed loss of resolution (`dval_max`). + + Parameters + ---------- + vmin : float + Minimum value of the dataset + vmax : float + Maximum value of the dataset + dval_max : float + Maximum allowed loss of resolution + + Returns + ------- + bool + True if the loss of resolution is allowed, False otherwise""" + nsteps = 32766 + 32767 + dval = (vmax - vmin) / nsteps + return dval <= dval_max diff --git a/nlmod/gwt/output.py b/nlmod/gwt/output.py index d08f4a40..d7994939 100644 --- a/nlmod/gwt/output.py +++ b/nlmod/gwt/output.py @@ -140,6 +140,7 @@ def get_concentration_at_gw_surface(conc, layer="layer"): def freshwater_head(ds, hp, conc, denseref=None, drhodc=None): """Calculate equivalent freshwater head from point water heads. + Heads file produced by mf6 contains point water heads. Parameters ---------- @@ -182,6 +183,7 @@ def freshwater_head(ds, hp, conc, denseref=None, drhodc=None): def pointwater_head(ds, hf, conc, denseref=None, drhodc=None): """Calculate point water head from freshwater heads. + Heads file produced by mf6 contains point water heads. Parameters ---------- diff --git a/tests/test_019_attributes_encodings.py b/tests/test_019_attributes_encodings.py new file mode 100644 index 00000000..0e57272d --- /dev/null +++ b/tests/test_019_attributes_encodings.py @@ -0,0 +1,71 @@ +import numpy as np +import os +import xarray as xr +from tempfile import TemporaryDirectory + +from nlmod.dims.attributes_encodings import get_encodings + + +def test_encodings_float_as_int16(): + """Test if the encodings are correct.""" + # Test is the encodings work for floats where degradation to int16 is allowed + data = np.arange(1.0, 4.0) + data[1] = np.nan + + ds = xr.Dataset(data_vars=dict(heads=xr.DataArray(data=data))) + encodings = get_encodings( + ds, set_encoding_inplace=False, allowed_to_read_data_vars_for_minmax=True + ) + + # test writing to temporary netcdf file + with TemporaryDirectory() as tmpdir: + ds.to_netcdf(os.path.join(tmpdir, "test2.nc"), encoding=encodings) + ds2 = xr.open_dataset(os.path.join(tmpdir, "test2.nc"), mask_and_scale=True) + + dval_max = float(ds["heads"].max() - ds["heads"].min()) / (32766 + 32767) + + assert np.allclose( + ds["heads"].values, ds2["heads"].values, atol=dval_max, rtol=0.0, equal_nan=True + ) + assert np.all(np.isnan(ds["heads"]) == np.isnan(ds2["heads"])) + + # Test is the encodings work for floats where degradation to int16 is not allowed + data = np.arange(1.0, 1e6) + data[1] = np.nan + + ds = xr.Dataset(data_vars=dict(heads=xr.DataArray(data=data))) + encodings = get_encodings( + ds, set_encoding_inplace=False, allowed_to_read_data_vars_for_minmax=True + ) + assert encodings["heads"]["dtype"] == "float32", "dtype should be float32" + + +def test_encondings_inplace(): + """Test if the encodings inplace are correct.""" + # Test is the encodings work for floats where degradation to int16 is allowed + data = np.arange(1.0, 5.0) + data[1] = np.nan + + ds = xr.Dataset(data_vars=dict(heads=xr.DataArray(data=data))) + ds_inplace = ds.copy(deep=True) + + encodings = get_encodings( + ds, set_encoding_inplace=False, allowed_to_read_data_vars_for_minmax=True + ) + get_encodings( + ds_inplace, + set_encoding_inplace=False, + allowed_to_read_data_vars_for_minmax=True, + ) + + # test writing to temporary netcdf file + with TemporaryDirectory() as tmpdir: + ds.to_netcdf(os.path.join(tmpdir, "test2.nc"), encoding=encodings) + ds2 = xr.open_dataset(os.path.join(tmpdir, "test2.nc"), mask_and_scale=True) + + ds_inplace.to_netcdf(os.path.join(tmpdir, "test_inplace.nc")) + ds_inplace2 = xr.open_dataset( + os.path.join(tmpdir, "test_inplace.nc"), mask_and_scale=True + ) + + assert np.allclose(ds2["heads"].values, ds_inplace2["heads"].values, equal_nan=True)