Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attributes encoding #245

Merged
merged 7 commits into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
267 changes: 267 additions & 0 deletions nlmod/dims/attributes_encodings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
import numpy as np

dim_attrs = {
"botm": dict(
name="Bottom elevation",
description="Bottom elevation for each model cell",
units="mNAP",
valid_min=-2000.0,
valid_max=500.0,
),
"top": dict(
name="Top elevation",
description="Top elevation for each model cell",
units="mNAP",
valid_min=-2000.0,
valid_max=500.0,
),
"kh": dict(
name="Horizontal hydraulic conductivity",
description="Horizontal hydraulic conductivity for each model cell",
units="m/day",
valid_min=0.0,
valid_max=1000.0,
),
"kv": dict(
name="Vertical hydraulic conductivity",
description="Vertical hydraulic conductivity for each model cell",
units="m/day",
valid_min=0.0,
valid_max=1000.0,
),
"ss": dict(
name="Specific storage",
description="Specific storage for each model cell",
units="1/m",
valid_min=0.0,
valid_max=1.0,
),
"sy": dict(
name="Specific yield",
description="Specific yield for each model cell",
units="-",
valid_min=0.0,
valid_max=1.0,
),
"porosity": dict(
name="Porosity",
description="Porosity for each model cell",
units="-",
valid_min=0.0,
valid_max=1.0,
),
"recharge": dict(
name="Recharge",
description="Recharge for each model cell",
units="m/day",
valid_min=0.0,
valid_max=0.5,
),
"heads": dict(
name="Heads",
description="Point water heads for each model cell",
units="mNAP",
valid_min=-100.0,
valid_max=500.0,
),
"starting_head": dict(
name="Starting head",
description="Starting head for each model cell",
units="mNAP",
valid_min=-100.0,
valid_max=500.0,
),
"freshwater_head": dict(
name="Freshwater head",
description="Freshwater head for each model cell",
units="mNAP",
valid_min=-100.0,
valid_max=500.0,
),
"pointwater_head": dict(
name="Pointwater head",
description="Pointwater head for each model cell",
units="mNAP",
valid_min=-100.0,
valid_max=500.0,
),
"density": dict(
name="Density",
description="Density for each model cell",
units="kg/m3",
valid_min=950.0,
valid_max=1200.0,
),
"area": dict(
name="Cell area",
description="Cell area for each model cell",
units="m2",
valid_min=0.0,
valid_max=1e8,
),
}


encoding_requirements = {
"heads": dict(dval_max=0.005),
"botm": dict(dval_max=0.005),
"top": dict(dval_max=0.005),
"kh": dict(dval_max=1e-6),
"kv": dict(dval_max=1e-6),
"ss": dict(dval_max=0.005),
"sy": dict(dval_max=0.005),
"porosity": dict(dval_max=0.005),
"recharge": dict(dval_max=0.0005),
"starting_head": dict(dval_max=0.005),
"freshwater_head": dict(dval_max=0.005),
"pointwater_head": dict(dval_max=0.005),
"density": dict(dval_max=0.005),
"area": dict(dval_max=0.05),
}


def get_encodings(
ds, set_encoding_inplace=True, allowed_to_read_data_vars_for_minmax=True
):
"""Get the encoding for the data_vars. Based on the minimum values and maximum
values set in `dim_attrs` and the maximum allowed difference from
`encoding_requirements`.

If a loss of data resolution is allowed floats can also be stored at int16, halfing
the space required for storage. The maximum acceptabel loss in resolution
(`dval_max`) is compared with the expected loss in resolution
(`is_int16_allowed()`).

If `set_encoding_inplace` is False, a dictionary with encodings is returned that
can be passed as argument to `ds.to_netcdf()`. If True, the encodings are set
inplace; they are stored in the `ds["var"].encoding` for each var seperate.

If encoding is specified as argument in `ds.to_netcdf()` the encoding stored in the
`ds["var"].encoding` for each var is ignored.

Parameters
----------
ds : xarray.Dataset
Dataset containing the data_vars
set_encoding_inplace : bool
Set the encoding inplace, by default True
allowed_to_data_vars : bool
If True, only data_vars that are allowed to be read are used to calculate the
minimum and maximum values to estimate the effect of precision loss.
If False, min max from dim_attrs are used. By default True.

Returns
-------
encodings : dict
Dictionary containing the encodings for each data_var

TODO: add support for strings
"""
encodings = {}
for varname, da in ds.data_vars.items():
encodings[varname] = dict(
fletcher32=True, # Store checksums to detect corruption
)
encoding = encodings[varname]

assert "_FillValue" not in da.attrs, (
f"Custom fillvalues are not supported. {varname} has a fillvalue set.")

isfloat = np.issubdtype(da.dtype, np.floating)
isint = np.issubdtype(da.dtype, np.integer)

# set the dtype, scale_factor and add_offset
if isfloat and varname in encoding_requirements and varname in dim_attrs:
dval_max = encoding_requirements[varname]["dval_max"]

if allowed_to_read_data_vars_for_minmax:
vmin = float(da.min())
vmax = float(da.max())
else:
vmin = dim_attrs[varname]["valid_min"]
vmax = dim_attrs[varname]["valid_max"]

float_as_int16 = is_int16_allowed(vmin, vmax, dval_max)

if float_as_int16:
scale_factor, add_offset = compute_scale_and_offset(vmin, vmax)
encoding["dtype"] = "int16"
encoding["scale_factor"] = scale_factor
encoding["add_offset"] = add_offset
encoding["_FillValue"] = -32767 # default for NC_SHORT
# result = (np.array([vmin, vmax]) - add_offset) / scale_factor
else:
encoding["dtype"] = "float32"

elif isint and allowed_to_read_data_vars_for_minmax:
vmin = int(da.min())
vmax = int(da.max())

if vmin >= -32766 and vmax <= 32767:
encoding["dtype"] = "int16"
elif vmin >= -2147483646 and vmax <= 2147483647:
encoding["dtype"] = "int32"
else:
encoding["dtype"] = "int64"
else:
pass

# set the compression
if isfloat or isint:
# Strings dont support compression. Only floats and ints for now.
encoding["zlib"] = True
encoding["complevel"] = 5

if set_encoding_inplace:
da.encoding = encoding
else:
return encodings


def compute_scale_and_offset(minValue, maxValue):
"""
Computes the scale_factor and offset for the dataset using a minValue and maxValue,
and int16. Useful for maximizing the compression of a dataset.

Parameters
----------
minValue : float
Minimum value of the dataset
maxValue : float
Maximum value of the dataset

Returns
-------
scale_factor : float
Scale factor for the dataset
add_offset : float
Add offset for the dataset
"""
# stretch/compress data to the available packed range
# from -32766 to 32767, because -32767 is the default fillvalue.
width = 32766 + 32767
scale_factor = (maxValue - minValue) / width
add_offset = 1.0 + scale_factor * (width - 1) / 2
return scale_factor, add_offset


def is_int16_allowed(vmin, vmax, dval_max):
"""compute the loss of resolution by storing a float as int16 (`dval`). And
compare it with the maximum allowed loss of resolution (`dval_max`).

Parameters
----------
vmin : float
Minimum value of the dataset
vmax : float
Maximum value of the dataset
dval_max : float
Maximum allowed loss of resolution

Returns
-------
bool
True if the loss of resolution is allowed, False otherwise"""
nsteps = 32766 + 32767
dval = (vmax - vmin) / nsteps
return dval <= dval_max
2 changes: 2 additions & 0 deletions nlmod/gwt/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def get_concentration_at_gw_surface(conc, layer="layer"):

def freshwater_head(ds, hp, conc, denseref=None, drhodc=None):
"""Calculate equivalent freshwater head from point water heads.
Heads file produced by mf6 contains point water heads.

Parameters
----------
Expand Down Expand Up @@ -182,6 +183,7 @@ def freshwater_head(ds, hp, conc, denseref=None, drhodc=None):

def pointwater_head(ds, hf, conc, denseref=None, drhodc=None):
"""Calculate point water head from freshwater heads.
Heads file produced by mf6 contains point water heads.

Parameters
----------
Expand Down
71 changes: 71 additions & 0 deletions tests/test_019_attributes_encodings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import numpy as np
import os
import xarray as xr
from tempfile import TemporaryDirectory

from nlmod.dims.attributes_encodings import get_encodings


def test_encodings_float_as_int16():
"""Test if the encodings are correct."""
# Test is the encodings work for floats where degradation to int16 is allowed
data = np.arange(1.0, 4.0)
data[1] = np.nan

ds = xr.Dataset(data_vars=dict(heads=xr.DataArray(data=data)))
encodings = get_encodings(
ds, set_encoding_inplace=False, allowed_to_read_data_vars_for_minmax=True
)

# test writing to temporary netcdf file
with TemporaryDirectory() as tmpdir:
ds.to_netcdf(os.path.join(tmpdir, "test2.nc"), encoding=encodings)
ds2 = xr.open_dataset(os.path.join(tmpdir, "test2.nc"), mask_and_scale=True)

dval_max = float(ds["heads"].max() - ds["heads"].min()) / (32766 + 32767)

assert np.allclose(
ds["heads"].values, ds2["heads"].values, atol=dval_max, rtol=0.0, equal_nan=True
)
assert np.all(np.isnan(ds["heads"]) == np.isnan(ds2["heads"]))

# Test is the encodings work for floats where degradation to int16 is not allowed
data = np.arange(1.0, 1e6)
data[1] = np.nan

ds = xr.Dataset(data_vars=dict(heads=xr.DataArray(data=data)))
encodings = get_encodings(
ds, set_encoding_inplace=False, allowed_to_read_data_vars_for_minmax=True
)
assert encodings["heads"]["dtype"] == "float32", "dtype should be float32"


def test_encondings_inplace():
"""Test if the encodings inplace are correct."""
# Test is the encodings work for floats where degradation to int16 is allowed
data = np.arange(1.0, 5.0)
data[1] = np.nan

ds = xr.Dataset(data_vars=dict(heads=xr.DataArray(data=data)))
ds_inplace = ds.copy(deep=True)

encodings = get_encodings(
ds, set_encoding_inplace=False, allowed_to_read_data_vars_for_minmax=True
)
get_encodings(
ds_inplace,
set_encoding_inplace=False,
allowed_to_read_data_vars_for_minmax=True,
)

# test writing to temporary netcdf file
with TemporaryDirectory() as tmpdir:
ds.to_netcdf(os.path.join(tmpdir, "test2.nc"), encoding=encodings)
ds2 = xr.open_dataset(os.path.join(tmpdir, "test2.nc"), mask_and_scale=True)

ds_inplace.to_netcdf(os.path.join(tmpdir, "test_inplace.nc"))
ds_inplace2 = xr.open_dataset(
os.path.join(tmpdir, "test_inplace.nc"), mask_and_scale=True
)

assert np.allclose(ds2["heads"].values, ds_inplace2["heads"].values, equal_nan=True)