diff --git a/tools/landusedata/README.md b/tools/landusedata/README.md new file mode 100644 index 0000000000..60305c7ad1 --- /dev/null +++ b/tools/landusedata/README.md @@ -0,0 +1,55 @@ +# FATES LUH2 data tool README + +## Purpose + +This tool takes the raw Land Use Harmonization (https://luh.umd.edu/), or LUH2, data files as +input and prepares them for use with FATES. The tool concatenates the various raw data sets into +a single file and provides the ability to regrid the source data resolution to a target +resolution that the user designates. The output data is then usable by FATES, mediated through +a host land model (currently either CTSM or E3SM). + +For more information on how FATES utilizes this information see https://github.com/NGEET/fates/pull/1040. + +## Installation + +This tool requires the usage of conda with python3. See https://docs.conda.io/en/latest/miniconda.html#installing +for information on installing conda on your system. To install the conda environment necessary to run the tool +execute the following commands: + +conda env create -f conda-luh2.yml + +This will create a conda environment named "luh2". To activate this environment run: + +conda activate luh2 + +For more information on creating conda environments see +https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-from-an-environment-yml-file + +Note that it is planned that a subset of host land model (hlm) and hlm supported machines will incoporate this tool into the surface dataset workflow. +As such, if you are working on one of these machines, the output from this tool may be precomputed and available for the grid resolution of interest. + +## Usage + +After activating the "luh2" environment the tool can be run from the command line with the following minimum required inputs: + +python luh2.py -l -s -r -w -o + +The description of the minimum required input arguments is as follows: +- raw-luh2-datafile: this is one of three raw luh2 datafiles, either states, transitions, or management. This is the data to be regridded and used by FATES. +- luh2-static-datafile: supplementary 0.25 deg resolution static data used in the construction of the raw luh2 datafiles. This is utilized to help set the gridcell mask for the output file. +- regrid-targetfile: host land model surface data file intended to be used in conjunction with the fates run at a specific grid resolution. This is used as the regridder target resolution. +- regridder-output: the path and filename to write out the regridding weights file or to use an existing regridding weights file. +- outputfile: the path and filename to which the output is written + +The tool is intended to be run three times, sequentially, to concatenate the raw states, transitions, and management data into a single file. After the first run of +the tool, a merge option should also be included in the argument list pointing to the most recent output file. This will ensure that the previous regridding run +will be merged into the current run as well as reusing the previously output regridding weights file (to help reduce duplicate computation). +The luh2.sh file in this directory provides an example shell script in using the python tool in this sequential manner. The python tool itself provides additional +help by passing the `--help` option argument to the command line call. + +## Description of directory contents + +- luh2.py: main luh2 python script +- luh2mod.py: python module source file for the functions called in luh2.py +- luh2.sh: example bash shell script file demonstrating how to call luh2.py +- conda-luh2.yml: conda enviroment yaml file which defines the minimum set of package dependencies for luh2.py diff --git a/tools/landusedata/pyproject.toml b/tools/landusedata/pyproject.toml new file mode 100644 index 0000000000..70ca046c63 --- /dev/null +++ b/tools/landusedata/pyproject.toml @@ -0,0 +1,20 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "landusedata" +version = "0.0.0" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] + +# [tool.pytest.ini_options] +# addopts = "-ra -q" +# testpaths = [ +# "tests", +# "integration", +# ] diff --git a/tools/landusedata/pytest.ini b/tools/landusedata/pytest.ini new file mode 100644 index 0000000000..7e0cc474e3 --- /dev/null +++ b/tools/landusedata/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = tests +addopts = --verbose --durations=10 --color=yes +# addopts = --pep8 --flakes --verbose --durations=10 --color=yes diff --git a/tools/landusedata/src/landusedata/__init__.py b/tools/landusedata/src/landusedata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/landusedata/src/landusedata/__main__.py b/tools/landusedata/src/landusedata/__main__.py new file mode 100644 index 0000000000..a1d5e07628 --- /dev/null +++ b/tools/landusedata/src/landusedata/__main__.py @@ -0,0 +1,5 @@ +from landusedata._main import main + +# Gaurd against import time side effects +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/tools/landusedata/src/landusedata/_main.py b/tools/landusedata/src/landusedata/_main.py new file mode 100644 index 0000000000..552e38e3a3 --- /dev/null +++ b/tools/landusedata/src/landusedata/_main.py @@ -0,0 +1,80 @@ +import argparse + +from landusedata.luh2 import main as luh2main +from landusedata.landusepft import main as lupftmain + +def main(argv=None): + + # Define top level parser + parser = argparse.ArgumentParser(description="FATES landuse data tool") + + # Target regrid file - is there a nice way to share this between subparsers? + # parser.add_argument('regrid_target_file', help='target surface data file with desired grid resolution') + + # Define subparser option for luh2 or landuse x pft data tool subcommands + subparsers = parser.add_subparsers(required=True, title='subcommands', + help='landuse data tool subcommand options') + luh2_parser = subparsers.add_parser('luh2', prog='luh2', + help='generate landuse harmonization timeseries data output') + lupft_parser = subparsers.add_parser('lupft', prog='lupft', + help='generate landuse x pft static data map output') + + # Set the default called function for the subparser command + luh2_parser.set_defaults(func=luh2main) + lupft_parser.set_defaults(func=lupftmain) + + # LUH2 subparser arguments + luh2_parser.add_argument('regrid_target_file', + help='target surface data file with desired grid resolution') + luh2_parser.add_argument("luh2_static_file", + help = "luh2 static data file") + luh2_parser.add_argument('luh2_states_file', + help = "full path of luh2 raw states file") + luh2_parser.add_argument('luh2_transitions_file', + help = "full path of luh2 raw transitions file") + luh2_parser.add_argument('luh2_management_file', + help = "full path of luh2 raw management file") + luh2_parser.add_argument("-w", "--regridder_weights", + default = 'regridder.nc', + help = "filename of regridder weights to save") + luh2_parser.add_argument("-b","--begin", + type = int, + default = None, + help = "beginning of date range of interest") + luh2_parser.add_argument("-e","--end", + type = int, + default = None, + help = "ending of date range to slice") + luh2_parser.add_argument("-o","--output", + default = 'LUH2_timeseries.nc', + help = "output filename") + + # Landuse x pft subparser arguments + lupft_parser.add_argument('regrid_target_file', + help='target surface data file with desired grid resolution') + lupft_parser.add_argument('luh2_static_file', + help = "luh2 static data file") + lupft_parser.add_argument('clm_luhforest_file', + help = "CLM5_current_luhforest_deg025.nc") + lupft_parser.add_argument('clm_luhpasture_file', + help = "CLM5_current_luhpasture_deg025.nc") + lupft_parser.add_argument('clm_luhother_file', + help = "CLM5_current_luhother_deg025.nc") + lupft_parser.add_argument('clm_surface_file', + help = "CLM5_current_surf_deg025.nc") + lupft_parser.add_argument("-o","--output", + default = 'fates_landuse_pft_map.nc', + help = "output filename") + + # Parse the arguments + args = parser.parse_args(argv) + + # Call the default function for the given subcommand + args.func(args) + + # Return successful completion + return 0 + +# Gaurd against import time side effects +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/tools/landusedata/src/landusedata/conda-luh2.yml b/tools/landusedata/src/landusedata/conda-luh2.yml new file mode 100644 index 0000000000..12d4a35c65 --- /dev/null +++ b/tools/landusedata/src/landusedata/conda-luh2.yml @@ -0,0 +1,11 @@ +# This yaml file is intended for users who wish to utilize the luh2.py tool on their own machines. +# The file is not yet tested regularly to determine if the latest versions of the dependencies will +# always work. This regular testing is expected to be implemented in the future. +name: luh2 +channels: + - conda-forge + - defaults +dependencies: + - xesmf + # xarray which is autodownloaded as xesmf dependency, uses scipy, which needs netcdf4 to open datasets + - netcdf4 diff --git a/tools/landusedata/src/landusedata/landusepft.py b/tools/landusedata/src/landusedata/landusepft.py new file mode 100644 index 0000000000..6a396163e7 --- /dev/null +++ b/tools/landusedata/src/landusedata/landusepft.py @@ -0,0 +1,96 @@ +import argparse, os, sys +import xarray as xr +import xesmf as xe + +from landusedata.landusepftmod import ImportLandusePFTFile, AddLatLonCoordinates, RenormalizePFTs +from landusedata.utils import ImportLUH2StaticFile, ImportRegridTarget +from landusedata.utils import SetMaskRegridTarget, DefineStaticMask + +def main(args): + + # Open the files + ds_static = ImportLUH2StaticFile(args.luh2_static_file) + filelist = [args.clm_surface_file, + args.clm_luhforest_file, + args.clm_luhpasture_file, + args.clm_luhother_file] + ds_landusepfts = [] + for filename in filelist: + ds_landusepfts.append(ImportLandusePFTFile(filename)) + + # Add lat/lon coordinates to the CLM5 landuse data + for dataset in ds_landusepfts: + AddLatLonCoordinates(dataset) + + # Define static luh2 ice/water mask + mask_icwtr = DefineStaticMask(ds_static) + + # Calculate the bareground percentage after initializing data array list + # Normalize the percentages + percent = [] + percent_bareground = ds_landusepfts[0].PCT_NAT_PFT.isel(natpft=0) + percent_bareground = (percent_bareground / 100.0) * mask_icwtr + percent.append(percent_bareground) + + # Renormalize the PCT_NAT_PFT for each dataset using the mask + for data_array in ds_landusepfts: + percent.append(RenormalizePFTs(data_array)) + + # Calculate the primary and secondary PFT fractions as the forest + # and nonforest-weighted averages of the forest and other PFT datasets. + percent[2] = ds_static.fstnf * percent[2] + (1. - ds_static.fstnf) * percent[-1] + + # Note that the list order is: + # bareground, surface data, primary, pasture, rangeland (other) + ds_var_names = ['frac_brgnd','frac_csurf','frac_primr','frac_pastr','frac_range'] + + # Open and prepare the target dataset + ds_target = ImportRegridTarget(args.regrid_target_file) + + # Set the mask for the regrid target + ds_target = SetMaskRegridTarget(ds_target) + + # Create an output dataset to contain individually regridded landuse percent datasets + ds_output = xr.Dataset() + + # Loop through percentage list and regrid each entry + for index,data_array in enumerate(percent): + + # Get the name for the new variable + varname = ds_var_names[index] + + # Convert current percent data array into temporary dataset + ds_percent = data_array.to_dataset(name=varname) + + # Apply mask for the current dataset + ds_percent['mask'] = mask_icwtr + if (varname != 'frac_brgnd'): + mask_zeropercent = xr.where(ds_percent[varname].sum(dim='natpft') == 0.,0,1) + ds_percent['mask'] = ds_percent.mask * mask_zeropercent + + # Regrid current dataset + print('Regridding {}'.format(varname)) + regridder = xe.Regridder(ds_percent, ds_target, "conservative_normed") + ds_regrid = regridder(ds_percent) + + # Drop mask to avoid conflicts when merging + if (varname != 'frac_brgnd'): + # There is no mask currently on the bareground + ds_regrid = ds_regrid.drop_vars(['mask']) + + # Append the new dataset to the output dataset + ds_output = ds_output.merge(ds_regrid) + + # Duplicate the 'primary' data array into a 'secondary' data array. Eventually + # this will contain different data from a future CLM landuse x pft update + ds_output['frac_secnd'] = ds_output.frac_primr.copy(deep=True) + + # ds_regrid = ds_regrid.rename_dims(dims_dict={'lat':'lsmlat','lon':'lsmlon'}) + + # Output dataset to netcdf file + print('Writing fates landuse x pft dataset to file') + output_file = os.path.join(os.getcwd(),args.output) + ds_output.to_netcdf(output_file) + +if __name__ == "__main__": + main() diff --git a/tools/landusedata/src/landusedata/landusepftmod.py b/tools/landusedata/src/landusedata/landusepftmod.py new file mode 100644 index 0000000000..5eea90e9f7 --- /dev/null +++ b/tools/landusedata/src/landusedata/landusepftmod.py @@ -0,0 +1,31 @@ +import xarray as xr + +# Open the CLM5 landuse x pft data file +def ImportLandusePFTFile(filename): + dataset = xr.open_dataset(filename) + + # Check to see if the imported dataset has the the percent + # natural pft variable present. + if 'PCT_NAT_PFT' not in list(dataset.var()): + raise TypeError("incorrect file, must be CLM5 landuse file") + + # change the percent natural pft from single to double precision + # for downstream calculations + dataset['PCT_NAT_PFT'] = dataset.PCT_NAT_PFT.astype('float64') + return dataset + +# Add lat/lon coordinates to the CLM5 landuse dataset +# While the lat and lon are available as variables, they are +# not defined as 'coords' in the imported dataset +def AddLatLonCoordinates(dataset): + dataset['lon'] = dataset.LON + dataset['lat'] = dataset.LAT + return dataset + +# Renormalize the pft percentages without the bareground +def RenormalizePFTs(dataset): + # Remove the bareground pft index from the dataset + percent = dataset.PCT_NAT_PFT.isel(natpft=slice(1,None)) + # Normalize + percent = percent / percent.sum(dim='natpft') + return percent diff --git a/tools/landusedata/src/landusedata/luh2.py b/tools/landusedata/src/landusedata/luh2.py new file mode 100644 index 0000000000..0ed8edf47c --- /dev/null +++ b/tools/landusedata/src/landusedata/luh2.py @@ -0,0 +1,105 @@ +import argparse, os, sys + +import xarray as xr + +from landusedata.luh2mod import ImportLUH2TimeSeries, CorrectStateSum +from landusedata.regrid import RegridConservative, RegridLoop +from landusedata.utils import ImportLUH2StaticFile, ImportRegridTarget +from landusedata.utils import SetMaskRegridTarget, DefineStaticMask + + +# Add version checking here in case environment.yml not used +def main(args): + + # Import and prep the LUH2 datasets and regrid target + filelist = [args.luh2_states_file, + args.luh2_transitions_file, + args.luh2_management_file] + ds_luh2 = [] + for filename in filelist: + ds_luh2.append(ImportLUH2TimeSeries(filename,start=args.begin,stop=args.end)) + + # Import the LUH2 static data to use for masking + ds_luh2_static = ImportLUH2StaticFile(args.luh2_static_file) + + # Create new variable where the ice water fraction is inverted w + ds_luh2_static["landfrac"] = 1 - ds_luh2_static.icwtr + + # Define static luh2 ice/water mask + mask_icwtr = DefineStaticMask(ds_luh2_static) + + # Mask all LUH2 input data using the ice/water fraction for the LUH2 static data + ds_luh2_static['mask'] = mask_icwtr + for dataset in ds_luh2: + dataset['mask'] = mask_icwtr + + # Import and prep the regrid target surface dataset + ds_regrid_target = ImportRegridTarget(args.regrid_target_file) + + # Mask the regrid target + ds_regrid_target = SetMaskRegridTarget(ds_regrid_target) + + # Create an output dataset to contain individually regridded landuse percent datasets + ds_output = xr.Dataset() + + # Loop through the data set list, regrid and merge the sets. + # At the beginning of the loop, there is no regrid weights file to reuse + regrid_reuse = False + for dataset in ds_luh2: + + # Regrid the luh2 data to the target grid + # TO DO: provide a check for the save argument based on the input arguments + regrid_luh2,regridder_luh2 = RegridConservative(dataset, ds_regrid_target, + args.regridder_weights, regrid_reuse) + + # Regrid the inverted ice/water fraction data to the target grid + regrid_land_fraction = regridder_luh2(ds_luh2_static) + + # Adjust the luh2 data by the land fraction + # TO DO: determine if this is necessary for the transitions and management data + regrid_luh2 = regrid_luh2 / regrid_land_fraction.landfrac + + # Correct the state sum (checks if argument passed is state file in the function) + regrid_luh2 = CorrectStateSum(regrid_luh2) + + # Add additional required variables for the host land model + # Add 'YEAR' as a variable. + # If we are merging, we might not need to do this, so check to see if its there already + # This is a requirement of the HLM dyn_subgrid module and should be the actual year. + # Note that the time variable from the LUH2 data is 'years since ...' so we need to + # add the input data year + if (not "YEAR" in list(regrid_luh2.variables)): + regrid_luh2["YEAR"] = regrid_luh2.time + dataset.timesince + regrid_luh2["LONGXY"] = ds_regrid_target["LONGXY"] # TO DO: double check if this is strictly necessary + regrid_luh2["LATIXY"] = ds_regrid_target["LATIXY"] # TO DO: double check if this is strictly necessary + + # Rename the dimensions for the output. This needs to happen after the "LONGXY/LATIXY" assignment + if (not 'lsmlat' in list(regrid_luh2.dims)): + regrid_luh2 = regrid_luh2.rename_dims({'lat':'lsmlat','lon':'lsmlon'}) + + # Reapply the coordinate attributes. This is a workaround for an xarray bug (#8047) + # Currently only need time + regrid_luh2.time.attrs = dataset.time.attrs + regrid_luh2.lat.attrs = dataset.lat.attrs + regrid_luh2.lon.attrs = dataset.lon.attrs + + # Merge previous regrided luh2 file with merge input target + # TO DO: check that the grid resolution matches + # We could do this with an append during the write phase instead of the merge + # What is the performance impact of writing to disk between dataset regrids + # versus holding the output in memory? + ds_output = ds_output.merge(regrid_luh2) + + # If regrid_reuse is False, change it to True for the next loop so that + # the previously generated weights file is reused + if (not(regrid_reuse)): + regrid_reuse = True + + # Write the files + # TO DO: add check to handle if the user enters the full path + output_file = os.path.join(os.getcwd(),args.output) + print("generating output: {}".format(output_file)) + ds_output.to_netcdf(output_file) + +if __name__ == "__main__": + main() diff --git a/tools/landusedata/src/landusedata/luh2mod.py b/tools/landusedata/src/landusedata/luh2mod.py new file mode 100644 index 0000000000..c4c3743a33 --- /dev/null +++ b/tools/landusedata/src/landusedata/luh2mod.py @@ -0,0 +1,110 @@ +import re, sys +import numpy as np +import xarray as xr +import xesmf as xe + +# Import luh2 or surface data sets +def ImportLUH2TimeSeries(input_file,start=None,stop=None): + + # Open files + # Set decode_times to false as the luh2 raw data is outside the range + # of the standard NetCDF datetime format. + datasetout = xr.open_dataset(input_file, cache=False, decode_times=False) + print("Input file dataset opened: {}".format(input_file)) + + # Prep the input data for use + datasetout = PrepDataset(datasetout,start,stop) + + return(datasetout) + +# Prepare the input_file to be used for regridding +def PrepDataset(input_dataset,start=None,stop=None): + + # Use the maximum span if start and stop are not present + # This assumes that the luh2 raw data will always use a + # 'years since' style format. + + # Get the units to determine the file time + # It is expected that the units of time is 'years since ...' + time_since_array = input_dataset.time.units.split() + if (time_since_array[0] != 'years'): + sys.exit("FileTimeUnitsError: input file units of time is not 'years since ...'") + + # Note that datetime package is not used as the date range might + # be beyond the bounds of the packages applicable bounds + time_since = int(time_since_array[2].split('-')[0]) + + # Get the time bounds of the input file + start_bound = input_dataset.time.values[0] + stop_bound = input_dataset.time.values[-1] + + # If no input provided, simply get the bounds of the time + if (isinstance(start,type(None))): + start = start_bound + time_since + + if (isinstance(stop,type(None))): + stop = stop_bound + time_since + + # Convert the input dates to years since 0850 + years_since_start = start - time_since + years_since_stop = stop - time_since + + # Abort if the times provided are outside the applicable range + if (years_since_start < start_bound or years_since_stop < start_bound or + years_since_start > stop_bound or years_since_stop > stop_bound): + sys.exit("StartStopBoundError: the input start or stop date is outside the applicable range of {} to {}".format(time_since+start_bound,time_since+stop_bound)) + + # Truncate the data to the user defined range + # This might need some more error handling for when + # the start/stop is out of range + input_dataset = input_dataset.sel(time=slice(years_since_start,years_since_stop)) + + # Save the timesince as a variable for future use + input_dataset["timesince"] = time_since + + # Correct the necessary variables for both datasets + input_dataset = _BoundsVariableFixLUH2(input_dataset) + + return(input_dataset) + +# Create the necessary variable "lat_b" and "lon_b" for xESMF conservative regridding +# Each lat/lon boundary array is a 2D array corresponding to the bounds of each +# coordinate position (e.g. lat_boundary would be 90.0 and 89.75 for lat coordinate +# of 89.875). +def _BoundsVariableFixLUH2(input_dataset): + + # Create lat and lon bounds as a single dimension array out of the LUH2 two dimensional_bounds array. + # Future todo: is it possible to have xESMF recognize and use the original 2D array? + input_dataset["lat_b"] = np.insert(input_dataset.lat_bounds[:,1].data,0,input_dataset.lat_bounds[0,0].data) + input_dataset["lon_b"] = np.insert(input_dataset.lon_bounds[:,1].data,0,input_dataset.lon_bounds[0,0].data) + + # Drop the old boundary names to avoid confusion + input_dataset = input_dataset.drop(labels=['lat_bounds','lon_bounds']) + + print("LUH2 dataset lat/lon boundary variables formatted and added as new variable for xESMF") + + return(input_dataset) + +# Temporary: Add minor correction factor to assure states sum to one +def CorrectStateSum(input_dataset): + + # Only calculate the state sum to unity correction for the appropiate dataset + # TO DO: Update this to use the check function + if (not(any('irrig' in var for var in input_dataset) or + any('_to_' in var for var in input_dataset))): + + # Drop the secma and secmb variables temporarily + temp_dataset = input_dataset.drop({'secma','secmb'}) + + # Sum the remaining state variables and normalize + state_sum = temp_dataset.to_array().sum(dim='variable') + state_sum = state_sum.where(state_sum != 0) + temp_dataset = temp_dataset / state_sum + + # Update dataset with new scaled values + input_dataset.update(temp_dataset) + + # Save the correction value + input_dataset["stscf"] = 1.0 / state_sum + + return(input_dataset) diff --git a/tools/landusedata/src/landusedata/regrid.py b/tools/landusedata/src/landusedata/regrid.py new file mode 100644 index 0000000000..7074a05ed9 --- /dev/null +++ b/tools/landusedata/src/landusedata/regrid.py @@ -0,0 +1,64 @@ +import xesmf as xe + +def RegridConservative(ds_to_regrid, ds_regrid_target, regridder_weights, regrid_reuse): + + # define the regridder transformation + regridder = GenerateRegridder(ds_to_regrid, ds_regrid_target, regridder_weights, regrid_reuse) + + # Loop through the variables to regrid + ds_regrid = RegridLoop(ds_to_regrid, regridder) + + return (ds_regrid, regridder) + +def GenerateRegridder(ds_to_regrid, ds_regrid_target, regridder_weights_file, regrid_reuse): + + regrid_method = "conservative" + print("\nDefining regridder, method: ", regrid_method) + + if (regrid_reuse): + regridder = xe.Regridder(ds_to_regrid, ds_regrid_target, + regrid_method, weights=regridder_weights_file) + else: + regridder = xe.Regridder(ds_to_regrid, ds_regrid_target, regrid_method) + + # If we are not reusing the regridder weights file, then save the regridder + filename = regridder.to_netcdf(regridder_weights_file) + print("regridder saved to file: ", filename) + + return(regridder) + +def RegridLoop(ds_to_regrid, regridder): + + # To Do: implement this with dask + print("\nRegridding") + + # Loop through the variables one at a time to conserve memory + ds_varnames = list(ds_to_regrid.variables.keys()) + varlen = len(ds_to_regrid.variables) + first_var = False + for i in range(varlen-1): + + # Skip time variable + if (not "time" in ds_varnames[i]): + + # Only regrid variables that match the lat/lon shape. + if (ds_to_regrid[ds_varnames[i]][0].shape == (ds_to_regrid.lat.shape[0], ds_to_regrid.lon.shape[0])): + print("regridding variable {}/{}: {}".format(i+1, varlen, ds_varnames[i])) + + # For the first non-coordinate variable, copy and regrid the dataset as a whole. + # This makes sure to correctly include the lat/lon in the regridding. + if (not(first_var)): + ds_regrid = ds_to_regrid[ds_varnames[i]].to_dataset() # convert data array to dataset + ds_regrid = regridder(ds_regrid) + first_var = True + + # Once the first variable has been included, then we can regrid by variable + else: + ds_regrid[ds_varnames[i]] = regridder(ds_to_regrid[ds_varnames[i]]) + else: + print("skipping variable {}/{}: {}".format(i+1, varlen, ds_varnames[i])) + else: + print("skipping variable {}/{}: {}".format(i+1, varlen, ds_varnames[i])) + + print("\n") + return(ds_regrid) diff --git a/tools/landusedata/src/landusedata/utils.py b/tools/landusedata/src/landusedata/utils.py new file mode 100644 index 0000000000..addb5076b1 --- /dev/null +++ b/tools/landusedata/src/landusedata/utils.py @@ -0,0 +1,65 @@ +import xarray as xr + +def ImportRegridTarget(filename): + dataset = xr.open_dataset(filename) + + # Check the file type + dim_list = list(dataset.dims) + if ('lsmlat' in list(dataset.dims)) != True: + raise TypeError("incorrect file, must be surface dataset") + + # Prepare the the regrid dataset + dataset = _RegridTargetPrep(dataset) + + return dataset + +def _RegridTargetPrep(regrid_target): + """ + Rename the CLM surface data file regrid target lat/lon dimensions + + The CLM surface datasets use a dimensions name that is not recognizable + by xesmf. As such, this function renames the dimensions. It also adds + lat/lon coordinates based on the LONGXY and LATIXY variables. + """ + regrid_target = regrid_target.rename_dims(dims_dict={'lsmlat':'lat','lsmlon':'lon'}) + regrid_target['lon'] = regrid_target.LONGXY.isel(lat=0) + regrid_target['lat'] = regrid_target.LATIXY.isel(lon=0) + + return regrid_target + +# Open the LUH2 static data file +def ImportLUH2StaticFile(filename): + dataset = xr.open_dataset(filename) + + # Check to see if the imported dataset has correct variables + listcheck = ['ptbio', 'fstnf', 'carea', 'icwtr', 'ccode', 'lat_bounds', 'lon_bounds'] + if list(dataset.var()) != listcheck: + raise TypeError("incorrect file, must be LUH2 static file") + + # Convert all data from single to double precision + dataset = dataset.astype('float64') + return dataset + +# Define the land/ocean mask based on the ice/water data +# from the LUH2 static data set +def DefineStaticMask(dataset): + try: + mask = xr.where(dataset.icwtr != 1, 1, 0) + # mask = (1.-dataset.icwtr) / (1.-dataset.icwtr) + except AttributeError: + raise AttributeError("incorrect dataset, must be static luh2 dataset") + else: + return mask + +# Surface dataset specific masking sub-function +def SetMaskRegridTarget(dataset): + try: + dataset["mask"] = xr.where(dataset.PCT_NATVEG > 0, 1, 0) + except AttributeError: + raise AttributeError("incorrect dataset, must be CLM surface dataset") + else: + return(dataset) + +# TODO: add the follow common functions +# - write to files +# - read dataset into list of datasets diff --git a/tools/landusedata/tests/conftest.py b/tools/landusedata/tests/conftest.py new file mode 100644 index 0000000000..78d647af13 --- /dev/null +++ b/tools/landusedata/tests/conftest.py @@ -0,0 +1,44 @@ +import os +import pytest +import xarray as xr + +@pytest.fixture(scope="module") +def target_file_location(): + return 'tests/resources/surfdata_4x5.nc' + +@pytest.fixture(scope="module") +def static_file_location(): + return 'tests/resources/staticData_quarterdeg.nc' + +@pytest.fixture(scope="module") +def landusepft_file_location(): + return 'tests/resources/CLM5_current_luhforest_deg025.nc' + +@pytest.fixture(scope="module") +def lupftsurf_file_location(): + return 'tests/resources/CLM5_current_surf_deg025.nc' + +@pytest.fixture(scope="function") +def static_dataset(static_file_location): + dataset = xr.open_dataset(static_file_location) + dataset = dataset.astype('float64') + return dataset + +@pytest.fixture(scope="function") +def target_dataset(target_file_location): + dataset = xr.open_dataset(target_file_location) + dataset = dataset.astype('float64') + return dataset + +@pytest.fixture(scope="function") +def landusepft_dataset(landusepft_file_location): + dataset = xr.open_dataset(landusepft_file_location) + dataset = dataset.astype('float64',casting='safe') + return dataset + +@pytest.fixture(scope="function") +def mock_mask(static_file_location): + dataset = xr.open_dataset(static_file_location) + # This is duplicative of the DefineMask function. Should it simply be called? + mask = (1.-dataset.icwtr) / (1.-dataset.icwtr) + return mask diff --git a/tools/landusedata/tests/landusepft_test.py b/tools/landusedata/tests/landusepft_test.py new file mode 100644 index 0000000000..ee08c4f47b --- /dev/null +++ b/tools/landusedata/tests/landusepft_test.py @@ -0,0 +1,91 @@ +import pytest +import numpy as np + +from landusedata import landusepftmod + +# Positive test case for importing landuse x pft data file +# TODO: parameterize this call to run all the landusepft files +def test_posImportLandusePFTFile(landusepft_file_location): + data = landusepftmod.ImportLandusePFTFile(landusepft_file_location) + landusepft_variables = list(data.var()) + assert landusepft_variables == ['EDGEN', + 'EDGEE', + 'EDGES', + 'EDGEW', + 'LAT', + 'LATIXY', + 'LON', + 'LONGXY', + 'LANDMASK', + 'LANDFRAC', + 'AREA', + 'PCT_NAT_PFT'] + +# Negative test case for importing incorrect file via CLM5 landuse file open function +def test_negImportLandusePFTFile(static_file_location): + with pytest.raises(TypeError) as exp: + landusepftmod.ImportLandusePFTFile(static_file_location) + assert str(exp.value) == "incorrect file, must be CLM5 landuse file" + +# Confirm that the data type has been converted upon import from single to double precision +def test_pos_ImportLandusePFTFileDtypeConversion(landusepft_file_location): + dataset = landusepftmod.ImportLandusePFTFile(landusepft_file_location) + assert np.dtype('float64') == dataset['PCT_NAT_PFT'].dtype + +# Unit tests to update CLM5 landuse datasets with lat/lon coordinates +def test_latlon_posAddLatLonCoordinates(landusepft_dataset): + dataset_modified = landusepftmod.AddLatLonCoordinates(landusepft_dataset) + assert 'lat' in list(dataset_modified.coords) and 'lon' in list(dataset_modified.coords) + +# Test that the math of the function makes sense. We can sum that lat/lon variables +# to makes sure that the update is correct +def test_latlonsum_posAddLatLonCoordinates(landusepft_dataset): + dataset_modified = landusepftmod.AddLatLonCoordinates(landusepft_dataset) + assert (sum(abs(dataset_modified.lon)).values.item() == 129600. + and sum(abs(dataset_modified.lat)).values.item() == 32400.) + +# Define Mask function unit tests +# - Is there a better way to simply test that the mask matches all gridcells? +# - Should we test what happens when the wrong dataset is used (i.e. landuse x pft)? +# - Should this test that the mask is truly one or zero everywhere? + +# Make sure that the generated mask makes sense in that a known gridcell is nan +def test_posDefineMask(static_dataset): + maskoutput = landusepftmod.DefineMask(static_dataset) + assert np.isnan(maskoutput[0][0]) + +# Make sure that the generated mask is equal to 1 where it isn't nan +def test_pos_ones_DefineMask(static_dataset): + maskoutput = landusepftmod.DefineMask(static_dataset) + assert (maskoutput.to_dataframe().dropna() == 1.).all().values.item() + +# Make sure a known gridcell that should be land is not nan +# TODO: change this to use sel indexing +def test_neg_output_DefineMask(static_dataset): + maskoutput = landusepftmod.DefineMask(static_dataset) + assert not(np.isnan(maskoutput[360][850])) + +def test_neg_input_DefineMask(landusepft_dataset): + with pytest.raises(AttributeError) as exp: + maskoutput = landusepftmod.DefineMask(landusepft_dataset) + assert str(exp.value) == "incorrect dataset, must be static luh2 dataset" + +# Bare ground fraction removal unit test +# Make sure that the pft fractions sum to 100% on a known land gridcell +def test_posRenormalizePFTs(landusepft_dataset): + percent = landusepftmod.RenormalizePFTs(landusepft_dataset) + + # Sum along the natpft dimension only. Use min_count to avoid + # NaNs from masking turning into zeros + percent = percent.sum(dim='natpft',min_count=1) + + # Convert to a dataframe to stack lat and lon in one dimension + # and drop the NaNs. Note that since percent is unassociated + # with a dataset we need to pass the name argument when converting + # to a dataframe. + percent = percent.to_dataframe(name="testpercent").dropna(how='all') + # breakpoint() + + # Check that all the summations are unity within a specific tolerance + tolerance = 3.0e-16 + assert (abs(percent - 1.0) < tolerance).all().values.item() diff --git a/tools/landusedata/tests/luh2_test.py b/tools/landusedata/tests/luh2_test.py new file mode 100644 index 0000000000..a5da26ae81 --- /dev/null +++ b/tools/landusedata/tests/luh2_test.py @@ -0,0 +1,3 @@ +import pytest + +from landusedata import luh2mod diff --git a/tools/landusedata/tests/main_test.py b/tools/landusedata/tests/main_test.py new file mode 100644 index 0000000000..c43be66323 --- /dev/null +++ b/tools/landusedata/tests/main_test.py @@ -0,0 +1,30 @@ +import pytest +from argparse import ArgumentError +from argparse import ArgumentParser as parser + +from landusedata._main import main + +# TODO: parameterize this +def test_main(capsys): + """ + Postive case testing for choice input + Note that the regridfile is a global argument to all landusedata subcommands + and thus must come last. + """ + statefile = 'statefile' + regridfile = 'regridfile' + main(['luh2',statefile,regridfile]) + out, err = capsys.readouterr() + assert out == 'calling luh2 code: {},{}\n'.format(regridfile,statefile) + assert err == '' + +def test_main_neg(capsys): + """ + Negative case testing for choice input + + This only tests the system exit. Testing the ArgumentError + is a little tricky to catch as it is upstream of SystemExit. + """ + with pytest.raises(SystemExit) as exp: + main(['notallowed']) + assert str(exp.value) == "2" diff --git a/tools/landusedata/tests/resources/README.md b/tools/landusedata/tests/resources/README.md new file mode 100644 index 0000000000..6bb3681a1e --- /dev/null +++ b/tools/landusedata/tests/resources/README.md @@ -0,0 +1,2 @@ +This resources directory is used to temporarily store +mock data for unit testing during test development diff --git a/tools/landusedata/tests/utils_test.py b/tools/landusedata/tests/utils_test.py new file mode 100644 index 0000000000..1e6130a222 --- /dev/null +++ b/tools/landusedata/tests/utils_test.py @@ -0,0 +1,47 @@ +import pytest + +from landusedata import utils + +def test_RegridTargetPrep_dims(target_dataset): + target_dataset = utils._RegridTargetPrep(target_dataset) + dim_list = list(target_dataset.dims) + latlon_list = ['lat','lon'] + test_list = [dim for latlon in latlon_list for dim in dim_list + if latlon in dim] + assert latlon_list == test_list + +def test_RegridTargetPrep_coords(target_dataset): + target_dataset = utils._RegridTargetPrep(target_dataset) + coords_list = list(target_dataset.coords) + latlon_list = ['lat','lon'] + test_list = [coord for latlon in latlon_list for coord in coords_list + if latlon in coord] + assert latlon_list == test_list + +def test_negImportRegridTarget(landusepft_file_location): + with pytest.raises(TypeError) as exp: + utils.ImportRegridTarget(landusepft_file_location) + assert str(exp.value) == "incorrect file, must be surface dataset" + +def test_negSetMaskRegridTarget(landusepft_dataset): + with pytest.raises(AttributeError) as exp: + maskoutput = utils.SetMaskRegridTarget(landusepft_dataset) + assert str(exp.value) == "incorrect dataset, must be CLM surface dataset" + +# Postive test case for importing LUH2 static data file +def test_posImportLUH2StaticFile(static_file_location): + data = utils.ImportLUH2StaticFile(static_file_location) + static_variables = list(data.var()) + assert static_variables == ['ptbio', + 'fstnf', + 'carea', + 'icwtr', + 'ccode', + 'lat_bounds', + 'lon_bounds'] + +# Negative test case for importing incorrect file via static luh2 file open function +def test_negImportLUH2StaticFile(landusepft_file_location): + with pytest.raises(TypeError) as exp: + utils.ImportLUH2StaticFile(landusepft_file_location) + assert str(exp.value) == "incorrect file, must be LUH2 static file"