Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PR: Make reading water level input files more robust #405

Merged
merged 12 commits into from
Mar 3, 2022
Merged
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ docs/_build/
target/

# ---- Tests output files

water_level_datafile.*
waterlvl_manual_measurements.*
glue_rawdata.npy

Expand Down
4 changes: 2 additions & 2 deletions gwhat/projet/manager_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,11 +267,11 @@ def new_wldset_imported(self, name, dataset):
Receives the new water level dataset, saves it in the project and
update the GUI.
"""
print("Saving the new water level dataset in the project...", end=" ")
print("Saving the new water level dataset in the project...")
self.projet.add_wldset(name, dataset)
self.update_wldsets(name)
self.wldset_changed()
print("done")
print("New water level dataset saved in the project successfully.")

def update_wldsets(self, name=None):
self.wldsets_cbox.blockSignals(True)
Expand Down
10 changes: 6 additions & 4 deletions gwhat/projet/reader_projet.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,12 +275,14 @@ def add_wldset(self, name, df):
mmeas.create_dataset('Time', data=np.array([]), maxshape=(None,))
mmeas.create_dataset('WL', data=np.array([]), maxshape=(None,))

self.db.flush()

print('New dataset created sucessfully')
except Exception:
print('Unable to save dataset to project db')
except Exception as e:
print('Unable to save dataset to project db because of the '
'following error:')
print(e)
del self.db['wldsets'][name]
finally:
self.db.flush()

return WLDatasetHDF5(grp)

Expand Down
209 changes: 116 additions & 93 deletions gwhat/projet/reader_waterlvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
import xlrd
import openpyxl

# ---- Local library imports
from gwhat.common.utils import save_content_to_csv

FILE_EXTS = ['.csv', '.xls', '.xlsx']

Expand All @@ -47,7 +45,7 @@
'Longitude': 0,
'Elevation': 0}
HEADER_REGEX = {
'Well': r'(?<!\S)(wellname|name)(:|=)?(?!\S)',
'Well': r'(?<!\S)(well|wellname|name)(:|=)?(?!\S)',
'Well ID': r'(?<!\S)(wellid|id)(:|=)?(?!\S)',
'Province': r'(?<!\S)(province|prov)(:|=)?(?!\S)',
'Municipality': r'(?<!\S)municipality(:|=)?(?!\S)',
Expand All @@ -57,66 +55,99 @@
}


class WLDataFrame(pd.DataFrame):
def __init__(self, data: list = None, columns: list = None):
super().__init__(data=[], columns=COLUMNS)
self.set_index([INDEX], drop=True, inplace=True)
def _format_column_names(df):
"""
Rename valid columns, drop invalid columns, and add missing columns.
"""
# Rename valid columns and drop invalid columns.
drop = []
rename = {}
for column in df.columns:
for colname, regex in COL_REGEX.items():
str_ = column.replace(" ", "").replace("_", "")
if re.search(regex, str_, re.IGNORECASE):
rename[column] = colname
break
else:
drop.append(column)
df = df.rename(columns=rename)
df = df.drop(columns=drop)

if data is not None and columns is not None:
df = pd.DataFrame(data, columns=columns)
for column in columns:
for colname, regex in COL_REGEX.items():
str_ = column.replace(" ", "").replace("_", "")
if re.search(regex, str_, re.IGNORECASE):
self[colname] = df[column].copy()
break
del df
self.format_numeric_data()
self.format_datetime_data()
# Add missing columns.
for column in COLUMNS:
if column not in df.columns:
df[column] = np.nan

@property
def _constructor(self):
return WLDataFrame

def format_numeric_data(self):
"""Format the data to floats type."""
for colname in COLUMNS:
if colname == INDEX:
pass
elif colname in self.columns:
self[colname] = pd.to_numeric(self[colname], errors='coerce')
else:
print('WARNING: no "%s" data found in the datafile.' % colname)

def format_datetime_data(self):
"""Format the dates to datetimes and set it as index."""
if INDEX in self.columns:
return df[COLUMNS].copy()


def _format_numeric_data(df):
"""Format the data to floats type."""
for colname in COLUMNS:
if colname != INDEX and colname in df.columns:
df[colname] = pd.to_numeric(df[colname], errors='coerce')
return df


def _format_datetime_data(df):
"""Format the dates to datetimes and set it as index."""
if INDEX in df.columns:
if df['Time'].dtypes != 'datetime64[ns]':
try:
# We assume first that the dates are stored in the
# We check first if the dates are stored in the
# Excel numeric format.
datetimes = self['Time'].astype('float64', errors='raise')
datetimes = pd.to_datetime(datetimes.apply(
lambda date: xlrd.xldate.xldate_as_datetime(date, 0)))
except ValueError:
datetimes = df['Time'].astype('float64', errors='raise')
datetimes = pd.to_datetime(
datetimes.apply(
lambda date: xlrd.xldate.xldate_as_datetime(date, 0)))

# Get rid of milliseconds to avoid introducting
# round-off errors.
datetimes = datetimes.dt.round('S')

df['Time'] = datetimes
except (ValueError, TypeError) as e:
try:
# Try converting the strings to datetime objects.
# The format of the datetime strings must be
# "%Y-%m-%d %H:%M:%S"
datetimes = pd.to_datetime(
self['Time'], infer_datetime_format=True)
# We assume that dates are stored as strings.
df['Time'] = pd.to_datetime(
df['Time'], infer_datetime_format=True)
except ValueError:
print('WARNING: the dates are not formatted correctly.')
finally:
self['Time'] = datetimes
self.set_index(['Time'], drop=True, inplace=True)

df.set_index(['Time'], drop=True, inplace=True)
else:
print('WARNING: no "Time" data found in the datafile.')
return df


def _drop_duplicates(df):
"""
Drop duplicated indexes from the dataframe.
"""
if df.index.duplicated(keep='first').any():
print("WARNING: Duplicated values were found in the datafile. "
"Only the first entries for each date were kept.")
index = df.index.drop_duplicates(keep='first')
df = df.loc[index]
return df


class WLDataFrame(pd.DataFrame):
def __init__(self, data=None, columns=None, metadata=None):
if data is None:
super().__init__(data=[], columns=COLUMNS)
self.set_index(INDEX, drop=True, inplace=True)
else:
print('WARNING: no "Time" data found in the datafile.')
df = pd.DataFrame(data, columns=columns)
df = _format_column_names(df)
df = _format_numeric_data(df)
df = _format_datetime_data(df)
df = _drop_duplicates(df)
super().__init__(df)

# Check and remove duplicate data.
if any(self.index.duplicated(keep='first')):
print("WARNING: Duplicated values were found in the datafile. "
"Only the first entries for each date were kept.")
self.drop_duplicates(keep='first', inplace=True)
metadata = {} if metadata is None else metadata
for key, val in HEADER.items():
self.attrs[key] = metadata.get(key, val)


def open_water_level_datafile(filename):
Expand Down Expand Up @@ -152,63 +183,53 @@ def read_water_level_datafile(filename):
Load a water level dataset from a csv or an Excel file and format the
data in a Pandas dataframe with the dates used as index.
"""
if filename is None or not osp.exists(filename):
return None
reader = open_water_level_datafile(filename)

# Fetch the metadata from the header.
header = deepcopy(HEADER)
for i, row in enumerate(reader):
if not len(row):
continue

label = str(row[0]).replace(" ", "").replace("_", "")
if re.search(COL_REGEX[INDEX], label, re.IGNORECASE):
break

for key in HEADER.keys():
if re.search(HEADER_REGEX[key], label, re.IGNORECASE):
if isinstance(header[key], (float, int)):
try:
header[key] = float(row[1])
except ValueError:
print('Wrong format for entry "{}".'.format(key))
print('Wrong format for metadata "{}".'.format(key))
else:
header[key] = str(row[1])
break
else:
if re.search(COL_REGEX[INDEX], label, re.IGNORECASE):
break
else:
print("ERROR: the water level datafile is not formatted correctly.")
return None
print("ERROR: no data found in input water level file.")
return WLDataFrame(metadata=header)

root, ext = osp.splitext(filename)
if ext.lower() in ['.xls', '.xlsx']:
data = pd.read_excel(
filename,
header=i,
parse_dates=[row[0]]
)
else:
data = pd.read_csv(
filename,
skip_blank_lines=False,
header=i)

# Cast the data into a Pandas dataframe.
dataf = WLDataFrame(reader[i+1:], columns=row)

# Add the metadata to the dataframe.
for key in header.keys():
setattr(dataf, key, header[key])
dataf = WLDataFrame(data, columns=None, metadata=header)
dataf.filename = filename

return dataf


# ---- Water Level Manual Measurements
def init_waterlvl_measures(dirname):
"""
Create an empty waterlvl_manual_measurements.csv file with headers
if it does not already exist.
"""
for ext in FILE_EXTS:
fname = osp.join(dirname, "waterlvl_manual_measurements" + ext)
if osp.exists(fname):
return
else:
fname = os.path.join(dirname, 'waterlvl_manual_measurements.csv')
fcontent = [['Well_ID', 'Time (days)', 'Obs. (mbgs)']]

if not osp.exists(dirname):
os.makedirs(dirname)
save_content_to_csv(fname, fcontent)


def load_waterlvl_measures(filename, well):
"""
Load and read the water level manual measurements from the specified
Expand Down Expand Up @@ -378,8 +399,8 @@ def __getitem__(self, key):
return self.strftime
elif key in COLUMNS:
return self.data[key].values
elif key in list(HEADER.keys()):
return getattr(self._dataf, key, HEADER[key])
elif key in HEADER.keys():
return self._dataf.attrs[key]
elif key == 'filename':
return self._dataf.filename

Expand All @@ -392,9 +413,11 @@ def __load_dataset__(self, filename):

if __name__ == "__main__":
from gwhat import __rootdir__
df = WLDataset(
osp.join(__rootdir__, 'tests', "water_level_datafile.csv"))
df2 = WLDataset(
osp.join(__rootdir__, 'tests', "water_level_datafile.xls"))
df3 = WLDataset(
osp.join(__rootdir__, 'tests', "water_level_datafile.xlsx"))
dirname = osp.join(__rootdir__, 'projet', 'tests', 'data')
df1 = WLDataset(osp.join(dirname, "water_level_datafile.csv"))
df2 = WLDataset(osp.join(dirname, "water_level_datafile.xls"))
df3 = WLDataset(osp.join(dirname, "water_level_datafile.xlsx"))
df4 = WLDataset(osp.join(dirname, "water_level_datafile_xldates.csv"))
df5 = WLDataset(osp.join(dirname, "water_level_datafile_strfmt.xls"))
df6 = WLDataset(osp.join(dirname, "water_level_datafile_strfmt.xlsx"))
df7 = WLDataFrame()
12 changes: 12 additions & 0 deletions gwhat/projet/tests/data/water_level_datafile.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Well name = ,êi!@':i*,,
well id : ,1234ABC,,
Province,Qc,,
latitude ,45.36,,
Longitude=,-72.4234665345,,
Elevation:,123,,
,,,
,,,
Date,WL(mbgs),BP(m),ET
2012-11-28 16:45,3.667377006,10.33327435,383.9680352
2012-11-28 17:00,3.665777025,10.33127437,387.7404819
2012-11-28 17:15,3.665277031,10.33097437,396.9950643
Binary file added gwhat/projet/tests/data/water_level_datafile.xls
Binary file not shown.
Binary file added gwhat/projet/tests/data/water_level_datafile.xlsx
Binary file not shown.
Binary file not shown.
Binary file not shown.
12 changes: 12 additions & 0 deletions gwhat/projet/tests/data/water_level_datafile_xldates.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Well name = ,êi!@':i*
well id : ,1234ABC
Province,Qc
latitude ,45.36
Longitude=,-72.4234665345
Elevation:,123


Date,WL(mbgs),BP(m),ET
41241.69792,3.667377006,10.33327435,383.9680352
41241.70833,3.665777025,10.33127437,387.7404819
41241.71875,3.665277031,10.33097437,396.9950643
Loading