jnsebgosselin · jnsebgosselin · Mar 3, 2022 · Mar 3, 2022 · Mar 3, 2022 · Mar 3, 2022
diff --git a/.gitignore b/.gitignore
@@ -53,8 +53,6 @@ docs/_build/
 target/
 
 # ---- Tests output files
-
-water_level_datafile.*
 waterlvl_manual_measurements.*
 glue_rawdata.npy
 

diff --git a/gwhat/projet/manager_data.py b/gwhat/projet/manager_data.py
@@ -267,11 +267,11 @@ def new_wldset_imported(self, name, dataset):
         Receives the new water level dataset, saves it in the project and
         update the GUI.
         """
-        print("Saving the new water level dataset in the project...", end=" ")
+        print("Saving the new water level dataset in the project...")
         self.projet.add_wldset(name, dataset)
         self.update_wldsets(name)
         self.wldset_changed()
-        print("done")
+        print("New water level dataset saved in the project successfully.")
 
     def update_wldsets(self, name=None):
         self.wldsets_cbox.blockSignals(True)

diff --git a/gwhat/projet/reader_projet.py b/gwhat/projet/reader_projet.py
@@ -275,12 +275,14 @@ def add_wldset(self, name, df):
             mmeas.create_dataset('Time', data=np.array([]), maxshape=(None,))
             mmeas.create_dataset('WL', data=np.array([]), maxshape=(None,))
 
-            self.db.flush()
-
             print('New dataset created sucessfully')
-        except Exception:
-            print('Unable to save dataset to project db')
+        except Exception as e:
+            print('Unable to save dataset to project db because of the '
+                  'following error:')
+            print(e)
             del self.db['wldsets'][name]
+        finally:
+            self.db.flush()
 
         return WLDatasetHDF5(grp)
 

diff --git a/gwhat/projet/reader_waterlvl.py b/gwhat/projet/reader_waterlvl.py
@@ -22,8 +22,6 @@
 import xlrd
 import openpyxl
 
-# ---- Local library imports
-from gwhat.common.utils import save_content_to_csv
 
 FILE_EXTS = ['.csv', '.xls', '.xlsx']
 
@@ -47,7 +45,7 @@
           'Longitude': 0,
           'Elevation': 0}
 HEADER_REGEX = {
-    'Well': r'(?<!\S)(wellname|name)(:|=)?(?!\S)',
+    'Well': r'(?<!\S)(well|wellname|name)(:|=)?(?!\S)',
     'Well ID': r'(?<!\S)(wellid|id)(:|=)?(?!\S)',
     'Province': r'(?<!\S)(province|prov)(:|=)?(?!\S)',
     'Municipality': r'(?<!\S)municipality(:|=)?(?!\S)',
@@ -57,66 +55,99 @@
     }
 
 
-class WLDataFrame(pd.DataFrame):
-    def __init__(self, data: list = None, columns: list = None):
-        super().__init__(data=[], columns=COLUMNS)
-        self.set_index([INDEX], drop=True, inplace=True)
+def _format_column_names(df):
+    """
+    Rename valid columns, drop invalid columns, and add missing columns.
+    """
+    # Rename valid columns and drop invalid columns.
+    drop = []
+    rename = {}
+    for column in df.columns:
+        for colname, regex in COL_REGEX.items():
+            str_ = column.replace(" ", "").replace("_", "")
+            if re.search(regex, str_, re.IGNORECASE):
+                rename[column] = colname
+                break
+        else:
+            drop.append(column)
+    df = df.rename(columns=rename)
+    df = df.drop(columns=drop)
 
-        if data is not None and columns is not None:
-            df = pd.DataFrame(data, columns=columns)
-            for column in columns:
-                for colname, regex in COL_REGEX.items():
-                    str_ = column.replace(" ", "").replace("_", "")
-                    if re.search(regex, str_, re.IGNORECASE):
-                        self[colname] = df[column].copy()
-                        break
-            del df
-        self.format_numeric_data()
-        self.format_datetime_data()
+    # Add missing columns.
+    for column in COLUMNS:
+        if column not in df.columns:
+            df[column] = np.nan
 
-    @property
-    def _constructor(self):
-        return WLDataFrame
-
-    def format_numeric_data(self):
-        """Format the data to floats type."""
-        for colname in COLUMNS:
-            if colname == INDEX:
-                pass
-            elif colname in self.columns:
-                self[colname] = pd.to_numeric(self[colname], errors='coerce')
-            else:
-                print('WARNING: no "%s" data found in the datafile.' % colname)
-
-    def format_datetime_data(self):
-        """Format the dates to datetimes and set it as index."""
-        if INDEX in self.columns:
+    return df[COLUMNS].copy()
+
+
+def _format_numeric_data(df):
+    """Format the data to floats type."""
+    for colname in COLUMNS:
+        if colname != INDEX and colname in df.columns:
+            df[colname] = pd.to_numeric(df[colname], errors='coerce')
+    return df
+
+
+def _format_datetime_data(df):
+    """Format the dates to datetimes and set it as index."""
+    if INDEX in df.columns:
+        if df['Time'].dtypes != 'datetime64[ns]':
             try:
-                # We assume first that the dates are stored in the
+                # We check first if the dates are stored in the
                 # Excel numeric format.
-                datetimes = self['Time'].astype('float64', errors='raise')
-                datetimes = pd.to_datetime(datetimes.apply(
-                    lambda date: xlrd.xldate.xldate_as_datetime(date, 0)))
-            except ValueError:
+                datetimes = df['Time'].astype('float64', errors='raise')
+                datetimes = pd.to_datetime(
+                    datetimes.apply(
+                        lambda date: xlrd.xldate.xldate_as_datetime(date, 0)))
+
+                # Get rid of milliseconds to avoid introducting
+                # round-off errors.
+                datetimes = datetimes.dt.round('S')
+
+                df['Time'] = datetimes
+            except (ValueError, TypeError) as e:
                 try:
-                    # Try converting the strings to datetime objects.
-                    # The format of the datetime strings must be
-                    # "%Y-%m-%d %H:%M:%S"
-                    datetimes = pd.to_datetime(
-                        self['Time'], infer_datetime_format=True)
+                    # We assume that dates are stored as strings.
+                    df['Time'] = pd.to_datetime(
+                        df['Time'], infer_datetime_format=True)
                 except ValueError:
                     print('WARNING: the dates are not formatted correctly.')
-            finally:
-                self['Time'] = datetimes
-                self.set_index(['Time'], drop=True, inplace=True)
+
+        df.set_index(['Time'], drop=True, inplace=True)
+    else:
+        print('WARNING: no "Time" data found in the datafile.')
+    return df
+
+
+def _drop_duplicates(df):
+    """
+    Drop duplicated indexes from the dataframe.
+    """
+    if df.index.duplicated(keep='first').any():
+        print("WARNING: Duplicated values were found in the datafile. "
+              "Only the first entries for each date were kept.")
+        index = df.index.drop_duplicates(keep='first')
+        df = df.loc[index]
+    return df
+
+
+class WLDataFrame(pd.DataFrame):
+    def __init__(self, data=None, columns=None, metadata=None):
+        if data is None:
+            super().__init__(data=[], columns=COLUMNS)
+            self.set_index(INDEX, drop=True, inplace=True)
         else:
-            print('WARNING: no "Time" data found in the datafile.')
+            df = pd.DataFrame(data, columns=columns)
+            df = _format_column_names(df)
+            df = _format_numeric_data(df)
+            df = _format_datetime_data(df)
+            df = _drop_duplicates(df)
+            super().__init__(df)
 
-        # Check and remove duplicate data.
-        if any(self.index.duplicated(keep='first')):
-            print("WARNING: Duplicated values were found in the datafile. "
-                  "Only the first entries for each date were kept.")
-            self.drop_duplicates(keep='first', inplace=True)
+        metadata = {} if metadata is None else metadata
+        for key, val in HEADER.items():
+            self.attrs[key] = metadata.get(key, val)
 
 
 def open_water_level_datafile(filename):
@@ -152,63 +183,53 @@ def read_water_level_datafile(filename):
     Load a water level dataset from a csv or an Excel file and format the
     data in a Pandas dataframe with the dates used as index.
     """
-    if filename is None or not osp.exists(filename):
-        return None
     reader = open_water_level_datafile(filename)
 
     # Fetch the metadata from the header.
     header = deepcopy(HEADER)
     for i, row in enumerate(reader):
         if not len(row):
             continue
+
         label = str(row[0]).replace(" ", "").replace("_", "")
+        if re.search(COL_REGEX[INDEX], label, re.IGNORECASE):
+            break
+
         for key in HEADER.keys():
             if re.search(HEADER_REGEX[key], label, re.IGNORECASE):
                 if isinstance(header[key], (float, int)):
                     try:
                         header[key] = float(row[1])
                     except ValueError:
-                        print('Wrong format for entry "{}".'.format(key))
+                        print('Wrong format for metadata "{}".'.format(key))
                 else:
                     header[key] = str(row[1])
                 break
-        else:
-            if re.search(COL_REGEX[INDEX], label, re.IGNORECASE):
-                break
     else:
-        print("ERROR: the water level datafile is not formatted correctly.")
-        return None
+        print("ERROR: no data found in input water level file.")
+        return WLDataFrame(metadata=header)
+
+    root, ext = osp.splitext(filename)
+    if ext.lower() in ['.xls', '.xlsx']:
+        data = pd.read_excel(
+            filename,
+            header=i,
+            parse_dates=[row[0]]
+            )
+    else:
+        data = pd.read_csv(
+            filename,
+            skip_blank_lines=False,
+            header=i)
 
     # Cast the data into a Pandas dataframe.
-    dataf = WLDataFrame(reader[i+1:], columns=row)
-
-    # Add the metadata to the dataframe.
-    for key in header.keys():
-        setattr(dataf, key, header[key])
+    dataf = WLDataFrame(data, columns=None, metadata=header)
     dataf.filename = filename
 
     return dataf
 
 
 # ---- Water Level Manual Measurements
-def init_waterlvl_measures(dirname):
-    """
-    Create an empty waterlvl_manual_measurements.csv file with headers
-    if it does not already exist.
-    """
-    for ext in FILE_EXTS:
-        fname = osp.join(dirname, "waterlvl_manual_measurements" + ext)
-        if osp.exists(fname):
-            return
-    else:
-        fname = os.path.join(dirname, 'waterlvl_manual_measurements.csv')
-        fcontent = [['Well_ID', 'Time (days)', 'Obs. (mbgs)']]
-
-        if not osp.exists(dirname):
-            os.makedirs(dirname)
-        save_content_to_csv(fname, fcontent)
-
-
 def load_waterlvl_measures(filename, well):
     """
     Load and read the water level manual measurements from the specified
@@ -378,8 +399,8 @@ def __getitem__(self, key):
             return self.strftime
         elif key in COLUMNS:
             return self.data[key].values
-        elif key in list(HEADER.keys()):
-            return getattr(self._dataf, key, HEADER[key])
+        elif key in HEADER.keys():
+            return self._dataf.attrs[key]
         elif key == 'filename':
             return self._dataf.filename
 
@@ -392,9 +413,11 @@ def __load_dataset__(self, filename):
 
 if __name__ == "__main__":
     from gwhat import __rootdir__
-    df = WLDataset(
-        osp.join(__rootdir__, 'tests', "water_level_datafile.csv"))
-    df2 = WLDataset(
-        osp.join(__rootdir__, 'tests', "water_level_datafile.xls"))
-    df3 = WLDataset(
-        osp.join(__rootdir__, 'tests', "water_level_datafile.xlsx"))
+    dirname = osp.join(__rootdir__, 'projet', 'tests', 'data')
+    df1 = WLDataset(osp.join(dirname, "water_level_datafile.csv"))
+    df2 = WLDataset(osp.join(dirname, "water_level_datafile.xls"))
+    df3 = WLDataset(osp.join(dirname, "water_level_datafile.xlsx"))
+    df4 = WLDataset(osp.join(dirname, "water_level_datafile_xldates.csv"))
+    df5 = WLDataset(osp.join(dirname, "water_level_datafile_strfmt.xls"))
+    df6 = WLDataset(osp.join(dirname, "water_level_datafile_strfmt.xlsx"))
+    df7 = WLDataFrame()
diff --git a/gwhat/projet/tests/data/water_level_datafile.csv b/gwhat/projet/tests/data/water_level_datafile.csv
@@ -0,0 +1,12 @@
+Well name = ,êi!@':i*,,
+well id : ,1234ABC,,
+Province,Qc,,
+latitude   ,45.36,,
+Longitude=,-72.4234665345,,
+Elevation:,123,,
+,,,
+,,,
+Date,WL(mbgs),BP(m),ET
+2012-11-28 16:45,3.667377006,10.33327435,383.9680352
+2012-11-28 17:00,3.665777025,10.33127437,387.7404819
+2012-11-28 17:15,3.665277031,10.33097437,396.9950643
diff --git a/gwhat/projet/tests/data/water_level_datafile.xls b/gwhat/projet/tests/data/water_level_datafile.xls
diff --git a/gwhat/projet/tests/data/water_level_datafile.xlsx b/gwhat/projet/tests/data/water_level_datafile.xlsx
diff --git a/gwhat/projet/tests/data/water_level_datafile_strfmt.xls b/gwhat/projet/tests/data/water_level_datafile_strfmt.xls
diff --git a/gwhat/projet/tests/data/water_level_datafile_strfmt.xlsx b/gwhat/projet/tests/data/water_level_datafile_strfmt.xlsx
diff --git a/gwhat/projet/tests/data/water_level_datafile_xldates.csv b/gwhat/projet/tests/data/water_level_datafile_xldates.csv
@@ -0,0 +1,12 @@
+Well name = ,êi!@':i*
+well id : ,1234ABC
+Province,Qc
+latitude   ,45.36
+Longitude=,-72.4234665345
+Elevation:,123
+
+
+Date,WL(mbgs),BP(m),ET
+41241.69792,3.667377006,10.33327435,383.9680352
+41241.70833,3.665777025,10.33127437,387.7404819
+41241.71875,3.665277031,10.33097437,396.9950643