diff --git a/METdbLoad/sql/mv_mysql.sql b/METdbLoad/sql/mv_mysql.sql index 066948dd..e8078f28 100644 --- a/METdbLoad/sql/mv_mysql.sql +++ b/METdbLoad/sql/mv_mysql.sql @@ -2290,12 +2290,12 @@ CREATE TABLE line_data_seeps obs_valid_beg DATETIME, obs_valid_end DATETIME, total INT UNSIGNED, - s12 DOUBLE, - s13 DOUBLE, - s21 DOUBLE, - s23 DOUBLE, - s31 DOUBLE, - s32 DOUBLE, + odfl DOUBLE, + odfh DOUBLE, + olfd DOUBLE, + olfh DOUBLE, + ohfd DOUBLE, + ohfl DOUBLE, pf1 DOUBLE, pf2 DOUBLE, pf3 DOUBLE, diff --git a/METdbLoad/sql/scripts/db_cmds.bash b/METdbLoad/sql/scripts/db_cmds.bash new file mode 100755 index 00000000..62118187 --- /dev/null +++ b/METdbLoad/sql/scripts/db_cmds.bash @@ -0,0 +1,51 @@ +#!/usr/bin/bash + + +echo "" +echo "" +echo "" +echo "*****************" +echo " USAGE" +echo "*****************" +echo "" +echo " ***Run under BASH shell***" +echo " SET the following ENV variables: " +echo " DB_PWD - the password for the mvadmin user" +echo " DBNAME - the name of the database you wish to create/drop/load schema" +echo " MVSCHEMA - the full path and name of the mv_mysql.sql script to load the schema" +echo " Then copy and paste the appropriate command for dropping, creating, or loading the schema after running the following on the command line:" +echo "" +echo " bash db_cmd.bash " +echo "" +echo "*****************" +echo "" +echo "" +echo "" + +echo "" +echo "********" +echo "Command to drop database ${DBNAME} if it exists (or error message if it doesn't exist)..." +echo "********" +echo mysql -u mvadmin -p${DB_PWD} "'drop database" ${DBNAME}"';" + + +echo "" +echo "********" +echo "Command to create database ${DBNAME}..." +echo "********" +echo mysql -u mvadmin -p${DB_PWD} -e "'"create database ${DBNAME}"'"; + +echo "" +echo "********" +echo "Command to grant privileges to ${DBNAME}..." +echo "********" +echo mysql -u mvadmin -p${DB_PWD} -e "\""GRANT INSERT, DELETE, UPDATE, INDEX, DROP ON ${DBNAME}.* to "'"mvuser"'"@"'"%"'" "\"" +echo "" + + +echo "" +echo "********" +echo "Command to load mv_mysql.sql schema ${DBNAME}..." +echo "********" +echo mysql -u mvadmin -p${DB_PWD} ${DBNAME} "<" ${MVSCHEMA} +echo "" diff --git a/METdbLoad/sql/updates/update_for_6_0_beta6.sql b/METdbLoad/sql/updates/update_for_6_0_beta6.sql index 00466bfc..c0663056 100644 --- a/METdbLoad/sql/updates/update_for_6_0_beta6.sql +++ b/METdbLoad/sql/updates/update_for_6_0_beta6.sql @@ -13,6 +13,15 @@ ALTER TABLE line_data_orank RENAME COLUMN climo_stdev TO obs_climo_stdev | ADD COLUMN fcst_climo_mean DOUBLE | ADD COLUMN fcst_climo_stdev DOUBLE | + +ALTER TABLE line_data_seeps + RENAME COLUMN s12 TO odfl | + RENAME COLUMN s13 TO odfh | + RENAME COLUMN s21 TO olfd | + RENAME COLUMN s23 TO olfh | + RENAME COLUMN s31 TO ohfd | + RENAME COLUMN s32 TO ohfl | + | DELIMITER ; diff --git a/METdbLoad/tests/update_schema_6.0_beta6/Data/total_data.tar b/METdbLoad/tests/update_schema_6.0_beta6/Data/total_data.tar index a330007b..92766f16 100644 Binary files a/METdbLoad/tests/update_schema_6.0_beta6/Data/total_data.tar and b/METdbLoad/tests/update_schema_6.0_beta6/Data/total_data.tar differ diff --git a/METdbLoad/tests/update_schema_6.0_beta6/test_schema.py b/METdbLoad/tests/update_schema_6.0_beta6/test_schema.py index d6281042..cc061220 100644 --- a/METdbLoad/tests/update_schema_6.0_beta6/test_schema.py +++ b/METdbLoad/tests/update_schema_6.0_beta6/test_schema.py @@ -14,8 +14,8 @@ # Make sure the database name matches with the one you created on the database host -CONST_LOAD_DB_CMD = "use mv_mpr_orank" -TEST_DB = "mv_mpr_orank" +CONST_LOAD_DB_CMD = "use mv_mpr_orank_seeps" +TEST_DB = "mv_mpr_orank_seeps" @pytest.fixture def setup_db(): @@ -76,6 +76,7 @@ def test_db_created(setup_db): finally: setup_db.close() + def test_tables_created(setup_db): # connect to the database and verify the MPR and ORANK tables exist @@ -98,6 +99,7 @@ def test_tables_created(setup_db): finally: setup_db.close() + def test_mpr_columns(setup_db): # log into the database and verify the renamed columns are in the # list_data_mpr database table, the previous/replaced columns do NOT @@ -123,6 +125,7 @@ def test_mpr_columns(setup_db): finally: setup_db.close() + def test_orank_columns(setup_db): # log into the database and verify the renamed and new columns are in the # list_data_orank database table, and the previous/replaced columns no longer @@ -145,3 +148,37 @@ def test_orank_columns(setup_db): finally: setup_db.close() +def test_seeps_columns(setup_db): + # log into the database and verify the renamed SEEPS columns are in the + # list_data_seeps database table, and the previous/replaced columns no longer + # exist. + + try: + with setup_db.cursor() as cursor: + cursor.execute(CONST_LOAD_DB_CMD) + check_columns_exist = "desc line_data_seeps;" + cursor.execute(check_columns_exist) + + # Get all rows + rows = cursor.fetchall() + list_of_rows = [r[0] for r in rows] + + # Verify newly renamed columns exist in the updated data + assert 'odfl' in list_of_rows + assert 'odfh' in list_of_rows + assert 'olfd' in list_of_rows + assert 'olfh' in list_of_rows + assert 'ohfd' in list_of_rows + assert 'ohfl' in list_of_rows + + # Verify that remaining columns are unchanged in the updated data + assert 'pf1' in list_of_rows + assert 'pf2' in list_of_rows + assert 'pf3' in list_of_rows + assert 'pv1' in list_of_rows + assert 'pv2' in list_of_rows + assert 'pv3' in list_of_rows + + finally: + setup_db.close() + diff --git a/METdbLoad/ush/met_db_load.py b/METdbLoad/ush/met_db_load.py index 7e13b36c..52113913 100644 --- a/METdbLoad/ush/met_db_load.py +++ b/METdbLoad/ush/met_db_load.py @@ -40,305 +40,329 @@ from METreformat.util import get_common_logger + def main(args): """ Main program to load files into the METdataio/METviewer database Returns: N/A """ - # use the current date/time for logger - begin_time = str(datetime.now()) - # setup a logger for this module - cli_loglevel = False - if args.loglevel: - loglevel = args.loglevel - cli_loglevel = True - else: - loglevel = DEFAULT_LOGLEVEL - - # Get the common logger - logger = get_common_logger(loglevel, 'stdout') - - if cli_loglevel: - logger.info(f"Loglevel set to {loglevel} from command line.") - else: - logger.info(f"Loglevel not supplied. Setting to default: {loglevel}. This may be overwritten by XML loadfile.") - - # Print the METdbload version from the docs folder - print_version(logger) - - logger.info("--- *** --- Start METdbLoad --- *** ---") - logger.info("Begin time: %s", begin_time) try: - logger.info("User name is: %s", getpass.getuser()) - except: - logger.info("User name is not available") - # time execution - load_time_start = time.perf_counter() + # use the current date/time for logger + begin_time = str(datetime.now()) + # setup a logger for this module + cli_loglevel = False + if args.loglevel: + loglevel = args.loglevel + cli_loglevel = True + else: + loglevel = DEFAULT_LOGLEVEL + # Get the common logger + logger = get_common_logger(loglevel, 'stdout') - # - # Read the XML file - # - try: - logger.debug("XML filename is %s", args.xmlfile) - - # instantiate a load_spec XML file - xml_loadfile = XmlLoadFile(args.xmlfile, logger=logger) + if cli_loglevel: + logger.info(f"Loglevel set to {loglevel} from command line.") + else: + logger.info( + f"Loglevel not supplied. Setting to default: {loglevel}. This may be overwritten by XML loadfile.") - # read in the XML file and get the information out of its tags - xml_loadfile.read_xml() - - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s occurred in Main reading XML ***", sys.exc_info()[0]) - sys.exit("*** Error reading XML") + # Print the METdbload version from the docs folder + print_version(logger) - # - # Verify the tmp file - # - try: - tmp_dir = args.tmpdir[0] - if not os.path.isdir(tmp_dir): - logger.error("*** Error occurred in Main accessing tmp dir %s ***", tmp_dir) - sys.exit("*** Error accessing tmp dir") + logger.info("--- *** --- Start METdbLoad --- *** ---") + logger.info("Begin time: %s", begin_time) - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s occurred in Main accessing tmp dir ***", sys.exc_info()[0]) - sys.exit("*** Error accessing tmp dir") - - # If XML tag verbose is set to True, change logger to debug level unless already - # supplied via the cli. - if xml_loadfile.flags["verbose"] and not cli_loglevel: - logger.setLevel("DEBUG") - - # - # If argument -index is used, only process the index - # - if args.index and xml_loadfile.flags["apply_indexes"]: try: - if xml_loadfile.connection['db_management_system'] in CN.RELATIONAL: - sql_run = RunSql() - sql_run.sql_on(xml_loadfile.connection) - sql_run.apply_indexes(False, sql_run.cur, logger) - logger.debug("-index is true - only process index") - if sql_run.conn.open: - sql_run.sql_off(sql_run.conn, sql_run.cur) - sys.exit("*** Only processing index with -index as argument") - except (RuntimeError, TypeError, NameError, KeyError, AttributeError): - if sql_run.conn.open: - sql_run.sql_off(sql_run.conn, sql_run.cur) - logger.error("*** %s occurred in Main processing index ***", sys.exc_info()[0]) - sys.exit("*** Error processing index") - - # - # Purge files if flags set to not load certain types of files - # - try: - # If user set flags to not read files, remove those files from load_files list - xml_loadfile.load_files = purge_files(xml_loadfile.load_files, xml_loadfile.flags, logger) + logger.info("User name is: %s", getpass.getuser()) + except UserWarning: + logger.info("User Warning: User name is not available") - if not xml_loadfile.load_files: - logger.warning("!!! No files to load") - sys.exit("*** No files to load") + # time execution + load_time_start = time.perf_counter() - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s occurred in Main purging files not selected ***", sys.exc_info()[0]) - sys.exit("*** Error when removing files from load list per XML") + # + # Read the XML file + # + try: + logger.debug("XML filename is %s", args.xmlfile) - # Set up indices to process some maximum number of files at a time - set_count = 0 - first_file = 0 - last_file = len(xml_loadfile.load_files) - 1 + # instantiate a load_spec XML file + xml_loadfile = XmlLoadFile(args.xmlfile, logger=logger) - if last_file > CN.MAX_FILES: - mid_file = first_file + CN.MAX_FILES - else: - mid_file = last_file + # read in the XML file and get the information out of its tags + xml_loadfile.read_xml() - line_counts = {"Stat": 0, "Mode CTS": 0, "Mode Obj": 0, "Tcst": 0, - "MTD 2D": 0, "MTD 3D Single": 0, "MTD 3D Pair": 0} + except (RuntimeError, TypeError, NameError, KeyError): + logger.error("*** %s occurred in Main reading XML ***", + sys.exc_info()[0]) + sys.exit("*** Error reading XML") - while mid_file <= last_file: + # + # Verify the tmp file + # try: - # Keep track of which set of files is being processed - set_count = set_count + 1 - # Handle only 1 file, or more files - if first_file == last_file: - current_files = [xml_loadfile.load_files[first_file]] - else: - current_files = xml_loadfile.load_files[first_file:mid_file + 1] + tmp_dir = args.tmpdir[0] + if not os.path.isdir(tmp_dir): + logger.error( + "*** Error occurred in Main accessing tmp dir %s ***", tmp_dir) + sys.exit("*** Error accessing tmp dir") except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s occurred in Main setting up loop ***", sys.exc_info()[0]) - sys.exit("*** Error when setting up loop") + logger.error( + "*** %s occurred in Main accessing tmp dir ***", sys.exc_info()[0]) + sys.exit("*** Error accessing tmp dir") + + # If XML tag verbose is set to True, change logger to debug level unless already + # supplied via the cli. + if xml_loadfile.flags["verbose"] and not cli_loglevel: + logger.setLevel("DEBUG") # - # Read the data files + # If argument -index is used, only process the index + # + if args.index and xml_loadfile.flags["apply_indexes"]: + try: + if xml_loadfile.connection['db_management_system'] in CN.RELATIONAL: + sql_run = RunSql() + sql_run.sql_on(xml_loadfile.connection) + sql_run.apply_indexes(False, sql_run.cur, logger) + logger.debug("-index is true - only process index") + if sql_run.conn.open: + sql_run.sql_off(sql_run.conn, sql_run.cur) + sys.exit("*** Only processing index with -index as argument") + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + if sql_run.conn.open: + sql_run.sql_off(sql_run.conn, sql_run.cur) + logger.error( + "*** %s occurred in Main processing index ***", sys.exc_info()[0]) + sys.exit("*** Error processing index") + + # + # Purge files if flags set to not load certain types of files # try: + # If user set flags to not read files, remove those files from load_files list + xml_loadfile.load_files = purge_files( + xml_loadfile.load_files, xml_loadfile.flags, logger) - # instantiate a read data files object - file_data = ReadDataFiles(logger) + if not xml_loadfile.load_files: + logger.warning("!!! No files to load") + sys.exit("*** No files to load") - # read in the data files, with options specified by XML flags - file_data.read_data(xml_loadfile.flags, - current_files, - xml_loadfile.line_types) + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s occurred in Main purging files not selected ***", sys.exc_info()[0]) + sys.exit("*** Error when removing files from load list per XML") + + # Set up indices to process some maximum number of files at a time + set_count = 0 + first_file = 0 + last_file = len(xml_loadfile.load_files) - 1 + + if last_file > CN.MAX_FILES: + mid_file = first_file + CN.MAX_FILES + else: + mid_file = last_file - current_files = [] + line_counts = {"Stat": 0, "Mode CTS": 0, "Mode Obj": 0, "Tcst": 0, + "MTD 2D": 0, "MTD 3D Single": 0, "MTD 3D Pair": 0} - if file_data.data_files.empty: - logger.warning("!!! No files to load in current set %s", str(set_count)) - # move indices to the next set of files - first_file, mid_file, last_file = next_set(mid_file, last_file) - continue + while mid_file <= last_file: + try: + # Keep track of which set of files is being processed + set_count = set_count + 1 + # Handle only 1 file, or more files + if first_file == last_file: + current_files = [xml_loadfile.load_files[first_file]] + else: + current_files = xml_loadfile.load_files[first_file:mid_file + 1] - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s occurred in Main reading data ***", sys.exc_info()[0]) - sys.exit("*** Error when reading data files") + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s occurred in Main setting up loop ***", sys.exc_info()[0]) + sys.exit("*** Error when setting up loop") - # - # Write the data to a database - # - try: + # + # Read the data files + # + try: - if xml_loadfile.connection['db_management_system'] in CN.RELATIONAL: - # for the first set of files, connect to the database - if set_count == 1: - sql_run = RunSql() - sql_run.sql_on(xml_loadfile.connection) + # instantiate a read data files object + file_data = ReadDataFiles(logger) + + # read in the data files, with options specified by XML flags + file_data.read_data(xml_loadfile.flags, + current_files, + xml_loadfile.line_types) - # if drop_indexes is set to true, drop the indexes - if xml_loadfile.flags["drop_indexes"]: - sql_run.apply_indexes(True, sql_run.cur, logger) - - # write the data file records out. put data file ids into other dataframes - write_file = WriteFileSql() - updated_data = write_file.write_file_sql(xml_loadfile.flags, - file_data.data_files, - file_data.stat_data, - file_data.mode_cts_data, - file_data.mode_obj_data, - file_data.tcst_data, - file_data.mtd_2d_data, - file_data.mtd_3d_single_data, - file_data.mtd_3d_pair_data, - tmp_dir, - sql_run.cur, - sql_run.local_infile) - - file_data.data_files = updated_data[0] - file_data.stat_data = updated_data[1] - line_counts["Stat"] += len(file_data.stat_data) - file_data.mode_cts_data = updated_data[2] - line_counts["Mode CTS"] += len(file_data.mode_cts_data) - file_data.mode_obj_data = updated_data[3] - line_counts["Mode Obj"] += len(file_data.mode_obj_data) - file_data.tcst_data = updated_data[4] - line_counts["Tcst"] += len(file_data.tcst_data) - file_data.mtd_2d_data = updated_data[5] - line_counts["MTD 2D"] += len(file_data.mtd_2d_data) - file_data.mtd_3d_single_data = updated_data[6] - line_counts["MTD 3D Single"] += len(file_data.mtd_3d_single_data) - file_data.mtd_3d_pair_data = updated_data[7] - line_counts["MTD 3D Pair"] += len(file_data.mtd_3d_pair_data) + current_files = [] if file_data.data_files.empty: - logger.warning("!!! No data to load in current set %s", str(set_count)) + logger.warning( + "!!! No files to load in current set %s", str(set_count)) # move indices to the next set of files - first_file, mid_file, last_file = next_set(mid_file, last_file) - - if not file_data.stat_data.empty: - stat_lines = WriteStatSql() - - stat_lines.write_stat_data(xml_loadfile.flags, - file_data.stat_data, - tmp_dir, - sql_run.cur, - sql_run.local_infile, - logger) - - if (not file_data.mode_cts_data.empty) or (not file_data.mode_obj_data.empty): - cts_lines = WriteModeSql() - - cts_lines.write_mode_data(xml_loadfile.flags, - file_data.mode_cts_data, - file_data.mode_obj_data, - tmp_dir, - sql_run.cur, - sql_run.local_infile, - logger) - - if not file_data.tcst_data.empty: - tcst_lines = WriteTcstSql() - - tcst_lines.write_tcst_data(xml_loadfile.flags, - file_data.tcst_data, - tmp_dir, - sql_run.cur, - sql_run.local_infile, - logger) - - if (not file_data.mtd_2d_data.empty) or (not file_data.mtd_3d_single_data.empty) \ - or (not file_data.mtd_3d_pair_data.empty): - mtd_lines = WriteMtdSql() - - mtd_lines.write_mtd_data(xml_loadfile.flags, - file_data.mtd_2d_data, - file_data.mtd_3d_single_data, - file_data.mtd_3d_pair_data, - tmp_dir, - sql_run.cur, - sql_run.local_infile, - logger) - - # Processing for the last set of data - if mid_file >= last_file: - # If any data was written, write to the metadata and instance_info tables - if not file_data.data_files.empty: - write_file.write_metadata_sql(xml_loadfile.flags, - file_data.data_files, - xml_loadfile.group, - xml_loadfile.description, - xml_loadfile.load_note, - xml_loadfile.xml_str, - tmp_dir, - sql_run.cur, - sql_run.local_infile) - - # if apply_indexes is set to true, load the indexes - if xml_loadfile.flags["apply_indexes"]: - sql_run.apply_indexes(False, sql_run.cur, logger) + first_file, mid_file, last_file = next_set( + mid_file, last_file) + continue + + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s occurred in Main reading data ***", sys.exc_info()[0]) + sys.exit("*** Error when reading data files") + + # + # Write the data to a database + # + try: + + if xml_loadfile.connection['db_management_system'] in CN.RELATIONAL: + # for the first set of files, connect to the database + if set_count == 1: + sql_run = RunSql() + sql_run.sql_on(xml_loadfile.connection) + + # if drop_indexes is set to true, drop the indexes + if xml_loadfile.flags["drop_indexes"]: + sql_run.apply_indexes(True, sql_run.cur, logger) + + # write the data file records out. put data file ids into other dataframes + write_file = WriteFileSql() + updated_data = write_file.write_file_sql(xml_loadfile.flags, + file_data.data_files, + file_data.stat_data, + file_data.mode_cts_data, + file_data.mode_obj_data, + file_data.tcst_data, + file_data.mtd_2d_data, + file_data.mtd_3d_single_data, + file_data.mtd_3d_pair_data, + tmp_dir, + sql_run.cur, + sql_run.local_infile) + + file_data.data_files = updated_data[0] + file_data.stat_data = updated_data[1] + line_counts["Stat"] += len(file_data.stat_data) + file_data.mode_cts_data = updated_data[2] + line_counts["Mode CTS"] += len(file_data.mode_cts_data) + file_data.mode_obj_data = updated_data[3] + line_counts["Mode Obj"] += len(file_data.mode_obj_data) + file_data.tcst_data = updated_data[4] + line_counts["Tcst"] += len(file_data.tcst_data) + file_data.mtd_2d_data = updated_data[5] + line_counts["MTD 2D"] += len(file_data.mtd_2d_data) + file_data.mtd_3d_single_data = updated_data[6] + line_counts["MTD 3D Single"] += len( + file_data.mtd_3d_single_data) + file_data.mtd_3d_pair_data = updated_data[7] + line_counts["MTD 3D Pair"] += len( + file_data.mtd_3d_pair_data) + + if file_data.data_files.empty: + logger.warning( + "!!! No data to load in current set %s", str(set_count)) + # move indices to the next set of files + first_file, mid_file, last_file = next_set( + mid_file, last_file) + + if not file_data.stat_data.empty: + stat_lines = WriteStatSql() + + stat_lines.write_stat_data(xml_loadfile.flags, + file_data.stat_data, + tmp_dir, + sql_run.cur, + sql_run.local_infile, + logger) + + if (not file_data.mode_cts_data.empty) or (not file_data.mode_obj_data.empty): + cts_lines = WriteModeSql() + + cts_lines.write_mode_data(xml_loadfile.flags, + file_data.mode_cts_data, + file_data.mode_obj_data, + tmp_dir, + sql_run.cur, + sql_run.local_infile, + logger) + + if not file_data.tcst_data.empty: + tcst_lines = WriteTcstSql() + + tcst_lines.write_tcst_data(xml_loadfile.flags, + file_data.tcst_data, + tmp_dir, + sql_run.cur, + sql_run.local_infile, + logger) + + if (not file_data.mtd_2d_data.empty) or (not file_data.mtd_3d_single_data.empty) \ + or (not file_data.mtd_3d_pair_data.empty): + mtd_lines = WriteMtdSql() + + mtd_lines.write_mtd_data(xml_loadfile.flags, + file_data.mtd_2d_data, + file_data.mtd_3d_single_data, + file_data.mtd_3d_pair_data, + tmp_dir, + sql_run.cur, + sql_run.local_infile, + logger) + + # Processing for the last set of data + if mid_file >= last_file: + # If any data was written, write to the metadata and instance_info tables + if not file_data.data_files.empty: + write_file.write_metadata_sql(xml_loadfile.flags, + file_data.data_files, + xml_loadfile.group, + xml_loadfile.description, + xml_loadfile.load_note, + xml_loadfile.xml_str, + tmp_dir, + sql_run.cur, + sql_run.local_infile) + + # if apply_indexes is set to true, load the indexes + if xml_loadfile.flags["apply_indexes"]: + sql_run.apply_indexes(False, sql_run.cur, logger) + + if sql_run.conn.open: + sql_run.sql_off(sql_run.conn, sql_run.cur) - if sql_run.conn.open: - sql_run.sql_off(sql_run.conn, sql_run.cur) + # move indices to the next set of files + first_file, mid_file, last_file = next_set(mid_file, last_file) - # move indices to the next set of files - first_file, mid_file, last_file = next_set(mid_file, last_file) + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s occurred in Main writing data ***", sys.exc_info()[0]) + sys.exit("*** Error when writing data to database") - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s occurred in Main writing data ***", sys.exc_info()[0]) - sys.exit("*** Error when writing data to database") + if not file_data.data_files.empty: + if sql_run.conn.open: + sql_run.sql_off(sql_run.conn, sql_run.cur) - if not file_data.data_files.empty: - if sql_run.conn.open: - sql_run.sql_off(sql_run.conn, sql_run.cur) + load_time_end = time.perf_counter() + load_time = timedelta(seconds=load_time_end - load_time_start) - load_time_end = time.perf_counter() - load_time = timedelta(seconds=load_time_end - load_time_start) + logger.info(" >>> Total load time: %s", str(load_time)) + for k in line_counts: + logger.info("For %s Count %s", k, line_counts[k]) - logger.info(" >>> Total load time: %s", str(load_time)) - for k in line_counts: - logger.info("For %s Count %s", k, line_counts[k]) + try: + logger.info("User name is: %s", getpass.getuser()) + except: + logger.info("User name is not available") - try: - logger.info("User name is: %s", getpass.getuser()) - except: - logger.info("User name is not available") + logger.info("End time: %s\n", str(datetime.now())) + logger.info("--- *** --- End METdbLoad --- *** ---") - logger.info("End time: %s\n", str(datetime.now())) - logger.info("--- *** --- End METdbLoad --- *** ---") + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + self.logger.error( + "*** %s occurred in main function of met_db_load ***", sys.exc_info()[0]) + sys.exit("*** Error loading data") def print_version(logger): @@ -355,7 +379,8 @@ def print_version(logger): except (RuntimeError, TypeError, NameError, KeyError): logger.error("*** %s occurred in print_version ***", sys.exc_info()[0]) - logger.error("*** %s occurred in Main printing version ***", sys.exc_info()[0]) + logger.error( + "*** %s occurred in Main printing version ***", sys.exc_info()[0]) sys.exit("*** Error in print version") @@ -404,23 +429,31 @@ def purge_files(load_files, xml_flags, logger): except (RuntimeError, TypeError, NameError, KeyError): logger.error("*** %s occurred in purge_files ***", sys.exc_info()[0]) - logger.error("*** %s occurred in Main purging files not selected ***", sys.exc_info()[0]) + logger.error( + "*** %s occurred in Main purging files not selected ***", sys.exc_info()[0]) sys.exit("*** Error in purge files") return updated_list if __name__ == '__main__': - parser = argparse.ArgumentParser() - # Allow user to choose dir for tmp files - default to user home - tmp_dir = [os.getenv('HOME')] - parser.add_argument("xmlfile", help="Please provide required xml load_spec filename") - parser.add_argument("-index", action="store_true", help="Only process index, do not load data") - parser.add_argument("tmpdir", nargs='*', default=tmp_dir, - help="Optional - when different directory wanted for tmp file") - parser.add_argument("--loglevel", default=None, type=str, choices={"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}, - help="Optional - specify log level. One of: DEBUG, INFO, WARNING, ERROR, CRITICAL.") - # get the command line arguments - args = parser.parse_args() + try: + parser = argparse.ArgumentParser() + # Allow user to choose dir for tmp files - default to user home + tmp_dir = [os.getenv('HOME')] + parser.add_argument( + "xmlfile", help="Please provide required xml load_spec filename") + parser.add_argument("-index", action="store_true", + help="Only process index, do not load data") + parser.add_argument("tmpdir", nargs='*', default=tmp_dir, + help="Optional - when different directory wanted for tmp file") + parser.add_argument("--loglevel", default=None, type=str, choices={"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}, + help="Optional - specify log level. One of: DEBUG, INFO, WARNING, ERROR, CRITICAL.") + # get the command line arguments + args = parser.parse_args() + except: + print( + "*** %s occurred setting up met_db_load ***", sys.exc_info()[0]) + sys.exit("*** Error setting up met_db_load") main(args) diff --git a/METdbLoad/ush/read_data_files.py b/METdbLoad/ush/read_data_files.py index 96996361..5197e724 100644 --- a/METdbLoad/ush/read_data_files.py +++ b/METdbLoad/ush/read_data_files.py @@ -28,7 +28,6 @@ from METdbLoad.ush import constants as CN - class ReadDataFiles: """! Class to read in data files given in load_spec file Returns: @@ -36,20 +35,30 @@ class ReadDataFiles: """ def __init__(self, logger=None): - self.cache = {} - self.stat_data = pd.DataFrame() - self.mode_cts_data = pd.DataFrame() - self.mode_obj_data = pd.DataFrame() - self.tcst_data = pd.DataFrame() - self.data_files = pd.DataFrame() - self.mtd_2d_data = pd.DataFrame() - self.mtd_3d_single_data = pd.DataFrame() - self.mtd_3d_pair_data = pd.DataFrame() - if logger is None: - full_logfile = os.path.join(os.getcwd(), __name__ + "_log.txt") - self.logger = util.get_common_logger('DEBUG', full_logfile) - else: - self.logger = logger + try: + self.cache = {} + self.stat_data = pd.DataFrame() + self.mode_cts_data = pd.DataFrame() + self.mode_obj_data = pd.DataFrame() + self.tcst_data = pd.DataFrame() + self.data_files = pd.DataFrame() + self.mtd_2d_data = pd.DataFrame() + self.mtd_3d_single_data = pd.DataFrame() + self.mtd_3d_pair_data = pd.DataFrame() + if logger is None: + full_logfile = os.path.join(os.getcwd(), __name__ + "_log.txt") + self.logger = util.get_common_logger('DEBUG', full_logfile) + else: + self.logger = logger + except RuntimeError: + if logger is None: + print( + "*** %s occurred while initializing class ReadDataFiles ***", sys.exc_info()[0]) + else: + self.logger = logger + self.logger.error( + "*** %s occurred while initializing class ReadDataFiles ***", sys.exc_info()[0]) + sys.exit("*** Error initializing class ReadDataFiles") def read_data(self, load_flags, load_files, line_types): """ Read in data files as given in load_spec file. @@ -57,1263 +66,1404 @@ def read_data(self, load_flags, load_files, line_types): N/A """ - self.logger.debug("[--- Start read_data ---]") - - read_time_start = time.perf_counter() - - # handle MET files, VSDB files, MODE files, MTD files, TCST files - - # speed up with dask delayed? - - one_file = pd.DataFrame() - vsdb_file = pd.DataFrame() - mode_file = pd.DataFrame() - tcst_file = pd.DataFrame() - mtd_file = pd.DataFrame() - file_hdr = pd.DataFrame() - all_stat = pd.DataFrame() - all_vsdb = pd.DataFrame() - all_cts = pd.DataFrame() - all_obj = pd.DataFrame() - all_tcst = pd.DataFrame() - all_2d = pd.DataFrame() - all_single = pd.DataFrame() - all_pair = pd.DataFrame() - list_frames = [] - list_vsdb = [] - list_cts = [] - list_obj = [] - list_tcst = [] - list_2d = [] - list_single = [] - list_pair = [] - - # keep track of each set of revisions - rev_ctr = 0 - try: + self.logger.debug("[--- Start read_data ---]") + + read_time_start = time.perf_counter() + + # handle MET files, VSDB files, MODE files, MTD files, TCST files + + # speed up with dask delayed? + + one_file = pd.DataFrame() + vsdb_file = pd.DataFrame() + mode_file = pd.DataFrame() + tcst_file = pd.DataFrame() + mtd_file = pd.DataFrame() + file_hdr = pd.DataFrame() + all_stat = pd.DataFrame() + all_vsdb = pd.DataFrame() + all_cts = pd.DataFrame() + all_obj = pd.DataFrame() + all_tcst = pd.DataFrame() + all_2d = pd.DataFrame() + all_single = pd.DataFrame() + all_pair = pd.DataFrame() + list_frames = [] + list_vsdb = [] + list_cts = [] + list_obj = [] + list_tcst = [] + list_2d = [] + list_single = [] + list_pair = [] + + # keep track of each set of revisions + rev_ctr = 0 + + try: + + # Put the list of files into a dataframe to collect info to write to database + self.data_files[CN.FULL_FILE] = load_files + # Add the code that describes what kind of file this is - stat, vsdb, etc + self.data_files[CN.DATA_FILE_LU_ID] = \ + np.vectorize(self.get_lookup)( + self.data_files[CN.FULL_FILE]) + + # Drop files that are not of a valid type + self.data_files.drop(self.data_files[self.data_files[CN.DATA_FILE_LU_ID] == + CN.NO_KEY].index, inplace=True) + self.data_files.reset_index(drop=True, inplace=True) - # Put the list of files into a dataframe to collect info to write to database - self.data_files[CN.FULL_FILE] = load_files - # Add the code that describes what kind of file this is - stat, vsdb, etc - self.data_files[CN.DATA_FILE_LU_ID] = \ - np.vectorize(self.get_lookup)(self.data_files[CN.FULL_FILE]) - - # Drop files that are not of a valid type - self.data_files.drop(self.data_files[self.data_files[CN.DATA_FILE_LU_ID] == - CN.NO_KEY].index, inplace=True) - self.data_files.reset_index(drop=True, inplace=True) - - # If no valid files to load, exit - if not len(self.data_files): - self.logger.warning("!!! No valid files to load") - sys.exit("*** No valid files to load") - - # Won't know database key until we interact with the database, so no keys yet - self.data_files[CN.DATA_FILE_ID] = CN.NO_KEY - # Store the index in a column to make later merging with stat data easier - self.data_files[CN.FILE_ROW] = self.data_files.index - # Break the full file name into path and filename - self.data_files[CN.FILEPATH] = \ - self.data_files[CN.FULL_FILE].str.rpartition(CN.FWD_SLASH)[0] - self.data_files[CN.FILENAME] = \ - self.data_files[CN.FULL_FILE].str.rpartition(CN.FWD_SLASH)[2] - # current date and time for load date - self.data_files[CN.LOAD_DATE] = datetime.now().strftime( - "%Y-%m-%d %H:%M:%S") - self.data_files[CN.MOD_DATE] = None - - # Check to make sure files exist - for row in self.data_files.itertuples(name=None): - - try: - row_num = row[0] - filename = row[1] - lu_id = row[2] - filepath = row[5] - - # Read in each file. Add columns if needed. Append to all_stat dataframe. - file_and_path = Path(filename) - - if file_and_path.is_file(): - # check for blank files or, for MET, no data after header line files - # handle variable number of fields - # get file info like size of file and last modified date of file - stat_info = os.stat(file_and_path) - # get last modified date of file in standard time format - mod_date = time.strftime('%Y-%m-%d %H:%M:%S', - time.localtime(stat_info.st_mtime)) - self.data_files.at[row_num, CN.MOD_DATE] = mod_date - - # - # Process stat files - # - if lu_id == CN.STAT: - - # Get the first line of the .stat file that has the headers - try: - file_hdr = pd.read_csv(filename, sep=r'\s+', - header=None, nrows=1) - except (pd.errors.EmptyDataError): - self.logger.warning(f"!!! Stat file {filename} has no columns") - continue - - # MET file has no headers or no text - it's empty - if file_hdr.empty or stat_info.st_size == 0: - self.logger.warning(f"!!! Stat file {filename} is empty") - continue - - # Add a DESC column if the data file does not have one - if not file_hdr.iloc[0].str.contains(CN.UC_DESC).any(): - hdr_names = CN.SHORT_HEADER + CN.COL_NUMS - one_file = self.read_stat(filename, hdr_names) - - # File has headers but not data - if not len(one_file): + # If no valid files to load, exit + if not len(self.data_files): + self.logger.warning("!!! No valid files to load") + sys.exit("*** No valid files to load") + + # Won't know database key until we interact with the database, so no keys yet + self.data_files[CN.DATA_FILE_ID] = CN.NO_KEY + # Store the index in a column to make later merging with stat data easier + self.data_files[CN.FILE_ROW] = self.data_files.index + # Break the full file name into path and filename + self.data_files[CN.FILEPATH] = \ + self.data_files[CN.FULL_FILE].str.rpartition(CN.FWD_SLASH)[ + 0] + self.data_files[CN.FILENAME] = \ + self.data_files[CN.FULL_FILE].str.rpartition(CN.FWD_SLASH)[ + 2] + # current date and time for load date + self.data_files[CN.LOAD_DATE] = datetime.now().strftime( + "%Y-%m-%d %H:%M:%S") + self.data_files[CN.MOD_DATE] = None + + # Check to make sure files exist + for row in self.data_files.itertuples(name=None): + + try: + row_num = row[0] + filename = row[1] + lu_id = row[2] + filepath = row[5] + + # Read in each file. Add columns if needed. Append to all_stat dataframe. + file_and_path = Path(filename) + + if file_and_path.is_file(): + # check for blank files or, for MET, no data after header line files + # handle variable number of fields + # get file info like size of file and last modified date of file + stat_info = os.stat(file_and_path) + # get last modified date of file in standard time format + mod_date = time.strftime('%Y-%m-%d %H:%M:%S', + time.localtime(stat_info.st_mtime)) + self.data_files.at[row_num, CN.MOD_DATE] = mod_date + + # + # Process stat files + # + if lu_id == CN.STAT: + + # Get the first line of the .stat file that has the headers + try: + file_hdr = pd.read_csv(filename, sep=r'\s+', + header=None, nrows=1) + except (pd.errors.EmptyDataError): + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! Stat file {filename} has no columns") continue - # If the file has no DESC column, add UNITS as well - one_file.insert(2, CN.DESCR, CN.NOTAV) - one_file.insert(10, CN.FCST_UNITS, CN.NOTAV) - one_file.insert(13, CN.OBS_UNITS, CN.NOTAV) - - # If the file has a DESC column, but no UNITS columns - elif not file_hdr.iloc[0].str.contains(CN.UC_FCST_UNITS).any(): - hdr_names = CN.MID_HEADER + CN.COL_NUMS - one_file = self.read_stat(filename, hdr_names) - - # File has headers but not data - if not len(one_file): + # MET file has no headers or no text - it's empty + if file_hdr.empty or stat_info.st_size == 0: + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! Stat file {filename} is empty") continue - one_file.insert(10, CN.FCST_UNITS, CN.NOTAV) - one_file.insert(13, CN.OBS_UNITS, CN.NOTAV) - - else: - hdr_names = CN.LONG_HEADER + CN.COL_NUMS - one_file = self.read_stat(filename, hdr_names) + # Add a DESC column if the data file does not have one + if not file_hdr.iloc[0].str.contains(CN.UC_DESC).any(): + hdr_names = CN.SHORT_HEADER + CN.COL_NUMS + one_file = self.read_stat( + filename, hdr_names) - # File has headers but not data - if not len(one_file): - continue + # File has headers but not data + if not len(one_file): + continue - # Defragmenting - one_file = one_file.copy() + # If the file has no DESC column, add UNITS as well + one_file.insert(2, CN.DESCR, CN.NOTAV) + one_file.insert( + 10, CN.FCST_UNITS, CN.NOTAV) + one_file.insert(13, CN.OBS_UNITS, CN.NOTAV) - # add line numbers and count the header line, for stat files - one_file[CN.LINE_NUM] = one_file.index + 2 - one_file = one_file.copy() + # If the file has a DESC column, but no UNITS columns + elif not file_hdr.iloc[0].str.contains(CN.UC_FCST_UNITS).any(): + hdr_names = CN.MID_HEADER + CN.COL_NUMS + one_file = self.read_stat( + filename, hdr_names) - # add columns for fcst_perc and obs_perc - # these can be in parens in fcst_thresh and obs_thresh in stat files - one_file[[CN.FCST_PERC, CN.OBS_PERC]] = \ - (CN.MV_NOTAV, CN.MV_NOTAV) + # File has headers but not data + if not len(one_file): + continue - # - # Process vsdb files - # - elif lu_id == CN.VSDB_POINT_STAT: + one_file.insert( + 10, CN.FCST_UNITS, CN.NOTAV) + one_file.insert(13, CN.OBS_UNITS, CN.NOTAV) - # check whether vsdb file is empty - if stat_info.st_size == 0: - self.logger.warning(f"!!! Vsdb fil {filename} is empty") - continue + else: + hdr_names = CN.LONG_HEADER + CN.COL_NUMS + one_file = self.read_stat( + filename, hdr_names) + + # File has headers but not data + if not len(one_file): + continue + + # Defragmenting + one_file = one_file.copy() + + # add line numbers and count the header line, for stat files + one_file[CN.LINE_NUM] = one_file.index + 2 + one_file = one_file.copy() + + # add columns for fcst_perc and obs_perc + # these can be in parens in fcst_thresh and obs_thresh in stat files + one_file[[CN.FCST_PERC, CN.OBS_PERC]] = \ + (CN.MV_NOTAV, CN.MV_NOTAV) + + # + # Process vsdb files + # + elif lu_id == CN.VSDB_POINT_STAT: + + # check whether vsdb file is empty + if stat_info.st_size == 0: + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! Vsdb fil {filename} is empty") + continue - # read each line in as 1 column so some fixes can be made - vsdb_file = pd.read_csv( - filename, sep=CN.SEP, header=None) + # read each line in as 1 column so some fixes can be made + vsdb_file = pd.read_csv( + filename, sep=CN.SEP, header=None) - if vsdb_file.iloc[:, 0].str.contains('=').any(): + if vsdb_file.iloc[:, 0].str.contains('=').any(): - # split vsdb data into 2 columns - before the =, and after - # this protects from changing weird variable names, and removes = - split_file = vsdb_file.iloc[:, 0].str.split( - '=', expand=True) + # split vsdb data into 2 columns - before the =, and after + # this protects from changing weird variable names, and removes = + split_file = vsdb_file.iloc[:, 0].str.split( + '=', expand=True) - # put space in front of hyphen between numbers in case space is missing - # FHO can have negative thresh - fix with regex, only between numbers - split_file[split_file.columns[1]] = \ - split_file[split_file.columns[1]].str.replace(r'(\d)-(\d)', r'\1 -\2', - regex=True) + # put space in front of hyphen between numbers in case space is missing + # FHO can have negative thresh - fix with regex, only between numbers + split_file[split_file.columns[1]] = \ + split_file[split_file.columns[1]].str.replace(r'(\d)-(\d)', r'\1 -\2', + regex=True) - # merge the two halves together again - vsdb_file = split_file.iloc[:, - 0] + ' ' + split_file.iloc[:, 1] + # merge the two halves together again + vsdb_file = split_file.iloc[:, + 0] + ' ' + split_file.iloc[:, 1] - else: - vsdb_file = vsdb_file.iloc[:, 0] - - # break fields out, separated by 1 or more spaces - vsdb_file = vsdb_file.str.split(' +', expand=True) - - # add column names - hdr_names = CN.VSDB_HEADER + CN.COL_NUMS - vsdb_file.columns = hdr_names[:len( - vsdb_file.columns)] - - # add line numbers, starting at 1 - vsdb_file.insert( - 9, CN.LINE_NUM, vsdb_file.index + 1) - - # some line types need a piece of the path added to the model name - # if last part of path contains an underscore, save string after it. - # then add it to model name - last_slash = filepath.rfind(CN.FWD_SLASH) - last_und = filepath.rfind('_') - ens_value = '' - if last_und > last_slash: - ens_value = filepath[last_und:] - if not vsdb_file.loc[vsdb_file.line_type.isin(CN.ENS_VSDB_LINE_TYPES), - CN.MODEL].empty: - vsdb_file.loc[vsdb_file.line_type.isin(CN.ENS_VSDB_LINE_TYPES), - CN.MODEL] = \ - vsdb_file.loc[vsdb_file.line_type.isin(CN.ENS_VSDB_LINE_TYPES), - CN.MODEL].str.split(CN.FWD_SLASH).str[0] + \ - ens_value + CN.FWD_SLASH + \ + else: + vsdb_file = vsdb_file.iloc[:, 0] + + # break fields out, separated by 1 or more spaces + vsdb_file = vsdb_file.str.split( + ' +', expand=True) + + # add column names + hdr_names = CN.VSDB_HEADER + CN.COL_NUMS + vsdb_file.columns = hdr_names[:len( + vsdb_file.columns)] + + # add line numbers, starting at 1 + vsdb_file.insert( + 9, CN.LINE_NUM, vsdb_file.index + 1) + + # some line types need a piece of the path added to the model name + # if last part of path contains an underscore, save string after it. + # then add it to model name + last_slash = filepath.rfind(CN.FWD_SLASH) + last_und = filepath.rfind('_') + ens_value = '' + if last_und > last_slash: + ens_value = filepath[last_und:] + if not vsdb_file.loc[vsdb_file.line_type.isin(CN.ENS_VSDB_LINE_TYPES), + CN.MODEL].empty: vsdb_file.loc[vsdb_file.line_type.isin(CN.ENS_VSDB_LINE_TYPES), - CN.MODEL].str.split(CN.FWD_SLASH).str[1] - - # - # Process mode files - # - elif lu_id in (CN.MODE_CTS, CN.MODE_OBJ): - - # Get the first line of the mode cts or obj file that has the headers - try: - file_hdr = pd.read_csv(filename, sep=r'\s+', - nrows=1) - except (pd.errors.EmptyDataError): - self.logger.warning(f"!!! Mode file {filename} has no columns") - continue - - # MODE file has no headers or no text - it's empty - if file_hdr.empty or stat_info.st_size == 0: - self.logger.warning(f"!!! Mode file {filename} is empty") - continue - - # use lower case of headers in file as column names - hdr_names = file_hdr.columns.tolist() - hdr_names = [hdr.lower() for hdr in hdr_names] - - # change field name after intensity_90 to be intensity_nn - if CN.INTENSITY_90 in hdr_names: - hdr_names[hdr_names.index( - CN.INTENSITY_90) + 1] = CN.INTENSITY_NN + CN.MODEL] = \ + vsdb_file.loc[vsdb_file.line_type.isin(CN.ENS_VSDB_LINE_TYPES), + CN.MODEL].str.split(CN.FWD_SLASH).str[0] + \ + ens_value + CN.FWD_SLASH + \ + vsdb_file.loc[vsdb_file.line_type.isin(CN.ENS_VSDB_LINE_TYPES), + CN.MODEL].str.split(CN.FWD_SLASH).str[1] + + # + # Process mode files + # + elif lu_id in (CN.MODE_CTS, CN.MODE_OBJ): + + # Get the first line of the mode cts or obj file that has the headers + try: + file_hdr = pd.read_csv(filename, sep=r'\s+', + nrows=1) + except (pd.errors.EmptyDataError): + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! Mode file {filename} has no columns") + continue - # read the file - mode_file = self.read_mode(filename, hdr_names) + # MODE file has no headers or no text - it's empty + if file_hdr.empty or stat_info.st_size == 0: + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! Mode file {filename} is empty") + continue - # File has headers but not data - if not len(mode_file): - continue + # use lower case of headers in file as column names + hdr_names = file_hdr.columns.tolist() + hdr_names = [hdr.lower() for hdr in hdr_names] - # add line numbers and count the header line, for mode files - mode_file[CN.LINENUMBER] = mode_file.index + 2 - - # add other fields if not present in file - if CN.N_VALID not in hdr_names: - mode_file.insert(2, CN.N_VALID, CN.MV_NULL) - if CN.GRID_RES not in hdr_names: - mode_file.insert(3, CN.GRID_RES, CN.MV_NULL) - if CN.DESCR not in hdr_names: - mode_file.insert(4, CN.DESCR, CN.NOTAV) - - if CN.ASPECT_DIFF not in hdr_names: - mode_file[CN.ASPECT_DIFF] = CN.MV_NOTAV - - if CN.CURV_RATIO not in hdr_names: - mode_file[CN.CURV_RATIO] = CN.MV_NOTAV - - # add units if input file does not have them - if CN.FCST_UNITS not in hdr_names: - mode_file.insert(16, CN.FCST_UNITS, CN.NOTAV) - mode_file.insert(19, CN.OBS_UNITS, CN.NOTAV) - - # if FCST_LEAD is NA, set it to 0 - if not mode_file.fcst_lead.dtypes == 'int': - mode_file.loc[mode_file.fcst_lead == - CN.NOTAV, CN.FCST_LEAD] = 0 - mode_file[CN.FCST_LEAD] = mode_file[CN.FCST_LEAD].astype( - int) - - # initially, match line data to the index of the file names - mode_file[CN.FILE_ROW] = row_num - - # determine which types of records are in the file - if lu_id == CN.MODE_CTS: - # mode_cts - list_cts.append(mode_file) - # both single and pair data can be in the same files - else: - list_obj.append(mode_file) - # - # Process TCST files - # - elif lu_id == CN.TCST: - - # Get the first line of the .tcst file that has the headers - try: - file_hdr = pd.read_csv(filename, sep=r'\s+', - header=None, nrows=1) - except (pd.errors.EmptyDataError): - self.logger.warning(f"!!! TCST file {filename} has no columns") - continue + # change field name after intensity_90 to be intensity_nn + if CN.INTENSITY_90 in hdr_names: + hdr_names[hdr_names.index( + CN.INTENSITY_90) + 1] = CN.INTENSITY_NN - # TCST file has no headers or no text - it's empty - if file_hdr.empty or stat_info.st_size == 0: - self.logger.warning(f"!!! TCST file {filename} is empty") - continue + # read the file + mode_file = self.read_mode(filename, hdr_names) - # Add a DESC column if the data file does not have one - if not file_hdr.iloc[0].str.contains(CN.UC_DESC).any(): - hdr_names = CN.SHORT_HEADER_TCST + CN.COL_NUMS - tcst_file = self.read_tcst(filename, hdr_names) # File has headers but not data - if not len(tcst_file): + if not len(mode_file): continue - tcst_file.insert(3, CN.DESCR, CN.NOTAV) - else: - hdr_names = CN.LONG_HEADER_TCST + CN.COL_NUMS - tcst_file = self.read_tcst(filename, hdr_names) - # File has headers but not data - if not len(tcst_file): - continue + # add line numbers and count the header line, for mode files + mode_file[CN.LINENUMBER] = mode_file.index + 2 + + # add other fields if not present in file + if CN.N_VALID not in hdr_names: + mode_file.insert(2, CN.N_VALID, CN.MV_NULL) + if CN.GRID_RES not in hdr_names: + mode_file.insert( + 3, CN.GRID_RES, CN.MV_NULL) + if CN.DESCR not in hdr_names: + mode_file.insert(4, CN.DESCR, CN.NOTAV) + + if CN.ASPECT_DIFF not in hdr_names: + mode_file[CN.ASPECT_DIFF] = CN.MV_NOTAV + + if CN.CURV_RATIO not in hdr_names: + mode_file[CN.CURV_RATIO] = CN.MV_NOTAV + + # add units if input file does not have them + if CN.FCST_UNITS not in hdr_names: + mode_file.insert( + 16, CN.FCST_UNITS, CN.NOTAV) + mode_file.insert( + 19, CN.OBS_UNITS, CN.NOTAV) + + # if FCST_LEAD is NA, set it to 0 + if not mode_file.fcst_lead.dtypes == 'int': + mode_file.loc[mode_file.fcst_lead == + CN.NOTAV, CN.FCST_LEAD] = 0 + mode_file[CN.FCST_LEAD] = mode_file[CN.FCST_LEAD].astype( + int) + + # initially, match line data to the index of the file names + mode_file[CN.FILE_ROW] = row_num + + # determine which types of records are in the file + if lu_id == CN.MODE_CTS: + # mode_cts + list_cts.append(mode_file) + # both single and pair data can be in the same files + else: + list_obj.append(mode_file) + # + # Process TCST files + # + elif lu_id == CN.TCST: + + # Get the first line of the .tcst file that has the headers + try: + file_hdr = pd.read_csv(filename, sep=r'\s+', + header=None, nrows=1) + except (pd.errors.EmptyDataError): + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! TCST file {filename} has no columns") + continue - # add line numbers and count the header line, for tcst files - tcst_file[CN.LINE_NUM] = tcst_file.index + 2 + # TCST file has no headers or no text - it's empty + if file_hdr.empty or stat_info.st_size == 0: + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! TCST file {filename} is empty") + continue - tcst_file = tcst_file.rename(columns={"init": "fcst_init", - "lead": "fcst_lead", - "valid": "fcst_valid"}) + # Add a DESC column if the data file does not have one + if not file_hdr.iloc[0].str.contains(CN.UC_DESC).any(): + hdr_names = CN.SHORT_HEADER_TCST + CN.COL_NUMS + tcst_file = self.read_tcst( + filename, hdr_names) + # File has headers but not data + if not len(tcst_file): + continue + tcst_file.insert(3, CN.DESCR, CN.NOTAV) + else: + hdr_names = CN.LONG_HEADER_TCST + CN.COL_NUMS + tcst_file = self.read_tcst( + filename, hdr_names) - # - # Process MTD files - # - elif lu_id in CN.MTD_FILES: + # File has headers but not data + if not len(tcst_file): + continue - # Get the first line of the MTD file that has the headers - try: - file_hdr = pd.read_csv(filename, sep=r'\s+', - nrows=1) - except (pd.errors.EmptyDataError): - self.logger.warning(f"!!! MTD file {filename} has no columns") - continue + # add line numbers and count the header line, for tcst files + tcst_file[CN.LINE_NUM] = tcst_file.index + 2 + + tcst_file = tcst_file.rename(columns={"init": "fcst_init", + "lead": "fcst_lead", + "valid": "fcst_valid"}) + + # + # Process MTD files + # + elif lu_id in CN.MTD_FILES: + + # Get the first line of the MTD file that has the headers + try: + file_hdr = pd.read_csv(filename, sep=r'\s+', + nrows=1) + except (pd.errors.EmptyDataError): + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! MTD file {filename} has no columns") + continue - # MTD file has no headers or no text - it's empty - if file_hdr.empty or stat_info.st_size == 0: - self.logger.warning(f"!!! MTD file {filename} is empty") - continue + # MTD file has no headers or no text - it's empty + if file_hdr.empty or stat_info.st_size == 0: + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! MTD file {filename} is empty") + continue - # use lower case of headers in file as column names - hdr_names = file_hdr.columns.tolist() - hdr_names = [hdr.lower() for hdr in hdr_names] + # use lower case of headers in file as column names + hdr_names = file_hdr.columns.tolist() + hdr_names = [hdr.lower() for hdr in hdr_names] - # MET output uses desc, mysql uses descr - hdr_names[2] = CN.DESCR + # MET output uses desc, mysql uses descr + hdr_names[2] = CN.DESCR - # read the MTD file the same way as a mode file - mtd_file = self.read_mode(filename, hdr_names) + # read the MTD file the same way as a mode file + mtd_file = self.read_mode(filename, hdr_names) - # File has headers but not data - if not len(mtd_file): - continue + # File has headers but not data + if not len(mtd_file): + continue - # change field name after intensity_90 to be intensity_nn - if CN.INTENSITY_90 in mtd_file: - inten_col = mtd_file.columns.get_loc( - CN.INTENSITY_90) - # if intensity_90 is the last column, add a column - if inten_col == len(mtd_file.columns) - 1: - mtd_file[CN.INTENSITY_NN] = CN.MV_NOTAV + # change field name after intensity_90 to be intensity_nn + if CN.INTENSITY_90 in mtd_file: + inten_col = mtd_file.columns.get_loc( + CN.INTENSITY_90) + # if intensity_90 is the last column, add a column + if inten_col == len(mtd_file.columns) - 1: + mtd_file[CN.INTENSITY_NN] = CN.MV_NOTAV + else: + mtd_file = mtd_file.rename(columns={mtd_file.columns[inten_col + 1]: + CN.INTENSITY_NN}) + + # add a column for the revision_id + mtd_file[CN.REVISION_ID] = CN.MV_NULL + + # add line numbers and count the header line, for MTD files + mtd_file[CN.LINENUMBER] = mtd_file.index + 2 + + # add other fields if not present in file + if CN.FCST_T_BEG not in hdr_names: + mtd_file.insert( + 8, CN.FCST_T_BEG, CN.MV_NULL) + if CN.FCST_T_END not in hdr_names: + mtd_file.insert( + 9, CN.FCST_T_END, CN.MV_NULL) + if CN.OBS_T_BEG not in hdr_names: + mtd_file.insert( + 12, CN.OBS_T_BEG, CN.MV_NULL) + if CN.OBS_T_END not in hdr_names: + mtd_file.insert( + 13, CN.OBS_T_END, CN.MV_NULL) + + # add units if input file does not have them + if CN.FCST_UNITS not in hdr_names: + mtd_file.insert( + 17, CN.FCST_UNITS, CN.NOTAV) + mtd_file.insert(20, CN.OBS_UNITS, CN.NOTAV) + + # if FCST_LEAD is NA, set it to 0 to do math + if not mtd_file.fcst_lead.dtypes == 'int': + mtd_file.loc[mtd_file.fcst_lead == + CN.NOTAV, CN.FCST_LEAD] = 0 + mtd_file[CN.FCST_LEAD] = mtd_file[CN.FCST_LEAD].astype( + int) + + # Copy forecast lead times, without trailing 0000 if they have them + mtd_file[CN.FCST_LEAD_HR] = \ + np.where(mtd_file[CN.FCST_LEAD] < 25, + mtd_file[CN.FCST_LEAD] * 10000, + mtd_file[CN.FCST_LEAD]) + + mtd_file[CN.FCST_LEAD_HR] = (((mtd_file[CN.FCST_LEAD_HR] // 10000) * 3600) + + ((mtd_file[CN.FCST_LEAD_HR] // 100 % 100) * 60) + + (mtd_file[CN.FCST_LEAD_HR] % 100)) + + # Calculate fcst_init = fcst_valid - fcst_lead hours (in seconds) + mtd_file.insert(5, CN.FCST_INIT, 0) + mtd_file[CN.FCST_INIT] = mtd_file[CN.FCST_VALID] - \ + pd.to_timedelta( + mtd_file[CN.FCST_LEAD_HR], unit='sec') + + # Where fcst_lead was set to zero for math, set it to -9999 + if mtd_file[CN.FCST_LEAD].eq(0).any(): + mtd_file.loc[mtd_file.fcst_lead == + 0, CN.FCST_LEAD] = CN.MV_NOTAV + + # if OBS_LEAD is NA, set it to -9999 + if not mtd_file.obs_lead.dtypes == 'int': + mtd_file.loc[mtd_file.obs_lead == + CN.NOTAV, CN.OBS_LEAD] = CN.MV_NOTAV + mtd_file[CN.OBS_LEAD] = mtd_file[CN.OBS_LEAD].astype( + int) + + # initially, match line data to the index of the file names + mtd_file[CN.FILE_ROW] = row_num + + # determine which types of records are in the file + if lu_id in (CN.MTD_3D_SS, CN.MTD_3D_SC): + # MTD single + list_single.append(mtd_file) + elif lu_id in (CN.MTD_3D_PS, CN.MTD_3D_PC): + # MTD pair + list_pair.append(mtd_file) + # MTD 2D else: - mtd_file = mtd_file.rename(columns={mtd_file.columns[inten_col + 1]: - CN.INTENSITY_NN}) - - # add a column for the revision_id - mtd_file[CN.REVISION_ID] = CN.MV_NULL - - # add line numbers and count the header line, for MTD files - mtd_file[CN.LINENUMBER] = mtd_file.index + 2 - - # add other fields if not present in file - if CN.FCST_T_BEG not in hdr_names: - mtd_file.insert(8, CN.FCST_T_BEG, CN.MV_NULL) - if CN.FCST_T_END not in hdr_names: - mtd_file.insert(9, CN.FCST_T_END, CN.MV_NULL) - if CN.OBS_T_BEG not in hdr_names: - mtd_file.insert(12, CN.OBS_T_BEG, CN.MV_NULL) - if CN.OBS_T_END not in hdr_names: - mtd_file.insert(13, CN.OBS_T_END, CN.MV_NULL) - - # add units if input file does not have them - if CN.FCST_UNITS not in hdr_names: - mtd_file.insert(17, CN.FCST_UNITS, CN.NOTAV) - mtd_file.insert(20, CN.OBS_UNITS, CN.NOTAV) - - # if FCST_LEAD is NA, set it to 0 to do math - if not mtd_file.fcst_lead.dtypes == 'int': - mtd_file.loc[mtd_file.fcst_lead == - CN.NOTAV, CN.FCST_LEAD] = 0 - mtd_file[CN.FCST_LEAD] = mtd_file[CN.FCST_LEAD].astype( - int) - - # Copy forecast lead times, without trailing 0000 if they have them - mtd_file[CN.FCST_LEAD_HR] = \ - np.where(mtd_file[CN.FCST_LEAD] < 25, - mtd_file[CN.FCST_LEAD] * 10000, - mtd_file[CN.FCST_LEAD]) - - mtd_file[CN.FCST_LEAD_HR] = (((mtd_file[CN.FCST_LEAD_HR] // 10000) * 3600) + - ((mtd_file[CN.FCST_LEAD_HR] // 100 % 100) * 60) + - (mtd_file[CN.FCST_LEAD_HR] % 100)) - - # Calculate fcst_init = fcst_valid - fcst_lead hours (in seconds) - mtd_file.insert(5, CN.FCST_INIT, 0) - mtd_file[CN.FCST_INIT] = mtd_file[CN.FCST_VALID] - \ - pd.to_timedelta( - mtd_file[CN.FCST_LEAD_HR], unit='sec') - - # Where fcst_lead was set to zero for math, set it to -9999 - if mtd_file[CN.FCST_LEAD].eq(0).any(): - mtd_file.loc[mtd_file.fcst_lead == - 0, CN.FCST_LEAD] = CN.MV_NOTAV - - # if OBS_LEAD is NA, set it to -9999 - if not mtd_file.obs_lead.dtypes == 'int': - mtd_file.loc[mtd_file.obs_lead == - CN.NOTAV, CN.OBS_LEAD] = CN.MV_NOTAV - mtd_file[CN.OBS_LEAD] = mtd_file[CN.OBS_LEAD].astype( - int) - - # initially, match line data to the index of the file names - mtd_file[CN.FILE_ROW] = row_num - - # determine which types of records are in the file - if lu_id in (CN.MTD_3D_SS, CN.MTD_3D_SC): - # MTD single - list_single.append(mtd_file) - elif lu_id in (CN.MTD_3D_PS, CN.MTD_3D_PC): - # MTD pair - list_pair.append(mtd_file) - # MTD 2D - else: - # This is an MTD 2D Revision file if 10 columns each have a single value - mtd_rev = True - for mtd_col in CN.MTD_2D_REV_FIELDS: - if not (mtd_file[mtd_col] == mtd_file[mtd_col][0]).all(): - mtd_rev = False - if mtd_rev: - rev_lines = [] - obj_id = 'new' - obj_ct = 1 - last_line = len(mtd_file.index) - create_new = False - - # Make all the fields float that are needed to do math - mtd_file[mtd_file.columns[26:38]] = \ - mtd_file[mtd_file.columns[26:38]].astype( - float) - # Create new rows by subtracting a previous row from a row by object - # Unique sequential id is assigned to items with the same object id - # Only object ids with more than 2 lines count and create lines - for row_num, mtd_row in mtd_file.iterrows(): - - if mtd_row[CN.OBJECT_ID] == obj_id: - obj_ct += 1 - if obj_ct == 2 and (row_num + 1) < last_line and \ - mtd_file[CN.OBJECT_ID][row_num + 1] == obj_id: - rev_ctr += 1 - create_new = True - if obj_ct > 1 and create_new: - new_line = mtd_file.iloc[row_num].to_dict( - ) - new_line[CN.FCST_VAR] = 'REV_' + \ - new_line[CN.FCST_VAR] - new_line[CN.OBS_VAR] = 'REV_' + \ - new_line[CN.OBS_VAR] - new_line[CN.AREA] -= mtd_file[CN.AREA][row_num - 1] - new_line[CN.CENTROID_X] -= \ - mtd_file[CN.CENTROID_X][row_num - 1] - new_line[CN.CENTROID_Y] -= \ - mtd_file[CN.CENTROID_Y][row_num - 1] - new_line[CN.CENTROID_LAT] -= \ - mtd_file[CN.CENTROID_LAT][row_num - 1] - new_line[CN.CENTROID_LON] -= \ - mtd_file[CN.CENTROID_LON][row_num - 1] - new_line[CN.AXIS_ANG] = CN.MV_NOTAV - new_line[CN.INTENSITY_10] -= \ - mtd_file[CN.INTENSITY_10][row_num - 1] - new_line[CN.INTENSITY_25] -= \ - mtd_file[CN.INTENSITY_25][row_num - 1] - new_line[CN.INTENSITY_50] -= \ - mtd_file[CN.INTENSITY_50][row_num - 1] - new_line[CN.INTENSITY_75] -= \ - mtd_file[CN.INTENSITY_75][row_num - 1] - new_line[CN.INTENSITY_90] -= \ - mtd_file[CN.INTENSITY_90][row_num - 1] - new_line[CN.REVISION_ID] = rev_ctr - new_line[CN.LINENUMBER] = 0 - rev_lines.append(new_line) - else: - obj_id = mtd_row[CN.OBJECT_ID] - obj_ct = 1 - create_new = False - rev_df = pd.DataFrame(rev_lines) - mtd_file = pd.concat([mtd_file, rev_df], ignore_index=True, - sort=False) - rev_df = rev_df.iloc[0:0] - # concat new rows with mtd_file - list_2d.append(mtd_file) + # This is an MTD 2D Revision file if 10 columns each have a single value + mtd_rev = True + for mtd_col in CN.MTD_2D_REV_FIELDS: + if not (mtd_file[mtd_col] == mtd_file[mtd_col][0]).all(): + mtd_rev = False + if mtd_rev: + rev_lines = [] + obj_id = 'new' + obj_ct = 1 + last_line = len(mtd_file.index) + create_new = False + + # Make all the fields float that are needed to do math + mtd_file[mtd_file.columns[26:38]] = \ + mtd_file[mtd_file.columns[26:38]].astype( + float) + # Create new rows by subtracting a previous row from a row by object + # Unique sequential id is assigned to items with the same object id + # Only object ids with more than 2 lines count and create lines + for row_num, mtd_row in mtd_file.iterrows(): + + if mtd_row[CN.OBJECT_ID] == obj_id: + obj_ct += 1 + if obj_ct == 2 and (row_num + 1) < last_line and \ + mtd_file[CN.OBJECT_ID][row_num + 1] == obj_id: + rev_ctr += 1 + create_new = True + if obj_ct > 1 and create_new: + new_line = mtd_file.iloc[row_num].to_dict( + ) + new_line[CN.FCST_VAR] = 'REV_' + \ + new_line[CN.FCST_VAR] + new_line[CN.OBS_VAR] = 'REV_' + \ + new_line[CN.OBS_VAR] + new_line[CN.AREA] -= mtd_file[CN.AREA][row_num - 1] + new_line[CN.CENTROID_X] -= \ + mtd_file[CN.CENTROID_X][row_num - 1] + new_line[CN.CENTROID_Y] -= \ + mtd_file[CN.CENTROID_Y][row_num - 1] + new_line[CN.CENTROID_LAT] -= \ + mtd_file[CN.CENTROID_LAT][row_num - 1] + new_line[CN.CENTROID_LON] -= \ + mtd_file[CN.CENTROID_LON][row_num - 1] + new_line[CN.AXIS_ANG] = CN.MV_NOTAV + new_line[CN.INTENSITY_10] -= \ + mtd_file[CN.INTENSITY_10][row_num - 1] + new_line[CN.INTENSITY_25] -= \ + mtd_file[CN.INTENSITY_25][row_num - 1] + new_line[CN.INTENSITY_50] -= \ + mtd_file[CN.INTENSITY_50][row_num - 1] + new_line[CN.INTENSITY_75] -= \ + mtd_file[CN.INTENSITY_75][row_num - 1] + new_line[CN.INTENSITY_90] -= \ + mtd_file[CN.INTENSITY_90][row_num - 1] + new_line[CN.REVISION_ID] = rev_ctr + new_line[CN.LINENUMBER] = 0 + rev_lines.append(new_line) + else: + obj_id = mtd_row[CN.OBJECT_ID] + obj_ct = 1 + create_new = False + rev_df = pd.DataFrame(rev_lines) + mtd_file = pd.concat([mtd_file, rev_df], ignore_index=True, + sort=False) + rev_df = rev_df.iloc[0:0] + # concat new rows with mtd_file + list_2d.append(mtd_file) + else: + """ + We do NOT want to exit here. One bad file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! File type of {filename} not valid") + + # re-initialize pandas dataframes before reading next file + if not one_file.empty: + # initially, match line data to the index of the file names + one_file[CN.FILE_ROW] = row_num + # keep the dataframes from each file in a list + list_frames.append(one_file) + self.logger.debug( + f"Lines in {filename}: {str(len(one_file.index))}") + one_file = one_file.iloc[0:0] + if not file_hdr.empty: + file_hdr = file_hdr.iloc[0:0] + elif not vsdb_file.empty: + vsdb_file.insert(10, CN.FILE_ROW, row_num) + list_vsdb.append(vsdb_file) + self.logger.debug( + f"Lines in {filename}, {str(len(vsdb_file.index))}") + vsdb_file = vsdb_file.iloc[0:0] + elif not mode_file.empty: + self.logger.debug( + f"Lines in {filename},{str(len(mode_file.index))}") + mode_file = mode_file.iloc[0:0] + if not file_hdr.empty: + file_hdr = file_hdr.iloc[0:0] + elif not tcst_file.empty: + # initially, match line data to the index of the file names + tcst_file[CN.FILE_ROW] = row_num + # keep the dataframes from each file in a list + list_tcst.append(tcst_file) + self.logger.debug( + f"Lines in {filename},{str(len(one_file.index))}") + tcst_file = tcst_file.iloc[0:0] + if not file_hdr.empty: + file_hdr = file_hdr.iloc[0:0] + elif not mtd_file.empty: + self.logger.debug( + f"Lines in {filename}, {str(len(mtd_file.index))}") + mtd_file = mtd_file.iloc[0:0] + if not file_hdr.empty: + file_hdr = file_hdr.iloc[0:0] + else: + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! Empty file {filename}") + continue else: - self.logger.warning(f"!!! File type of {filename} not valid") - - # re-initialize pandas dataframes before reading next file - if not one_file.empty: - # initially, match line data to the index of the file names - one_file[CN.FILE_ROW] = row_num - # keep the dataframes from each file in a list - list_frames.append(one_file) - self.logger.debug(f"Lines in {filename}: {str(len(one_file.index))}") - one_file = one_file.iloc[0:0] - if not file_hdr.empty: - file_hdr = file_hdr.iloc[0:0] - elif not vsdb_file.empty: - vsdb_file.insert(10, CN.FILE_ROW, row_num) - list_vsdb.append(vsdb_file) - self.logger.debug(f"Lines in {filename}, {str(len(vsdb_file.index))}") - vsdb_file = vsdb_file.iloc[0:0] - elif not mode_file.empty: - self.logger.debug(f"Lines in {filename},{str(len(mode_file.index))}") - mode_file = mode_file.iloc[0:0] - if not file_hdr.empty: - file_hdr = file_hdr.iloc[0:0] - elif not tcst_file.empty: - # initially, match line data to the index of the file names - tcst_file[CN.FILE_ROW] = row_num - # keep the dataframes from each file in a list - list_tcst.append(tcst_file) - self.logger.debug(f"Lines in {filename},{str(len(one_file.index))}") - tcst_file = tcst_file.iloc[0:0] - if not file_hdr.empty: - file_hdr = file_hdr.iloc[0:0] - elif not mtd_file.empty: - self.logger.debug(f"Lines in {filename}, {str(len(mtd_file.index))}") - mtd_file = mtd_file.iloc[0:0] - if not file_hdr.empty: - file_hdr = file_hdr.iloc[0:0] - else: - self.logger.warning(f"!!! Empty file {filename}") - continue - else: - self.logger.warning(f"!!! No file {filename}") - sys.exit(1) - - except (RuntimeError, TypeError, NameError, KeyError) as e: - self.logger.error(f"*** {str(e)}: in read_data upper, mid-loop ***") - # end for row - - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error(f"*** {sys.exc_info()[0]}: in read_data upper ***", ) - - try: - - # concatenate all the dataframes - much faster than doing an append each time - # added sort=False on 10/21/19 because that will be new default behavior - if list_frames: - all_stat = pd.concat( - list_frames, ignore_index=True, sort=False) - list_frames = [] - - all_stat.fcst_thresh = all_stat.fcst_thresh.astype(str) - all_stat.obs_thresh = all_stat.obs_thresh.astype(str) - if not all_stat['1'].dtypes == 'float': - all_stat.loc[all_stat['1'] == CN.NOTAV, '1'] = CN.MV_NOTAV - all_stat['1'] = all_stat['1'].astype(float) - - # if a fcst percentage thresh is used, it is in parens in fcst_thresh - if all_stat.fcst_thresh.str.contains(CN.L_PAREN, regex=False).any(): - # save the value in parens - all_stat.loc[all_stat.fcst_thresh.str.contains(CN.L_PAREN, regex=False) & - all_stat.fcst_thresh.str.contains( - CN.R_PAREN, regex=False), - CN.FCST_PERC] = \ + """ + We do NOT want to exit here. Even if a process deleted one of the files in the batch, we + should still check to see if the others are there. + """ + self.logger.warning(f"!!! No file {filename}") + + except (RuntimeError, TypeError, NameError, KeyError) as e: + """ + We do NOT want to exit here. One bad file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"*** Error produced by one file: {str(e)}. Ignoring this bad file and continuing on ***") + + # end for row + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error assembling files for load: {sys.exc_info()[0]} ***", ) + sys.exit("*** Error assembling files for load") + + try: + + # concatenate all the dataframes - much faster than doing an append each time + # added sort=False on 10/21/19 because that will be new default behavior + if list_frames: + all_stat = pd.concat( + list_frames, ignore_index=True, sort=False) + list_frames = [] + + all_stat.fcst_thresh = all_stat.fcst_thresh.astype(str) + all_stat.obs_thresh = all_stat.obs_thresh.astype(str) + if not all_stat['1'].dtypes == 'float': + all_stat.loc[all_stat['1'] == + CN.NOTAV, '1'] = CN.MV_NOTAV + all_stat['1'] = all_stat['1'].astype(float) + + # if a fcst percentage thresh is used, it is in parens in fcst_thresh + if all_stat.fcst_thresh.str.contains(CN.L_PAREN, regex=False).any(): + # save the value in parens all_stat.loc[all_stat.fcst_thresh.str.contains(CN.L_PAREN, regex=False) & all_stat.fcst_thresh.str.contains( - CN.R_PAREN, regex=False), - CN.FCST_THRESH].str.split(CN.L_PAREN).str[1]. \ - str.split(CN.R_PAREN).str[0].astype(float) - # remove the percentage from fcst_thresh - all_stat.loc[all_stat.fcst_thresh.str.contains(CN.L_PAREN, regex=False) & - all_stat.fcst_thresh.str.contains( - CN.R_PAREN, regex=False), - CN.FCST_THRESH] = \ + CN.R_PAREN, regex=False), + CN.FCST_PERC] = \ + all_stat.loc[all_stat.fcst_thresh.str.contains(CN.L_PAREN, regex=False) & + all_stat.fcst_thresh.str.contains( + CN.R_PAREN, regex=False), + CN.FCST_THRESH].str.split(CN.L_PAREN).str[1]. \ + str.split(CN.R_PAREN).str[0].astype(float) + # remove the percentage from fcst_thresh all_stat.loc[all_stat.fcst_thresh.str.contains(CN.L_PAREN, regex=False) & all_stat.fcst_thresh.str.contains( - CN.R_PAREN, regex=False), - CN.FCST_THRESH].str.split(CN.L_PAREN).str[0] - - # if an obs percentage thresh is used, it is in parens in obs_thresh - if all_stat.obs_thresh.str.contains(CN.L_PAREN, regex=False).any(): - # save the value in parens - all_stat.loc[all_stat.obs_thresh.str.contains(CN.L_PAREN, regex=False) & - all_stat.obs_thresh.str.contains( - CN.R_PAREN, regex=False), - CN.OBS_PERC] = \ + CN.R_PAREN, regex=False), + CN.FCST_THRESH] = \ + all_stat.loc[all_stat.fcst_thresh.str.contains(CN.L_PAREN, regex=False) & + all_stat.fcst_thresh.str.contains( + CN.R_PAREN, regex=False), + CN.FCST_THRESH].str.split(CN.L_PAREN).str[0] + + # if an obs percentage thresh is used, it is in parens in obs_thresh + if all_stat.obs_thresh.str.contains(CN.L_PAREN, regex=False).any(): + # save the value in parens all_stat.loc[all_stat.obs_thresh.str.contains(CN.L_PAREN, regex=False) & all_stat.obs_thresh.str.contains( - CN.R_PAREN, regex=False), - CN.OBS_THRESH].str.split(CN.L_PAREN).str[1]. \ - str.split(CN.R_PAREN).str[0].astype(float) - all_stat.loc[all_stat.obs_thresh.str.contains(CN.L_PAREN, regex=False) & - all_stat.obs_thresh.str.contains( - CN.R_PAREN, regex=False), - CN.OBS_THRESH] = \ + CN.R_PAREN, regex=False), + CN.OBS_PERC] = \ + all_stat.loc[all_stat.obs_thresh.str.contains(CN.L_PAREN, regex=False) & + all_stat.obs_thresh.str.contains( + CN.R_PAREN, regex=False), + CN.OBS_THRESH].str.split(CN.L_PAREN).str[1]. \ + str.split(CN.R_PAREN).str[0].astype(float) all_stat.loc[all_stat.obs_thresh.str.contains(CN.L_PAREN, regex=False) & all_stat.obs_thresh.str.contains( - CN.R_PAREN, regex=False), - CN.OBS_THRESH].str.split(CN.L_PAREN).str[0] - - # These warnings and transforms only apply to stat files - # Give a warning message with data if value of alpha for an alpha line type is NA - # Do not check CNT and PSTD, even though they are alpha line types - alpha_lines = all_stat[(all_stat.line_type.isin(CN.ALPHA_LINE_TYPES[:-2])) & - (all_stat.alpha == CN.NOTAV)].line_type - if not alpha_lines.empty: - self.logger.warning(f"!!! ALPHA line_type has ALPHA value of NA:\r\n {str(alpha_lines)}") - - # give a warning message with data if non-alpha line type has float value - non_alpha_lines = all_stat[(~all_stat.line_type.isin(CN.ALPHA_LINE_TYPES)) & - (all_stat.alpha != CN.NOTAV)].line_type - if not non_alpha_lines.empty: - self.logger.warning(f"!!! non-ALPHA line_type has ALPHA float value:\r\n {str(non_alpha_lines)}") - - # Change ALL items in column ALPHA to '-9999' if they are 'NA' - all_stat.loc[all_stat.alpha == - CN.NOTAV, CN.ALPHA] = CN.MV_NOTAV - - # Make ALPHA column into a decimal with no trailing zeroes after the decimal - all_stat.alpha = all_stat.alpha.astype( - float).map('{0:g}'.format) - - # Change ALL items in column COV_THRESH to '-9999' if they are 'NA' - all_stat.loc[all_stat.cov_thresh == - CN.NOTAV, CN.COV_THRESH] = CN.MV_NOTAV - - # Change ALL items in column FCST_LEAD to 0 if they are 'NA' - # Added for tc_gen files - if not all_stat.fcst_lead.dtypes == 'int': - all_stat.loc[all_stat.fcst_lead == - CN.NOTAV, CN.FCST_LEAD] = 0 - all_stat[CN.FCST_LEAD] = all_stat[CN.FCST_LEAD].astype(int) - - # Change ALL items in column OBS_LEAD to 0 if they are 'NA' - if not all_stat.obs_lead.dtypes == 'int': - all_stat.loc[all_stat.obs_lead == - CN.NOTAV, CN.OBS_LEAD] = 0 - all_stat[CN.OBS_LEAD] = all_stat[CN.OBS_LEAD].astype(int) - - # Change 'NA' values in column INTERP_PNTS to 0 if present - if not all_stat.interp_pnts.dtypes == 'int': - all_stat.loc[all_stat.interp_pnts == - CN.NOTAV, CN.INTERP_PNTS] = 0 - all_stat.loc[all_stat.interp_pnts.isnull(), - CN.INTERP_PNTS] = 0 - all_stat.interp_pnts = all_stat.interp_pnts.astype(int) - - # PCT lines in stat files are short one row, subtract 1 from n_thresh - if all_stat[CN.LINE_TYPE].eq(CN.PCT).any(): - all_stat.loc[all_stat.line_type == CN.PCT, '1'] = \ - all_stat.loc[all_stat.line_type == CN.PCT, '1'] - 1 - - # RPS lines in stat files may be missing rps_comp - # if rps_comp IS null and rps is NOT null, - # set rps_comp to 1 minus rps - if all_stat[CN.LINE_TYPE].eq(CN.RPS).any(): - all_stat.loc[(all_stat.line_type == CN.RPS) & - (all_stat['8'].isnull()) & - (~all_stat['5'].isnull()), '8'] = \ - 1 - all_stat.loc[(all_stat.line_type == CN.RPS) & - (all_stat['8'].isnull()) & - (~all_stat['5'].isnull()), '5'].astype(float) - - # Some lines in stat files may be missing ec_value - # CTC and CTS, set to .5 - # MCTS, set to 1/n_cat. MCTC is variable length - if all_stat[CN.LINE_TYPE].eq(CN.CTC).any(): - all_stat.loc[(all_stat.line_type == CN.CTC) & - ((all_stat['5'].isnull()) | - (all_stat['5'] == CN.NOTAV)), '5'] = .5 - - if all_stat[CN.LINE_TYPE].eq(CN.CTS).any(): - all_stat.loc[(all_stat.line_type == CN.CTS) & - ((all_stat['96'].isnull()) | - (all_stat['96'] == CN.NOTAV)), '96'] = .5 - - if all_stat[CN.LINE_TYPE].eq(CN.MCTS).any(): - all_stat.loc[(all_stat.line_type == CN.MCTS) & - ((all_stat['19'].isnull()) | - (all_stat['19'] == CN.NOTAV)), '19'] = \ - 1/all_stat.loc[(all_stat.line_type == CN.MCTS) & - ((all_stat['19'].isnull()) | - (all_stat['19'] == CN.NOTAV)), '1'] - - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error(f"*** {sys.exc_info()[0]} in read_data if list_frames *** ") - - try: - - # concatenate all the dataframes - much faster than doing an append each time - if list_tcst: - all_tcst = pd.concat(list_tcst, ignore_index=True, sort=False) - list_tcst = [] - - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error(f"*** {sys.exc_info()[0]} in read_data if list_tcst ***") - - try: - - # collect vsdb files separately so additional transforms can be done - if list_vsdb: - all_vsdb = pd.concat(list_vsdb, ignore_index=True, sort=False) - list_vsdb = [] - - # Make VSDB files look like stat files - # get thresh starting with > in line_type - # FHO and FSS in the 6 column have thresh - all_vsdb.insert(9, CN.FCST_THRESH, CN.NOTAV) - - if all_vsdb.line_type.str.startswith('F').any(): - all_vsdb.loc[all_vsdb.line_type.str.startswith('F'), - CN.FCST_THRESH] = \ + CN.R_PAREN, regex=False), + CN.OBS_THRESH] = \ + all_stat.loc[all_stat.obs_thresh.str.contains(CN.L_PAREN, regex=False) & + all_stat.obs_thresh.str.contains( + CN.R_PAREN, regex=False), + CN.OBS_THRESH].str.split(CN.L_PAREN).str[0] + + # These warnings and transforms only apply to stat files + # Give a warning message with data if value of alpha for an alpha line type is NA + # Do not check CNT and PSTD, even though they are alpha line types + alpha_lines = all_stat[(all_stat.line_type.isin(CN.ALPHA_LINE_TYPES[:-2])) & + (all_stat.alpha == CN.NOTAV)].line_type + if not alpha_lines.empty: + self.logger.warning( + f"!!! ALPHA line_type has ALPHA value of NA:\r\n {str(alpha_lines)}") + + # give a warning message with data if non-alpha line type has float value + non_alpha_lines = all_stat[(~all_stat.line_type.isin(CN.ALPHA_LINE_TYPES)) & + (all_stat.alpha != CN.NOTAV)].line_type + if not non_alpha_lines.empty: + self.logger.warning( + f"!!! non-ALPHA line_type has ALPHA float value:\r\n {str(non_alpha_lines)}") + + # Change ALL items in column ALPHA to '-9999' if they are 'NA' + all_stat.loc[all_stat.alpha == + CN.NOTAV, CN.ALPHA] = CN.MV_NOTAV + + # Make ALPHA column into a decimal with no trailing zeroes after the decimal + all_stat.alpha = all_stat.alpha.astype( + float).map('{0:g}'.format) + + # Change ALL items in column COV_THRESH to '-9999' if they are 'NA' + all_stat.loc[all_stat.cov_thresh == + CN.NOTAV, CN.COV_THRESH] = CN.MV_NOTAV + + # Change ALL items in column FCST_LEAD to 0 if they are 'NA' + # Added for tc_gen files + if not all_stat.fcst_lead.dtypes == 'int': + all_stat.loc[all_stat.fcst_lead == + CN.NOTAV, CN.FCST_LEAD] = 0 + all_stat[CN.FCST_LEAD] = all_stat[CN.FCST_LEAD].astype( + int) + + # Change ALL items in column OBS_LEAD to 0 if they are 'NA' + if not all_stat.obs_lead.dtypes == 'int': + all_stat.loc[all_stat.obs_lead == + CN.NOTAV, CN.OBS_LEAD] = 0 + all_stat[CN.OBS_LEAD] = all_stat[CN.OBS_LEAD].astype( + int) + + # Change 'NA' values in column INTERP_PNTS to 0 if present + if not all_stat.interp_pnts.dtypes == 'int': + all_stat.loc[all_stat.interp_pnts == + CN.NOTAV, CN.INTERP_PNTS] = 0 + all_stat.loc[all_stat.interp_pnts.isnull(), + CN.INTERP_PNTS] = 0 + all_stat.interp_pnts = all_stat.interp_pnts.astype(int) + + # PCT lines in stat files are short one row, subtract 1 from n_thresh + if all_stat[CN.LINE_TYPE].eq(CN.PCT).any(): + all_stat.loc[all_stat.line_type == CN.PCT, '1'] = \ + all_stat.loc[all_stat.line_type == CN.PCT, '1'] - 1 + + # RPS lines in stat files may be missing rps_comp + # if rps_comp IS null and rps is NOT null, + # set rps_comp to 1 minus rps + if all_stat[CN.LINE_TYPE].eq(CN.RPS).any(): + all_stat.loc[(all_stat.line_type == CN.RPS) & + (all_stat['8'].isnull()) & + (~all_stat['5'].isnull()), '8'] = \ + 1 - all_stat.loc[(all_stat.line_type == CN.RPS) & + (all_stat['8'].isnull()) & + (~all_stat['5'].isnull()), '5'].astype(float) + + # Some lines in stat files may be missing ec_value + # CTC and CTS, set to .5 + # MCTS, set to 1/n_cat. MCTC is variable length + if all_stat[CN.LINE_TYPE].eq(CN.CTC).any(): + all_stat.loc[(all_stat.line_type == CN.CTC) & + ((all_stat['5'].isnull()) | + (all_stat['5'] == CN.NOTAV)), '5'] = .5 + + if all_stat[CN.LINE_TYPE].eq(CN.CTS).any(): + all_stat.loc[(all_stat.line_type == CN.CTS) & + ((all_stat['96'].isnull()) | + (all_stat['96'] == CN.NOTAV)), '96'] = .5 + + if all_stat[CN.LINE_TYPE].eq(CN.MCTS).any(): + all_stat.loc[(all_stat.line_type == CN.MCTS) & + ((all_stat['19'].isnull()) | + (all_stat['19'] == CN.NOTAV)), '19'] = \ + 1/all_stat.loc[(all_stat.line_type == CN.MCTS) & + ((all_stat['19'].isnull()) | + (all_stat['19'] == CN.NOTAV)), '1'] + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error concatenating data frames prior to load: {sys.exc_info()[0]}*** ") + sys.exit("*** Error concatenating data frames prior to load") + + try: + + # concatenate all the dataframes - much faster than doing an append each time + if list_tcst: + all_tcst = pd.concat( + list_tcst, ignore_index=True, sort=False) + list_tcst = [] + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error concatenating tcst data frames prior to load: {sys.exc_info()[0]}*** ") + sys.exit("*** Error concatenating tcst data frames prior to load") + + try: + + # collect vsdb files separately so additional transforms can be done + if list_vsdb: + all_vsdb = pd.concat( + list_vsdb, ignore_index=True, sort=False) + list_vsdb = [] + + # Make VSDB files look like stat files + # get thresh starting with > in line_type + # FHO and FSS in the 6 column have thresh + all_vsdb.insert(9, CN.FCST_THRESH, CN.NOTAV) + + if all_vsdb.line_type.str.startswith('F').any(): all_vsdb.loc[all_vsdb.line_type.str.startswith('F'), - CN.LINE_TYPE].str[3:] - # remove the thresh value from the line type - all_vsdb.loc[all_vsdb.line_type.str.startswith('F'), - CN.LINE_TYPE] = \ + CN.FCST_THRESH] = \ + all_vsdb.loc[all_vsdb.line_type.str.startswith('F'), + CN.LINE_TYPE].str[3:] + # remove the thresh value from the line type all_vsdb.loc[all_vsdb.line_type.str.startswith('F'), - CN.LINE_TYPE].str[0:3] - - # handle model names that contain a forward slash followed by a number - if all_vsdb.model.str.contains(CN.FWD_SLASH).any(): - all_vsdb[CN.N_VAR] = 0 - # save the value after the slash in model - all_vsdb.loc[all_vsdb.model.str.contains(CN.FWD_SLASH), - CN.N_VAR] = \ + CN.LINE_TYPE] = \ + all_vsdb.loc[all_vsdb.line_type.str.startswith('F'), + CN.LINE_TYPE].str[0:3] + + # handle model names that contain a forward slash followed by a number + if all_vsdb.model.str.contains(CN.FWD_SLASH).any(): + all_vsdb[CN.N_VAR] = 0 + # save the value after the slash in model all_vsdb.loc[all_vsdb.model.str.contains(CN.FWD_SLASH), - CN.MODEL].str.split(CN.FWD_SLASH).str[1].astype(int) - - # remove the slash and value from model - if all_vsdb.line_type.isin(CN.ENS_VSDB_LINE_TYPES).any(): - all_vsdb.loc[all_vsdb.model.str.contains(CN.FWD_SLASH) & - all_vsdb.line_type.isin( - CN.ENS_VSDB_LINE_TYPES), - CN.MODEL] = \ - all_vsdb.loc[all_vsdb.model.str.contains(CN.FWD_SLASH), - CN.MODEL].str.split(CN.FWD_SLASH).str[0] - - # for RELI/PCT, get number after slash in model, add one, - # prefix with string and put in thresh - if CN.RELI in all_vsdb.line_type.values: - all_vsdb.loc[all_vsdb.line_type == - CN.RELI, CN.N_VAR] = \ + all_vsdb.loc[all_vsdb.model.str.contains(CN.FWD_SLASH), + CN.MODEL].str.split(CN.FWD_SLASH).str[1].astype(int) + + # remove the slash and value from model + if all_vsdb.line_type.isin(CN.ENS_VSDB_LINE_TYPES).any(): + all_vsdb.loc[all_vsdb.model.str.contains(CN.FWD_SLASH) & + all_vsdb.line_type.isin( + CN.ENS_VSDB_LINE_TYPES), + CN.MODEL] = \ + all_vsdb.loc[all_vsdb.model.str.contains(CN.FWD_SLASH), + CN.MODEL].str.split(CN.FWD_SLASH).str[0] + + # for RELI/PCT, get number after slash in model, add one, + # prefix with string and put in thresh + if CN.RELI in all_vsdb.line_type.values: all_vsdb.loc[all_vsdb.line_type == CN.RELI, - CN.N_VAR] + 1 - # RELI/PCT also uses this number in the threshold - all_vsdb.loc[all_vsdb.line_type == - CN.RELI, - CN.FCST_THRESH] = \ - '==1/' + \ + CN.N_VAR] = \ + all_vsdb.loc[all_vsdb.line_type == + CN.RELI, + CN.N_VAR] + 1 + # RELI/PCT also uses this number in the threshold all_vsdb.loc[all_vsdb.line_type == CN.RELI, - CN.N_VAR].astype(str) - - # HIST/RHIST also adds one - if CN.HIST in all_vsdb.line_type.values: - all_vsdb.loc[all_vsdb.line_type == - CN.HIST, - CN.N_VAR] = \ + CN.FCST_THRESH] = \ + '==1/' + \ + all_vsdb.loc[all_vsdb.line_type == + CN.RELI, + CN.N_VAR].astype(str) + + # HIST/RHIST also adds one + if CN.HIST in all_vsdb.line_type.values: all_vsdb.loc[all_vsdb.line_type == CN.HIST, - CN.N_VAR] + 1 - - # ECON/ECLV use a default of 18 - if CN.ECON in all_vsdb.line_type.values: - all_vsdb.loc[all_vsdb.line_type == - CN.ECON, - CN.N_VAR] = 18 - - # change from VSDB line types to STAT line types - all_vsdb.line_type = \ - all_vsdb.line_type.replace(to_replace=CN.OLD_VSDB_LINE_TYPES, - value=CN.VSDB_TO_STAT_TYPES) - - # add columns to make these VSDB files look more like Met stat files - - # add description - all_vsdb.insert(2, CN.DESCR, CN.NOTAV) - # reformat fcst_valid_beg - all_vsdb.fcst_valid_beg = pd.to_datetime(all_vsdb.fcst_valid_beg, - format='%Y%m%d%H') - # fcst_valid_end is the same as fcst_valid_beg - all_vsdb[CN.FCST_VALID_END] = all_vsdb.fcst_valid_beg - # fcst_lead must be numeric for later calculations - all_vsdb.fcst_lead = all_vsdb.fcst_lead.astype(int) - all_vsdb.insert(11, CN.OBS_LEAD, 0) - # copy obs values from fcst values - all_vsdb[CN.OBS_VALID_BEG] = all_vsdb.fcst_valid_beg - all_vsdb[CN.OBS_VALID_END] = all_vsdb.fcst_valid_beg - all_vsdb[CN.OBS_VAR] = all_vsdb.fcst_var - all_vsdb[CN.OBS_LEV] = all_vsdb.fcst_lev - all_vsdb[CN.OBS_THRESH] = all_vsdb.fcst_thresh - # add units - all_vsdb.insert(12, CN.FCST_UNITS, CN.NOTAV) - all_vsdb.insert(13, CN.OBS_UNITS, CN.NOTAV) - # add interp method and interp points with default values - all_vsdb.insert(14, CN.INTERP_MTHD, CN.NOTAV) - all_vsdb.insert(15, CN.INTERP_PNTS, 0) - # add alpha and cov_thresh - all_vsdb.insert(16, CN.ALPHA, CN.MV_NOTAV) - all_vsdb.insert(17, CN.COV_THRESH, CN.MV_NOTAV) - # add total column with default of zero - all_vsdb.insert(18, CN.TOTAL_LC, "0") - - all_vsdb[CN.COL_NA] = CN.MV_NOTAV - all_vsdb[CN.COL_ZERO] = "0" - - # find all the line types in the data - vsdb_types = all_vsdb.line_type.unique() - - for vsdb_type in vsdb_types: - # get the line data of just this VSDB type and re-index - vsdb_data = all_vsdb[all_vsdb[CN.LINE_TYPE] - == vsdb_type].copy() - vsdb_data.reset_index(drop=True, inplace=True) - - if vsdb_type in (CN.SL1L2, CN.SAL1L2): - # some SL1L2 files do not have MAE - if '6' not in vsdb_data: - vsdb_data.insert(25, '6', CN.MV_NOTAV) - one_file = vsdb_data[CN.LONG_HEADER + CN.COL_NUMS[:7] + - CN.COL_NAS[:94] + [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type in (CN.VL1L2, CN.GRAD): - # some VL1L2 files do not have f_speed_bar and o_speed_bar - if '8' not in vsdb_data: - vsdb_data.insert(25, '8', CN.MV_NOTAV) - if '9' not in vsdb_data: - vsdb_data.insert(25, '9', CN.MV_NOTAV) - one_file = vsdb_data[CN.LONG_HEADER + CN.COL_NUMS[:10] + - CN.COL_NAS[:91] + [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.VAL1L2: - one_file = vsdb_data[CN.LONG_HEADER + CN.COL_NUMS[:8] + - CN.COL_NAS[:93] + [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.RHIST: - # rhist ranks need to be multiplied by 100. First convert to float. - vsdb_data[CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]]] = \ - vsdb_data[CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]]].astype( + CN.N_VAR] = \ + all_vsdb.loc[all_vsdb.line_type == + CN.HIST, + CN.N_VAR] + 1 + + # ECON/ECLV use a default of 18 + if CN.ECON in all_vsdb.line_type.values: + all_vsdb.loc[all_vsdb.line_type == + CN.ECON, + CN.N_VAR] = 18 + + # change from VSDB line types to STAT line types + all_vsdb.line_type = \ + all_vsdb.line_type.replace(to_replace=CN.OLD_VSDB_LINE_TYPES, + value=CN.VSDB_TO_STAT_TYPES) + + # add columns to make these VSDB files look more like Met stat files + + # add description + all_vsdb.insert(2, CN.DESCR, CN.NOTAV) + # reformat fcst_valid_beg + all_vsdb.fcst_valid_beg = pd.to_datetime(all_vsdb.fcst_valid_beg, + format='%Y%m%d%H') + # fcst_valid_end is the same as fcst_valid_beg + all_vsdb[CN.FCST_VALID_END] = all_vsdb.fcst_valid_beg + # fcst_lead must be numeric for later calculations + all_vsdb.fcst_lead = all_vsdb.fcst_lead.astype(int) + all_vsdb.insert(11, CN.OBS_LEAD, 0) + # copy obs values from fcst values + all_vsdb[CN.OBS_VALID_BEG] = all_vsdb.fcst_valid_beg + all_vsdb[CN.OBS_VALID_END] = all_vsdb.fcst_valid_beg + all_vsdb[CN.OBS_VAR] = all_vsdb.fcst_var + all_vsdb[CN.OBS_LEV] = all_vsdb.fcst_lev + all_vsdb[CN.OBS_THRESH] = all_vsdb.fcst_thresh + # add units + all_vsdb.insert(12, CN.FCST_UNITS, CN.NOTAV) + all_vsdb.insert(13, CN.OBS_UNITS, CN.NOTAV) + # add interp method and interp points with default values + all_vsdb.insert(14, CN.INTERP_MTHD, CN.NOTAV) + all_vsdb.insert(15, CN.INTERP_PNTS, 0) + # add alpha and cov_thresh + all_vsdb.insert(16, CN.ALPHA, CN.MV_NOTAV) + all_vsdb.insert(17, CN.COV_THRESH, CN.MV_NOTAV) + # add total column with default of zero + all_vsdb.insert(18, CN.TOTAL_LC, "0") + + all_vsdb[CN.COL_NA] = CN.MV_NOTAV + all_vsdb[CN.COL_ZERO] = "0" + + # find all the line types in the data + vsdb_types = all_vsdb.line_type.unique() + + for vsdb_type in vsdb_types: + # get the line data of just this VSDB type and re-index + vsdb_data = all_vsdb[all_vsdb[CN.LINE_TYPE] + == vsdb_type].copy() + vsdb_data.reset_index(drop=True, inplace=True) + + if vsdb_type in (CN.SL1L2, CN.SAL1L2): + # some SL1L2 files do not have MAE + if '6' not in vsdb_data: + vsdb_data.insert(25, '6', CN.MV_NOTAV) + one_file = vsdb_data[CN.LONG_HEADER + CN.COL_NUMS[:7] + + CN.COL_NAS[:94] + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type in (CN.VL1L2, CN.GRAD): + # some VL1L2 files do not have f_speed_bar and o_speed_bar + if '8' not in vsdb_data: + vsdb_data.insert(25, '8', CN.MV_NOTAV) + if '9' not in vsdb_data: + vsdb_data.insert(25, '9', CN.MV_NOTAV) + one_file = vsdb_data[CN.LONG_HEADER + CN.COL_NUMS[:10] + + CN.COL_NAS[:91] + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.VAL1L2: + one_file = vsdb_data[CN.LONG_HEADER + CN.COL_NUMS[:8] + + CN.COL_NAS[:93] + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.RHIST: + # rhist ranks need to be multiplied by 100. First convert to float. + vsdb_data[CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]]] = \ + vsdb_data[CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]]].astype( + float) + vsdb_data[CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]]] *= 100 + one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC, CN.N_VAR] + + CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]] + + CN.COL_NAS[:(99 - vsdb_data[CN.N_VAR][1])] + + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.PCT: + # the total needs to be float + vsdb_data[CN.TOTAL_LC] = vsdb_data[CN.TOTAL_LC].astype( float) - vsdb_data[CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]]] *= 100 - one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC, CN.N_VAR] + - CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]] + - CN.COL_NAS[:(99 - vsdb_data[CN.N_VAR][1])] + - [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.PCT: - # the total needs to be float - vsdb_data[CN.TOTAL_LC] = vsdb_data[CN.TOTAL_LC].astype( - float) - # the first set of n_var columns are oy_i, the second set are subtotals - # add n_var new columns after the first two sets, for calculated thresh_i - zero_col = vsdb_data.columns.get_loc('0') - mid_col = vsdb_data.columns.get_loc(CN.N_VAR) - vsdb_data = \ - vsdb_data.reindex(columns=[*vsdb_data.columns.tolist()[0:mid_col], - *CN.COL_NUMS[mid_col - zero_col:-2], - *vsdb_data.columns.tolist()[mid_col:]], - fill_value=0) - # all 3 sets of columns need to be float - vsdb_data[CN.COL_NUMS[0:-2]] = \ - vsdb_data[CN.COL_NUMS[0:-2]].astype(float) - # the total in line_data_pct is the total of all of the subtotals - # calculated per row as there may be rows with different values of n_var - col_total = vsdb_data.columns.get_loc(CN.TOTAL_LC) - # calculate thresh and re-order values to be - # in sets of thresh_i, oy_i, and on_i (which is subtotal - oy_i) - for index, row in vsdb_data.iterrows(): - var_values = [] - n_var = row[CN.N_VAR] - col_start = zero_col + n_var - col_end = col_start + n_var - vsdb_data.iloc[index, - col_total] = row[col_start:col_end].sum() - for i in range(n_var): - var_values = var_values + [i/(n_var - 1)] - var_values = var_values + [row[str(i)]] - var_values = var_values + \ - [row[str(i+n_var)] - row[str(i)]] - df_values = pd.DataFrame([var_values]) - # put calculated and re-ordered values back into vsdb_data - vsdb_data.iloc[index, zero_col:zero_col + (n_var * 3)] = \ - df_values.iloc[0, 0:n_var * 3].values - one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC, CN.N_VAR] + - CN.COL_NUMS[0:-2] + - [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.RELP: - one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC, CN.N_VAR] + - CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]] + - CN.COL_NAS[:(99 - vsdb_data[CN.N_VAR][1])] + - [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.ECLV: - n_var = vsdb_data.loc[0, CN.N_VAR] - # first data column - zero_col = vsdb_data.columns.get_loc('0') - # new last data column after data is doubled - last_col = zero_col + (n_var * 2) - 1 - # current last data column - mid_col = zero_col + n_var - 1 - # counter for constants that will be added to double the data - last_point = n_var - 1 - n_var_col = vsdb_data.columns.get_loc(CN.N_VAR) - top_col = int(vsdb_data.columns[n_var_col - 1]) - # add extra columns to vsdb_data if needed for doubling of columns - if top_col < last_col: + # the first set of n_var columns are oy_i, the second set are subtotals + # add n_var new columns after the first two sets, for calculated thresh_i + zero_col = vsdb_data.columns.get_loc('0') + mid_col = vsdb_data.columns.get_loc(CN.N_VAR) vsdb_data = \ - vsdb_data.reindex(columns=[*vsdb_data.columns.tolist()[0:n_var_col], - *CN.COL_NUMS[top_col + 1: last_col + 1], - *vsdb_data.columns.tolist()[n_var_col:]], - fill_value=CN.MV_NOTAV) - for i in range(n_var): - vsdb_data[vsdb_data.columns[last_col] - ] = vsdb_data.iloc[:, mid_col] - vsdb_data[vsdb_data.columns[last_col - 1] - ] = CN.X_POINTS_ECON[last_point] - last_col = last_col - 2 - mid_col = mid_col - 1 - last_point = last_point - 1 - one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC] + - CN.COL_NAS[:2] + [CN.N_VAR] + - CN.COL_NUMS[0:36] + - CN.COL_NAS[:61] + [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.PSTD: - one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC] + - [CN.COL_ZERO] + CN.COL_NAS[:3] + - ['3', '4', '5'] + CN.COL_NAS[:1] + - ['0'] + CN.COL_NAS[:2] + - ['1'] + CN.COL_NAS[:2] + - ['2'] + CN.COL_NAS[:1] + - CN.COL_NAS[:84] + [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.CNT: - one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC] + - CN.COL_NAS[:27] + - [CN.COL_ZERO, CN.COL_ZERO, CN.COL_ZERO] + - ['2'] + CN.COL_NAS[:4] + - ['0'] + CN.COL_NAS[:7] + - ['3'] + CN.COL_NAS[:8] + - ['1'] + CN.COL_NAS[:23] + - ['4'] + CN.COL_NAS[:23] + [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.ENSCNT: - one_file = vsdb_data[CN.LONG_HEADER + ['0'] + - CN.COL_NAS[:4] + - ['1'] + CN.COL_NAS[:4] + - ['2'] + CN.COL_NAS[:4] + - ['3'] + CN.COL_NAS[:4] + - ['4'] + CN.COL_NAS[:4] + - ['5'] + CN.COL_NAS[:75] + [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.CTC: - # column 0 is Total, 1 is F, 2 is H - # column 3 is O (Oh) - if None, set to 0 (Zero) - vsdb_data.loc[vsdb_data['3'].isnull(), '3'] = 0 - # fy = Total * F - vsdb_data['4'] = vsdb_data['0'].astype( - float) * vsdb_data['1'].astype(float) - # oy = Total * O - vsdb_data['5'] = vsdb_data['0'].astype( - float) * vsdb_data['3'].astype(float) - # fy_oy = Total * H - vsdb_data['6'] = vsdb_data['0'].astype( - float) * vsdb_data['2'].astype(float) - # fy_on = fy - fy_oy - vsdb_data['7'] = vsdb_data['4'].astype( - float) - vsdb_data['6'].astype(float) - # fn_oy = oy - fy_oy - vsdb_data['8'] = vsdb_data['5'].astype( - float) - vsdb_data['6'].astype(float) - # fn_on = Total - fy - oy + fy_oy - vsdb_data['9'] = (vsdb_data['0'].astype(float) - - vsdb_data['4'].astype(float) - - vsdb_data['5'].astype(float) + - vsdb_data['6'].astype(float)) - # Needs a default ec_value of 0.5 - vsdb_data['10'] = 0.5 - one_file = vsdb_data[CN.LONG_HEADER + - ['0'] + ['6', '7', '8', '9', '10'] + - CN.COL_NAS[:95] + [CN.LINE_NUM, CN.FILE_ROW]] - - elif vsdb_type == CN.NBRCNT: - # fss is calculated from the other columns - vsdb_data['4'] = (1 - vsdb_data['1'].astype(float) / - vsdb_data['2'].astype(float) + - vsdb_data['3'].astype(float)) - one_file = vsdb_data[CN.LONG_HEADER + - ['0', '1'] + CN.COL_NAS[:2] + ['4'] + - CN.COL_NAS[:96] + [CN.LINE_NUM, CN.FILE_ROW]] - - # rename columns - if not one_file.empty: - one_file.columns = CN.LONG_HEADER + \ - CN.COL_NUMS[:101] + [CN.LINE_NUM, CN.FILE_ROW] - list_vsdb.append(one_file) - one_file = one_file.iloc[0:0] - vsdb_data = vsdb_data.iloc[0:0] - - # end for vsdb_type - - # Clear out all_vsdb, which we copied from above, line_type by line_type - all_vsdb = all_vsdb.iloc[0:0] - # combine stat and vsdb - all_vsdb = pd.concat(list_vsdb, ignore_index=True, sort=False) - all_stat = pd.concat([all_stat, all_vsdb], - ignore_index=True, sort=False) - all_vsdb = all_vsdb.iloc[0:0] - - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error(f"*** {sys.exc_info()[0]} in read_data if list_vsdb ***") - - try: - if list_cts: - all_cts = pd.concat(list_cts, ignore_index=True, sort=False) - list_cts = [] - - if not all_cts.fcst_lead.dtypes == 'int': - all_cts.loc[all_cts.fcst_lead == - CN.NOTAV, CN.FCST_LEAD] = 0 - all_cts[CN.FCST_LEAD] = all_cts[CN.FCST_LEAD].astype(int) - - # Copy forecast lead times, without trailing 0000 if they have them - all_cts[CN.FCST_LEAD_HR] = \ - np.where(all_cts[CN.FCST_LEAD] < 25, - all_cts[CN.FCST_LEAD] * 10000, - all_cts[CN.FCST_LEAD]) - - all_cts[CN.FCST_LEAD_HR] = (((all_cts[CN.FCST_LEAD_HR] // 10000) * 3600) + - ((all_cts[CN.FCST_LEAD_HR] // 100 % 100) * 60) + - (all_cts[CN.FCST_LEAD_HR] % 100)) - - # Calculate fcst_init = fcst_valid - fcst_lead hours - all_cts.insert(8, CN.FCST_INIT, CN.NOTAV) - all_cts[CN.FCST_INIT] = all_cts[CN.FCST_VALID] - \ - pd.to_timedelta(all_cts[CN.FCST_LEAD_HR], unit='sec') - - # line type of mode contingency table - all_cts[CN.LINE_TYPE_LU_ID] = 19 - - self.mode_cts_data = all_cts - all_cts = all_cts.iloc[0:0] - - if list_obj: - # gather all mode lines - all_obj = pd.concat(list_obj, ignore_index=True, sort=False) - list_obj = [] - - if not all_obj.fcst_lead.dtypes == 'int': - all_obj.loc[all_obj.fcst_lead == - CN.NOTAV, CN.FCST_LEAD] = 0 - all_obj[CN.FCST_LEAD] = all_obj[CN.FCST_LEAD].astype(int) - - # Copy forecast lead times, without trailing 0000 if they have them - all_obj[CN.FCST_LEAD_HR] = \ - np.where(all_obj[CN.FCST_LEAD] < 25, - all_obj[CN.FCST_LEAD] * 10000, - all_obj[CN.FCST_LEAD]) - - all_obj[CN.FCST_LEAD_HR] = (((all_obj[CN.FCST_LEAD_HR] // 10000) * 3600) + - ((all_obj[CN.FCST_LEAD_HR] // 100 % 100) * 60) + - (all_obj[CN.FCST_LEAD_HR] % 100)) - - # Calculate fcst_init = fcst_valid - fcst_lead hours - all_obj.insert(8, CN.FCST_INIT, CN.NOTAV) - all_obj[CN.FCST_INIT] = all_obj[CN.FCST_VALID] - \ - pd.to_timedelta(all_obj[CN.FCST_LEAD_HR], unit='sec') - - # default to mode single - all_obj[CN.LINE_TYPE_LU_ID] = 17 - - # mark if it's a mode pair - all_obj.loc[all_obj.object_id.str.contains('_'), - CN.LINE_TYPE_LU_ID] = 18 - - self.mode_obj_data = all_obj - all_obj = all_obj.iloc[0:0] - - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error(f"*** {sys.exc_info()[0]} in read_data if list_cts or list_obj ***") - - try: - if not all_stat.empty: - self.logger.debug(f"Shape of all_stat before transforms: {str(all_stat.shape)}") - - # delete any lines that have invalid line_types - invalid_line_indexes = all_stat[~all_stat.line_type.isin( - CN.UC_LINE_TYPES)].index - - if not invalid_line_indexes.empty: - - self.logger.warning("!!! Warning, invalid line_types:") - self.logger.warning(f"line types: {str(all_stat.iloc[invalid_line_indexes].line_type)}") - - all_stat.drop(invalid_line_indexes, axis=0, inplace=True) - - # if user specified line types to load, delete the rest - if load_flags["line_type_load"]: - all_stat.drop(all_stat[~all_stat.line_type.isin(line_types)].index, - inplace=True) - - # if load_spec has flag to not load MPR records, delete them - if not load_flags["load_mpr"]: - all_stat.drop( - all_stat[all_stat.line_type == CN.MPR].index, inplace=True) - - # if load_spec has flag to not load ORANK records, delete them - if not load_flags["load_orank"]: - all_stat.drop( - all_stat[all_stat.line_type == CN.ORANK].index, inplace=True) - - # reset the index, in case any lines have been deleted - all_stat.reset_index(drop=True, inplace=True) - - # if all lines from a stat or vsdb file were deleted, remove filename - files_to_drop = ~self.data_files.index.isin( - all_stat[CN.FILE_ROW]) - files_stat = self.data_files[CN.DATA_FILE_LU_ID].isin([CN.VSDB_POINT_STAT, - CN.STAT]) - self.data_files.drop(self.data_files[files_to_drop & files_stat].index, - inplace=True) - self.data_files.reset_index(drop=True, inplace=True) - - if not all_stat.fcst_lead.dtypes == 'int': - all_stat.loc[all_stat.fcst_lead == - CN.NOTAV, CN.FCST_LEAD] = 0 - all_stat[CN.FCST_LEAD] = all_stat[CN.FCST_LEAD].astype(int) - - # Copy forecast lead times, without trailing 0000 if they have them - all_stat[CN.FCST_LEAD_HR] = \ - np.where(all_stat[CN.FCST_LEAD] < 25, - all_stat[CN.FCST_LEAD] * 10000, - all_stat[CN.FCST_LEAD]) - - all_stat[CN.FCST_LEAD_HR] = (((all_stat[CN.FCST_LEAD_HR] // 10000) * 3600) + - ((all_stat[CN.FCST_LEAD_HR] // 100 % 100) * 60) + - (all_stat[CN.FCST_LEAD_HR] % 100)) - - # Calculate fcst_init_beg = fcst_valid_beg - fcst_lead hours (in seconds) - all_stat.insert(6, CN.FCST_INIT_BEG, CN.NOTAV) - all_stat[CN.FCST_INIT_BEG] = all_stat[CN.FCST_VALID_BEG] - \ - pd.to_timedelta(all_stat[CN.FCST_LEAD_HR], unit='sec') - - self.logger.debug(f"Shape of all_stat after transforms: {str(all_stat.shape)}") - - self.stat_data = all_stat - all_stat = all_stat.iloc[0:0] - - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error("*** {sys.exc_info()[0]} in read_data near end ***") - - try: - if not all_tcst.empty: - - self.logger.debug(f"Shape of all_tcst before transforms: {str(all_tcst.shape)}") - - # delete any lines that have invalid line_types - invalid_line_indexes = \ - all_tcst[~all_tcst.line_type.isin( - CN.UC_LINE_TYPES_TCST)].index - - if not invalid_line_indexes.empty: - - self.logger.warning("!!! Warning, invalid line_types:") - self.logger.warning(f"line types: {str(all_tcst.iloc[invalid_line_indexes].line_types)}") - - all_tcst.drop(invalid_line_indexes, axis=0, inplace=True) - - # reset the index, in case any lines have been deleted - all_tcst.reset_index(drop=True, inplace=True) - - # if all lines from a tcst file were deleted, remove filename - files_to_drop = ~self.data_files.index.isin( - all_tcst[CN.FILE_ROW]) - files_tcsp = self.data_files[CN.DATA_FILE_LU_ID] == CN.TCST - self.data_files.drop(self.data_files[files_to_drop & files_tcsp].index, - inplace=True) - - self.data_files.reset_index(drop=True, inplace=True) - - self.tcst_data = all_tcst - all_tcst = all_tcst.iloc[0:0] - - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error(f"***{sys.exc_info()[0]} in read_data near end ***") - - try: - if list_2d: - all_2d = pd.concat(list_2d, ignore_index=True, sort=False) - list_2d = [] - - # line type of mtd 2d table - all_2d[CN.LINE_TYPE_LU_ID] = 19 - - self.mtd_2d_data = all_2d - all_2d = all_2d.iloc[0:0] - - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error(f"*** {sys.exc_info()[0]} in read_data if list_2d ***") + vsdb_data.reindex(columns=[*vsdb_data.columns.tolist()[0:mid_col], + *CN.COL_NUMS[mid_col - zero_col:-2], + *vsdb_data.columns.tolist()[mid_col:]], + fill_value=0) + # all 3 sets of columns need to be float + vsdb_data[CN.COL_NUMS[0:-2]] = \ + vsdb_data[CN.COL_NUMS[0:-2]].astype(float) + # the total in line_data_pct is the total of all of the subtotals + # calculated per row as there may be rows with different values of n_var + col_total = vsdb_data.columns.get_loc(CN.TOTAL_LC) + # calculate thresh and re-order values to be + # in sets of thresh_i, oy_i, and on_i (which is subtotal - oy_i) + for index, row in vsdb_data.iterrows(): + var_values = [] + n_var = row[CN.N_VAR] + col_start = zero_col + n_var + col_end = col_start + n_var + vsdb_data.iloc[index, + col_total] = row[col_start:col_end].sum() + for i in range(n_var): + var_values = var_values + [i/(n_var - 1)] + var_values = var_values + [row[str(i)]] + var_values = var_values + \ + [row[str(i+n_var)] - row[str(i)]] + df_values = pd.DataFrame([var_values]) + # put calculated and re-ordered values back into vsdb_data + vsdb_data.iloc[index, zero_col:zero_col + (n_var * 3)] = \ + df_values.iloc[0, 0:n_var * 3].values + one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC, CN.N_VAR] + + CN.COL_NUMS[0:-2] + + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.RELP: + one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC, CN.N_VAR] + + CN.COL_NUMS[0:vsdb_data[CN.N_VAR][1]] + + CN.COL_NAS[:(99 - vsdb_data[CN.N_VAR][1])] + + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.ECLV: + n_var = vsdb_data.loc[0, CN.N_VAR] + # first data column + zero_col = vsdb_data.columns.get_loc('0') + # new last data column after data is doubled + last_col = zero_col + (n_var * 2) - 1 + # current last data column + mid_col = zero_col + n_var - 1 + # counter for constants that will be added to double the data + last_point = n_var - 1 + n_var_col = vsdb_data.columns.get_loc(CN.N_VAR) + top_col = int(vsdb_data.columns[n_var_col - 1]) + # add extra columns to vsdb_data if needed for doubling of columns + if top_col < last_col: + vsdb_data = \ + vsdb_data.reindex(columns=[*vsdb_data.columns.tolist()[0:n_var_col], + *CN.COL_NUMS[top_col + 1: last_col + 1], + *vsdb_data.columns.tolist()[n_var_col:]], + fill_value=CN.MV_NOTAV) + for i in range(n_var): + vsdb_data[vsdb_data.columns[last_col] + ] = vsdb_data.iloc[:, mid_col] + vsdb_data[vsdb_data.columns[last_col - 1] + ] = CN.X_POINTS_ECON[last_point] + last_col = last_col - 2 + mid_col = mid_col - 1 + last_point = last_point - 1 + one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC] + + CN.COL_NAS[:2] + [CN.N_VAR] + + CN.COL_NUMS[0:36] + + CN.COL_NAS[:61] + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.PSTD: + one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC] + + [CN.COL_ZERO] + CN.COL_NAS[:3] + + ['3', '4', '5'] + CN.COL_NAS[:1] + + ['0'] + CN.COL_NAS[:2] + + ['1'] + CN.COL_NAS[:2] + + ['2'] + CN.COL_NAS[:1] + + CN.COL_NAS[:84] + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.CNT: + one_file = vsdb_data[CN.LONG_HEADER + [CN.TOTAL_LC] + + CN.COL_NAS[:27] + + [CN.COL_ZERO, CN.COL_ZERO, CN.COL_ZERO] + + ['2'] + CN.COL_NAS[:4] + + ['0'] + CN.COL_NAS[:7] + + ['3'] + CN.COL_NAS[:8] + + ['1'] + CN.COL_NAS[:23] + + ['4'] + CN.COL_NAS[:23] + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.ENSCNT: + one_file = vsdb_data[CN.LONG_HEADER + ['0'] + + CN.COL_NAS[:4] + + ['1'] + CN.COL_NAS[:4] + + ['2'] + CN.COL_NAS[:4] + + ['3'] + CN.COL_NAS[:4] + + ['4'] + CN.COL_NAS[:4] + + ['5'] + CN.COL_NAS[:75] + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.CTC: + # column 0 is Total, 1 is F, 2 is H + # column 3 is O (Oh) - if None, set to 0 (Zero) + vsdb_data.loc[vsdb_data['3'].isnull(), '3'] = 0 + # fy = Total * F + vsdb_data['4'] = vsdb_data['0'].astype( + float) * vsdb_data['1'].astype(float) + # oy = Total * O + vsdb_data['5'] = vsdb_data['0'].astype( + float) * vsdb_data['3'].astype(float) + # fy_oy = Total * H + vsdb_data['6'] = vsdb_data['0'].astype( + float) * vsdb_data['2'].astype(float) + # fy_on = fy - fy_oy + vsdb_data['7'] = vsdb_data['4'].astype( + float) - vsdb_data['6'].astype(float) + # fn_oy = oy - fy_oy + vsdb_data['8'] = vsdb_data['5'].astype( + float) - vsdb_data['6'].astype(float) + # fn_on = Total - fy - oy + fy_oy + vsdb_data['9'] = (vsdb_data['0'].astype(float) - + vsdb_data['4'].astype(float) - + vsdb_data['5'].astype(float) + + vsdb_data['6'].astype(float)) + # Needs a default ec_value of 0.5 + vsdb_data['10'] = 0.5 + one_file = vsdb_data[CN.LONG_HEADER + + ['0'] + ['6', '7', '8', '9', '10'] + + CN.COL_NAS[:95] + [CN.LINE_NUM, CN.FILE_ROW]] + + elif vsdb_type == CN.NBRCNT: + # fss is calculated from the other columns + vsdb_data['4'] = (1 - vsdb_data['1'].astype(float) / + vsdb_data['2'].astype(float) + + vsdb_data['3'].astype(float)) + one_file = vsdb_data[CN.LONG_HEADER + + ['0', '1'] + CN.COL_NAS[:2] + ['4'] + + CN.COL_NAS[:96] + [CN.LINE_NUM, CN.FILE_ROW]] + + # rename columns + if not one_file.empty: + one_file.columns = CN.LONG_HEADER + \ + CN.COL_NUMS[:101] + [CN.LINE_NUM, CN.FILE_ROW] + list_vsdb.append(one_file) + one_file = one_file.iloc[0:0] + vsdb_data = vsdb_data.iloc[0:0] + + # end for vsdb_type + + # Clear out all_vsdb, which we copied from above, line_type by line_type + all_vsdb = all_vsdb.iloc[0:0] + # combine stat and vsdb + all_vsdb = pd.concat( + list_vsdb, ignore_index=True, sort=False) + all_stat = pd.concat([all_stat, all_vsdb], + ignore_index=True, sort=False) + all_vsdb = all_vsdb.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error concatenating vsdb data frames prior to load: {sys.exc_info()[0]}*** ") + sys.exit("*** Error concatenating vsdb data frames prior to load") + + try: + if list_cts: + all_cts = pd.concat( + list_cts, ignore_index=True, sort=False) + list_cts = [] + + if not all_cts.fcst_lead.dtypes == 'int': + all_cts.loc[all_cts.fcst_lead == + CN.NOTAV, CN.FCST_LEAD] = 0 + all_cts[CN.FCST_LEAD] = all_cts[CN.FCST_LEAD].astype( + int) + + # Copy forecast lead times, without trailing 0000 if they have them + all_cts[CN.FCST_LEAD_HR] = \ + np.where(all_cts[CN.FCST_LEAD] < 25, + all_cts[CN.FCST_LEAD] * 10000, + all_cts[CN.FCST_LEAD]) + + all_cts[CN.FCST_LEAD_HR] = (((all_cts[CN.FCST_LEAD_HR] // 10000) * 3600) + + ((all_cts[CN.FCST_LEAD_HR] // 100 % 100) * 60) + + (all_cts[CN.FCST_LEAD_HR] % 100)) + + # Calculate fcst_init = fcst_valid - fcst_lead hours + all_cts.insert(8, CN.FCST_INIT, CN.NOTAV) + all_cts[CN.FCST_INIT] = all_cts[CN.FCST_VALID] - \ + pd.to_timedelta(all_cts[CN.FCST_LEAD_HR], unit='sec') + + # line type of mode contingency table + all_cts[CN.LINE_TYPE_LU_ID] = 19 + + self.mode_cts_data = all_cts + all_cts = all_cts.iloc[0:0] + + if list_obj: + # gather all mode lines + all_obj = pd.concat( + list_obj, ignore_index=True, sort=False) + list_obj = [] + + if not all_obj.fcst_lead.dtypes == 'int': + all_obj.loc[all_obj.fcst_lead == + CN.NOTAV, CN.FCST_LEAD] = 0 + all_obj[CN.FCST_LEAD] = all_obj[CN.FCST_LEAD].astype( + int) + + # Copy forecast lead times, without trailing 0000 if they have them + all_obj[CN.FCST_LEAD_HR] = \ + np.where(all_obj[CN.FCST_LEAD] < 25, + all_obj[CN.FCST_LEAD] * 10000, + all_obj[CN.FCST_LEAD]) + + all_obj[CN.FCST_LEAD_HR] = (((all_obj[CN.FCST_LEAD_HR] // 10000) * 3600) + + ((all_obj[CN.FCST_LEAD_HR] // 100 % 100) * 60) + + (all_obj[CN.FCST_LEAD_HR] % 100)) + + # Calculate fcst_init = fcst_valid - fcst_lead hours + all_obj.insert(8, CN.FCST_INIT, CN.NOTAV) + all_obj[CN.FCST_INIT] = all_obj[CN.FCST_VALID] - \ + pd.to_timedelta(all_obj[CN.FCST_LEAD_HR], unit='sec') + + # default to mode single + all_obj[CN.LINE_TYPE_LU_ID] = 17 + + # mark if it's a mode pair + all_obj.loc[all_obj.object_id.str.contains('_'), + CN.LINE_TYPE_LU_ID] = 18 + + self.mode_obj_data = all_obj + all_obj = all_obj.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error transforming cts or obj data prior to load: {sys.exc_info()[0]}*** ") + sys.exit( + "*** Error transforming cts or obj data prior to load") + + try: + if not all_stat.empty: + self.logger.debug( + f"Shape of all_stat before transforms: {str(all_stat.shape)}") + + # delete any lines that have invalid line_types + invalid_line_indexes = all_stat[~all_stat.line_type.isin( + CN.UC_LINE_TYPES)].index + + if not invalid_line_indexes.empty: + + self.logger.warning("!!! Warning, invalid line_types:") + self.logger.warning( + f"line types: {str(all_stat.iloc[invalid_line_indexes].line_type)}") + + all_stat.drop(invalid_line_indexes, + axis=0, inplace=True) + + # if user specified line types to load, delete the rest + if load_flags["line_type_load"]: + all_stat.drop(all_stat[~all_stat.line_type.isin(line_types)].index, + inplace=True) + + # if load_spec has flag to not load MPR records, delete them + if not load_flags["load_mpr"]: + all_stat.drop( + all_stat[all_stat.line_type == CN.MPR].index, inplace=True) + + # if load_spec has flag to not load ORANK records, delete them + if not load_flags["load_orank"]: + all_stat.drop( + all_stat[all_stat.line_type == CN.ORANK].index, inplace=True) + + # reset the index, in case any lines have been deleted + all_stat.reset_index(drop=True, inplace=True) + + # if all lines from a stat or vsdb file were deleted, remove filename + files_to_drop = ~self.data_files.index.isin( + all_stat[CN.FILE_ROW]) + files_stat = self.data_files[CN.DATA_FILE_LU_ID].isin([CN.VSDB_POINT_STAT, + CN.STAT]) + self.data_files.drop(self.data_files[files_to_drop & files_stat].index, + inplace=True) + self.data_files.reset_index(drop=True, inplace=True) + + if not all_stat.fcst_lead.dtypes == 'int': + all_stat.loc[all_stat.fcst_lead == + CN.NOTAV, CN.FCST_LEAD] = 0 + all_stat[CN.FCST_LEAD] = all_stat[CN.FCST_LEAD].astype( + int) + + # Copy forecast lead times, without trailing 0000 if they have them + all_stat[CN.FCST_LEAD_HR] = \ + np.where(all_stat[CN.FCST_LEAD] < 25, + all_stat[CN.FCST_LEAD] * 10000, + all_stat[CN.FCST_LEAD]) + + all_stat[CN.FCST_LEAD_HR] = (((all_stat[CN.FCST_LEAD_HR] // 10000) * 3600) + + ((all_stat[CN.FCST_LEAD_HR] // 100 % 100) * 60) + + (all_stat[CN.FCST_LEAD_HR] % 100)) + + # Calculate fcst_init_beg = fcst_valid_beg - fcst_lead hours (in seconds) + all_stat.insert(6, CN.FCST_INIT_BEG, CN.NOTAV) + all_stat[CN.FCST_INIT_BEG] = all_stat[CN.FCST_VALID_BEG] - \ + pd.to_timedelta(all_stat[CN.FCST_LEAD_HR], unit='sec') + + self.logger.debug( + f"Shape of all_stat after transforms: {str(all_stat.shape)}") + + self.stat_data = all_stat + all_stat = all_stat.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error transforming data prior to load: {sys.exc_info()[0]}*** ") + sys.exit( + "*** Error transforming data prior to load") + + try: + if not all_tcst.empty: + + self.logger.debug( + f"Shape of all_tcst before transforms: {str(all_tcst.shape)}") + + # delete any lines that have invalid line_types + invalid_line_indexes = \ + all_tcst[~all_tcst.line_type.isin( + CN.UC_LINE_TYPES_TCST)].index + + if not invalid_line_indexes.empty: + + self.logger.warning("!!! Warning, invalid line_types:") + self.logger.warning( + f"line types: {str(all_tcst.iloc[invalid_line_indexes].line_types)}") + + all_tcst.drop(invalid_line_indexes, + axis=0, inplace=True) + + # reset the index, in case any lines have been deleted + all_tcst.reset_index(drop=True, inplace=True) + + # if all lines from a tcst file were deleted, remove filename + files_to_drop = ~self.data_files.index.isin( + all_tcst[CN.FILE_ROW]) + files_tcsp = self.data_files[CN.DATA_FILE_LU_ID] == CN.TCST + self.data_files.drop(self.data_files[files_to_drop & files_tcsp].index, + inplace=True) + + self.data_files.reset_index(drop=True, inplace=True) + + self.tcst_data = all_tcst + all_tcst = all_tcst.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error transforming tcst data prior to load: {sys.exc_info()[0]}*** ") + sys.exit( + "*** Error transforming tcst data prior to load") + + try: + if list_2d: + all_2d = pd.concat(list_2d, ignore_index=True, sort=False) + list_2d = [] + + # line type of mtd 2d table + all_2d[CN.LINE_TYPE_LU_ID] = 19 + + self.mtd_2d_data = all_2d + all_2d = all_2d.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error transforming 2d obj data prior to load: {sys.exc_info()[0]}*** ") + sys.exit( + "*** Error transforming 2d obj data prior to load") + + try: + if list_single: + all_single = pd.concat( + list_single, ignore_index=True, sort=False) + list_single = [] - try: - if list_single: - all_single = pd.concat( - list_single, ignore_index=True, sort=False) - list_single = [] + # line type of mtd single table + all_single[CN.LINE_TYPE_LU_ID] = 17 - # line type of mtd single table - all_single[CN.LINE_TYPE_LU_ID] = 17 + self.mtd_3d_single_data = all_single + all_single = all_single.iloc[0:0] - self.mtd_3d_single_data = all_single - all_single = all_single.iloc[0:0] + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error transforming single obj data prior to load: {sys.exc_info()[0]}*** ") + sys.exit( + "*** Error transforming single obj data prior to load") - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error(f"*** {sys.exc_info()[0]} in read_data if list_single ***") + try: + if list_pair: + all_pair = pd.concat( + list_pair, ignore_index=True, sort=False) + list_pair = [] - try: - if list_pair: - all_pair = pd.concat(list_pair, ignore_index=True, sort=False) - list_pair = [] + # line type of mtd pair table + all_pair[CN.LINE_TYPE_LU_ID] = 18 - # line type of mtd pair table - all_pair[CN.LINE_TYPE_LU_ID] = 18 + self.mtd_3d_pair_data = all_pair + all_pair = all_pair.iloc[0:0] - self.mtd_3d_pair_data = all_pair - all_pair = all_pair.iloc[0:0] + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + f"*** Error transforming single obj data prior to load: {sys.exc_info()[0]}*** ") + sys.exit( + "*** Error transforming single obj data prior to load") - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error(f"*** {sys.exc_info()[0]} in read_data if list_pair ***") + read_time_end = time.perf_counter() + read_time = timedelta(seconds=read_time_end - read_time_start) - read_time_end = time.perf_counter() - read_time = timedelta(seconds=read_time_end - read_time_start) + self.logger.info(f" >>> Read time: {str(read_time)}") - self.logger.info(f" >>> Read time: {str(read_time)}") + self.logger.debug("[--- End read_data ---]") - self.logger.debug("[--- End read_data ---]") + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + self.logger.error( + "*** %s occurred in read_data function ***", sys.exc_info()[0]) + sys.exit("*** Error reading data") @staticmethod def get_lookup(filename): @@ -1361,7 +1511,12 @@ def read_stat(self, filename, hdr_names): stat_file = pd.read_csv(filename, sep=CN.SEP, skiprows=1, header=None, skipinitialspace=True) except (pd.errors.EmptyDataError): - self.logger.warning(f"!!! Stat file {filename} has no data after headers") + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! Stat file {filename} has no data after headers") return stat_file stat_file = stat_file.iloc[:, 0] @@ -1403,7 +1558,12 @@ def read_tcst(self, filename, hdr_names): stat_file = pd.read_csv(filename, sep=CN.SEP, skiprows=1, header=None, skipinitialspace=True) except (pd.errors.EmptyDataError): - self.logger.warning(f"!!! Tcst file {filename} has no data after headers") + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! Tcst file {filename} has no data after headers") return stat_file stat_file = stat_file.iloc[:, 0] @@ -1440,7 +1600,12 @@ def read_mode(self, filename, hdr_names): stat_file = pd.read_csv(filename, sep=CN.SEP, skiprows=1, header=None, skipinitialspace=True) except (pd.errors.EmptyDataError): - self.logger.warning(f"!!! Mode or MTD file {filename} has no data after headers") + """ + We do NOT want to exit here. One empty file does not mean that we should not load + any good files in this batch. + """ + self.logger.warning( + f"!!! Mode or MTD file {filename} has no data after headers") return stat_file stat_file = stat_file.iloc[:, 0] diff --git a/METdbLoad/ush/read_load_xml.py b/METdbLoad/ush/read_load_xml.py index fb8cd6d3..8ca9c2f4 100644 --- a/METdbLoad/ush/read_load_xml.py +++ b/METdbLoad/ush/read_load_xml.py @@ -17,14 +17,11 @@ import sys import os -import datetime -import io from pathlib import Path import pandas as pd from lxml import etree import METreformat.util as util from METdbLoad.ush import constants as CN -from METdbLoad.test import utils as dbload_util class XmlLoadFile: @@ -34,149 +31,163 @@ class XmlLoadFile: """ def __init__(self, xmlfile, logger=None): - # set the defaults - self.xmlfilename = xmlfile - - self.connection = { - 'db_port': CN.SQL_PORT, - 'db_management_system': "mysql", - } - - self.insert_size = 1 - self.load_note = None - self.group = CN.DEFAULT_DATABASE_GROUP - self.description = "" - self.xml_str = None - - self.flags = { - 'line_type_load': False, - 'load_stat': True, - 'load_mode': True, - 'load_mtd': True, - 'load_mpr': False, - 'load_orank': False, - 'force_dup_file': False, - 'verbose': False, - 'stat_header_db_check': True, - 'tcst_header_db_check': True, - 'mode_header_db_check': True, - 'mtd_header_db_check': True, - 'drop_indexes': False, - 'apply_indexes': False, - 'load_xml': True, - } - - self.load_files = [] - self.line_types = [] - - if logger is None: - log_filename = os.path.join(os.getcwd(), __name__ + "_log.txt") - self.logger = util.get_common_logger('INFO', log_filename) - else: - self.logger = logger + try: + # set the defaults + self.xmlfilename = xmlfile + + self.connection = { + 'db_port': CN.SQL_PORT, + 'db_management_system': "mysql", + } + + self.insert_size = 1 + self.load_note = None + self.group = CN.DEFAULT_DATABASE_GROUP + self.description = "" + self.xml_str = None + + self.flags = { + 'line_type_load': False, + 'load_stat': True, + 'load_mode': True, + 'load_mtd': True, + 'load_mpr': False, + 'load_orank': False, + 'force_dup_file': False, + 'verbose': False, + 'stat_header_db_check': True, + 'tcst_header_db_check': True, + 'mode_header_db_check': True, + 'mtd_header_db_check': True, + 'drop_indexes': False, + 'apply_indexes': False, + 'load_xml': True, + } + + self.load_files = [] + self.line_types = [] + + if logger is None: + log_filename = os.path.join(os.getcwd(), __name__ + "_log.txt") + self.logger = util.get_common_logger('DEBUG', log_filename) + else: + self.logger = logger + except RuntimeError: + if logger is None: + print( + "*** %s occurred while initializing class XmlLoadFile ***", sys.exc_info()[0]) + else: + self.logger = logger + self.logger.error( + "*** %s occurred while initializing class XmlLoadFile ***", sys.exc_info()[0]) + sys.exit("*** Error initializing class XmlLoadFile") def read_xml(self): """! Read in load_spec xml file, store values as class attributes Returns: N/A """ - self.logger.debug("[--- Start read_xml ---]") try: - # check for existence of XML file - if not Path(self.xmlfilename).is_file(): - sys.exit("*** XML file " + self.xmlfilename + " can not be found!") + self.logger.debug("[--- Start read_xml ---]") - # Validate the XML file - self.logger.info(f"Validating the {self.xmlfilename} against the {dbload_util.LOAD_SPECIFICATION_SCHEMA}") - if self.validate_xml() is False: - msg = ( - f"{self.xmlfilename} is not valid and may contain a recursive payload or an excessively large payload") - self.logger.error(msg) - print(f"{msg}") - raise ValueError(msg) - else: - msg = (f"{self.xmlfilename} is valid ") - self.logger.info(msg) - print(f"{msg}") + try: - self.logger.info('Reading XML Load file') - parser = etree.XMLParser(remove_comments=True, resolve_entities=False) - tree = etree.parse(self.xmlfilename, parser=parser) - root = tree.getroot() + # check for existence of XML file + if not Path(self.xmlfilename).is_file(): + sys.exit("*** XML file " + self.xmlfilename + + " can not be found!") - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error("*** %s in read_xml ***", sys.exc_info()[0]) - sys.exit("*** Parsing error(s) in XML file!") + # parse the XML file + self.logger.info('Reading XML Load file') + parser = etree.XMLParser( + remove_comments=True, resolve_entities=False) + tree = etree.parse(self.xmlfilename, parser=parser) + root = tree.getroot() - # Extract values from load_spec XML tags, store in attributes of class XmlLoadFile - try: + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error("*** %s in read_xml ***", sys.exc_info()[0]) + sys.exit("*** Parsing error(s) in XML file!") - # Extract values for connecting to database - if root.xpath("connection"): - self.read_db_connect(root) - self.logger.info("Database name is: %s", self.connection['db_database']) + # Extract values from load_spec XML tags, store in attributes of class XmlLoadFile + try: - # group and description for putting databases into groups/categories - if root.xpath("group"): - self.group = root.xpath("group")[0].text + # Extract values for connecting to database + if root.xpath("connection"): + self.read_db_connect(root) + self.logger.info("Database name is: %s", + self.connection['db_database']) - if root.xpath("description"): - self.description = root.xpath("description")[0].text + # group and description for putting databases into groups/categories + if root.xpath("group"): + self.group = root.xpath("group")[0].text - # load_note and load_xml are used to put a note in the database - if root.xpath('load_note'): - self.load_note = root.xpath("load_note")[0].text + if root.xpath("description"): + self.description = root.xpath("description")[0].text - # MET line types to load. If omitted, all line types are loaded - if root.xpath('line_type'): - self.flags['line_type_load'] = True - self.line_types = [x.text.upper() for x in root.xpath('line_type')[0]] + # load_note and load_xml are used to put a note in the database + if root.xpath('load_note'): + self.load_note = root.xpath("load_note")[0].text - # insert_size value is an integer - if root.xpath('insert_size') and root.xpath('insert_size')[0].text.isdigit(): - self.insert_size = int(root.xpath('insert_size')[0].text) + # MET line types to load. If omitted, all line types are loaded + if root.xpath('line_type'): + self.flags['line_type_load'] = True + self.line_types = [x.text.upper() + for x in root.xpath('line_type')[0]] - # Handle flags with a default of True - default_true = ["stat_header_db_check", "mode_header_db_check", - "mtd_header_db_check", "tcst_header_db_check", - "load_stat", "load_mode", "load_mtd", "load_xml"] + # insert_size value is an integer + if root.xpath('insert_size') and root.xpath('insert_size')[0].text.isdigit(): + self.insert_size = int(root.xpath('insert_size')[0].text) - self.flag_default_true(root, default_true) + # Handle flags with a default of True + default_true = ["stat_header_db_check", "mode_header_db_check", + "mtd_header_db_check", "tcst_header_db_check", + "load_stat", "load_mode", "load_mtd", "load_xml"] - # Handle flags with a default of False - default_false = ["verbose", "drop_indexes", "apply_indexes", - "load_mpr", "load_orank", "force_dup_file"] + self.flag_default_true(root, default_true) - self.flag_default_false(root, default_false) + # Handle flags with a default of False + default_false = ["verbose", "drop_indexes", "apply_indexes", + "load_mpr", "load_orank", "force_dup_file"] - # if requested, get a string of the XML to put in the database - if self.flags['load_xml']: - self.xml_str = etree.tostring(tree).decode().replace('\n', '').replace(' ', '') + self.flag_default_false(root, default_false) - # Get a list of all of the file names to load - if root.xpath('load_files'): - self.load_files = [x.text for x in root.xpath('load_files')[0]] - else: - # Or get info on file template, fill-in values, and dates, if needed - self.read_file_info(root) + # if requested, get a string of the XML to put in the database + if self.flags['load_xml']: + self.xml_str = etree.tostring( + tree).decode().replace('\n', '').replace(' ', '') - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error("*** %s in read_xml ***", sys.exc_info()[0]) - sys.exit("*** Error(s) found while reading XML file!") + # Get a list of all of the file names to load + if root.xpath('load_files'): + self.load_files = [ + x.text for x in root.xpath('load_files')[0]] + else: + # Or get info on file template, fill-in values, and dates, if needed + self.read_file_info(root) + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error("*** %s in read_xml ***", sys.exc_info()[0]) + sys.exit("*** Error(s) found while reading XML file!") + + # This removes duplicate file names. do we want that? + if self.load_files: + self.load_files = list(dict.fromkeys(self.load_files)) - # This removes duplicate file names. do we want that? - if self.load_files: - self.load_files = list(dict.fromkeys(self.load_files)) + # Remove directory names + self.load_files = [ + lf for lf in self.load_files if '.' in lf.split('/')[-1]] - # Remove directory names - self.load_files = [lf for lf in self.load_files if '.' in lf.split('/')[-1]] + self.logger.info("Initial number of files: %s", + str(len(self.load_files))) - self.logger.info("Initial number of files: %s", str(len(self.load_files))) + self.logger.debug("[--- End read_xml ---]") - self.logger.debug("[--- End read_xml ---]") + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + self.logger.error( + "*** %s occurred in read_xml function ***", sys.exc_info()[0]) + sys.exit("*** Error reading XML") def validate_xml(self): """ @@ -225,7 +236,8 @@ def read_file_info(self, root): # Handle the date_list tag and its child tags if root.xpath('date_list'): - date_list = {x.tag.lower(): x.text for x in root.xpath('date_list')[0]} + date_list = { + x.tag.lower(): x.text for x in root.xpath('date_list')[0]} date_list['name'] = root.xpath('date_list')[0].attrib['name'] # if the date_list tag is included, generate a list of dates @@ -236,7 +248,8 @@ def read_file_info(self, root): folder_template = root.xpath("folder_tmpl")[0].text # get the values to fill in to the folder template - field_names = [x.attrib['name'] for x in root.xpath('load_val')[0].xpath('field')] + field_names = [x.attrib['name'] + for x in root.xpath('load_val')[0].xpath('field')] for field_name in field_names: # Process zero or more val tags @@ -258,10 +271,12 @@ def read_file_info(self, root): # Generate all possible path/filenames from folder template if folder_template and template_fills: - self.load_files = self.filenames_from_template(folder_template, template_fills) + self.load_files = self.filenames_from_template( + folder_template, template_fills) except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error("*** %s in read_xml read_file_info ***", sys.exc_info()[0]) + self.logger.error( + "*** %s in read_xml read_file_info ***", sys.exc_info()[0]) sys.exit("*** Error(s) found while reading XML file info!") def read_db_connect(self, root): @@ -272,7 +287,8 @@ def read_db_connect(self, root): try: host_and_port = None if root.xpath('connection')[0].xpath('host'): - host_and_port = root.xpath('connection')[0].xpath('host')[0].text + host_and_port = root.xpath('connection')[ + 0].xpath('host')[0].text if host_and_port: host_and_port = host_and_port.split(":") self.connection['db_host'] = host_and_port[0] @@ -292,7 +308,8 @@ def read_db_connect(self, root): raise NameError("Missing required database tag") if not self.connection['db_database'].startswith("mv_"): - self.logger.warning("!!! Database not visible unless name starts with mv_") + self.logger.warning( + "!!! Database not visible unless name starts with mv_") self.connection['db_user'] = \ root.xpath('connection')[0].xpath('user')[0].text @@ -305,14 +322,16 @@ def read_db_connect(self, root): if root.xpath('connection')[0].xpath('management_system'): self.connection['db_management_system'] = \ - root.xpath('connection')[0].xpath('management_system')[0].text + root.xpath('connection')[0].xpath( + 'management_system')[0].text if root.xpath('connection')[0].xpath('local_infile'): self.connection['db_local_infile'] = \ root.xpath('connection')[0].xpath('local_infile')[0].text except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error("*** %s in read_xml read_db_connect ***", sys.exc_info()[0]) + self.logger.error( + "*** %s in read_xml read_db_connect ***", sys.exc_info()[0]) sys.exit("*** Error(s) found while reading XML file connection tag!") def flag_default_true(self, root, default_true): @@ -348,7 +367,8 @@ def filenames_from_date(self, date_list): for java_date, python_date in CN.DATE_SUBS.items(): date_format = date_format.replace(java_date, python_date) # format the start and end dates - date_start = pd.to_datetime(date_list["start"], format=date_format) + date_start = pd.to_datetime( + date_list["start"], format=date_format) date_end = pd.to_datetime(date_list["end"], format=date_format) date_inc = int(date_list["inc"]) while date_start < date_end: @@ -356,14 +376,17 @@ def filenames_from_date(self, date_list): date_start = date_start + pd.Timedelta(seconds=date_inc) all_dates.append(date_end.strftime(date_format)) else: - self.logger.error("*** date_list tag has unknown characters ***") + self.logger.error( + "*** date_list tag has unknown characters ***") except ValueError as value_error: - self.logger.error("*** %s in filenames_from_date ***", sys.exc_info()[0]) + self.logger.error( + "*** %s in filenames_from_date ***", sys.exc_info()[0]) self.logger.error(value_error) sys.exit("*** Value Error found while expanding XML date format!") except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error("*** %s in filenames_from_date ***", sys.exc_info()[0]) + self.logger.error( + "*** %s in filenames_from_date ***", sys.exc_info()[0]) sys.exit("*** Error found while expanding XML date format!") return all_dates @@ -383,7 +406,8 @@ def filenames_from_template(self, folder_template, template_fills): # remove any fill values that are not in the template not_in = [] if template_fills: - not_in = [tf for tf in template_fills.keys() if tf not in folder_template] + not_in = [tf for tf in template_fills.keys() + if tf not in folder_template] for wrong_key in not_in: del template_fills[wrong_key] @@ -397,7 +421,8 @@ def filenames_from_template(self, folder_template, template_fills): alist = [] for tvalue in load_dirs: alist = alist + \ - [tvalue.replace("{" + key + "}", x) for x in template_fills[key]] + [tvalue.replace("{" + key + "}", x) + for x in template_fills[key]] load_dirs = alist # find all files in directories, append path to them, and put on load_files list @@ -408,11 +433,13 @@ def filenames_from_template(self, folder_template, template_fills): for x in os.listdir(file_dir)] except ValueError as value_error: - self.logger.error("f*** {sys.exc_info()[0]} in filenames_from_template ***") + self.logger.error( + "f*** {sys.exc_info()[0]} in filenames_from_template ***") self.logger.error(value_error) sys.exit("*** Value Error found while expanding XML folder templates!") except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error("*** %s in filenames_from_template ***", sys.exc_info()[0]) + self.logger.error( + "*** %s in filenames_from_template ***", sys.exc_info()[0]) sys.exit("*** Error found while expanding XML folder templates!") return file_list diff --git a/METdbLoad/ush/run_sql.py b/METdbLoad/ush/run_sql.py index 9c31f884..3b05afb1 100644 --- a/METdbLoad/ush/run_sql.py +++ b/METdbLoad/ush/run_sql.py @@ -25,6 +25,7 @@ from METdbLoad.ush import DEFAULT_LOGLEVEL from METreformat.util import get_common_logger + class RunSql: """ Class to connect and disconnect to/from a SQL database Returns: @@ -32,14 +33,24 @@ class RunSql: """ def __init__(self, logger=None): - # Default to False since it requires extra permission - self.local_infile = False - self.conn = None - self.cur = None - if logger is None: - self.logger = get_common_logger(DEFAULT_LOGLEVEL, 'stdout') - else: - self.logger = logger + try: + # Default to False since it requires extra permission + self.local_infile = False + self.conn = None + self.cur = None + if logger is None: + self.logger = get_common_logger(DEFAULT_LOGLEVEL, 'stdout') + else: + self.logger = logger + except RuntimeError: + if logger is None: + print( + "*** %s occurred while initializing class RunSql ***", sys.exc_info()[0]) + else: + self.logger = logger + self.logger.error( + "*** %s occurred while initializing class RunSql ***", sys.exc_info()[0]) + sys.exit("*** Error initializing class RunSql") def sql_on(self, connection): """ method to connect to a SQL database @@ -50,12 +61,13 @@ def sql_on(self, connection): if 'db_local_infile' in connection.keys() and connection['db_local_infile'].lower() == 'false': local_infile = False else: - # Default behaviour + # Default behaviour local_infile = True try: if (not 'db_host' in connection) or (not 'db_user' in connection): - self.logger.error("XML Load file does not have enough connection tags") + self.logger.error( + "XML Load file does not have enough connection tags") sys.exit("*** Error when connecting to database") # Connect to the database using connection info from XML file @@ -90,7 +102,6 @@ def sql_on(self, connection): self.local_infile = 'OFF' self.logger.debug("local_infile is %s", self.local_infile) - @staticmethod def sql_off(conn, cur): """ method to commit data and disconnect from a SQL database @@ -120,8 +131,8 @@ def get_next_id(table, field, sql_cur, logger): return next_id except (RuntimeError, TypeError, NameError, KeyError, AttributeError): - logger.error("*** %s in write_sql_data get_next_id ***", sys.exc_info()[0]) - + logger.error( + "*** %s in write_sql_data get_next_id ***", sys.exc_info()[0]) @staticmethod def get_file_name(data_file_id, sql_cur, logger): @@ -132,7 +143,8 @@ def get_file_name(data_file_id, sql_cur, logger): # get the filename try: file_name = None - query_for_name = "SELECT filename from data_file where data_file_id = " + str(data_file_id) + query_for_name = "SELECT filename from data_file where data_file_id = " + \ + str(data_file_id) sql_cur.execute(query_for_name) result = sql_cur.fetchone() if result[0] is not None: @@ -140,8 +152,9 @@ def get_file_name(data_file_id, sql_cur, logger): return file_name except (RuntimeError, TypeError, NameError, KeyError, AttributeError): - logger.error("*** %s in write_sql_data get_file_name ***", sys.exc_info()[0]) - + logger.error( + "*** %s in write_sql_data get_file_name ***", sys.exc_info()[0]) + @staticmethod def write_to_sql(raw_data, col_list, sql_table, sql_query, tmp_dir, sql_cur, local_infile, logger): """ given a dataframe of raw_data with specific columns to write to a sql_table, @@ -166,11 +179,16 @@ def write_to_sql(raw_data, col_list, sql_table, sql_query, tmp_dir, sql_cur, loc # only line_data has timestamps in dataframe - change to strings if 'line_data' in sql_table: - raw_data['fcst_valid_beg'] = raw_data['fcst_valid_beg'].astype(str) - raw_data['fcst_valid_end'] = raw_data['fcst_valid_end'].astype(str) - raw_data['fcst_init_beg'] = raw_data['fcst_init_beg'].astype(str) - raw_data['obs_valid_beg'] = raw_data['obs_valid_beg'].astype(str) - raw_data['obs_valid_end'] = raw_data['obs_valid_end'].astype(str) + raw_data['fcst_valid_beg'] = raw_data['fcst_valid_beg'].astype( + str) + raw_data['fcst_valid_end'] = raw_data['fcst_valid_end'].astype( + str) + raw_data['fcst_init_beg'] = raw_data['fcst_init_beg'].astype( + str) + raw_data['obs_valid_beg'] = raw_data['obs_valid_beg'].astype( + str) + raw_data['obs_valid_end'] = raw_data['obs_valid_end'].astype( + str) elif sql_table in (CN.MODE_HEADER, CN.MTD_HEADER): raw_data['fcst_valid'] = raw_data['fcst_valid'].astype(str) raw_data['fcst_init'] = raw_data['fcst_valid'].astype(str) @@ -180,7 +198,8 @@ def write_to_sql(raw_data, col_list, sql_table, sql_query, tmp_dir, sql_cur, loc sql_cur.executemany(sql_query, dfile) except (RuntimeError, TypeError, NameError, KeyError, AttributeError): - logger.error("*** %s in run_sql write_to_sql ***", sys.exc_info()[0]) + logger.error("*** %s in run_sql write_to_sql ***", + sys.exc_info()[0]) @staticmethod def apply_indexes(drop, sql_cur, logger): @@ -205,9 +224,11 @@ def apply_indexes(drop, sql_cur, logger): except (pymysql.OperationalError, pymysql.InternalError): if drop: - logger.error("*** Index to drop does not exist in run_sql apply_indexes ***") + logger.error( + "*** Index to drop does not exist in run_sql apply_indexes ***") else: - logger.error("*** Index to add already exists in run_sql apply_indexes ***") + logger.error( + "*** Index to add already exists in run_sql apply_indexes ***") apply_time_end = time.perf_counter() apply_time = timedelta(seconds=apply_time_end - apply_time_start) diff --git a/METdbLoad/ush/write_file_sql.py b/METdbLoad/ush/write_file_sql.py index 64bf5dee..783b55e5 100644 --- a/METdbLoad/ush/write_file_sql.py +++ b/METdbLoad/ush/write_file_sql.py @@ -26,6 +26,7 @@ from METdbLoad.ush import DEFAULT_LOGLEVEL from METreformat.util import get_common_logger + class WriteFileSql: """ Class to write data_file records to a SQL database Returns: @@ -33,11 +34,21 @@ class WriteFileSql: """ def __init__(self, logger=None): - self.sql_met = RunSql() - if logger is None: - self.logger = get_common_logger(DEFAULT_LOGLEVEL, 'stdout') - else: - self.logger = logger + try: + self.sql_met = RunSql() + if logger is None: + self.logger = get_common_logger(DEFAULT_LOGLEVEL, 'stdout') + else: + self.logger = logger + except RuntimeError: + if logger is None: + print( + "*** %s occurred while initializing class WriteFileSql ***", sys.exc_info()[0]) + else: + self.logger = logger + self.logger.error( + "*** %s occurred while initializing class WriteFileSql ***", sys.exc_info()[0]) + sys.exit("*** Error initializing class WriteFileSql") def write_file_sql(self, load_flags, data_files, stat_data, mode_cts_data, mode_obj_data, tcst_data, mtd_2d_data, mtd_3d_single_data, @@ -47,146 +58,160 @@ def write_file_sql(self, load_flags, data_files, stat_data, mode_cts_data, N/A """ - self.logger.debug("[--- Start write_file_sql ---]") - - write_time_start = time.perf_counter() - try: - # -------------------- - # Write Data Files - # -------------------- - - # get next valid data file id. data files start counting from 1 - next_file_id = self.sql_met.get_next_id(CN.DATA_FILE, CN.DATA_FILE_ID, sql_cur, self.logger) - if next_file_id == 0: - next_file_id = 1 - - id_ctr = 0 - list_dupes = [] - - # write out records for data files, but first: - # check for duplicates if flag on - delete if found - for row_num, file_line in data_files.iterrows(): - # look for existing data file record - sql_cur.execute(CN.Q_FILE, [file_line[CN.FILEPATH], file_line[CN.FILENAME]]) - result = sql_cur.fetchone() - - # If you find a match, check the force_dup_file tag/flag - if sql_cur.rowcount > 0: - list_dupes = list_dupes + [file_line[CN.FILE_ROW]] - if not load_flags['force_dup_file']: - self.logger.warning("!!! Duplicate file %s without FORCE_DUP_FILE tag", - file_line[CN.FULL_FILE]) + self.logger.debug("[--- Start write_file_sql ---]") + + write_time_start = time.perf_counter() + + try: + + # -------------------- + # Write Data Files + # -------------------- + + # get next valid data file id. data files start counting from 1 + next_file_id = self.sql_met.get_next_id( + CN.DATA_FILE, CN.DATA_FILE_ID, sql_cur, self.logger) + if next_file_id == 0: + next_file_id = 1 + + id_ctr = 0 + list_dupes = [] + + # write out records for data files, but first: + # check for duplicates if flag on - delete if found + for row_num, file_line in data_files.iterrows(): + # look for existing data file record + sql_cur.execute( + CN.Q_FILE, [file_line[CN.FILEPATH], file_line[CN.FILENAME]]) + result = sql_cur.fetchone() + + # If you find a match, check the force_dup_file tag/flag + if sql_cur.rowcount > 0: + list_dupes = list_dupes + [file_line[CN.FILE_ROW]] + if not load_flags['force_dup_file']: + self.logger.warning("!!! Duplicate file %s without FORCE_DUP_FILE tag", + file_line[CN.FULL_FILE]) + else: + # With duplicate files allowed, save the existing id for the file + data_files.loc[data_files.index[row_num], + CN.DATA_FILE_ID] = result[0] + self.logger.warning("Duplicate file %s already in data_file", + file_line[CN.FULL_FILE]) + # Not a duplicate - give it a new id else: - # With duplicate files allowed, save the existing id for the file - data_files.loc[data_files.index[row_num], CN.DATA_FILE_ID] = result[0] - self.logger.warning("Duplicate file %s already in data_file", - file_line[CN.FULL_FILE]) - # Not a duplicate - give it a new id - else: - data_files.loc[data_files.index[row_num], CN.DATA_FILE_ID] = \ - id_ctr + next_file_id - id_ctr = id_ctr + 1 - - # end for row_num, file_line - - if not load_flags['force_dup_file']: - - # delete line data rows that match index of duplicated file - if not stat_data.empty and list_dupes: - if stat_data.file_row.isin(list_dupes).any(): - stat_data.drop(stat_data[stat_data.file_row - .isin(list_dupes)].index, - inplace=True) - - if not mode_cts_data.empty and list_dupes: - if mode_cts_data.file_row.isin(list_dupes).any(): - mode_cts_data.drop(mode_cts_data[mode_cts_data.file_row - .isin(list_dupes)].index, - inplace=True) + data_files.loc[data_files.index[row_num], CN.DATA_FILE_ID] = \ + id_ctr + next_file_id + id_ctr = id_ctr + 1 - if not mode_obj_data.empty and list_dupes: - if mode_obj_data.file_row.isin(list_dupes).any(): - mode_obj_data.drop(mode_obj_data[mode_obj_data.file_row - .isin(list_dupes)].index, - inplace=True) + # end for row_num, file_line - if not mtd_2d_data.empty and list_dupes: - if mtd_2d_data.file_row.isin(list_dupes).any(): - mtd_2d_data.drop(mtd_2d_data[mtd_2d_data.file_row - .isin(list_dupes)].index, - inplace=True) + if not load_flags['force_dup_file']: - if not mtd_3d_single_data.empty and list_dupes: - if mtd_3d_single_data.file_row.isin(list_dupes).any(): - mtd_3d_single_data.drop(mtd_3d_single_data[mtd_3d_single_data.file_row - .isin(list_dupes)].index, - inplace=True) - - if not mtd_3d_pair_data.empty and list_dupes: - if mtd_3d_pair_data.file_row.isin(list_dupes).any(): - mtd_3d_pair_data.drop(mtd_3d_pair_data[mtd_3d_pair_data.file_row - .isin(list_dupes)].index, - inplace=True) - - # delete duplicate file entries - index_names = data_files[data_files.data_file_id == CN.NO_KEY].index - data_files.drop(index_names, inplace=True) - - if not data_files.empty: - - # reset indexes in case any records were dropped - stat_data.reset_index(drop=True, inplace=True) - mode_cts_data.reset_index(drop=True, inplace=True) - mode_obj_data.reset_index(drop=True, inplace=True) - tcst_data.reset_index(drop=True, inplace=True) - - # Replace the temporary id value with the actual index in the line data - for row_num, row in data_files.iterrows(): - if not stat_data.empty: - stat_data.loc[stat_data[CN.FILE_ROW] == row[CN.FILE_ROW], - CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] - if not mode_cts_data.empty: - mode_cts_data.loc[mode_cts_data[CN.FILE_ROW] == row[CN.FILE_ROW], - CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] - if not mode_obj_data.empty: - mode_obj_data.loc[mode_obj_data[CN.FILE_ROW] == row[CN.FILE_ROW], - CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] - if not tcst_data.empty: - tcst_data.loc[tcst_data[CN.FILE_ROW] == row[CN.FILE_ROW], - CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] - if not mtd_2d_data.empty: - mtd_2d_data.loc[mtd_2d_data[CN.FILE_ROW] == row[CN.FILE_ROW], - CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] - if not mtd_3d_single_data.empty: - mtd_3d_single_data.loc[mtd_3d_single_data[CN.FILE_ROW] == row[CN.FILE_ROW], - CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] - if not mtd_3d_pair_data.empty: - mtd_3d_pair_data.loc[mtd_3d_pair_data[CN.FILE_ROW] == row[CN.FILE_ROW], - CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] + # delete line data rows that match index of duplicated file + if not stat_data.empty and list_dupes: + if stat_data.file_row.isin(list_dupes).any(): + stat_data.drop(stat_data[stat_data.file_row + .isin(list_dupes)].index, + inplace=True) - # get just the new data files - new_files = data_files[data_files[CN.DATA_FILE_ID] >= next_file_id] + if not mode_cts_data.empty and list_dupes: + if mode_cts_data.file_row.isin(list_dupes).any(): + mode_cts_data.drop(mode_cts_data[mode_cts_data.file_row + .isin(list_dupes)].index, + inplace=True) + + if not mode_obj_data.empty and list_dupes: + if mode_obj_data.file_row.isin(list_dupes).any(): + mode_obj_data.drop(mode_obj_data[mode_obj_data.file_row + .isin(list_dupes)].index, + inplace=True) + + if not mtd_2d_data.empty and list_dupes: + if mtd_2d_data.file_row.isin(list_dupes).any(): + mtd_2d_data.drop(mtd_2d_data[mtd_2d_data.file_row + .isin(list_dupes)].index, + inplace=True) - # write the new data files out to the sql database - if not new_files.empty: - self.sql_met.write_to_sql(new_files, CN.DATA_FILE_FIELDS, CN.DATA_FILE, - CN.INS_DATA_FILES, tmp_dir, sql_cur, local_infile, - self.logger) + if not mtd_3d_single_data.empty and list_dupes: + if mtd_3d_single_data.file_row.isin(list_dupes).any(): + mtd_3d_single_data.drop(mtd_3d_single_data[mtd_3d_single_data.file_row + .isin(list_dupes)].index, + inplace=True) - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error("*** %s in write_file_sql ***", sys.exc_info()[0]) + if not mtd_3d_pair_data.empty and list_dupes: + if mtd_3d_pair_data.file_row.isin(list_dupes).any(): + mtd_3d_pair_data.drop(mtd_3d_pair_data[mtd_3d_pair_data.file_row + .isin(list_dupes)].index, + inplace=True) - write_time_end = time.perf_counter() - write_time = timedelta(seconds=write_time_end - write_time_start) + # delete duplicate file entries + index_names = data_files[data_files.data_file_id == + CN.NO_KEY].index + data_files.drop(index_names, inplace=True) - self.logger.info(" >>> Write time File: %s", str(write_time)) + if not data_files.empty: - self.logger.debug("[--- End write_file_sql ---]") + # reset indexes in case any records were dropped + stat_data.reset_index(drop=True, inplace=True) + mode_cts_data.reset_index(drop=True, inplace=True) + mode_obj_data.reset_index(drop=True, inplace=True) + tcst_data.reset_index(drop=True, inplace=True) - return data_files, stat_data, mode_cts_data, mode_obj_data, tcst_data, \ - mtd_2d_data, mtd_3d_single_data, mtd_3d_pair_data + # Replace the temporary id value with the actual index in the line data + for row_num, row in data_files.iterrows(): + if not stat_data.empty: + stat_data.loc[stat_data[CN.FILE_ROW] == row[CN.FILE_ROW], + CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] + if not mode_cts_data.empty: + mode_cts_data.loc[mode_cts_data[CN.FILE_ROW] == row[CN.FILE_ROW], + CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] + if not mode_obj_data.empty: + mode_obj_data.loc[mode_obj_data[CN.FILE_ROW] == row[CN.FILE_ROW], + CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] + if not tcst_data.empty: + tcst_data.loc[tcst_data[CN.FILE_ROW] == row[CN.FILE_ROW], + CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] + if not mtd_2d_data.empty: + mtd_2d_data.loc[mtd_2d_data[CN.FILE_ROW] == row[CN.FILE_ROW], + CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] + if not mtd_3d_single_data.empty: + mtd_3d_single_data.loc[mtd_3d_single_data[CN.FILE_ROW] == row[CN.FILE_ROW], + CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] + if not mtd_3d_pair_data.empty: + mtd_3d_pair_data.loc[mtd_3d_pair_data[CN.FILE_ROW] == row[CN.FILE_ROW], + CN.DATA_FILE_ID] = row[CN.DATA_FILE_ID] + + # get just the new data files + new_files = data_files[data_files[CN.DATA_FILE_ID] + >= next_file_id] + + # write the new data files out to the sql database + if not new_files.empty: + self.sql_met.write_to_sql(new_files, CN.DATA_FILE_FIELDS, CN.DATA_FILE, + CN.INS_DATA_FILES, tmp_dir, sql_cur, local_infile, + self.logger) + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + "*** %s in write_file_sql ***", sys.exc_info()[0]) + sys.exit("*** Error writing SQL") + + write_time_end = time.perf_counter() + write_time = timedelta(seconds=write_time_end - write_time_start) + + self.logger.info(" >>> Write time File: %s", str(write_time)) + + self.logger.debug("[--- End write_file_sql ---]") + + return data_files, stat_data, mode_cts_data, mode_obj_data, tcst_data, \ + mtd_2d_data, mtd_3d_single_data, mtd_3d_pair_data + + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + self.logger.error( + "*** %s occurred in write_file_sql function ***", sys.exc_info()[0]) + sys.exit("*** Error in write_file_sql function") def write_metadata_sql(self, load_flags, data_files, group, description, load_note, xml_str, tmp_dir, sql_cur, local_infile): @@ -195,49 +220,58 @@ def write_metadata_sql(self, load_flags, data_files, group, description, N/A """ - self.logger.debug("[--- Start write_metadata_sql ---]") + try: - write_time_start = time.perf_counter() + self.logger.debug("[--- Start write_metadata_sql ---]") - try: + write_time_start = time.perf_counter() + + try: - # -------------------- - # Write Metadata - group and description - # -------------------- - - # insert or update the group and description fields in the metadata table - if group != CN.DEFAULT_DATABASE_GROUP: - sql_cur.execute(CN.Q_METADATA) - result = sql_cur.fetchone() - - # If you find a match, update the category and description - if sql_cur.rowcount > 0: - if group != result[0] or description != result[1]: - sql_cur.execute(CN.UPD_METADATA, [group, description]) - # otherwise, insert the category and description - else: - new_metadata = pd.DataFrame([[group, description]], - columns=['category', 'description']) - self.sql_met.write_to_sql(new_metadata, ['category', 'description'], 'metadata', - CN.INS_METADATA, tmp_dir, sql_cur, local_infile, self.logger) - - # -------------------- - # Write Instance Info - # -------------------- - - if load_flags['load_xml'] and not data_files.empty: - update_date = data_files[CN.LOAD_DATE].iloc[0] - next_instance_id = self.sql_met.get_next_id(CN.INSTANCE_INFO, CN.INSTANCE_INFO_ID, - sql_cur, self.logger) - sql_cur.execute(CN.INS_INSTANCE, [next_instance_id, getpass.getuser(), update_date, - load_note, xml_str]) - - except (RuntimeError, TypeError, NameError, KeyError): - self.logger.error("*** %s in write_metadata_sql ***", sys.exc_info()[0]) - - write_time_end = time.perf_counter() - write_time = timedelta(seconds=write_time_end - write_time_start) - - self.logger.info(" >>> Write time Metadata: %s", str(write_time)) - - self.logger.debug("[--- End write_metadata_sql ---]") + # -------------------- + # Write Metadata - group and description + # -------------------- + + # insert or update the group and description fields in the metadata table + if group != CN.DEFAULT_DATABASE_GROUP: + sql_cur.execute(CN.Q_METADATA) + result = sql_cur.fetchone() + + # If you find a match, update the category and description + if sql_cur.rowcount > 0: + if group != result[0] or description != result[1]: + sql_cur.execute(CN.UPD_METADATA, [group, description]) + # otherwise, insert the category and description + else: + new_metadata = pd.DataFrame([[group, description]], + columns=['category', 'description']) + self.sql_met.write_to_sql(new_metadata, ['category', 'description'], 'metadata', + CN.INS_METADATA, tmp_dir, sql_cur, local_infile, self.logger) + + # -------------------- + # Write Instance Info + # -------------------- + + if load_flags['load_xml'] and not data_files.empty: + update_date = data_files[CN.LOAD_DATE].iloc[0] + next_instance_id = self.sql_met.get_next_id(CN.INSTANCE_INFO, CN.INSTANCE_INFO_ID, + sql_cur, self.logger) + sql_cur.execute(CN.INS_INSTANCE, [next_instance_id, getpass.getuser(), update_date, + load_note, xml_str]) + + except (RuntimeError, TypeError, NameError, KeyError): + self.logger.error( + "*** %s in write_metadata_sql ***", sys.exc_info()[0]) + sys.exit("*** Error writing metadata_SQL") + + write_time_end = time.perf_counter() + write_time = timedelta(seconds=write_time_end - write_time_start) + + self.logger.info(" >>> Write time Metadata: %s", str(write_time)) + + self.logger.debug("[--- End write_metadata_sql ---]") + + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + self.logger.error( + "*** %s occurred in write_file_sql function ***", sys.exc_info()[0]) + sys.exit("*** Error in write_file_sql function") diff --git a/METdbLoad/ush/write_mode_sql.py b/METdbLoad/ush/write_mode_sql.py index 994911bc..f4aab30d 100644 --- a/METdbLoad/ush/write_mode_sql.py +++ b/METdbLoad/ush/write_mode_sql.py @@ -37,205 +37,236 @@ def write_mode_data(load_flags, cts_data, obj_data, tmp_dir, sql_cur, local_infi N/A """ - logger.debug("[--- Start write_mode_sql ---]") - - write_time_start = time.perf_counter() - try: - all_pair = pd.DataFrame() - - sql_met = RunSql() - - # -------------------- - # Write Mode Headers - # -------------------- - - # get the unique mode headers from cts_data and obj_data - if not cts_data.empty: - mode_headers = cts_data[CN.MODE_HEADER_FIELDS[1:]] - if not obj_data.empty: - mode_headers = pd.concat([mode_headers, obj_data[CN.MODE_HEADER_FIELDS[1:]]], - ignore_index=True, sort=False) - # restore to original order now that cts and obj are recombined - mode_headers = mode_headers.sort_values(by=[CN.DATA_FILE_ID, CN.LINENUMBER]) - # get unique values, keeping the first of the duplicate records - mode_headers.drop_duplicates(CN.MODE_HEADER_KEYS, keep='first', inplace=True) - mode_headers.reset_index(drop=True, inplace=True) - - # At first, we do not know if the headers already exist, so we have no keys - mode_headers[CN.MODE_HEADER_ID] = CN.NO_KEY - - # get the next valid mode header id. Set it to zero (first valid id) if no records yet - next_header_id = sql_met.get_next_id(CN.MODE_HEADER, CN.MODE_HEADER_ID, sql_cur, logger) - - # if the flag is set to check for duplicate headers, get ids from existing headers - if load_flags["mode_header_db_check"]: - - # For each header, query with unique fields to try to find a match in the database - for row_num, data_line in mode_headers.iterrows(): - data_line[CN.FCST_VALID] = \ - data_line[CN.FCST_VALID].strftime("%Y-%m-%d %H:%M:%S") - data_line[CN.FCST_INIT] = data_line[CN.FCST_INIT].strftime("%Y-%m-%d %H:%M:%S") - data_line[CN.OBS_VALID] = data_line[CN.OBS_VALID].strftime("%Y-%m-%d %H:%M:%S") - # when n_valid and grid_res are null, query needs 'is null' - if data_line[CN.N_VALID] == CN.MV_NULL and data_line[CN.GRID_RES] == CN.MV_NULL: - sql_cur.execute(CN.QN_MHEADER, - [data_line[CN.VERSION], - data_line[CN.MODEL]] + data_line.values[7:-1].tolist()) - else: - sql_cur.execute(CN.Q_MHEADER, data_line.values[3:-1].tolist()) - result = sql_cur.fetchone() - - # If you find a match, put the key into the mode_headers dataframe - if sql_cur.rowcount > 0: - mode_headers.loc[mode_headers.index[row_num], CN.MODE_HEADER_ID] = result[0] - # otherwise create the next id and put it in - else: - mode_headers.loc[mode_headers.index[row_num], CN.MODE_HEADER_ID] = \ - row_num + next_header_id - else: - # When all new headers, add the next id to the row number/index to make a new key - mode_headers.loc[mode_headers.mode_header_id == CN.NO_KEY, CN.MODE_HEADER_ID] = \ - mode_headers.index + next_header_id - - # get just the new headers with their keys - new_headers = mode_headers[mode_headers[CN.MODE_HEADER_ID] > (next_header_id - 1)] - logger.info("New mode headers: %s rows", str(len(new_headers.index))) - - # Write any new headers out to the sql database - if not new_headers.empty: - sql_met.write_to_sql(new_headers, CN.MODE_HEADER_FIELDS, CN.MODE_HEADER, - CN.INS_MHEADER, tmp_dir, sql_cur, local_infile, logger) - new_headers = new_headers.iloc[0:0] - - # -------------------- - # Write Line Data - # -------------------- - - # write the lines out to a CSV file, and then load them into database - - if not cts_data.empty: - # put the header ids back into the dataframes - cts_data = pd.merge(left=mode_headers, right=cts_data, on=CN.MODE_HEADER_KEYS) - - sql_met.write_to_sql(cts_data, CN.MODE_CTS_FIELDS, CN.MODE_CTS_T, - CN.INS_CHEADER, tmp_dir, sql_cur, local_infile, logger) - cts_data = cts_data.iloc[0:0] - - if not obj_data.empty: - # MET has a different column name than METviewer - obj_data = obj_data.rename(columns={'axis_ang': 'axis_avg'}) - # put the header ids back into the dataframes - obj_data = pd.merge(left=mode_headers, right=obj_data, on=CN.MODE_HEADER_KEYS) - mode_headers = mode_headers.iloc[0:0] - - # intensity values can be NA, which causes MySQL warning - # replace is done to achieve desired MySQL output of NULL - obj_data.replace({'intensity_10': CN.NOTAV, 'intensity_25': CN.NOTAV, - 'intensity_50': CN.NOTAV, 'intensity_75': CN.NOTAV, - 'intensity_90': CN.NOTAV, 'intensity_nn': CN.NOTAV}, - CN.MV_NULL, inplace=True) - - # pairs have an underscore in the object id - singles do not - all_pair = obj_data[obj_data[CN.OBJECT_ID].str.contains(CN.U_SCORE)].copy() - obj_data.drop(obj_data[obj_data[CN.OBJECT_ID].str.contains(CN.U_SCORE)].index, - inplace=True) - - # reset the index so mode_obj_ids are set correctly - obj_data.reset_index(drop=True, inplace=True) - - # get next valid mode object id. Set it to zero (first valid id) if no records yet - next_line_id = sql_met.get_next_id(CN.MODE_SINGLE_T, CN.MODE_OBJ_ID, sql_cur, logger) - - # create the mode_obj_ids using the dataframe index and next valid id - obj_data[CN.MODE_OBJ_ID] = obj_data.index + next_line_id - - # create defaults for flags - obj_data[CN.SIMPLE_FLAG] = 1 - obj_data[CN.FCST_FLAG] = 0 - obj_data[CN.MATCHED_FLAG] = 0 - - # Set simple flag to zero if object id starts with C - if obj_data.object_id.str.startswith('C').any(): - obj_data.loc[obj_data.object_id.str.startswith('C'), - CN.SIMPLE_FLAG] = 0 - - # Set fcst flag to 1 if object id contains an F - if obj_data.object_id.str.contains('F').any(): - obj_data.loc[obj_data.object_id.str.contains('F'), - CN.FCST_FLAG] = 1 - - # Set matched flag to 1 if object cat has neither underscore nor 000 - if (~obj_data.object_cat.str.contains(CN.U_SCORE)).sum() > 0: - if (~obj_data.object_cat.str.contains(CN.T_ZERO)).sum() > 0: - obj_data.loc[~obj_data.object_cat.str.contains(CN.U_SCORE) & - ~obj_data.object_cat.str.contains(CN.T_ZERO), - CN.MATCHED_FLAG] = 1 - - # write out the mode single objects - sql_met.write_to_sql(obj_data, CN.MODE_SINGLE_FIELDS, CN.MODE_SINGLE_T, - CN.INS_SHEADER, tmp_dir, sql_cur, local_infile, logger) - - if not all_pair.empty: - - all_pair.reset_index(drop=True, inplace=True) - - # split out the paired object ids for processing - all_pair[[CN.F_OBJECT_ID, CN.O_OBJECT_ID]] = \ - all_pair[CN.OBJECT_ID].str.split(CN.U_SCORE, expand=True) - - # split out the paired cats for processing - all_pair[[CN.F_OBJECT_CAT, CN.O_OBJECT_CAT]] = \ - all_pair[CN.OBJECT_CAT].str.split(CN.U_SCORE, expand=True) - - # get only the single object columns needed to find mode object ids - obj_data = obj_data[[CN.MODE_HEADER_ID, CN.OBJECT_ID, CN.MODE_OBJ_ID]] - # rename the object id column to match forecasts - obj_data.columns = [CN.MODE_HEADER_ID, CN.F_OBJECT_ID, CN.MODE_OBJ_ID] - - # get mode objects ids for forecasts - all_pair = pd.merge(left=all_pair, right=obj_data, - on=[CN.MODE_HEADER_ID, CN.F_OBJECT_ID]) - all_pair.rename(columns={CN.MODE_OBJ_ID: CN.MODE_OBJ_FCST_ID}, inplace=True) - - # rename the object id column to match observations - obj_data.rename(columns={CN.F_OBJECT_ID: CN.O_OBJECT_ID}, inplace=True) - - # get mode objects ids for observations - all_pair = pd.merge(left=all_pair, right=obj_data, - on=[CN.MODE_HEADER_ID, CN.O_OBJECT_ID]) - all_pair.rename(columns={CN.MODE_OBJ_ID: CN.MODE_OBJ_OBS_ID}, inplace=True) - - obj_data = obj_data.iloc[0:0] - - all_pair[CN.SIMPLE_FLAG] = 1 - # Set simple flag to zero if object id starts with C - if all_pair.f_object_id.str.startswith('C').any() and \ - all_pair.o_object_id.str.startswith('C').any(): - all_pair.loc[all_pair.f_object_id.str.startswith('C') & - all_pair.o_object_id.str.startswith('C'), - CN.SIMPLE_FLAG] = 0 - - all_pair[CN.MATCHED_FLAG] = 0 - if (~all_pair.f_object_cat.str.contains(CN.T_ZERO)).sum() > 0: - if (all_pair.f_object_cat.str[2:] == all_pair.o_object_cat.str[2:]).any(): - all_pair.loc[~all_pair.f_object_cat.str.contains(CN.T_ZERO) & - (all_pair.f_object_cat.str[2:] == - all_pair.o_object_cat.str[2:]), - CN.MATCHED_FLAG] = 1 - - # write out the mode pair objects - sql_met.write_to_sql(all_pair, CN.MODE_PAIR_FIELDS, CN.MODE_PAIR_T, - CN.INS_PHEADER, tmp_dir, sql_cur, local_infile, logger) - all_pair = all_pair.iloc[0:0] - - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s in write_mode_sql ***", sys.exc_info()[0]) - - write_time_end = time.perf_counter() - write_time = timedelta(seconds=write_time_end - write_time_start) - - logger.info(" >>> Write time Mode: %s", str(write_time)) - - logger.debug("[--- End write_mode_sql ---]") + logger.debug("[--- Start write_mode_sql ---]") + + write_time_start = time.perf_counter() + + try: + + all_pair = pd.DataFrame() + + sql_met = RunSql() + + # -------------------- + # Write Mode Headers + # -------------------- + + # get the unique mode headers from cts_data and obj_data + if not cts_data.empty: + mode_headers = cts_data[CN.MODE_HEADER_FIELDS[1:]] + if not obj_data.empty: + mode_headers = pd.concat([mode_headers, obj_data[CN.MODE_HEADER_FIELDS[1:]]], + ignore_index=True, sort=False) + # restore to original order now that cts and obj are recombined + mode_headers = mode_headers.sort_values( + by=[CN.DATA_FILE_ID, CN.LINENUMBER]) + # get unique values, keeping the first of the duplicate records + mode_headers.drop_duplicates( + CN.MODE_HEADER_KEYS, keep='first', inplace=True) + mode_headers.reset_index(drop=True, inplace=True) + + # At first, we do not know if the headers already exist, so we have no keys + mode_headers[CN.MODE_HEADER_ID] = CN.NO_KEY + + # get the next valid mode header id. Set it to zero (first valid id) if no records yet + next_header_id = sql_met.get_next_id( + CN.MODE_HEADER, CN.MODE_HEADER_ID, sql_cur, logger) + + # if the flag is set to check for duplicate headers, get ids from existing headers + if load_flags["mode_header_db_check"]: + + # For each header, query with unique fields to try to find a match in the database + for row_num, data_line in mode_headers.iterrows(): + data_line[CN.FCST_VALID] = \ + data_line[CN.FCST_VALID].strftime( + "%Y-%m-%d %H:%M:%S") + data_line[CN.FCST_INIT] = data_line[CN.FCST_INIT].strftime( + "%Y-%m-%d %H:%M:%S") + data_line[CN.OBS_VALID] = data_line[CN.OBS_VALID].strftime( + "%Y-%m-%d %H:%M:%S") + # when n_valid and grid_res are null, query needs 'is null' + if data_line[CN.N_VALID] == CN.MV_NULL and data_line[CN.GRID_RES] == CN.MV_NULL: + sql_cur.execute(CN.QN_MHEADER, + [data_line[CN.VERSION], + data_line[CN.MODEL]] + data_line.values[7:-1].tolist()) + else: + sql_cur.execute( + CN.Q_MHEADER, data_line.values[3:-1].tolist()) + result = sql_cur.fetchone() + + # If you find a match, put the key into the mode_headers dataframe + if sql_cur.rowcount > 0: + mode_headers.loc[mode_headers.index[row_num], + CN.MODE_HEADER_ID] = result[0] + # otherwise create the next id and put it in + else: + mode_headers.loc[mode_headers.index[row_num], CN.MODE_HEADER_ID] = \ + row_num + next_header_id + else: + # When all new headers, add the next id to the row number/index to make a new key + mode_headers.loc[mode_headers.mode_header_id == CN.NO_KEY, CN.MODE_HEADER_ID] = \ + mode_headers.index + next_header_id + + # get just the new headers with their keys + new_headers = mode_headers[mode_headers[CN.MODE_HEADER_ID] > ( + next_header_id - 1)] + logger.info("New mode headers: %s rows", + str(len(new_headers.index))) + + # Write any new headers out to the sql database + if not new_headers.empty: + sql_met.write_to_sql(new_headers, CN.MODE_HEADER_FIELDS, CN.MODE_HEADER, + CN.INS_MHEADER, tmp_dir, sql_cur, local_infile, logger) + new_headers = new_headers.iloc[0:0] + + # -------------------- + # Write Line Data + # -------------------- + + # write the lines out to a CSV file, and then load them into database + + if not cts_data.empty: + # put the header ids back into the dataframes + cts_data = pd.merge(left=mode_headers, + right=cts_data, on=CN.MODE_HEADER_KEYS) + + sql_met.write_to_sql(cts_data, CN.MODE_CTS_FIELDS, CN.MODE_CTS_T, + CN.INS_CHEADER, tmp_dir, sql_cur, local_infile, logger) + cts_data = cts_data.iloc[0:0] + + if not obj_data.empty: + # MET has a different column name than METviewer + obj_data = obj_data.rename( + columns={'axis_ang': 'axis_avg'}) + # put the header ids back into the dataframes + obj_data = pd.merge(left=mode_headers, + right=obj_data, on=CN.MODE_HEADER_KEYS) + mode_headers = mode_headers.iloc[0:0] + + # intensity values can be NA, which causes MySQL warning + # replace is done to achieve desired MySQL output of NULL + obj_data.replace({'intensity_10': CN.NOTAV, 'intensity_25': CN.NOTAV, + 'intensity_50': CN.NOTAV, 'intensity_75': CN.NOTAV, + 'intensity_90': CN.NOTAV, 'intensity_nn': CN.NOTAV}, + CN.MV_NULL, inplace=True) + + # pairs have an underscore in the object id - singles do not + all_pair = obj_data[obj_data[CN.OBJECT_ID].str.contains( + CN.U_SCORE)].copy() + obj_data.drop(obj_data[obj_data[CN.OBJECT_ID].str.contains(CN.U_SCORE)].index, + inplace=True) + + # reset the index so mode_obj_ids are set correctly + obj_data.reset_index(drop=True, inplace=True) + + # get next valid mode object id. Set it to zero (first valid id) if no records yet + next_line_id = sql_met.get_next_id( + CN.MODE_SINGLE_T, CN.MODE_OBJ_ID, sql_cur, logger) + + # create the mode_obj_ids using the dataframe index and next valid id + obj_data[CN.MODE_OBJ_ID] = obj_data.index + next_line_id + + # create defaults for flags + obj_data[CN.SIMPLE_FLAG] = 1 + obj_data[CN.FCST_FLAG] = 0 + obj_data[CN.MATCHED_FLAG] = 0 + + # Set simple flag to zero if object id starts with C + if obj_data.object_id.str.startswith('C').any(): + obj_data.loc[obj_data.object_id.str.startswith('C'), + CN.SIMPLE_FLAG] = 0 + + # Set fcst flag to 1 if object id contains an F + if obj_data.object_id.str.contains('F').any(): + obj_data.loc[obj_data.object_id.str.contains('F'), + CN.FCST_FLAG] = 1 + + # Set matched flag to 1 if object cat has neither underscore nor 000 + if (~obj_data.object_cat.str.contains(CN.U_SCORE)).sum() > 0: + if (~obj_data.object_cat.str.contains(CN.T_ZERO)).sum() > 0: + obj_data.loc[~obj_data.object_cat.str.contains(CN.U_SCORE) & + ~obj_data.object_cat.str.contains( + CN.T_ZERO), + CN.MATCHED_FLAG] = 1 + + # write out the mode single objects + sql_met.write_to_sql(obj_data, CN.MODE_SINGLE_FIELDS, CN.MODE_SINGLE_T, + CN.INS_SHEADER, tmp_dir, sql_cur, local_infile, logger) + + if not all_pair.empty: + + all_pair.reset_index(drop=True, inplace=True) + + # split out the paired object ids for processing + all_pair[[CN.F_OBJECT_ID, CN.O_OBJECT_ID]] = \ + all_pair[CN.OBJECT_ID].str.split( + CN.U_SCORE, expand=True) + + # split out the paired cats for processing + all_pair[[CN.F_OBJECT_CAT, CN.O_OBJECT_CAT]] = \ + all_pair[CN.OBJECT_CAT].str.split( + CN.U_SCORE, expand=True) + + # get only the single object columns needed to find mode object ids + obj_data = obj_data[[CN.MODE_HEADER_ID, + CN.OBJECT_ID, CN.MODE_OBJ_ID]] + # rename the object id column to match forecasts + obj_data.columns = [CN.MODE_HEADER_ID, + CN.F_OBJECT_ID, CN.MODE_OBJ_ID] + + # get mode objects ids for forecasts + all_pair = pd.merge(left=all_pair, right=obj_data, + on=[CN.MODE_HEADER_ID, CN.F_OBJECT_ID]) + all_pair.rename( + columns={CN.MODE_OBJ_ID: CN.MODE_OBJ_FCST_ID}, inplace=True) + + # rename the object id column to match observations + obj_data.rename( + columns={CN.F_OBJECT_ID: CN.O_OBJECT_ID}, inplace=True) + + # get mode objects ids for observations + all_pair = pd.merge(left=all_pair, right=obj_data, + on=[CN.MODE_HEADER_ID, CN.O_OBJECT_ID]) + all_pair.rename( + columns={CN.MODE_OBJ_ID: CN.MODE_OBJ_OBS_ID}, inplace=True) + + obj_data = obj_data.iloc[0:0] + + all_pair[CN.SIMPLE_FLAG] = 1 + # Set simple flag to zero if object id starts with C + if all_pair.f_object_id.str.startswith('C').any() and \ + all_pair.o_object_id.str.startswith('C').any(): + all_pair.loc[all_pair.f_object_id.str.startswith('C') & + all_pair.o_object_id.str.startswith('C'), + CN.SIMPLE_FLAG] = 0 + + all_pair[CN.MATCHED_FLAG] = 0 + if (~all_pair.f_object_cat.str.contains(CN.T_ZERO)).sum() > 0: + if (all_pair.f_object_cat.str[2:] == all_pair.o_object_cat.str[2:]).any(): + all_pair.loc[~all_pair.f_object_cat.str.contains(CN.T_ZERO) & + (all_pair.f_object_cat.str[2:] == + all_pair.o_object_cat.str[2:]), + CN.MATCHED_FLAG] = 1 + + # write out the mode pair objects + sql_met.write_to_sql(all_pair, CN.MODE_PAIR_FIELDS, CN.MODE_PAIR_T, + CN.INS_PHEADER, tmp_dir, sql_cur, local_infile, logger) + all_pair = all_pair.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + logger.error("*** %s in write_mode_sql ***", sys.exc_info()[0]) + sys.exit("*** Error writing MODE SQL") + + write_time_end = time.perf_counter() + write_time = timedelta(seconds=write_time_end - write_time_start) + + logger.info(" >>> Write time Mode: %s", str(write_time)) + + logger.debug("[--- End write_mode_sql ---]") + + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + self.logger.error( + "*** %s occurred in write_mode_data function ***", sys.exc_info()[0]) + sys.exit("*** Error in write_mode_data function") diff --git a/METdbLoad/ush/write_mtd_sql.py b/METdbLoad/ush/write_mtd_sql.py index a7839d99..5c08d671 100644 --- a/METdbLoad/ush/write_mtd_sql.py +++ b/METdbLoad/ush/write_mtd_sql.py @@ -38,225 +38,259 @@ def write_mtd_data(load_flags, m_2d_data, m_3d_single_data, m_3d_pair_data, N/A """ - logger.debug("[--- Start write_mtd_sql ---]") - - write_time_start = time.perf_counter() - - try: - - sql_met = RunSql() - - mtd_headers = pd.DataFrame() - new_headers = pd.DataFrame() - - # -------------------- - # Write MTD Headers - # -------------------- - - # get the unique MTD headers - if not m_2d_data.empty: - mtd_headers = m_2d_data[CN.MTD_HEADER_FIELDS[1:]] - if not m_3d_single_data.empty: - mtd_headers = pd.concat([mtd_headers, - m_3d_single_data[CN.MTD_HEADER_FIELDS[1:]]], - ignore_index=True, sort=False) - if not m_3d_pair_data.empty: - mtd_headers = pd.concat([mtd_headers, - m_3d_pair_data[CN.MTD_HEADER_FIELDS[1:]]], - ignore_index=True, sort=False) - - # get unique values, keeping the first of the duplicate records - mtd_headers = mtd_headers.drop_duplicates(CN.MTD_2D_HEADER_KEYS, keep='first') - mtd_headers.reset_index(drop=True, inplace=True) - - # make sure type of columns is consistent between headers and line data - mtd_headers.fcst_lead = mtd_headers.fcst_lead.astype('int64') - mtd_headers.obs_lead = mtd_headers.obs_lead.astype('int64') - - # At first, we do not know if the headers already exist, so we have no keys - mtd_headers[CN.MTD_HEADER_ID] = CN.NO_KEY - - # get the next valid MTD header id. Set it to zero (first valid id) if no records yet - next_header_id = sql_met.get_next_id(CN.MTD_HEADER, CN.MTD_HEADER_ID, sql_cur, logger) - - # if the flag is set to check for duplicate headers, get ids from existing headers - if load_flags["mtd_header_db_check"]: - - # For each header, query with unique fields to try to find a match in the database - for row_num, data_line in mtd_headers.iterrows(): - data_line[CN.FCST_VALID] = \ - data_line[CN.FCST_VALID].strftime("%Y-%m-%d %H:%M:%S") - data_line[CN.FCST_INIT] = data_line[CN.FCST_INIT].strftime("%Y-%m-%d %H:%M:%S") - if data_line[CN.OBS_VALID] != CN.MV_NULL: - data_line[CN.OBS_VALID] = \ - data_line[CN.OBS_VALID].strftime("%Y-%m-%d %H:%M:%S") - if CN.MV_NULL not in data_line.values[4:-1].tolist(): - sql_cur.execute(CN.Q_MTDHEADER, data_line.values[4:-1].tolist()) - else: - sql_query = "SELECT mtd_header_id FROM mtd_header WHERE " + \ - "version=%s AND model=%s AND descr=%s AND fcst_lead=%s " + \ - "AND fcst_valid=%s AND fcst_init=%s AND obs_lead=%s " - data_values = data_line.values[4:11].tolist() - for mfield in CN.MTD_HEADER_KEYS[7:]: - if data_line[mfield] != CN.MV_NULL: - sql_query = sql_query + 'AND ' + mfield + '=%s ' - data_values.append(data_line[mfield]) - else: - sql_query = sql_query + 'AND ' + mfield + ' is NULL ' - sql_cur.execute(sql_query, data_values) - result = sql_cur.fetchone() - - # If you find a match, put the key into the mtd_headers dataframe - if sql_cur.rowcount > 0: - mtd_headers.loc[mtd_headers.index[row_num], CN.MTD_HEADER_ID] = result[0] - # otherwise create the next id and put it in - else: - mtd_headers.loc[mtd_headers.index[row_num], CN.MTD_HEADER_ID] = \ - row_num + next_header_id - else: - # When all new headers, add the next id to the row number/index to make a new key - mtd_headers.loc[mtd_headers.mtd_header_id == CN.NO_KEY, CN.MTD_HEADER_ID] = \ - mtd_headers.index + next_header_id - - # get just the new headers with their keys - new_headers = mtd_headers[mtd_headers[CN.MTD_HEADER_ID] > (next_header_id - 1)] - new_headers.obs_valid = pd.to_datetime(new_headers.obs_valid, errors='coerce') - logger.info("New MTD headers: %s rows", str(len(new_headers.index))) - - # Write any new headers out to the sql database - if not new_headers.empty: - # If there are any 2D revision files - if new_headers[CN.REVISION_ID].ne(CN.MV_NULL).any(): - # numbered revision ids must have max revision id added to be unique - next_rev_id = sql_met.get_next_id(CN.MTD_HEADER, CN.REVISION_ID, sql_cur, logger) - new_headers.loc[new_headers.revision_id != CN.MV_NULL, CN.REVISION_ID] = \ - new_headers.loc[new_headers.revision_id != CN.MV_NULL, CN.REVISION_ID] + \ - next_rev_id - new_headers.loc[new_headers.obs_valid.isnull(), CN.OBS_VALID] = CN.MV_NULL - sql_met.write_to_sql(new_headers, CN.MTD_HEADER_FIELDS, CN.MTD_HEADER, - CN.INS_MTDHEADER, tmp_dir, sql_cur, local_infile, logger) - new_headers = new_headers.iloc[0:0] - - mtd_headers.obs_valid = pd.to_datetime(mtd_headers.obs_valid, errors='coerce') - - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s in write_mtd_sql write MTD headers ***", sys.exc_info()[0]) - try: - # -------------------- - # Write Line Data - # -------------------- - # write the lines out to a CSV file, and then load them into database - if not m_2d_data.empty: - # make sure type of columns is consistent between headers and line data - m_2d_data.obs_valid = pd.to_datetime(m_2d_data.obs_valid, - errors='coerce') - # put the header ids back into the dataframe - m_2d_data = pd.merge(left=mtd_headers, right=m_2d_data, on=CN.MTD_2D_HEADER_KEYS) - m_2d_data.loc[m_2d_data.obs_valid.isnull(), CN.OBS_VALID] = CN.MV_NULL - - # create defaults for flags - m_2d_data[CN.SIMPLE_FLAG] = 1 - m_2d_data[CN.FCST_FLAG] = 0 - m_2d_data[CN.MATCHED_FLAG] = 0 - - # Set simple flag to zero if object id starts with C - if m_2d_data.object_id.str.startswith('C').any(): - m_2d_data.loc[m_2d_data.object_id.str.startswith('C'), - CN.SIMPLE_FLAG] = 0 - - # Set fcst flag to 1 if object id contains an F - if m_2d_data.object_id.str.contains('F').any(): - m_2d_data.loc[m_2d_data.object_id.str.contains('F'), - CN.FCST_FLAG] = 1 - - # Set matched flag to 1 if object cat has neither underscore nor 000 - if ((~m_2d_data.object_cat.str.contains(CN.U_SCORE)).sum() > 0 and - (~m_2d_data.object_cat.str.contains(CN.T_ZERO)).sum() > 0): - m_2d_data.loc[~m_2d_data.object_cat.str.contains(CN.U_SCORE) & - ~m_2d_data.object_cat.str.contains(CN.T_ZERO), - CN.MATCHED_FLAG] = 1 - - sql_met.write_to_sql(m_2d_data, CN.MTD_2D_OBJ_FIELDS, CN.MTD_2D_T, - CN.INS_M2HEADER, tmp_dir, sql_cur, local_infile, logger) - m_2d_data = m_2d_data.iloc[0:0] - - if not m_3d_single_data.empty: - # make sure type of columns is consistent between headers and line data - m_3d_single_data.fcst_lead = m_3d_single_data.fcst_lead.astype('int64') - m_3d_single_data.obs_lead = m_3d_single_data.obs_lead.astype('int64') - m_3d_single_data.obs_valid = pd.to_datetime(m_3d_single_data.obs_valid, - errors='coerce') - - # put the header ids back into the dataframe - m_3d_single_data = pd.merge(left=mtd_headers, right=m_3d_single_data, - on=CN.MTD_HEADER_KEYS) - m_3d_single_data.loc[m_3d_single_data.obs_valid.isnull(), CN.OBS_VALID] = CN.MV_NULL - - # create defaults for flags - m_3d_single_data[CN.SIMPLE_FLAG] = 1 - m_3d_single_data[CN.FCST_FLAG] = 0 - m_3d_single_data[CN.MATCHED_FLAG] = 0 - - # Set simple flag to zero if object id starts with C - if m_3d_single_data.object_id.str.startswith('C').any(): - m_3d_single_data.loc[m_3d_single_data.object_id.str.startswith('C'), - CN.SIMPLE_FLAG] = 0 - - # Set fcst flag to 1 if object id contains an F - if m_3d_single_data.object_id.str.contains('F').any(): - m_3d_single_data.loc[m_3d_single_data.object_id.str.contains('F'), - CN.FCST_FLAG] = 1 - - # Set matched flag to 1 if object cat has neither underscore nor 000 - if (~m_3d_single_data.object_cat.str.contains(CN.U_SCORE)).sum() > 0: - if (~m_3d_single_data.object_cat.str.contains(CN.T_ZERO)).sum() > 0: - m_3d_single_data.loc[~m_3d_single_data.object_cat.str.contains(CN.U_SCORE) & - ~m_3d_single_data.object_cat.str.contains(CN.T_ZERO), - CN.MATCHED_FLAG] = 1 - - sql_met.write_to_sql(m_3d_single_data, CN.MTD_3D_OBJ_SINGLE_FIELDS, CN.MTD_SINGLE_T, - CN.INS_M3SHEADER, tmp_dir, sql_cur, local_infile, logger) - m_3d_single_data = m_3d_single_data.iloc[0:0] - - if not m_3d_pair_data.empty: - # make sure type of columns is consistent between headers and line data - m_3d_pair_data.fcst_lead = m_3d_pair_data.fcst_lead.astype('int64') - m_3d_pair_data.obs_lead = m_3d_pair_data.obs_lead.astype('int64') - m_3d_pair_data.obs_valid = pd.to_datetime(m_3d_pair_data.obs_valid, - errors='coerce') + logger.debug("[--- Start write_mtd_sql ---]") - # put the header ids back into the dataframe - m_3d_pair_data = pd.merge(left=mtd_headers, right=m_3d_pair_data, - on=CN.MTD_HEADER_KEYS) - m_3d_pair_data.loc[m_3d_pair_data.obs_valid.isnull(), CN.OBS_VALID] = CN.MV_NULL - mtd_headers = mtd_headers.iloc[0:0] + write_time_start = time.perf_counter() - # create defaults for flags - m_3d_pair_data[CN.SIMPLE_FLAG] = 1 - m_3d_pair_data[CN.MATCHED_FLAG] = 0 + try: - # Set simple flag to zero if object id starts with C - if m_3d_pair_data.object_id.str.startswith('C').any(): - m_3d_pair_data.loc[m_3d_pair_data.object_id.str.startswith('C'), - CN.SIMPLE_FLAG] = 0 + sql_met = RunSql() - # Set matched flag to 1 if object cat has no 000 - if (~m_3d_pair_data.object_cat.str.contains(CN.T_ZERO)).sum() > 0: - m_3d_pair_data.loc[~m_3d_pair_data.object_cat.str.contains(CN.T_ZERO), - CN.MATCHED_FLAG] = 1 + mtd_headers = pd.DataFrame() + new_headers = pd.DataFrame() - sql_met.write_to_sql(m_3d_pair_data, CN.MTD_3D_OBJ_PAIR_FIELDS, CN.MTD_PAIR_T, - CN.INS_M3PHEADER, tmp_dir, sql_cur, local_infile, logger) - m_3d_pair_data = m_3d_pair_data.iloc[0:0] + # -------------------- + # Write MTD Headers + # -------------------- - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s in write_mtd_sql write line data ***", sys.exc_info()[0]) + # get the unique MTD headers + if not m_2d_data.empty: + mtd_headers = m_2d_data[CN.MTD_HEADER_FIELDS[1:]] + if not m_3d_single_data.empty: + mtd_headers = pd.concat([mtd_headers, + m_3d_single_data[CN.MTD_HEADER_FIELDS[1:]]], + ignore_index=True, sort=False) + if not m_3d_pair_data.empty: + mtd_headers = pd.concat([mtd_headers, + m_3d_pair_data[CN.MTD_HEADER_FIELDS[1:]]], + ignore_index=True, sort=False) - write_time_end = time.perf_counter() - write_time = timedelta(seconds=write_time_end - write_time_start) + # get unique values, keeping the first of the duplicate records + mtd_headers = mtd_headers.drop_duplicates( + CN.MTD_2D_HEADER_KEYS, keep='first') + mtd_headers.reset_index(drop=True, inplace=True) - logger.info(" >>> Write time MTD: %s", str(write_time)) - - logger.debug("[--- End write_mtd_sql ---]") + # make sure type of columns is consistent between headers and line data + mtd_headers.fcst_lead = mtd_headers.fcst_lead.astype('int64') + mtd_headers.obs_lead = mtd_headers.obs_lead.astype('int64') + + # At first, we do not know if the headers already exist, so we have no keys + mtd_headers[CN.MTD_HEADER_ID] = CN.NO_KEY + + # get the next valid MTD header id. Set it to zero (first valid id) if no records yet + next_header_id = sql_met.get_next_id( + CN.MTD_HEADER, CN.MTD_HEADER_ID, sql_cur, logger) + + # if the flag is set to check for duplicate headers, get ids from existing headers + if load_flags["mtd_header_db_check"]: + + # For each header, query with unique fields to try to find a match in the database + for row_num, data_line in mtd_headers.iterrows(): + data_line[CN.FCST_VALID] = \ + data_line[CN.FCST_VALID].strftime( + "%Y-%m-%d %H:%M:%S") + data_line[CN.FCST_INIT] = data_line[CN.FCST_INIT].strftime( + "%Y-%m-%d %H:%M:%S") + if data_line[CN.OBS_VALID] != CN.MV_NULL: + data_line[CN.OBS_VALID] = \ + data_line[CN.OBS_VALID].strftime( + "%Y-%m-%d %H:%M:%S") + if CN.MV_NULL not in data_line.values[4:-1].tolist(): + sql_cur.execute( + CN.Q_MTDHEADER, data_line.values[4:-1].tolist()) + else: + sql_query = "SELECT mtd_header_id FROM mtd_header WHERE " + \ + "version=%s AND model=%s AND descr=%s AND fcst_lead=%s " + \ + "AND fcst_valid=%s AND fcst_init=%s AND obs_lead=%s " + data_values = data_line.values[4:11].tolist() + for mfield in CN.MTD_HEADER_KEYS[7:]: + if data_line[mfield] != CN.MV_NULL: + sql_query = sql_query + 'AND ' + mfield + '=%s ' + data_values.append(data_line[mfield]) + else: + sql_query = sql_query + 'AND ' + mfield + ' is NULL ' + sql_cur.execute(sql_query, data_values) + result = sql_cur.fetchone() + + # If you find a match, put the key into the mtd_headers dataframe + if sql_cur.rowcount > 0: + mtd_headers.loc[mtd_headers.index[row_num], + CN.MTD_HEADER_ID] = result[0] + # otherwise create the next id and put it in + else: + mtd_headers.loc[mtd_headers.index[row_num], CN.MTD_HEADER_ID] = \ + row_num + next_header_id + else: + # When all new headers, add the next id to the row number/index to make a new key + mtd_headers.loc[mtd_headers.mtd_header_id == CN.NO_KEY, CN.MTD_HEADER_ID] = \ + mtd_headers.index + next_header_id + + # get just the new headers with their keys + new_headers = mtd_headers[mtd_headers[CN.MTD_HEADER_ID] > ( + next_header_id - 1)] + new_headers.obs_valid = pd.to_datetime( + new_headers.obs_valid, errors='coerce') + logger.info("New MTD headers: %s rows", + str(len(new_headers.index))) + + # Write any new headers out to the sql database + if not new_headers.empty: + # If there are any 2D revision files + if new_headers[CN.REVISION_ID].ne(CN.MV_NULL).any(): + # numbered revision ids must have max revision id added to be unique + next_rev_id = sql_met.get_next_id( + CN.MTD_HEADER, CN.REVISION_ID, sql_cur, logger) + new_headers.loc[new_headers.revision_id != CN.MV_NULL, CN.REVISION_ID] = \ + new_headers.loc[new_headers.revision_id != CN.MV_NULL, CN.REVISION_ID] + \ + next_rev_id + new_headers.loc[new_headers.obs_valid.isnull( + ), CN.OBS_VALID] = CN.MV_NULL + sql_met.write_to_sql(new_headers, CN.MTD_HEADER_FIELDS, CN.MTD_HEADER, + CN.INS_MTDHEADER, tmp_dir, sql_cur, local_infile, logger) + new_headers = new_headers.iloc[0:0] + + mtd_headers.obs_valid = pd.to_datetime( + mtd_headers.obs_valid, errors='coerce') + + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s in write_mtd_sql write MTD headers ***", sys.exc_info()[0]) + sys.exit("*** Error writing MTD SQL headers") + + try: + # -------------------- + # Write Line Data + # -------------------- + + # write the lines out to a CSV file, and then load them into database + if not m_2d_data.empty: + # make sure type of columns is consistent between headers and line data + m_2d_data.obs_valid = pd.to_datetime(m_2d_data.obs_valid, + errors='coerce') + # put the header ids back into the dataframe + m_2d_data = pd.merge( + left=mtd_headers, right=m_2d_data, on=CN.MTD_2D_HEADER_KEYS) + m_2d_data.loc[m_2d_data.obs_valid.isnull(), + CN.OBS_VALID] = CN.MV_NULL + + # create defaults for flags + m_2d_data[CN.SIMPLE_FLAG] = 1 + m_2d_data[CN.FCST_FLAG] = 0 + m_2d_data[CN.MATCHED_FLAG] = 0 + + # Set simple flag to zero if object id starts with C + if m_2d_data.object_id.str.startswith('C').any(): + m_2d_data.loc[m_2d_data.object_id.str.startswith('C'), + CN.SIMPLE_FLAG] = 0 + + # Set fcst flag to 1 if object id contains an F + if m_2d_data.object_id.str.contains('F').any(): + m_2d_data.loc[m_2d_data.object_id.str.contains('F'), + CN.FCST_FLAG] = 1 + + # Set matched flag to 1 if object cat has neither underscore nor 000 + if ((~m_2d_data.object_cat.str.contains(CN.U_SCORE)).sum() > 0 and + (~m_2d_data.object_cat.str.contains(CN.T_ZERO)).sum() > 0): + m_2d_data.loc[~m_2d_data.object_cat.str.contains(CN.U_SCORE) & + ~m_2d_data.object_cat.str.contains( + CN.T_ZERO), + CN.MATCHED_FLAG] = 1 + + sql_met.write_to_sql(m_2d_data, CN.MTD_2D_OBJ_FIELDS, CN.MTD_2D_T, + CN.INS_M2HEADER, tmp_dir, sql_cur, local_infile, logger) + m_2d_data = m_2d_data.iloc[0:0] + + if not m_3d_single_data.empty: + # make sure type of columns is consistent between headers and line data + m_3d_single_data.fcst_lead = m_3d_single_data.fcst_lead.astype( + 'int64') + m_3d_single_data.obs_lead = m_3d_single_data.obs_lead.astype( + 'int64') + m_3d_single_data.obs_valid = pd.to_datetime(m_3d_single_data.obs_valid, + errors='coerce') + + # put the header ids back into the dataframe + m_3d_single_data = pd.merge(left=mtd_headers, right=m_3d_single_data, + on=CN.MTD_HEADER_KEYS) + m_3d_single_data.loc[m_3d_single_data.obs_valid.isnull( + ), CN.OBS_VALID] = CN.MV_NULL + + # create defaults for flags + m_3d_single_data[CN.SIMPLE_FLAG] = 1 + m_3d_single_data[CN.FCST_FLAG] = 0 + m_3d_single_data[CN.MATCHED_FLAG] = 0 + + # Set simple flag to zero if object id starts with C + if m_3d_single_data.object_id.str.startswith('C').any(): + m_3d_single_data.loc[m_3d_single_data.object_id.str.startswith('C'), + CN.SIMPLE_FLAG] = 0 + + # Set fcst flag to 1 if object id contains an F + if m_3d_single_data.object_id.str.contains('F').any(): + m_3d_single_data.loc[m_3d_single_data.object_id.str.contains('F'), + CN.FCST_FLAG] = 1 + + # Set matched flag to 1 if object cat has neither underscore nor 000 + if (~m_3d_single_data.object_cat.str.contains(CN.U_SCORE)).sum() > 0: + if (~m_3d_single_data.object_cat.str.contains(CN.T_ZERO)).sum() > 0: + m_3d_single_data.loc[~m_3d_single_data.object_cat.str.contains(CN.U_SCORE) & + ~m_3d_single_data.object_cat.str.contains( + CN.T_ZERO), + CN.MATCHED_FLAG] = 1 + + sql_met.write_to_sql(m_3d_single_data, CN.MTD_3D_OBJ_SINGLE_FIELDS, CN.MTD_SINGLE_T, + CN.INS_M3SHEADER, tmp_dir, sql_cur, local_infile, logger) + m_3d_single_data = m_3d_single_data.iloc[0:0] + + if not m_3d_pair_data.empty: + # make sure type of columns is consistent between headers and line data + m_3d_pair_data.fcst_lead = m_3d_pair_data.fcst_lead.astype( + 'int64') + m_3d_pair_data.obs_lead = m_3d_pair_data.obs_lead.astype( + 'int64') + m_3d_pair_data.obs_valid = pd.to_datetime(m_3d_pair_data.obs_valid, + errors='coerce') + + # put the header ids back into the dataframe + m_3d_pair_data = pd.merge(left=mtd_headers, right=m_3d_pair_data, + on=CN.MTD_HEADER_KEYS) + m_3d_pair_data.loc[m_3d_pair_data.obs_valid.isnull( + ), CN.OBS_VALID] = CN.MV_NULL + mtd_headers = mtd_headers.iloc[0:0] + + # create defaults for flags + m_3d_pair_data[CN.SIMPLE_FLAG] = 1 + m_3d_pair_data[CN.MATCHED_FLAG] = 0 + + # Set simple flag to zero if object id starts with C + if m_3d_pair_data.object_id.str.startswith('C').any(): + m_3d_pair_data.loc[m_3d_pair_data.object_id.str.startswith('C'), + CN.SIMPLE_FLAG] = 0 + + # Set matched flag to 1 if object cat has no 000 + if (~m_3d_pair_data.object_cat.str.contains(CN.T_ZERO)).sum() > 0: + m_3d_pair_data.loc[~m_3d_pair_data.object_cat.str.contains(CN.T_ZERO), + CN.MATCHED_FLAG] = 1 + + sql_met.write_to_sql(m_3d_pair_data, CN.MTD_3D_OBJ_PAIR_FIELDS, CN.MTD_PAIR_T, + CN.INS_M3PHEADER, tmp_dir, sql_cur, local_infile, logger) + m_3d_pair_data = m_3d_pair_data.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s in write_mtd_sql write line data ***", sys.exc_info()[0]) + sys.exit("*** Error writing MTD SQL line data") + + write_time_end = time.perf_counter() + write_time = timedelta(seconds=write_time_end - write_time_start) + + logger.info(" >>> Write time MTD: %s", str(write_time)) + + logger.debug("[--- End write_mtd_sql ---]") + + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + self.logger.error( + "*** %s occurred in write_mtd_data function ***", sys.exc_info()[0]) + sys.exit("*** Error in write_mtd_data function") diff --git a/METdbLoad/ush/write_stat_sql.py b/METdbLoad/ush/write_stat_sql.py index 5eff413c..037e30e4 100644 --- a/METdbLoad/ush/write_stat_sql.py +++ b/METdbLoad/ush/write_stat_sql.py @@ -39,273 +39,310 @@ def write_stat_data(load_flags, stat_data, tmp_dir, sql_cur, local_infile, logge N/A """ - logger.debug("[--- Start write_stat_data ---]") + try: - write_time_start = time.perf_counter() + logger.debug("[--- Start write_stat_data ---]") - try: + write_time_start = time.perf_counter() - sql_met = RunSql() - - # -------------------- - # Write Stat Headers - # -------------------- - - # find the unique headers for this current load job - # Do not include Version, as MVLoad does not - stat_headers = stat_data[CN.STAT_HEADER_KEYS].copy() - stat_headers.drop_duplicates(CN.STAT_HEADER_KEYS[1:], keep='first', inplace=True) - stat_headers.reset_index(drop=True, inplace=True) - - # At first, we do not know if the headers already exist, so we have no keys - stat_headers[CN.STAT_HEADER_ID] = CN.NO_KEY - - # get the next valid stat header id. Set it to zero (first valid id) if no records yet - next_header_id = sql_met.get_next_id(CN.STAT_HEADER, CN.STAT_HEADER_ID, sql_cur, logger) - - # if the flag is set to check for duplicate headers, get ids from existing headers - if load_flags["stat_header_db_check"]: - - # For each header, query with unique fields to try to find a match in the database - for row_num, data_line in stat_headers.iterrows(): - sql_cur.execute(CN.Q_HEADER, data_line.values[1:-1].tolist()) - result = sql_cur.fetchone() - - # If you find a match, put the key into the stat_headers dataframe - if sql_cur.rowcount > 0: - stat_headers.loc[stat_headers.index[row_num], CN.STAT_HEADER_ID] = result[0] - else: - stat_headers.loc[stat_headers.index[row_num], CN.STAT_HEADER_ID] = \ - row_num + next_header_id - else: - # When all new headers, add the next id to the row number/index to make a new key - stat_headers.loc[stat_headers.stat_header_id == CN.NO_KEY, CN.STAT_HEADER_ID] = \ - stat_headers.index + next_header_id - - # get just the new headers with their keys - new_headers = stat_headers[stat_headers[CN.STAT_HEADER_ID] > (next_header_id - 1)] - logger.info("New headers: %s rows", str(len(new_headers.index))) - - # Write any new headers out to the sql database - if not new_headers.empty: - sql_met.write_to_sql(new_headers, CN.STAT_HEADER_FIELDS, CN.STAT_HEADER, - CN.INS_HEADER, tmp_dir, sql_cur, local_infile, logger) - - # put the header ids back into the dataframe of all the line data - stat_data = pd.merge(left=stat_data, right=stat_headers, on=CN.STAT_HEADER_KEYS[1:]) - # Merging with limited keys renames the version column, change it back - if 'version_x' in stat_data.columns: - stat_data = stat_data.rename(columns={'version_x': CN.VERSION}) - # Clean out the headers working dataframes - stat_headers = stat_headers.iloc[0:0] - new_headers = new_headers.iloc[0:0] - - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s in top half of write_stat_data ***", sys.exc_info()[0]) + try: - try: + sql_met = RunSql() - # -------------------- - # Write Line Data - # -------------------- - - # find all of the line types in the data - line_types = stat_data.line_type.unique() - line_data = pd.DataFrame() - - # process one kind of line data at a time - for line_type in line_types: - - all_var = pd.DataFrame() - list_var = [] - - # use the UC line type to index into the list of table names - line_table = CN.LINE_TABLES[CN.UC_LINE_TYPES.index(line_type)] - - # get the line data of just this type and re-index - line_data = stat_data[stat_data[CN.LINE_TYPE] == line_type].copy() - line_data = line_data.reset_index(drop=True) - logger.info("%s: %s rows", line_type, str(len(line_data.index))) - - # change all Not Available values to METviewer not available (-9999) - line_data = line_data.replace(CN.NOTAV, CN.MV_NOTAV) - - # Only variable length lines have a line_data_id - if line_type in CN.VAR_LINE_TYPES: - # Get next valid line data id. Set it to zero (first valid id) if no records yet - next_line_id = \ - sql_met.get_next_id(line_table, CN.LINE_DATA_ID, sql_cur, logger) - logger.debug("next_line_id is %s", next_line_id) - - # try to keep order the same as MVLoad - line_data = line_data.sort_values(by=[CN.DATA_FILE_ID, CN.LINE_NUM], - ignore_index=True).copy() - - line_data[CN.LINE_DATA_ID] = line_data.index + next_line_id - - # index of the first column of the repeating variables - var_index = line_data.columns.get_loc(CN.LINE_VAR_COUNTER[line_type]) + 1 - - # There are 10 extra variables after n_thresh in PSTD records - if line_type == CN.PSTD: - var_index = var_index + 10 - - # need this later for old RHIST - orig_index = var_index - - # process each variable line one at a time for different versions - for row_num, file_line in line_data.iterrows(): - # how many sets of repeating variables - var_count = int(file_line[CN.LINE_VAR_COUNTER[line_type]]) - # these two variable line types are one group short - if line_type in [CN.PJC, CN.PRC]: - var_count = var_count - 1 - - # VSDB and STAT values for sets of repeating vars may be different - if line_type == 'CN.ECLV': - var_count = var_index - 1 - - # reset to original value - var_index = orig_index - - # older versions of RHIST have varying ECNT data in them - if line_type == CN.RHIST and file_line[CN.VERSION] in CN.RHIST_OLD: - var_count = int(file_line['3']) - var_index = orig_index + 2 - if file_line[CN.VERSION] in CN.RHIST_5: - var_index = var_index + 1 - if file_line[CN.VERSION] in CN.RHIST_6: - var_index = var_index + 2 - - # MCTC needs an i and a j counter - if line_type == CN.MCTC: - basic_count = var_count - var_count = var_count * var_count - - # The number of variables in the repeats - var_repeats = CN.LINE_VAR_REPEATS[line_type] - - # number of sets of variables times the number of variables in the sets - repeat_width = int(var_count * var_repeats) - - # If variable length record exceeds current line length, delete - if repeat_width > (len(file_line) - var_index): - file_name = sql_met.get_file_name(file_line[CN.DATA_FILE_ID], sql_cur, logger) - logger.error('*** Variable length record from file %s line %s ' - 'type %s deleted as longer than %s ***', - file_name, row_num, line_type, CN.MAX_COL + 25) - line_data.drop(line_data[line_data.line_data_id == row_num].index, - inplace=True) - else: - # pull out just the repeating data - list_var_data = file_line.iloc[var_index:var_index + repeat_width] - # put it into the right number of rows and columns - var_data = \ - pd.DataFrame(list_var_data.values.reshape(var_count, var_repeats)) + # -------------------- + # Write Stat Headers + # -------------------- - # for older versions of RHIST, blank out repeating fields in line data - if line_type == CN.RHIST and file_line[CN.VERSION] in CN.RHIST_OLD: - line_data.iloc[row_num, var_index:var_index + repeat_width] = \ - CN.MV_NOTAV + # find the unique headers for this current load job + # Do not include Version, as MVLoad does not + stat_headers = stat_data[CN.STAT_HEADER_KEYS].copy() + stat_headers.drop_duplicates( + CN.STAT_HEADER_KEYS[1:], keep='first', inplace=True) + stat_headers.reset_index(drop=True, inplace=True) - # for stat file versions of PSTD, blank out variable fields in line data - if line_type == CN.PSTD and file_line[CN.VERSION] != 'V01': - line_data.iloc[row_num, var_index:var_index + repeat_width] = \ - CN.MV_NOTAV + # At first, we do not know if the headers already exist, so we have no keys + stat_headers[CN.STAT_HEADER_ID] = CN.NO_KEY - # add on the first two fields - line data id, and i value - var_data.insert(0, CN.LINE_DATA_ID, file_line[CN.LINE_DATA_ID]) - var_data.insert(1, 'i_value', var_data.index + 1) + # get the next valid stat header id. Set it to zero (first valid id) if no records yet + next_header_id = sql_met.get_next_id( + CN.STAT_HEADER, CN.STAT_HEADER_ID, sql_cur, logger) - # MCTC has i and j counters where j increments faster + # if the flag is set to check for duplicate headers, get ids from existing headers + if load_flags["stat_header_db_check"]: + + # For each header, query with unique fields to try to find a match in the database + for row_num, data_line in stat_headers.iterrows(): + sql_cur.execute( + CN.Q_HEADER, data_line.values[1:-1].tolist()) + result = sql_cur.fetchone() + + # If you find a match, put the key into the stat_headers dataframe + if sql_cur.rowcount > 0: + stat_headers.loc[stat_headers.index[row_num], + CN.STAT_HEADER_ID] = result[0] + else: + stat_headers.loc[stat_headers.index[row_num], CN.STAT_HEADER_ID] = \ + row_num + next_header_id + else: + # When all new headers, add the next id to the row number/index to make a new key + stat_headers.loc[stat_headers.stat_header_id == CN.NO_KEY, CN.STAT_HEADER_ID] = \ + stat_headers.index + next_header_id + + # get just the new headers with their keys + new_headers = stat_headers[stat_headers[CN.STAT_HEADER_ID] > ( + next_header_id - 1)] + logger.info("New headers: %s rows", + str(len(new_headers.index))) + + # Write any new headers out to the sql database + if not new_headers.empty: + sql_met.write_to_sql(new_headers, CN.STAT_HEADER_FIELDS, CN.STAT_HEADER, + CN.INS_HEADER, tmp_dir, sql_cur, local_infile, logger) + + # put the header ids back into the dataframe of all the line data + stat_data = pd.merge( + left=stat_data, right=stat_headers, on=CN.STAT_HEADER_KEYS[1:]) + # Merging with limited keys renames the version column, change it back + if 'version_x' in stat_data.columns: + stat_data = stat_data.rename( + columns={'version_x': CN.VERSION}) + # Clean out the headers working dataframes + stat_headers = stat_headers.iloc[0:0] + new_headers = new_headers.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s in write_stat_data write headers ***", sys.exc_info()[0]) + sys.exit("*** Error writing stat SQL headers") + + try: + + # -------------------- + # Write Line Data + # -------------------- + + # find all of the line types in the data + line_types = stat_data.line_type.unique() + line_data = pd.DataFrame() + + # process one kind of line data at a time + for line_type in line_types: + + all_var = pd.DataFrame() + list_var = [] + + # use the UC line type to index into the list of table names + line_table = CN.LINE_TABLES[CN.UC_LINE_TYPES.index( + line_type)] + + # get the line data of just this type and re-index + line_data = stat_data[stat_data[CN.LINE_TYPE] + == line_type].copy() + line_data = line_data.reset_index(drop=True) + logger.info("%s: %s rows", line_type, + str(len(line_data.index))) + + # change all Not Available values to METviewer not available (-9999) + line_data = line_data.replace(CN.NOTAV, CN.MV_NOTAV) + + # Only variable length lines have a line_data_id + if line_type in CN.VAR_LINE_TYPES: + # Get next valid line data id. Set it to zero (first valid id) if no records yet + next_line_id = \ + sql_met.get_next_id( + line_table, CN.LINE_DATA_ID, sql_cur, logger) + logger.debug("next_line_id is %s", next_line_id) + + # try to keep order the same as MVLoad + line_data = line_data.sort_values(by=[CN.DATA_FILE_ID, CN.LINE_NUM], + ignore_index=True).copy() + + line_data[CN.LINE_DATA_ID] = line_data.index + \ + next_line_id + + # index of the first column of the repeating variables + var_index = line_data.columns.get_loc( + CN.LINE_VAR_COUNTER[line_type]) + 1 + + # There are 10 extra variables after n_thresh in PSTD records + if line_type == CN.PSTD: + var_index = var_index + 10 + + # need this later for old RHIST + orig_index = var_index + + # process each variable line one at a time for different versions + for row_num, file_line in line_data.iterrows(): + # how many sets of repeating variables + var_count = int( + file_line[CN.LINE_VAR_COUNTER[line_type]]) + # these two variable line types are one group short + if line_type in [CN.PJC, CN.PRC]: + var_count = var_count - 1 + + # VSDB and STAT values for sets of repeating vars may be different + if line_type == 'CN.ECLV': + var_count = var_index - 1 + + # reset to original value + var_index = orig_index + + # older versions of RHIST have varying ECNT data in them + if line_type == CN.RHIST and file_line[CN.VERSION] in CN.RHIST_OLD: + var_count = int(file_line['3']) + var_index = orig_index + 2 + if file_line[CN.VERSION] in CN.RHIST_5: + var_index = var_index + 1 + if file_line[CN.VERSION] in CN.RHIST_6: + var_index = var_index + 2 + + # MCTC needs an i and a j counter if line_type == CN.MCTC: - var_data.loc[:, 'i_value'] = \ - np.repeat(np.array(range(1, basic_count + 1)), basic_count) - j_indices = np.resize(range(1, basic_count + 1), var_count) - var_data.insert(2, 'j_value', j_indices) - - - # Fill in ec_value if missing - 1/n_cat - if pd.isna(line_data.iloc[row_num, var_index + repeat_width]): - line_data.iloc[row_num, var_index + repeat_width] = \ - 1/line_data.iloc[row_num, var_index - 1] - - # Move field (ec_value) that was added later back to end of main line - line_data.iloc[row_num, var_index] = \ - line_data.iloc[row_num, var_index + repeat_width] - - if line_type == CN.ORANK: - # move the values after the variable length data to the left - var_end = var_index + repeat_width - line_data.iloc[row_num, var_index:var_index + 7] = \ - line_data.iloc[row_num, var_end:var_end + 7].values - - # collect all of the variable data for a line type - list_var.append(var_data) - - # end for row_num, file_line - if list_var: - all_var = pd.concat(list_var, ignore_index=True, sort=False) - list_var = [] - - if line_type == CN.RHIST: - # copy the RHIST columns and create ECNT lines from them - line_data2 = line_data[line_data[CN.VERSION].isin(CN.RHIST_OLD)].copy() - if not line_data2.empty: - line_data2.line_type = CN.ECNT - - # put the fields in the correct order for ECNT - line_data2 = \ - line_data2.rename(columns={'1': '2', '2': '4', - '3': '1', '4': '3', - '5': '7', '7': '5'}) - - # Write out the ECNT lines created from old RHIST lines - sql_met.write_to_sql(line_data2, CN.LINE_DATA_COLS[CN.ECNT], - CN.LINE_TABLES[CN.UC_LINE_TYPES.index(CN.ECNT)], - CN.LINE_DATA_Q[CN.ECNT], - tmp_dir, sql_cur, local_infile, logger) - line_data2 = line_data2.iloc[0:0] - - # copy the value of n_rank two columns earlier for old RHIST - line_data.loc[line_data[CN.VERSION].isin(CN.RHIST_OLD), '1'] = \ - line_data['3'] - - # write the lines out to a CSV file, and then load them into database - if not line_data.empty: - sql_met.write_to_sql(line_data, CN.LINE_DATA_COLS[line_type], line_table, - CN.LINE_DATA_Q[line_type], tmp_dir, sql_cur, local_infile, logger) - line_data = line_data.iloc[0:0] - - # if there are variable length records, write them out also - if not all_var.empty: - all_var.columns = CN.LINE_DATA_VAR_FIELDS[line_type] - sql_met.write_to_sql(all_var, CN.LINE_DATA_VAR_FIELDS[line_type], - CN.LINE_DATA_VAR_TABLES[line_type], - CN.LINE_DATA_VAR_Q[line_type], - tmp_dir, sql_cur, local_infile, logger) - all_var = all_var.iloc[0:0] - - # end for line_type - - # write out line_data_perc records - if CN.FCST_PERC in stat_data: - if stat_data[CN.FCST_PERC].ne(CN.MV_NOTAV).any(): - line_data2 = stat_data[stat_data[CN.FCST_PERC].ne(CN.MV_NOTAV) & - stat_data[CN.FCST_PERC].notnull()].copy() - - # Write out the PERC lines - sql_met.write_to_sql(line_data2, CN.LINE_DATA_COLS[CN.PERC], - CN.LINE_TABLES[CN.UC_LINE_TYPES.index(CN.PERC)], - CN.LINE_DATA_Q[CN.PERC], tmp_dir, sql_cur, local_infile, logger) - line_data2 = line_data2.iloc[0:0] - - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s in lower half of write_stat_data ***", sys.exc_info()[0]) - - write_time_end = time.perf_counter() - write_time = timedelta(seconds=write_time_end - write_time_start) - - logger.info(" >>> Write time Stat: %s", str(write_time)) - - logger.debug("[--- End write_stat_data ---]") + basic_count = var_count + var_count = var_count * var_count + + # The number of variables in the repeats + var_repeats = CN.LINE_VAR_REPEATS[line_type] + + # number of sets of variables times the number of variables in the sets + repeat_width = int(var_count * var_repeats) + + # If variable length record exceeds current line length, delete + if repeat_width > (len(file_line) - var_index): + file_name = sql_met.get_file_name( + file_line[CN.DATA_FILE_ID], sql_cur, logger) + logger.error('*** Variable length record from file %s line %s ' + 'type %s deleted as longer than %s ***', + file_name, row_num, line_type, CN.MAX_COL + 25) + line_data.drop(line_data[line_data.line_data_id == row_num].index, + inplace=True) + else: + # pull out just the repeating data + list_var_data = file_line.iloc[var_index:var_index + repeat_width] + # put it into the right number of rows and columns + var_data = \ + pd.DataFrame(list_var_data.values.reshape( + var_count, var_repeats)) + + # for older versions of RHIST, blank out repeating fields in line data + if line_type == CN.RHIST and file_line[CN.VERSION] in CN.RHIST_OLD: + line_data.iloc[row_num, var_index:var_index + repeat_width] = \ + CN.MV_NOTAV + + # for stat file versions of PSTD, blank out variable fields in line data + if line_type == CN.PSTD and file_line[CN.VERSION] != 'V01': + line_data.iloc[row_num, var_index:var_index + repeat_width] = \ + CN.MV_NOTAV + + # add on the first two fields - line data id, and i value + var_data.insert( + 0, CN.LINE_DATA_ID, file_line[CN.LINE_DATA_ID]) + var_data.insert( + 1, 'i_value', var_data.index + 1) + + # MCTC has i and j counters where j increments faster + if line_type == CN.MCTC: + var_data.loc[:, 'i_value'] = \ + np.repeat( + np.array(range(1, basic_count + 1)), basic_count) + j_indices = np.resize( + range(1, basic_count + 1), var_count) + var_data.insert(2, 'j_value', j_indices) + + # Fill in ec_value if missing - 1/n_cat + if pd.isna(line_data.iloc[row_num, var_index + repeat_width]): + line_data.iloc[row_num, var_index + repeat_width] = \ + 1/line_data.iloc[row_num, var_index - 1] + + # Move field (ec_value) that was added later back to end of main line + line_data.iloc[row_num, var_index] = \ + line_data.iloc[row_num, + var_index + repeat_width] + + if line_type == CN.ORANK: + # move the values after the variable length data to the left + var_end = var_index + repeat_width + line_data.iloc[row_num, var_index:var_index + 7] = \ + line_data.iloc[row_num, + var_end:var_end + 7].values + + # collect all of the variable data for a line type + list_var.append(var_data) + + # end for row_num, file_line + if list_var: + all_var = pd.concat( + list_var, ignore_index=True, sort=False) + list_var = [] + + if line_type == CN.RHIST: + # copy the RHIST columns and create ECNT lines from them + line_data2 = line_data[line_data[CN.VERSION].isin( + CN.RHIST_OLD)].copy() + if not line_data2.empty: + line_data2.line_type = CN.ECNT + + # put the fields in the correct order for ECNT + line_data2 = \ + line_data2.rename(columns={'1': '2', '2': '4', + '3': '1', '4': '3', + '5': '7', '7': '5'}) + + # Write out the ECNT lines created from old RHIST lines + sql_met.write_to_sql(line_data2, CN.LINE_DATA_COLS[CN.ECNT], + CN.LINE_TABLES[CN.UC_LINE_TYPES.index( + CN.ECNT)], + CN.LINE_DATA_Q[CN.ECNT], + tmp_dir, sql_cur, local_infile, logger) + line_data2 = line_data2.iloc[0:0] + + # copy the value of n_rank two columns earlier for old RHIST + line_data.loc[line_data[CN.VERSION].isin(CN.RHIST_OLD), '1'] = \ + line_data['3'] + + # write the lines out to a CSV file, and then load them into database + if not line_data.empty: + sql_met.write_to_sql(line_data, CN.LINE_DATA_COLS[line_type], line_table, + CN.LINE_DATA_Q[line_type], tmp_dir, sql_cur, local_infile, logger) + line_data = line_data.iloc[0:0] + + # if there are variable length records, write them out also + if not all_var.empty: + all_var.columns = CN.LINE_DATA_VAR_FIELDS[line_type] + sql_met.write_to_sql(all_var, CN.LINE_DATA_VAR_FIELDS[line_type], + CN.LINE_DATA_VAR_TABLES[line_type], + CN.LINE_DATA_VAR_Q[line_type], + tmp_dir, sql_cur, local_infile, logger) + all_var = all_var.iloc[0:0] + + # end for line_type + + # write out line_data_perc records + if CN.FCST_PERC in stat_data: + if stat_data[CN.FCST_PERC].ne(CN.MV_NOTAV).any(): + line_data2 = stat_data[stat_data[CN.FCST_PERC].ne(CN.MV_NOTAV) & + stat_data[CN.FCST_PERC].notnull()].copy() + + # Write out the PERC lines + sql_met.write_to_sql(line_data2, CN.LINE_DATA_COLS[CN.PERC], + CN.LINE_TABLES[CN.UC_LINE_TYPES.index( + CN.PERC)], + CN.LINE_DATA_Q[CN.PERC], tmp_dir, sql_cur, local_infile, logger) + line_data2 = line_data2.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s in write_stat_data write line data ***", sys.exc_info()[0]) + sys.exit("*** Error writing stat SQL line data") + + write_time_end = time.perf_counter() + write_time = timedelta(seconds=write_time_end - write_time_start) + + logger.info(" >>> Write time Stat: %s", str(write_time)) + + logger.debug("[--- End write_stat_data ---]") + + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + self.logger.error( + "*** %s occurred in write_stat_data function ***", sys.exc_info()[0]) + sys.exit("*** Error in write_stat_data function") diff --git a/METdbLoad/ush/write_tcst_sql.py b/METdbLoad/ush/write_tcst_sql.py index f0a7351a..4506aab1 100644 --- a/METdbLoad/ush/write_tcst_sql.py +++ b/METdbLoad/ush/write_tcst_sql.py @@ -39,170 +39,200 @@ def write_tcst_data(load_flags, tcst_data, tmp_dir, sql_cur, local_infile, logge N/A """ - logger.debug("[--- Start write_tcst_data ---]") - - write_time_start = time.perf_counter() - - try: - - sql_met = RunSql() - - # -------------------- - # Write Tcst Headers - # -------------------- - - # find the unique headers for this current load job - # Do not include Version, as MVLoad does not - tcst_headers = tcst_data[CN.TCST_HEADER_KEYS].copy() - tcst_headers.drop_duplicates(CN.TCST_HEADER_KEYS[1:], keep='first', inplace=True) - tcst_headers.reset_index(drop=True, inplace=True) - - # At first, we do not know if the headers already exist, so we have no keys - tcst_headers[CN.TCST_HEADER_ID] = CN.NO_KEY - - # get the next valid tcst header id. Set it to zero (first valid id) if no records yet - next_header_id = sql_met.get_next_id(CN.TCST_HEADER, CN.TCST_HEADER_ID, sql_cur, logger) - - # if the flag is set to check for duplicate headers, get ids from existing headers - if load_flags["tcst_header_db_check"]: - - # For each header, query with unique fields to try to find a match in the database - for row_num, data_line in tcst_headers.iterrows(): - sql_cur.execute(CN.Q_HEADER_TCST, data_line.values[1:-1].tolist()) - result = sql_cur.fetchone() - - # If you find a match, put the key into the tcst_headers dataframe - if sql_cur.rowcount > 0: - tcst_headers.loc[tcst_headers.index[row_num], CN.TCST_HEADER_ID] = result[0] - else: - tcst_headers.loc[tcst_headers.index[row_num], CN.TCST_HEADER_ID] = \ - row_num + next_header_id - else: - # When all new headers, add the next id to the row number/index to make a new key - tcst_headers.loc[tcst_headers.tcst_header_id == CN.NO_KEY, CN.TCST_HEADER_ID] = \ - tcst_headers.index + next_header_id - - # get just the new headers with their keys - new_headers = tcst_headers[tcst_headers[CN.TCST_HEADER_ID] > (next_header_id - 1)] - logger.info("New headers: %s rows", str(len(new_headers.index))) - - # Write any new headers out to the sql database - if not new_headers.empty: - sql_met.write_to_sql(new_headers, CN.TCST_HEADER_FIELDS, CN.TCST_HEADER, - CN.INS_HEADER_TCST, tmp_dir, sql_cur, local_infile, logger) - - # put the header ids back into the dataframe of all the line data - tcst_data = pd.merge(left=tcst_data, right=tcst_headers, on=CN.TCST_HEADER_KEYS[1:]) - # Merging with limited keys renames the version column, change it back - if 'version_x' in tcst_data.columns: - tcst_data = tcst_data.rename(columns={'version_x': CN.VERSION}) - # Clean out the headers working dataframes - tcst_headers = tcst_headers.iloc[0:0] - new_headers = new_headers.iloc[0:0] - - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s in write_tcst_data write tcst headers ***", sys.exc_info()[0]) - try: - - # -------------------- - # Write Line Data - # -------------------- - - # find all of the line types in the data - line_types = tcst_data.line_type.unique() - - # process one kind of line data at a time - for line_type in line_types: - - all_var = pd.DataFrame() - list_var = [] - - # use the UC line type to index into the list of table names - line_table = CN.LINE_TABLES_TCST[CN.UC_LINE_TYPES_TCST.index(line_type)] - - # get the line data of just this type and re-index - line_data = tcst_data[tcst_data[CN.LINE_TYPE] == line_type].copy() - line_data.reset_index(drop=True, inplace=True) - logger.info("%s: %s rows", line_type, str(len(line_data.index))) - - # change all Not Available numerical values to METviewer not available (-9999) - # replace adepth and bdepth NA -> X - if line_type == CN.TCMPR: - line_data['64'] = line_data['64'].replace(CN.NOTAV, 'X') - line_data['65'] = line_data['65'].replace(CN.NOTAV, 'X') - - # Change remaining NA values - line_data = line_data.replace(CN.NOTAV, CN.MV_NOTAV) - - # Only variable length lines have a line_data_id - if line_type in CN.VAR_LINE_TYPES_TCST: - # Get next valid line data id. Set it to zero (first valid id) if no records yet - next_line_id = \ - sql_met.get_next_id(line_table, CN.LINE_DATA_ID, sql_cur, logger) - logger.debug("next_line_id is %s", next_line_id) - - # try to keep order the same as MVLoad - line_data = line_data.sort_values(by=[CN.DATA_FILE_ID, CN.LINE_NUM], - ignore_index=True).copy() - - line_data[CN.LINE_DATA_ID] = line_data.index + next_line_id - - # index of the first column of the repeating variables - var_index = line_data.columns.get_loc(CN.LINE_VAR_COUNTER[line_type]) + 1 - - # process each variable line one at a time for different versions - for row_num, file_line in line_data.iterrows(): - # how many sets of repeating variables - var_count = int(file_line[CN.LINE_VAR_COUNTER[line_type]]) - - # The number of variables in the repeats - var_repeats = CN.LINE_VAR_REPEATS[line_type] - # number of sets of variables times the number of variables in the sets - repeat_width = int(var_count * var_repeats) - - # pull out just the repeating data - list_var_data = file_line.iloc[var_index:var_index + repeat_width] - # put it into the right number of rows and columns - var_data = \ - pd.DataFrame(list_var_data.values.reshape(var_count, var_repeats)) - - # add on the first two fields - line data id, and i value - var_data.insert(0, CN.LINE_DATA_ID, file_line[CN.LINE_DATA_ID]) - var_data.insert(1, 'i_value', var_data.index + 1) - - # collect all of the variable data for a line type - list_var.append(var_data) - - # end for row_num, file_line - if list_var: - all_var = pd.concat(list_var, ignore_index=True, sort=False) - list_var = [] - - # write the lines out to a CSV file, and then load them into database - if not line_data.empty: - sql_met.write_to_sql(line_data, CN.LINE_DATA_COLS_TCST[line_type], line_table, - CN.LINE_DATA_Q[line_type], tmp_dir, sql_cur, local_infile, logger) - line_data = line_data.iloc[0:0] - - # if there are variable length records, write them out also - if not all_var.empty: - all_var.columns = CN.LINE_DATA_VAR_FIELDS[line_type] - sql_met.write_to_sql(all_var, CN.LINE_DATA_VAR_FIELDS[line_type], - CN.LINE_DATA_VAR_TABLES[line_type], - CN.LINE_DATA_VAR_Q[line_type], - tmp_dir, sql_cur, local_infile, logger) - all_var = all_var.iloc[0:0] - - # end for line_type - - except (RuntimeError, TypeError, NameError, KeyError): - logger.error("*** %s in write_tcst_data write line data ***", sys.exc_info()[0]) - - write_time_end = time.perf_counter() - write_time = timedelta(seconds=write_time_end - write_time_start) - - logger.info(" >>> Write time Tcst: %s", str(write_time)) - - logger.debug("[--- End write_tcst_data ---]") + logger.debug("[--- Start write_tcst_data ---]") + + write_time_start = time.perf_counter() + + try: + + sql_met = RunSql() + + # -------------------- + # Write Tcst Headers + # -------------------- + + # find the unique headers for this current load job + # Do not include Version, as MVLoad does not + tcst_headers = tcst_data[CN.TCST_HEADER_KEYS].copy() + tcst_headers.drop_duplicates( + CN.TCST_HEADER_KEYS[1:], keep='first', inplace=True) + tcst_headers.reset_index(drop=True, inplace=True) + + # At first, we do not know if the headers already exist, so we have no keys + tcst_headers[CN.TCST_HEADER_ID] = CN.NO_KEY + + # get the next valid tcst header id. Set it to zero (first valid id) if no records yet + next_header_id = sql_met.get_next_id( + CN.TCST_HEADER, CN.TCST_HEADER_ID, sql_cur, logger) + + # if the flag is set to check for duplicate headers, get ids from existing headers + if load_flags["tcst_header_db_check"]: + + # For each header, query with unique fields to try to find a match in the database + for row_num, data_line in tcst_headers.iterrows(): + sql_cur.execute(CN.Q_HEADER_TCST, + data_line.values[1:-1].tolist()) + result = sql_cur.fetchone() + + # If you find a match, put the key into the tcst_headers dataframe + if sql_cur.rowcount > 0: + tcst_headers.loc[tcst_headers.index[row_num], + CN.TCST_HEADER_ID] = result[0] + else: + tcst_headers.loc[tcst_headers.index[row_num], CN.TCST_HEADER_ID] = \ + row_num + next_header_id + else: + # When all new headers, add the next id to the row number/index to make a new key + tcst_headers.loc[tcst_headers.tcst_header_id == CN.NO_KEY, CN.TCST_HEADER_ID] = \ + tcst_headers.index + next_header_id + + # get just the new headers with their keys + new_headers = tcst_headers[tcst_headers[CN.TCST_HEADER_ID] > ( + next_header_id - 1)] + logger.info("New headers: %s rows", + str(len(new_headers.index))) + + # Write any new headers out to the sql database + if not new_headers.empty: + sql_met.write_to_sql(new_headers, CN.TCST_HEADER_FIELDS, CN.TCST_HEADER, + CN.INS_HEADER_TCST, tmp_dir, sql_cur, local_infile, logger) + + # put the header ids back into the dataframe of all the line data + tcst_data = pd.merge( + left=tcst_data, right=tcst_headers, on=CN.TCST_HEADER_KEYS[1:]) + # Merging with limited keys renames the version column, change it back + if 'version_x' in tcst_data.columns: + tcst_data = tcst_data.rename( + columns={'version_x': CN.VERSION}) + # Clean out the headers working dataframes + tcst_headers = tcst_headers.iloc[0:0] + new_headers = new_headers.iloc[0:0] + + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s in write_tcst_data write tcst headers ***", sys.exc_info()[0]) + sys.exit("*** Error writing tcst SQL headers") + + try: + + # -------------------- + # Write Line Data + # -------------------- + + # find all of the line types in the data + line_types = tcst_data.line_type.unique() + + # process one kind of line data at a time + for line_type in line_types: + + all_var = pd.DataFrame() + list_var = [] + + # use the UC line type to index into the list of table names + line_table = CN.LINE_TABLES_TCST[CN.UC_LINE_TYPES_TCST.index( + line_type)] + + # get the line data of just this type and re-index + line_data = tcst_data[tcst_data[CN.LINE_TYPE] + == line_type].copy() + line_data.reset_index(drop=True, inplace=True) + logger.info("%s: %s rows", line_type, + str(len(line_data.index))) + + # change all Not Available numerical values to METviewer not available (-9999) + # replace adepth and bdepth NA -> X + if line_type == CN.TCMPR: + line_data['64'] = line_data['64'].replace( + CN.NOTAV, 'X') + line_data['65'] = line_data['65'].replace( + CN.NOTAV, 'X') + + # Change remaining NA values + line_data = line_data.replace(CN.NOTAV, CN.MV_NOTAV) + + # Only variable length lines have a line_data_id + if line_type in CN.VAR_LINE_TYPES_TCST: + # Get next valid line data id. Set it to zero (first valid id) if no records yet + next_line_id = \ + sql_met.get_next_id( + line_table, CN.LINE_DATA_ID, sql_cur, logger) + logger.debug("next_line_id is %s", next_line_id) + + # try to keep order the same as MVLoad + line_data = line_data.sort_values(by=[CN.DATA_FILE_ID, CN.LINE_NUM], + ignore_index=True).copy() + + line_data[CN.LINE_DATA_ID] = line_data.index + \ + next_line_id + + # index of the first column of the repeating variables + var_index = line_data.columns.get_loc( + CN.LINE_VAR_COUNTER[line_type]) + 1 + + # process each variable line one at a time for different versions + for row_num, file_line in line_data.iterrows(): + # how many sets of repeating variables + var_count = int( + file_line[CN.LINE_VAR_COUNTER[line_type]]) + + # The number of variables in the repeats + var_repeats = CN.LINE_VAR_REPEATS[line_type] + # number of sets of variables times the number of variables in the sets + repeat_width = int(var_count * var_repeats) + + # pull out just the repeating data + list_var_data = file_line.iloc[var_index:var_index + repeat_width] + # put it into the right number of rows and columns + var_data = \ + pd.DataFrame(list_var_data.values.reshape( + var_count, var_repeats)) + + # add on the first two fields - line data id, and i value + var_data.insert(0, CN.LINE_DATA_ID, + file_line[CN.LINE_DATA_ID]) + var_data.insert(1, 'i_value', var_data.index + 1) + + # collect all of the variable data for a line type + list_var.append(var_data) + + # end for row_num, file_line + if list_var: + all_var = pd.concat( + list_var, ignore_index=True, sort=False) + list_var = [] + + # write the lines out to a CSV file, and then load them into database + if not line_data.empty: + sql_met.write_to_sql(line_data, CN.LINE_DATA_COLS_TCST[line_type], line_table, + CN.LINE_DATA_Q[line_type], tmp_dir, sql_cur, local_infile, logger) + line_data = line_data.iloc[0:0] + + # if there are variable length records, write them out also + if not all_var.empty: + all_var.columns = CN.LINE_DATA_VAR_FIELDS[line_type] + sql_met.write_to_sql(all_var, CN.LINE_DATA_VAR_FIELDS[line_type], + CN.LINE_DATA_VAR_TABLES[line_type], + CN.LINE_DATA_VAR_Q[line_type], + tmp_dir, sql_cur, local_infile, logger) + all_var = all_var.iloc[0:0] + + # end for line_type + + except (RuntimeError, TypeError, NameError, KeyError): + logger.error( + "*** %s in write_tcst_data write line data ***", sys.exc_info()[0]) + sys.exit("*** Error writing tcst SQL line data") + + write_time_end = time.perf_counter() + write_time = timedelta(seconds=write_time_end - write_time_start) + + logger.info(" >>> Write time Tcst: %s", str(write_time)) + + logger.debug("[--- End write_tcst_data ---]") + + except (RuntimeError, TypeError, NameError, KeyError, AttributeError): + self.logger.error( + "*** %s occurred in write_tcst_data function ***", sys.exc_info()[0]) + sys.exit("*** Error in write_tcst_data function") diff --git a/METreadnc/util/read_netcdf.py b/METreadnc/util/read_netcdf.py index 3a2320fc..5342677a 100644 --- a/METreadnc/util/read_netcdf.py +++ b/METreadnc/util/read_netcdf.py @@ -14,8 +14,8 @@ import pandas as pd import xarray as xr import yaml -#Setting PYTHONPATH to METcalcpy -#or pip install . in the directory METcalcpy makes this the better import +# Setting PYTHONPATH to METcalcpy +# or pip install . in the directory METcalcpy makes this the better import from metcalcpy.util.read_env_vars_in_config import parse_config @@ -27,9 +27,9 @@ class ReadNetCDF: def __init__(self): self.pandas_data = pd.DataFrame() - self.xarray_data = [] + self.xarray_data = [] - def readYAMLConfig(self,configFile): + def readYAMLConfig(self, configFile): """ Returns a file or list of files Args: @@ -44,11 +44,10 @@ def readYAMLConfig(self,configFile): # Use a configure file parser that handles environment variables files_dict = parse_config(configFile) - #parse_config returns a dictionary, read_data_files wants a list + # parse_config returns a dictionary, read_data_files wants a list files = files_dict['files'] return files - def read_into_pandas(self, load_files) -> pd.DataFrame: """ Read in data files as a list specified in yaml config or invoke directly with input provided as a list, tuple, or a single file. @@ -68,7 +67,7 @@ def read_into_pandas(self, load_files) -> pd.DataFrame: elif isinstance(load_files, str): # single file specified file_data = xr.open_dataset(load_files) - df = file_data.to_dataframe().reset_index() + df = file_data.to_dataframe().reset_index() return df def read_into_xarray(self, load_files) -> list: @@ -99,24 +98,29 @@ def main(): list of xarray Datasets and/or a pandas DataFrame """ - file_reader = ReadNetCDF() + try: + file_reader = ReadNetCDF() + + # Reading in the configuration file + parser = argparse.ArgumentParser(description='Read in config file') + parser.add_argument('Path', metavar='yaml_config_file', type=str, + help='the full path to the YAML config file') + args = parser.parse_args() + specified_config_file = args.Path + load_files = file_reader.readYAMLConfig(specified_config_file) - # Reading in the configuration file - parser = argparse.ArgumentParser(description='Read in config file') - parser.add_argument('Path', metavar='yaml_config_file', type=str, - help='the full path to the YAML config file') - args = parser.parse_args() - specified_config_file = args.Path - load_files = file_reader.readYAMLConfig(specified_config_file) + # Pandas dataframes are much larger than xarrays + # The read_into_pandas should be commented out if you are testing this + # on very large files + netcdf_data_frame = file_reader.read_into_pandas(load_files) - #Pandas dataframes are much larger than xarrays - #The read_into_pandas should be commented out if you are testing this - #on very large files - netcdf_data_frame = file_reader.read_into_pandas(load_files) + netcdf_data_set = file_reader.read_into_xarray(load_files) - netcdf_data_set = file_reader.read_into_xarray(load_files) + except RuntimeError: + print( + "*** %s occurred setting up read_netcdf ***", sys.exc_info()[0]) + sys.exit("*** Error setting up read_netcdf") if __name__ == "__main__": main() - diff --git a/METreformat/write_stat_ascii.py b/METreformat/write_stat_ascii.py index 31908bba..98dcfb32 100644 --- a/METreformat/write_stat_ascii.py +++ b/METreformat/write_stat_ascii.py @@ -36,7 +36,6 @@ from METdbLoad.ush.read_load_xml import XmlLoadFile - class WriteStatAscii: """ Class to write MET .stat files to an ASCII file that contains the reformatted input data @@ -44,22 +43,26 @@ class WriteStatAscii: a Pandas dataframe and creates an ascii file with reformatted data. """ - def __init__(self, parms, logger): - # Set up logging - - log_directory = parms['log_directory'] + try: + # Set up logging - # Create log directory if it doesn't already exist. - log_filename = (str(parms['log_filename'])).upper() - if not os.path.exists(log_directory) and log_filename != 'STDOUT': - os.mkdir(parms['log_directory']) + log_directory = parms['log_directory'] - self.logger = logger - self.parms = parms + # Create log directory if it doesn't already exist. + log_filename = (str(parms['log_filename'])).upper() + if not os.path.exists(log_directory) and log_filename != 'STDOUT': + os.mkdir(parms['log_directory']) + self.logger = logger + self.parms = parms + except RuntimeError: + self.logger = logger + self.logger.error( + "*** %s occurred while initializing class WriteStatAscii ***", sys.exc_info()[0]) + sys.exit("*** Error initializing class WriteStatAscii") def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame: """ For line types: FHO, CTC, CTS, SL1L2, ECNT, MCTS, and VCNT reformat the MET stat files (.stat) to another @@ -118,9 +121,11 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame working_df = working_df.loc[(working_df['line_type'] == linetype_requested) | (working_df['line_type'] == cn.TCMPR)] else: - working_df = working_df.loc[working_df['line_type'] == linetype_requested] + working_df = working_df.loc[working_df['line_type'] + == linetype_requested] else: - self.logger.error("Requested line type is currently not supported for reformatting") + self.logger.error( + "Requested line type is currently not supported for reformatting") raise ValueError("Requested line type ", linetype_requested, " is currently not supported for reformatting") @@ -150,7 +155,8 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame begin_reformat = time.perf_counter() try: - reformatted_df = self.process_by_stat_linetype(linetype_requested, working_df, is_aggregated) + reformatted_df = self.process_by_stat_linetype( + linetype_requested, working_df, is_aggregated) except NotImplementedError: sys.exit('NotImplementedError') @@ -160,29 +166,32 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame self.logger.info(msg) # Write out to the tab-separated text file - output_file = os.path.join(parms['output_dir'], parms['output_filename']) + output_file = os.path.join( + parms['output_dir'], parms['output_filename']) _: pd.DataFrame = reformatted_df.to_csv(output_file, index=None, sep='\t', mode='a') except (TypeError, NameError, KeyError, NotImplementedError): - self.logger.error("*** %s in write_stat_ascii ***", sys.exc_info()[0]) + self.logger.error( + "*** %s in write_stat_ascii ***", sys.exc_info()[0]) write_time_end: float = time.perf_counter() write_time = write_time_end - write_time_start - self.logger.info("Total time to reformat and write ASCII: %s seconds", str(write_time)) + self.logger.info( + "Total time to reformat and write ASCII: %s seconds", str(write_time)) return reformatted_df def process_by_stat_linetype(self, linetype: str, stat_data: pd.DataFrame, is_aggregated=True): """ - + For MET .stat output, extract the relevant statistics information into the necessary format based on whether the data is already aggregated (via MET stat-analysis) or if the data is un-aggregated and requires the METcalcpy agg_stat module for performing the aggregation statistics calculations. **NOTE** Support for reformatting into agg_stat's required input format is currently available for the *ECNT* linetype. This support will be extended to the other supported linetypes. - + Args: @param linetype: The linetype of interest (i.e. CNT, CTS, FHO, TCMPR, etc.) @@ -214,63 +223,72 @@ def process_by_stat_linetype(self, linetype: str, stat_data: pd.DataFrame, is_ag if is_aggregated: linetype_data: pd.DataFrame = self.process_fho(stat_data) else: - linetype_data: pd.DataFrame = self.process_fho_for_agg(stat_data) + linetype_data: pd.DataFrame = self.process_fho_for_agg( + stat_data) # CNT Continuous Statistics elif linetype == cn.CNT: if is_aggregated: linetype_data: pd.DataFrame = self.process_cnt(stat_data) else: - linetype_data: pd.DataFrame = self.process_cnt_for_agg(stat_data) + linetype_data: pd.DataFrame = self.process_cnt_for_agg( + stat_data) # VCNT Continuous Statistics elif linetype == cn.VCNT: if is_aggregated: linetype_data: pd.DataFrame = self.process_vcnt(stat_data) else: - linetype_data: pd.DataFrame = self.process_vcnt_for_agg(stat_data) + linetype_data: pd.DataFrame = self.process_vcnt_for_agg( + stat_data) # CTC Contingency Table Counts elif linetype == cn.CTC: if is_aggregated: linetype_data: pd.DataFrame = self.process_ctc(stat_data) else: - linetype_data: pd.DataFrame = self.process_ctc_for_agg(stat_data) + linetype_data: pd.DataFrame = self.process_ctc_for_agg( + stat_data) # CTS Contingency Table Statistics elif linetype == cn.CTS: if is_aggregated: linetype_data: pd.DataFrame = self.process_cts(stat_data) else: - linetype_data: pd.DataFrame = self.process_cts_for_agg(stat_data) + linetype_data: pd.DataFrame = self.process_cts_for_agg( + stat_data) # MCTS Contingency Table Statistics elif linetype == cn.MCTS: if is_aggregated: linetype_data: pd.DataFrame = self.process_mcts(stat_data) else: - linetype_data: pd.DataFrame = self.process_mcts_for_agg(stat_data) + linetype_data: pd.DataFrame = self.process_mcts_for_agg( + stat_data) # SL1L2 Scalar Partial sums elif linetype == cn.SL1L2: if is_aggregated: linetype_data: pd.DataFrame = self.process_sl1l2(stat_data) else: - linetype_data: pd.DataFrame = self.process_sl1l2_for_agg(stat_data) + linetype_data: pd.DataFrame = self.process_sl1l2_for_agg( + stat_data) # VL1L2 Scalar Partial sums elif linetype == cn.VL1L2: if is_aggregated: linetype_data: pd.DataFrame = self.process_vl1l2(stat_data) else: - linetype_data: pd.DataFrame = self.process_vl1l2_for_agg(stat_data) + linetype_data: pd.DataFrame = self.process_vl1l2_for_agg( + stat_data) # ECNT Ensemble Continuous statistics elif linetype == cn.ECNT: if is_aggregated: linetype_data: pd.DataFrame = self.process_ecnt(stat_data) else: - linetype_data: pd.DataFrame = self.process_ecnt_for_agg(stat_data) + linetype_data: pd.DataFrame = self.process_ecnt_for_agg( + stat_data) # PCT elif linetype == cn.PCT: @@ -340,10 +358,12 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame: total_number_variable_columns = num_thresh * num_repeating_col_labels + 1 # Add 1 for the TOTAL column to get the total number of columns for this line type - total_number_relevant_columns = cn.NUM_STATIC_PCT_COLS + total_number_variable_columns + 1 + total_number_relevant_columns = cn.NUM_STATIC_PCT_COLS + \ + total_number_variable_columns + 1 # Get a list of names of the columns that correspond to the PCT linetype for this data - only_relevant_columns = stat_data_copy.columns.tolist()[0:total_number_relevant_columns] + only_relevant_columns = stat_data_copy.columns.tolist()[ + 0:total_number_relevant_columns] filtered_df = stat_data_copy[only_relevant_columns] headers = filtered_df.columns @@ -359,14 +379,16 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame: gc.collect() # Replace the first two numbered labels (following the LINETYPE column) with the TOTAL and N_THRESH labels - working_df.rename(columns={'0': 'total', cn.LINE_VAR_COUNTER[cn.PCT]: 'n_thresh'}, inplace=True) + working_df.rename( + columns={'0': 'total', cn.LINE_VAR_COUNTER[cn.PCT]: 'n_thresh'}, inplace=True) # Relabel the remaining numbered column headers last_column_name = len(working_df.columns) - cn.NUM_STATIC_PCT_COLS # The THRESH_n column is the last column thresh_n_col_name = 'thresh_' + str(num_thresh + 1) - working_df.rename(columns={str(last_column_name): thresh_n_col_name}, inplace=True) + working_df.rename( + columns={str(last_column_name): thresh_n_col_name}, inplace=True) # Relabel the repeating columns (THRESH_i, OY_i, ON_i) # column names are numbered '1','2','3',...,etc. Give them descriptive labels: thresh_1, oy_1, on_1, etc. @@ -377,11 +399,13 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame: for column in cn.LC_PCT_VARIABLE_HEADERS: column_name = str(column_name_value) column_label = "{label}_{idx}".format(label=column, idx=i) - working_df.rename(columns={column_name: column_label}, inplace=True) + working_df.rename( + columns={column_name: column_label}, inplace=True) column_name_value += 1 # Add a list used to facilitate creating the value_i column when reformatting. - ith_value_label.append("{label}_{idx}".format(label="value", idx=i)) + ith_value_label.append( + "{label}_{idx}".format(label="value", idx=i)) # Create a dataframe consisting only of the value_1, ..., value_n values and their corresponding index values # and concat to the working_df. @@ -409,7 +433,8 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Reindex working_df to match the value_df index. This ensures correct concatenation of # the working_df with the value_df working_df_reindexed = working_df.reset_index(drop=False) - working_df_reindexed = pd.concat([working_df_reindexed, value_df], axis=1) + working_df_reindexed = pd.concat( + [working_df_reindexed, value_df], axis=1) # Clean up working_df dataframe, it is no longer needed del working_df @@ -457,8 +482,10 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame: common_list.append('n_thresh') df_thresh = working_copy_df.melt(id_vars=common_list, value_vars=thresh_cols, var_name='thresh', value_name='thresh_i') - df_oy = working_copy_df.melt(id_vars=common_list, value_vars=oy_cols, var_name='oy', value_name='oy_i') - df_on = working_copy_df.melt(id_vars=common_list, value_vars=on_cols, var_name='on', value_name='on_i') + df_oy = working_copy_df.melt( + id_vars=common_list, value_vars=oy_cols, var_name='oy', value_name='oy_i') + df_on = working_copy_df.melt( + id_vars=common_list, value_vars=on_cols, var_name='on', value_name='on_i') df_values = working_copy_df.melt(id_vars=common_list, value_vars=i_value, var_name='values', value_name='i_value') @@ -470,11 +497,16 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Reindex to use the common columns before concatenating the melted dataframes to avoid duplication of # common columns. - df_thresh_reindex = df_thresh.set_index(common_list, drop=True, append=False, inplace=False) - df_oy_reindex = df_oy.set_index(common_list, drop=True, append=False, inplace=False) - df_on_reindex = df_on.set_index(common_list, drop=True, append=False, inplace=False) - df_values_reindex = df_values.set_index(common_list, drop=True, append=False, inplace=False) - reformatted_df = pd.concat([df_thresh_reindex, df_oy_reindex, df_on_reindex, df_values_reindex], axis=1) + df_thresh_reindex = df_thresh.set_index( + common_list, drop=True, append=False, inplace=False) + df_oy_reindex = df_oy.set_index( + common_list, drop=True, append=False, inplace=False) + df_on_reindex = df_on.set_index( + common_list, drop=True, append=False, inplace=False) + df_values_reindex = df_values.set_index( + common_list, drop=True, append=False, inplace=False) + reformatted_df = pd.concat( + [df_thresh_reindex, df_oy_reindex, df_on_reindex, df_values_reindex], axis=1) # clean up del working_copy_df @@ -525,10 +557,12 @@ def process_rhist(self, stat_data: pd.DataFrame) -> pd.DataFrame: num_rank: int = int(stat_data_copy.iloc[0][cn.NUM_STATIC_RHIST_COLS]) # Add 1 for the TOTAL column to get the total number of columns for this line type - total_number_relevant_columns = cn.NUM_STATIC_RHIST_COLS + num_rank * num_repeating_col_labels + 1 + total_number_relevant_columns = cn.NUM_STATIC_RHIST_COLS + \ + num_rank * num_repeating_col_labels + 1 # Get a list of names of the columns that correspond to the RHIST linetype for this data - only_relevant_columns = stat_data_copy.columns.tolist()[0:total_number_relevant_columns] + only_relevant_columns = stat_data_copy.columns.tolist()[ + 0:total_number_relevant_columns] filtered_df = stat_data_copy[only_relevant_columns] headers = filtered_df.columns @@ -544,7 +578,8 @@ def process_rhist(self, stat_data: pd.DataFrame) -> pd.DataFrame: gc.collect() # Replace the first two numbered labels (following the LINETYPE column) with the TOTAL and N_RANK labels - working_df.rename(columns={'0': 'total', cn.LINE_VAR_COUNTER[cn.RHIST]: 'n_rank'}, inplace=True) + working_df.rename( + columns={'0': 'total', cn.LINE_VAR_COUNTER[cn.RHIST]: 'n_rank'}, inplace=True) # Relabel the repeating columns (RANK_1, ..., RANK_n) # column names are numbered '1','2','3',...,etc. by METdbLoad. @@ -556,11 +591,13 @@ def process_rhist(self, stat_data: pd.DataFrame) -> pd.DataFrame: for column in cn.LC_RHIST_VARIABLE_HEADERS: column_name = str(column_name_value) column_label = "{label}_{idx}".format(label=column, idx=i) - working_df.rename(columns={column_name: column_label}, inplace=True) + working_df.rename( + columns={column_name: column_label}, inplace=True) column_name_value += 1 # Add a list used to facilitate creating the value_i column when reformatting. - ith_value_label.append("{label}_{idx}".format(label="value", idx=i)) + ith_value_label.append( + "{label}_{idx}".format(label="value", idx=i)) # Create a dataframe consisting only of the value_1, ..., value_n values and their corresponding index values # and concat to the working_df. @@ -588,7 +625,8 @@ def process_rhist(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Reindex working_df to match the value_df index. This ensures correct concatenation of # the working_df with the value_df working_df_reindexed = working_df.reset_index(drop=False) - working_df_reindexed = pd.concat([working_df_reindexed, value_df], axis=1) + working_df_reindexed = pd.concat( + [working_df_reindexed, value_df], axis=1) # Clean up working_df dataframe, it is no longer needed del working_df @@ -622,7 +660,8 @@ def process_rhist(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Now apply melt to get the rank_i and i_value columns # include the n_rank column for indexing. common_list.append('n_rank') - df_rank = working_copy_df.melt(id_vars=common_list, value_vars=rank_cols, var_name='rank', value_name='rank_i') + df_rank = working_copy_df.melt( + id_vars=common_list, value_vars=rank_cols, var_name='rank', value_name='rank_i') df_values = working_copy_df.melt(id_vars=common_list, value_vars=i_value, var_name='values', value_name='i_value') @@ -632,9 +671,12 @@ def process_rhist(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Reindex to use the common columns before concatenating the melted dataframes to avoid duplication of # common columns. - df_rank_reindex = df_rank.set_index(common_list, drop=True, append=False, inplace=False) - df_values_reindex = df_values.set_index(common_list, drop=True, append=False, inplace=False) - reformatted_df = pd.concat([df_rank_reindex, df_values_reindex], axis=1) + df_rank_reindex = df_rank.set_index( + common_list, drop=True, append=False, inplace=False) + df_values_reindex = df_values.set_index( + common_list, drop=True, append=False, inplace=False) + reformatted_df = pd.concat( + [df_rank_reindex, df_values_reindex], axis=1) # clean up del working_copy_df @@ -686,7 +728,7 @@ def process_fho(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to another dataframe consisting of only the FHO # line type. fho_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - fho_columns_to_use] + fho_columns_to_use] # Add the stat columns header names for the FHO line type fho_columns: List[str] = cn.FHO_FULL_HEADER @@ -759,7 +801,7 @@ def process_cnt(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to one containing only the CNT data cnt_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - cnt_columns_to_use] + cnt_columns_to_use] # Add the stat columns for the CNT line type cnt_columns: List[str] = cn.FULL_CNT_HEADER @@ -855,7 +897,7 @@ def process_vcnt(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to one containing only the VCNT data vcnt_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - vcnt_columns_to_use] + vcnt_columns_to_use] # Add the stat columns for the CNT line type vcnt_columns: List[str] = cn.FULL_VCNT_HEADER @@ -950,7 +992,7 @@ def process_ctc(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to one containing only the CTC data ctc_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - ctc_columns_to_use] + ctc_columns_to_use] # Add the stat columns header names for the CTC line type ctc_columns: List[str] = cn.CTC_HEADERS @@ -1017,7 +1059,7 @@ def process_cts(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to one containing only the CTS data cts_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - cts_columns_to_use] + cts_columns_to_use] # Add all the columns header names for the CTS line type cts_columns: List[str] = cn.CTS_SPECIFIC_HEADERS @@ -1114,7 +1156,7 @@ def process_mcts(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to one containing only the CTS data mcts_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - mcts_columns_to_use] + mcts_columns_to_use] # Add all the columns header names for the MCTS line type mcts_columns: List[str] = cn.MCTS_SPECIFIC_HEADERS @@ -1208,7 +1250,7 @@ def process_sl1l2(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to one containing only the Sl1L2 data sl1l2_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - sl1l2_columns_to_use] + sl1l2_columns_to_use] # Add the stat columns header names for the SL1L2 line type sl1l2_columns: List[str] = cn.SL1L2_HEADERS @@ -1272,7 +1314,7 @@ def process_vl1l2(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to one containing only the Sl1L2 data vl1l2_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - vl1l2_columns_to_use] + vl1l2_columns_to_use] # Add the stat columns header names for the SL1L2 line type vl1l2_columns: List[str] = cn.VL1L2_HEADERS @@ -1336,7 +1378,7 @@ def process_ecnt(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to one containing only the ECNT data ecnt_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - ecnt_columns_to_use] + ecnt_columns_to_use] # Add the stat columns header names for the ECNT line type ecnt_columns: List[str] = cn.ECNT_HEADERS @@ -1411,7 +1453,7 @@ def process_ecnt_for_agg(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset original dataframe to one containing only the ECNT data ecnt_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - ecnt_columns_to_use] + ecnt_columns_to_use] # Replace the column numbers with the name of the corresponding statistic as specified in MET # User's Guide for the ECNT linetype in the ensemble stat table. @@ -1424,7 +1466,8 @@ def process_ecnt_for_agg(self, stat_data: pd.DataFrame) -> pd.DataFrame: # each ECNT-specific statistic. This will result in a very large dataframe. linetype_str = linetype.upper() + '_' ecnt_headers = cn.LC_ECNT_SPECIFIC - renamed_ecnt = [linetype_str + cur_hdr.upper() for cur_hdr in ecnt_headers] + renamed_ecnt = [linetype_str + cur_hdr.upper() + for cur_hdr in ecnt_headers] # Create a list of dataframes, each corresponding to the ECNT statistics, then merge them # all into one final dataframe. @@ -1438,7 +1481,8 @@ def process_ecnt_for_agg(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Merge all the statistics dataframes into one, then add the # stat_value column. Initialize the stat_values to NaN/NA. These # values will be filled by the METcalcpy agg_stat calculation. - merged_dfs: pd.DataFrame = pd.concat(dfs_to_merge, axis=0, ignore_index=True) + merged_dfs: pd.DataFrame = pd.concat( + dfs_to_merge, axis=0, ignore_index=True) merged_dfs['stat_value'] = np.nan merged_dfs.replace('N/A', pd.NA) @@ -1549,7 +1593,8 @@ def process_tcdiag(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Join the TCMPR and TCDIAG dataframes into one and do some cleaning up of columns uc_long_header_tcst = [hdr.upper() for hdr in cn.LONG_HEADER_TCST] common_headers = uc_long_header_tcst[0:len(uc_long_header_tcst) - 1] - full_df = pd.merge(reformatted_tcmpr, all_tcdiag_reformatted, on=common_headers, how='inner') + full_df = pd.merge( + reformatted_tcmpr, all_tcdiag_reformatted, on=common_headers, how='inner') # Clean up extraneous columns: # TOTAL_x and TOTAL_y are identical, drop TOTAL_y and rename TOTAL_x to TOTAL @@ -1557,16 +1602,17 @@ def process_tcdiag(self, stat_data: pd.DataFrame) -> pd.DataFrame: cleanup_df = full_df.copy(deep=True) cleanup_df.drop('TOTAL_y', axis=1, inplace=True) cleanup_df.drop('LINE_TYPE_x', axis=1, inplace=True) - cleanup_df.rename({'TOTAL_x': 'TOTAL', 'LINE_TYPE_y': 'LINE_TYPE'}, axis=1, inplace=True) + cleanup_df.rename( + {'TOTAL_x': 'TOTAL', 'LINE_TYPE_y': 'LINE_TYPE'}, axis=1, inplace=True) end_tcdiag = time.perf_counter() time_to_process_tcdiag = end_tcdiag - begin_tcdiag - self.logger.info(f"Total time for processing the TCDiag matched pair linetype: {time_to_process_tcdiag} seconds") + self.logger.info( + f"Total time for processing the TCDiag matched pair linetype: {time_to_process_tcdiag} seconds") return cleanup_df def reformat_tcdiag(self, tcdiag_df: pd.DataFrame) -> pd.DataFrame: - """ Takes a TCDiag dataframe and reformats it by replacing the VALUE_i column with the value of the corresponding DIAG_i @@ -1589,7 +1635,8 @@ def reformat_tcdiag(self, tcdiag_df: pd.DataFrame) -> pd.DataFrame: """ begin_reformat = time.perf_counter() - self.logger.info("Reformat the TCDiag dataframe based on the DIAG_SOURCE ") + self.logger.info( + "Reformat the TCDiag dataframe based on the DIAG_SOURCE ") n_diag_col_name = cn.LINE_VAR_COUNTER[cn.TCDIAG] ds_df = tcdiag_df.copy(deep=True) @@ -1629,7 +1676,8 @@ def reformat_tcdiag(self, tcdiag_df: pd.DataFrame) -> pd.DataFrame: diag_name = diag_names[0] # Replace the VALUE_i column corresponding to the DIAG_i with the name of the diagnostic - ds_df.rename({start_value: diag_name}, axis='columns', inplace=True) + ds_df.rename({start_value: diag_name}, + axis='columns', inplace=True) diag_to_drop.append(start_diag) next_diag = str(int(start_diag) + 2) next_value = str(int(start_value) + 2) @@ -1649,15 +1697,20 @@ def reformat_tcdiag(self, tcdiag_df: pd.DataFrame) -> pd.DataFrame: # for shear magnitude reformatted_cols = reformatted.columns.to_list() if 'SHR_MAG' in reformatted_cols: - reformatted.rename({'SHR_MAG': cn.TCDIAG_COMMON_NAMES['SHR_MAG']}, axis='columns', inplace=True) + reformatted.rename( + {'SHR_MAG': cn.TCDIAG_COMMON_NAMES['SHR_MAG']}, axis='columns', inplace=True) elif 'SHRD' in reformatted_cols: - reformatted.rename({'SHRD': cn.TCDIAG_COMMON_NAMES['SHRD']}, axis='columns', inplace=True) + reformatted.rename( + {'SHRD': cn.TCDIAG_COMMON_NAMES['SHRD']}, axis='columns', inplace=True) if 'LAND' in reformatted_cols: - reformatted.rename({'LAND': cn.TCDIAG_COMMON_NAMES['LAND']}, axis='columns', inplace=True) + reformatted.rename( + {'LAND': cn.TCDIAG_COMMON_NAMES['LAND']}, axis='columns', inplace=True) elif 'DTL' in reformatted_cols: - reformatted.rename({'DTL': cn.TCDIAG_COMMON_NAMES['DTL']}, axis='columns', inplace=True) + reformatted.rename( + {'DTL': cn.TCDIAG_COMMON_NAMES['DTL']}, axis='columns', inplace=True) if 'STM_SPD' in reformatted_cols: - reformatted.rename({'STM_SPD': cn.TCDIAG_COMMON_NAMES['STM_SPD']}, axis='columns', inplace=True) + reformatted.rename( + {'STM_SPD': cn.TCDIAG_COMMON_NAMES['STM_SPD']}, axis='columns', inplace=True) # Clean up intermediate dataframes del ds_df @@ -1665,7 +1718,8 @@ def reformat_tcdiag(self, tcdiag_df: pd.DataFrame) -> pd.DataFrame: end_reformat = time.perf_counter() time_to_reformat = end_reformat - begin_reformat - self.logger.info(f"Finished reformatting TCDiag matched pair output in {time_to_reformat} seconds") + self.logger.info( + f"Finished reformatting TCDiag matched pair output in {time_to_reformat} seconds") return reformatted @@ -1702,7 +1756,8 @@ def reformat_tcmpr(self, tcmpr_df: pd.DataFrame) -> pd.DataFrame: end_reformat = time.perf_counter() reformat_time = end_reformat - begin_reformat - self.logger.info("Reformatting the TCMPR dataframe took {reformat_time} seconds") + self.logger.info( + "Reformatting the TCMPR dataframe took {reformat_time} seconds") return tcmpr_relevant @@ -1747,7 +1802,7 @@ def process_mpr(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Subset the original dataframe to another dataframe consisting of only the MPR # line type. The MPR specific columns will only have numbers at this point. mpr_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, - mpr_columns_to_use] + mpr_columns_to_use] # Add the stat columns header names for the MPR line type mpr_columns: List[str] = cn.MPR_HEADERS @@ -1788,17 +1843,17 @@ def process_mpr(self, stat_data: pd.DataFrame) -> pd.DataFrame: # key-value structure with variables in one column and their corresponding # values in another column). Omit the matched pair index. variables_to_transform = list(cn.LC_MPR_SPECIFIC)[:] - self.logger.info(f"Variables to transform from wide to long: {cn.LC_MPR_SPECIFIC[1:]} ") + self.logger.info( + f"Variables to transform from wide to long: {cn.LC_MPR_SPECIFIC[1:]} ") melted: pd.DataFrame = pd.melt(mpr_df_copy, id_vars=columns_to_use[1:28], - value_vars=variables_to_transform, - var_name='stat_name', - value_name='stat_value', - ignore_index=True) + value_vars=variables_to_transform, + var_name='stat_name', + value_name='stat_value', + ignore_index=True) linetype_data = melted.copy(deep=True) - # The MPR line type doesn't have the bcl and bcu stat values; set these to NA na_column: List[str] = ['NA' for _ in range(0, linetype_data.shape[0])] @@ -1815,7 +1870,6 @@ def process_mpr(self, stat_data: pd.DataFrame) -> pd.DataFrame: return linetype_data - def rename_confidence_level_columns(self, confidence_level_columns: List[str]) -> \ List[str]: """ @@ -1838,7 +1892,8 @@ def rename_confidence_level_columns(self, confidence_level_columns: List[str]) - renamed: List[str] = [] for cur_col in confidence_level_columns: - match = re.match(r'(.+)_(BCL|bcl|BCU|bcu|NCL|ncl|NCU|ncu)', cur_col) + match = re.match( + r'(.+)_(BCL|bcl|BCU|bcu|NCL|ncl|NCU|ncu)', cur_col) if match: rearranged = match.group(2) + '_' + match.group(1) renamed.append(rearranged.upper()) @@ -1971,42 +2026,49 @@ def main(): stat_ncl, and stat_ncu columns. ''' - # Acquire the output file name and output directory information and location of - # the xml specification file - config_file: str = util.read_config_from_command_line() - with open(config_file, 'r') as stream: - try: - parms: dict = yaml.load(stream, Loader=yaml.FullLoader) - pathlib.Path(parms['output_dir']).mkdir(parents=True, exist_ok=True) - except yaml.YAMLError: - sys.exit(1) + try: + # Acquire the output file name and output directory information and location of + # the xml specification file + config_file: str = util.read_config_from_command_line() + with open(config_file, 'r') as stream: + try: + parms: dict = yaml.load(stream, Loader=yaml.FullLoader) + pathlib.Path(parms['output_dir']).mkdir( + parents=True, exist_ok=True) + except yaml.YAMLError: + sys.exit(1) - log_dir = parms['log_directory'] + log_dir = parms['log_directory'] - # Create the log directory if it doesn't alreaedy exist - try: - os.makedirs(log_dir) - except: - # ignore warning that is raised - # when the directory already exists - pass - - full_log_filename = os.path.join(log_dir, parms['log_filename']) - logger = util.get_common_logger(parms['log_level'],full_log_filename) - - file_df: pd.DataFrame = read_input(parms, logger) - - # Check if the output file already exists, if so, delete it to avoid - # appending output from subsequent runs into the same file. - existing_output_file = os.path.join(parms['output_dir'], parms['output_filename']) - if os.path.exists(existing_output_file): - logger.info("Output file already exists, removing this file.") - os.remove(existing_output_file) - - # Write stat file in ASCII format - stat_lines_obj: WriteStatAscii = WriteStatAscii(parms, logger) - # stat_lines_obj.write_stat_ascii(file_df, parms, logger) - stat_lines_obj.write_stat_ascii(file_df, parms) + # Create the log directory if it doesn't alreaedy exist + try: + os.makedirs(log_dir) + except OSError: + # ignore warning that is raised + # when the directory already exists + pass + + full_log_filename = os.path.join(log_dir, parms['log_filename']) + logger = util.get_common_logger(parms['log_level'], full_log_filename) + + file_df: pd.DataFrame = read_input(parms, logger) + + # Check if the output file already exists, if so, delete it to avoid + # appending output from subsequent runs into the same file. + existing_output_file = os.path.join( + parms['output_dir'], parms['output_filename']) + if os.path.exists(existing_output_file): + logger.info("Output file already exists, removing this file.") + os.remove(existing_output_file) + + # Write stat file in ASCII format + stat_lines_obj: WriteStatAscii = WriteStatAscii(parms, logger) + # stat_lines_obj.write_stat_ascii(file_df, parms, logger) + stat_lines_obj.write_stat_ascii(file_df, parms) + except RuntimeError: + print( + "*** %s occurred setting up write_stat_ascii ***", sys.exc_info()[0]) + sys.exit("*** Error setting up write_stat_ascii") if __name__ == "__main__":