-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #23 from Urban-Research-Group/refactor
Refactor
- Loading branch information
Showing
12 changed files
with
403 additions
and
324 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from dataclasses import dataclass | ||
import pandas as pd | ||
from src import file_io | ||
from src import file_utils | ||
|
||
|
||
# TODO: check YAML formatting | ||
@dataclass | ||
class DataInfo: | ||
"""Retains data parameters for processing instructions""" | ||
|
||
county_name: str | ||
county_path: str | ||
var_map_path: str | ||
operations: dict | ||
output: dict | ||
county_files: list = None | ||
var_map_non_derived: pd.DataFrame = None | ||
var_map_derived: pd.DataFrame = None | ||
|
||
|
||
def create_data_info(config_path: str) -> DataInfo: | ||
"""Reads YAML config instruction set and initalizes a | ||
data_info dataclass with values from the config. | ||
Args: | ||
file_path (str): path to config YAML | ||
Returns: | ||
data_info: dataclass containing config parameters | ||
""" | ||
config = file_io.read_config(config_path) | ||
county_path = config["county-path"] | ||
var_map_path = config["var-map-path"] | ||
|
||
county_files = file_utils.get_all_files_in_dir(county_path) | ||
var_map = file_io.read_var_map(var_map_path) | ||
|
||
derived_mask = var_map["derived"].str.lower().eq("false") | ||
var_map_non_derived = var_map[~derived_mask] | ||
var_map_derived = var_map[derived_mask] | ||
|
||
return DataInfo( | ||
county_name=config["county-name"], | ||
county_path=county_path, | ||
var_map_path=var_map_path, | ||
operations=config["operations"], | ||
output=config["output"], | ||
county_files=county_files, | ||
var_map_non_derived=var_map_non_derived, | ||
var_map_derived=var_map_derived, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import yaml | ||
import numpy as np | ||
import pandas as pd | ||
from src.logger import configure_logger, timing | ||
|
||
logger = configure_logger() | ||
|
||
|
||
class File: | ||
def __init__(self, file_path: str, format_file=None): | ||
self.file_path = file_path | ||
self.format_file = format_file | ||
|
||
@staticmethod | ||
def _read_format_file(path): | ||
return np.loadtxt(path, dtype=str) | ||
|
||
@staticmethod | ||
def _get_widths_from_format(format_file: np.array): | ||
return format_file[:, 2].astype(np.int16).tolist() | ||
|
||
@staticmethod | ||
def _get_column_headers_from_format(format_file: np.array): | ||
header_col = format_file[:, 0] | ||
return header_col | ||
|
||
def _get_fwf_paramters(self): | ||
format_file = File._read_format_file(self.format_file) | ||
widths = File._get_widths_from_format(format_file) | ||
column_headers = File._get_column_headers_from_format(format_file) | ||
return widths, column_headers | ||
|
||
@timing | ||
def read(self): | ||
"""Reads a file based on its extension and returns a dataframe""" | ||
file_type = self.file_path.split(".")[-1].lower() | ||
|
||
match file_type: | ||
case "csv": | ||
df = pd.read_csv(self.file_path, encoding="latin-1", low_memory=False) | ||
case "xlsx" | "xls": | ||
df = pd.read_excel(self.file_path) | ||
case "txt": | ||
widths, column_headers = self._get_fwf_paramters() | ||
df = pd.read_fwf( | ||
self.file_path, widths=widths, header=None, encoding="latin-1" | ||
) | ||
df.columns = column_headers | ||
case "dat": | ||
NotImplemented | ||
case _: | ||
error_msg = f"File type {file_type} not supported" | ||
logger.error(error_msg) | ||
raise ValueError(error_msg) | ||
|
||
return df | ||
|
||
|
||
def read_var_map(var_map_path: str) -> pd.DataFrame: | ||
try: | ||
var_map = pd.read_csv(var_map_path) | ||
except Exception as e: | ||
logger.error("Could not read var map file with error %s", e) | ||
|
||
return var_map | ||
|
||
|
||
def read_config(config_path: str) -> dict: | ||
try: | ||
with open(config_path, "r") as config: | ||
config = yaml.safe_load(config) | ||
except Exception as e: | ||
logger.error("Could not read config file with error %s", e) | ||
|
||
return config | ||
|
||
|
||
class WriteOutput: | ||
def __init__(self, data, output_path, output_formats): | ||
self.data = data | ||
self.output_path = output_path | ||
self.output_formats = output_formats | ||
|
||
@timing | ||
def write_output(self): | ||
for output_format in self.output_formats: | ||
if output_format == "csv": | ||
self.write_csv() | ||
elif output_format == "parquet": | ||
self.write_parquet() | ||
else: | ||
logger.warning("%s is not supported!", output_format) | ||
|
||
def write_csv(self): | ||
try: | ||
self.data.to_csv(self.output_path + ".csv") | ||
except Exception as e: | ||
self.log("CSV", e) | ||
|
||
def write_parquet(self): | ||
try: | ||
self.data.to_parquet(self.output_path + ".parquet") | ||
except Exception as e: | ||
self.log("Parquet", e) | ||
|
||
def log(self, file_type, e): | ||
if e: | ||
logger.error("Error writing %s output: %s", file_type, e) | ||
else: | ||
logger.info("Saved to %s", self.output_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.