Merge pull request #135 from vfedotovs/staging-rc-1.4.7

Staging rc 1.4.7
vfedotovs · Jun 7, 2022 · b7c5ab1 · b7c5ab1
2 parents c53b3f0 + 1516d2f
commit b7c5ab1
Show file tree

Hide file tree

Showing 17 changed files with 383 additions and 86 deletions.
diff --git a/app/wsmodules/DataAnalyser.py b/app/wsmodules/DataAnalyser.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+""" DataAnalyzer creates statistical text segments and images for report
+
+DataAnalyzer provides functionality to extract statistical data segments
+and create statistical images from data frame and postgress database.
+
+Module requires:
+	* cleaned-sorted-df.csv - contains scraped data
+
+Module creates:
+	* daily_room_type_stats.txt
+
+
+Todo:
+    * [ ] Create str segments (most of algos are in jupyter)
+			- [ ] rooms types %
+			- [ ] house floors
+			- [ ] apt locations
+			- [ ] sqm size ranges for each
+			- [ ] sqm price ranges for each
+	* [ ] Create images based on DF
+			- [ ] gen_image(data_frame, 'Size_sqm', "Price_in_eur") - created but df not filtered by room cunt = 1
+            - [ ] gen_image('double_room_sqm_prices.png')
+            - [ ] gen_image('triple_room_sqm_prices.png')
+            - [ ] gen_image('quad_room_sqm_prices.png')
+            - [] gen_image('all_room_sqm_prices.png')
+
+	* Need interface to connect to DB and extract historic dict and save to to df and csv
+
+"""
+import pandas as pd
+
+
+class DataFrameAvalyzer():
+
+    def __init__(self, df_file_name: str):
+        self.df_file_name = df_file_name
+
+    def analyze_df_room_types(self, file) -> None:
+        pass
+
+    def analyze_df_house_types(self, file) -> None:
+        pass
+
+    def analyze_df_apt_loc_types(self, file) -> None:
+        pass
+
+
+    def gen_image(self, data_frame: pd.DataFrame, xclmn: str, yclmn: str) -> None:
+        """Generate scatter plot based x and y axsis as data frame column values,
+        include title and save to *.png file"""
+        img_title = 'All room sqm size to price relationships'
+        #file_name = '{}_{}.png'.format(xclmn, yclmn)
+        file_name = 'all_room_sqm_prices.png'
+        ax = data_frame.plot.scatter(
+            x=xclmn, y=yclmn, s=100, title=img_title, grid=True)
+        fig = ax.get_figure()
+        fig.savefig(file_name)
+
+
+class DBAnalyzer():
+    pass
+
+
+def main():
+    """docstring"""
+    run_daily_analitics()
+    run_monthly_analitics()
+
+
+def run_daily_analitics() -> None:
+    """docstring"""
+    data_frame = pd.read_csv('cleaned-sorted-df.csv')
+    dfa = DataFrameAvalyzer('cleaned-sorted-df.csv')
+    #dfa.analyze_df_room_types('daily_room_stats.txt')
+    #dfa.analyze_df_house_types('daily_house_stats.txt')
+    #dfa.analyze_df_apt_loc_types('daily_apt_loc_stats.txt')
+    #dfa.gen_image(data_frame, 'Size_sqm', "Price_in_eur")
+    #dfa.gen_image('double_room_sqm_prices.png')
+    #dfa.gen_image('triple_room_sqm_prices.png')
+    #dfa.gen_image('quad_room_sqm_prices.png')
+    dfa.gen_image(data_frame, 'Size_sqm', "Price_in_eur")
+
+
+def run_monthly_analitics() -> None:
+    """docstring"""
+    pass
+
+
+main()
+
+
diff --git a/app/wsmodules/Report.py b/app/wsmodules/Report.py
@@ -0,0 +1,49 @@
+""" FIXME module docstring """
+from datetime import datetime
+from time import strftime
+from fpdf import FPDF
+
+
+class Report():
+    """ docsting """
+
+    def __init__(self, report_type, file_name):
+        self.report_type = report_type
+        self.file_name = file_name
+        self.pdf = FPDF()
+        self.pdf.add_page()
+        self.pdf.set_font('Arial', 'B', 16)
+        #self.insert_header(self.report_type)
+
+    def insert_header(self, report_type: str) -> None:
+        """ docstring """
+        todays_date = datetime.today().strftime('%Y-%m-%d %H:%M')
+        report_title = f"Ogre city apartments for sale {report_type}"
+        date_created = f"Report date: {todays_date}"
+        self.pdf.write(5, report_title)
+        self.pdf.ln(5)
+        self.pdf.write(5, date_created)
+        self.pdf.ln(5)
+
+    def insert_text_segment(self, text_lines: str) -> None:
+        """ docstring """
+        self.pdf.ln(5) # line break
+        self.pdf.write(5, text_lines)
+        self.pdf.ln(5)            
+
+    def insert_error_msg(self, msg: str) -> None:
+        """ docstring """
+        self.pdf.ln(5) # line break
+        self.pdf.write(5, msg)
+        self.pdf.ln(5)
+
+
+    def insert_images(self, images: list) -> None:
+        """ docstring """
+        for image in images:
+            self.pdf.image(image, x=10, y=10, w=100, h=100)
+            self.pdf.ln(5)
+
+    def save_report(self, file_name: str) -> None:
+        """ docstring """
+        self.pdf.output(file_name, 'F')
diff --git a/app/wsmodules/analytics.py b/app/wsmodules/analytics.py
@@ -14,7 +14,7 @@
 
 
 log = logging.getLogger('')
-log.setLevel(logging.DEBUG)
+log.setLevel(logging.INFO)
 fa_log_format = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] : %(funcName)s: %(lineno)d: %(message)s")
 ch = logging.StreamHandler(sys.stdout)
 ch.setFormatter(fa_log_format)

diff --git a/app/wsmodules/data_formater_v14.py b/app/wsmodules/data_formater_v14.py
@@ -90,8 +90,10 @@ def create_oneline_report(source_file: str):
 
 
 def create_file_copy() -> None:
-    """Creates report file copy in data folder"""
-    copy_cmd = 'mv cleaned-sorted-df.csv data/'
+    """Creates file copy with date in name to data folder"""
+    todays_date = datetime.today().strftime('%Y-%m-%d')
+    dest_file = 'pandas_df_' + todays_date + '.csv'
+    copy_cmd = 'cp pandas_df.csv data/' + dest_file
     if not os.path.exists('data'):
         os.makedirs('data')
     os.system(copy_cmd)

diff --git a/app/wsmodules/db_worker.py b/app/wsmodules/db_worker.py
@@ -25,6 +25,7 @@
 12.[] Check if report day by last x days count and  generate report
 13.[] Write tests for db_worker module
 """
+import os
 import sys
 import logging
 from logging.handlers import RotatingFileHandler
@@ -78,16 +79,20 @@ def db_worker_main() -> None:
 
 def check_files(file_names: list) -> None:
     """Testing if file exists and can be opened"""
-    for f in file_names:
+    cwd = os.getcwd()
+    for file_name in file_names:
         try:
-            file = open(f, 'r')
+            logger.info(f'Checking if required module file {file_name} exits in {cwd}')
+            file = open(file_name, 'r')
         except IOError:
-            logger.error(f'There was an error opening the file {f} or file does not exist!')
+            logger.error(f'There was an error opening the file {file_name} or file does not exist!')
             sys.exit()
 
 
 def load_csv_to_df(csv_file_name: str):
-    """reads csv file and return pandas data frame"""
+    """reads csv file and returns pandas data frame"""
+    cwd = os.getcwd()
+    logger.info(f'Loading {csv_file_name} from directory {cwd}')
     df = pd.read_csv(csv_file_name)
     logger.info(f'Loaded {csv_file_name} file to pandas data frame in memory')
     return df
@@ -101,7 +106,8 @@ def extract_url_hashes_from_df(df_name) -> list:
     for full_url in urls:
         url_hash = extract_hash(full_url)
         url_hashes.append(url_hash)
-    logger.info(f'Extracted {len(url_hashes)} url hashes from pandas data frame')
+    logger.info(f'Extracted {len(url_hashes)} url hashes from todays scraped data')
+    logger.info(f'Extracted {url_hashes} url hashes from todays scraped data')
     return url_hashes
 
 
@@ -114,7 +120,7 @@ def extract_hash(full_url: str) -> str:
     return url_hash
 
 
-def extract_listed_url_hashes_from_db():
+def extract_listed_url_hashes_from_db() -> list:
     """Iterate over all rows in  listed_ads table and
     extract each url hash column value and return as list of hashes"""
     conn = None
@@ -130,7 +136,7 @@ def extract_listed_url_hashes_from_db():
             row = cur.fetchone()
         cur.close()
     except (Exception, psycopg2.DatabaseError) as error:
-        print(error)
+        logger.error(f'{error}')
     finally:
         if conn is not None:
             conn.close()
@@ -141,6 +147,8 @@ def extract_listed_url_hashes_from_db():
         clean_hash = clean_element.replace("(", "").replace(",", "")
         clean_hashes.append(clean_hash)
     logger.info(f'Extracted {len(clean_hashes)} hashes from database listed_ads table')
+    logger.info(f'Extracted clean hash count: {len(clean_hashes)}')
+    logger.info(f'Extracted clean hash list: {clean_hashes}')
     return clean_hashes
 
 
@@ -150,7 +158,7 @@ def compare_df_to_db_hashes(df_hashes: list, db_hashes: list) -> list:
     new_ads = []
     existing_ads = []
     removed_ads = []
-    logger.info(f'Comparing {len(df_hashes)} data frame hashes with {len(db_hashes)} listed table hashes')
+    logger.info(f'Comparing {len(df_hashes)} todays scraped data hashes with {len(db_hashes)} DB listed_ads table hashes')
     for df_hash in df_hashes:
         if df_hash in db_hashes:
             existing_ads.append(df_hash)
@@ -163,12 +171,17 @@ def compare_df_to_db_hashes(df_hashes: list, db_hashes: list) -> list:
     hash_categories.append(existing_ads)
     hash_categories.append(removed_ads)
     logger.info(f'Result {len(new_ads)} new, {len(existing_ads)} still_listed, {len(removed_ads)} to_remove hashes ')
+    logger.info(f'New todays scraped hashes: {new_ads}')
+    logger.info(f'Hashes from DB listed_ads table: {existing_ads}')
+    logger.info(f'Hashes for DB removed_ads table: {removed_ads}')
     return hash_categories
 
 
 def extract_new_msg_data(df, new_msg_hashes: list) -> dict:
     """ Extract data from df and return as dict hash: (list column data for hash row)"""
     data_dict = {}
+    logger.info(f'new_msg_hashes count {len(new_msg_hashes)}, hashes: {new_msg_hashes}')
+    logger.info('Starting extract new ads from todays scraped data farme in memory')
     for hash_str in new_msg_hashes:
         for index, row in df.iterrows():
             url = row['URL']
@@ -190,6 +203,9 @@ def extract_new_msg_data(df, new_msg_hashes: list) -> dict:
             row_data.append(days_count)
             if url_hash == hash_str:
                 data_dict[url_hash] = row_data
+    logger.info(f'Extrcted new ad count from todays data frame {len(data_dict)} ')
+    for k, v in data_dict.items():
+        logger.info(f'{k} {v}')
     return data_dict
 
 
@@ -271,6 +287,8 @@ def insert_data_to_listed_table(data: dict) -> None:
                   days_listed))
         conn.commit()
         cur.close()
+        for k, v in data.items():
+            logger.info(f'{k} {v}')
     except (Exception, psycopg2.DatabaseError) as error:
         print(error)
     finally:
@@ -339,6 +357,7 @@ def extract_to_increment_msg_data(listed_url_hashes:list) -> list:
     conn = None
     to_increment_msg_data = {}
     try:
+        logger.info(f'Connecting to DB to fetch data from listed_ads table')
         params = config()
         conn = psycopg2.connect(**params)
         cur = conn.cursor()
@@ -361,12 +380,14 @@ def extract_to_increment_msg_data(listed_url_hashes:list) -> list:
                     data_values.append(dlv)
                     to_increment_msg_data[curr_row_hash] = data_values
         cur.close()
+        logger.info(f'Extracted data from listed_ads table for {len(to_increment_msg_data)} messages')
+        for k, v in to_increment_msg_data.items():
+            logger.info(f'{k} {v}')
     except (Exception, psycopg2.DatabaseError) as error:
         print(error)
     finally:
         if conn is not None:
             conn.close()
-    logger.info(f'Extracted data from listed_ads table for {len(to_increment_msg_data)} messages')
     return to_increment_msg_data
 
 
@@ -417,6 +438,8 @@ def insert_data_to_removed_table(data: dict) -> None:
                   days_listed))
         conn.commit()
         cur.close()
+        for k, v in data.items():
+            logger.info(f'{k} {v}')
     except (Exception, psycopg2.DatabaseError) as error:
         logger.error(error)
         print(error)
@@ -440,6 +463,7 @@ def delete_db_listed_table_rows(delisted_hashes: list) -> None:
             cur.execute(full_cmd)
         conn.commit()
         cur.close()
+        logger.info(f'Deleted ads with hashes: {delisted_hashes} from listed_ads table')
     except (Exception, psycopg2.DatabaseError) as error:
         print(error)
     finally:
@@ -462,6 +486,8 @@ def update_dlv_in_db_table(data: dict, todays_date: datetime) -> None:
             if correct_dlv == days_listed:
                 pass
     logger.info(f'Updated days_listed value for {dlv_count} messages in listed_ads table')
+    for k, v in data.items():
+        logger.info(f'{k} {v}')
 
 
 def calc_valid_dlv(pub_date: str, todays_date: datetime) -> int:

diff --git a/app/wsmodules/df_cleaner.py b/app/wsmodules/df_cleaner.py
@@ -6,10 +6,10 @@
     3. Save as clean df in csv format
 """
 import pandas as pd
+import os
+from datetime import datetime
+
 
-print("Debug info: Starting data frame cleaning module ... ")
-# loading data to dataframe from csv file
-df_to_clean = pd.read_csv("pandas_df.csv")
 
 
 def clean_data_frame(df_name):
@@ -111,17 +111,28 @@ def create_email_body(clean_data_frame, file_name: str) -> None:
 
 def df_cleaner_main():
     """ Cleans df, sorts df by price in EUR, save to csv file """
+    print("Debug info: Starting data frame cleaning module ... ")
+    df_to_clean = pd.read_csv("pandas_df.csv")
     clean_df = clean_data_frame(df_to_clean)
     clean_sqm_col = clean_sqm_column(clean_df)
     clean_price_col = split_price_column(clean_sqm_col)
     clean_df = clean_sqm_eur_col(clean_price_col)
     sorted_df = clean_df.sort_values(by='Price_in_eur', ascending=True)
     sorted_df.to_csv("cleaned-sorted-df.csv")
     all_ads_df = pd.read_csv("cleaned-sorted-df.csv", index_col=False)
+    create_file_copy()
     create_email_body(all_ads_df, 'email_body_txt_m4.txt')
     print("Debug info: Completed dat_formater module ... ")
 
 
+def create_file_copy() -> None:
+    """Creates file copy in data folder"""
+    todays_date = datetime.today().strftime('%Y-%m-%d')
+    dest_file = 'cleaned-sorted-df-' + todays_date + '.csv'
+    copy_cmd = 'cp cleaned-sorted-df.csv data/' + dest_file
+    if not os.path.exists('data'):
+        os.makedirs('data')
+    os.system(copy_cmd)
+
 
-# Main module code driver
 df_cleaner_main()