Skip to content

Commit

Permalink
Merge pull request #135 from vfedotovs/staging-rc-1.4.7
Browse files Browse the repository at this point in the history
Staging rc 1.4.7
  • Loading branch information
vfedotovs committed Jun 7, 2022
2 parents c53b3f0 + 1516d2f commit b7c5ab1
Show file tree
Hide file tree
Showing 17 changed files with 383 additions and 86 deletions.
92 changes: 92 additions & 0 deletions app/wsmodules/DataAnalyser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
""" DataAnalyzer creates statistical text segments and images for report
DataAnalyzer provides functionality to extract statistical data segments
and create statistical images from data frame and postgress database.
Module requires:
* cleaned-sorted-df.csv - contains scraped data
Module creates:
* daily_room_type_stats.txt
Todo:
* [ ] Create str segments (most of algos are in jupyter)
- [ ] rooms types %
- [ ] house floors
- [ ] apt locations
- [ ] sqm size ranges for each
- [ ] sqm price ranges for each
* [ ] Create images based on DF
- [ ] gen_image(data_frame, 'Size_sqm', "Price_in_eur") - created but df not filtered by room cunt = 1
- [ ] gen_image('double_room_sqm_prices.png')
- [ ] gen_image('triple_room_sqm_prices.png')
- [ ] gen_image('quad_room_sqm_prices.png')
- [] gen_image('all_room_sqm_prices.png')
* Need interface to connect to DB and extract historic dict and save to to df and csv
"""
import pandas as pd


class DataFrameAvalyzer():

def __init__(self, df_file_name: str):
self.df_file_name = df_file_name

def analyze_df_room_types(self, file) -> None:
pass

def analyze_df_house_types(self, file) -> None:
pass

def analyze_df_apt_loc_types(self, file) -> None:
pass


def gen_image(self, data_frame: pd.DataFrame, xclmn: str, yclmn: str) -> None:
"""Generate scatter plot based x and y axsis as data frame column values,
include title and save to *.png file"""
img_title = 'All room sqm size to price relationships'
#file_name = '{}_{}.png'.format(xclmn, yclmn)
file_name = 'all_room_sqm_prices.png'
ax = data_frame.plot.scatter(
x=xclmn, y=yclmn, s=100, title=img_title, grid=True)
fig = ax.get_figure()
fig.savefig(file_name)


class DBAnalyzer():
pass


def main():
"""docstring"""
run_daily_analitics()
run_monthly_analitics()


def run_daily_analitics() -> None:
"""docstring"""
data_frame = pd.read_csv('cleaned-sorted-df.csv')
dfa = DataFrameAvalyzer('cleaned-sorted-df.csv')
#dfa.analyze_df_room_types('daily_room_stats.txt')
#dfa.analyze_df_house_types('daily_house_stats.txt')
#dfa.analyze_df_apt_loc_types('daily_apt_loc_stats.txt')
#dfa.gen_image(data_frame, 'Size_sqm', "Price_in_eur")
#dfa.gen_image('double_room_sqm_prices.png')
#dfa.gen_image('triple_room_sqm_prices.png')
#dfa.gen_image('quad_room_sqm_prices.png')
dfa.gen_image(data_frame, 'Size_sqm', "Price_in_eur")


def run_monthly_analitics() -> None:
"""docstring"""
pass


main()


49 changes: 49 additions & 0 deletions app/wsmodules/Report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
""" FIXME module docstring """
from datetime import datetime
from time import strftime
from fpdf import FPDF


class Report():
""" docsting """

def __init__(self, report_type, file_name):
self.report_type = report_type
self.file_name = file_name
self.pdf = FPDF()
self.pdf.add_page()
self.pdf.set_font('Arial', 'B', 16)
#self.insert_header(self.report_type)

def insert_header(self, report_type: str) -> None:
""" docstring """
todays_date = datetime.today().strftime('%Y-%m-%d %H:%M')
report_title = f"Ogre city apartments for sale {report_type}"
date_created = f"Report date: {todays_date}"
self.pdf.write(5, report_title)
self.pdf.ln(5)
self.pdf.write(5, date_created)
self.pdf.ln(5)

def insert_text_segment(self, text_lines: str) -> None:
""" docstring """
self.pdf.ln(5) # line break
self.pdf.write(5, text_lines)
self.pdf.ln(5)

def insert_error_msg(self, msg: str) -> None:
""" docstring """
self.pdf.ln(5) # line break
self.pdf.write(5, msg)
self.pdf.ln(5)


def insert_images(self, images: list) -> None:
""" docstring """
for image in images:
self.pdf.image(image, x=10, y=10, w=100, h=100)
self.pdf.ln(5)

def save_report(self, file_name: str) -> None:
""" docstring """
self.pdf.output(file_name, 'F')
2 changes: 1 addition & 1 deletion app/wsmodules/analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


log = logging.getLogger('')
log.setLevel(logging.DEBUG)
log.setLevel(logging.INFO)
fa_log_format = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] : %(funcName)s: %(lineno)d: %(message)s")
ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(fa_log_format)
Expand Down
6 changes: 4 additions & 2 deletions app/wsmodules/data_formater_v14.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,10 @@ def create_oneline_report(source_file: str):


def create_file_copy() -> None:
"""Creates report file copy in data folder"""
copy_cmd = 'mv cleaned-sorted-df.csv data/'
"""Creates file copy with date in name to data folder"""
todays_date = datetime.today().strftime('%Y-%m-%d')
dest_file = 'pandas_df_' + todays_date + '.csv'
copy_cmd = 'cp pandas_df.csv data/' + dest_file
if not os.path.exists('data'):
os.makedirs('data')
os.system(copy_cmd)
Expand Down
44 changes: 35 additions & 9 deletions app/wsmodules/db_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
12.[] Check if report day by last x days count and generate report
13.[] Write tests for db_worker module
"""
import os
import sys
import logging
from logging.handlers import RotatingFileHandler
Expand Down Expand Up @@ -78,16 +79,20 @@ def db_worker_main() -> None:

def check_files(file_names: list) -> None:
"""Testing if file exists and can be opened"""
for f in file_names:
cwd = os.getcwd()
for file_name in file_names:
try:
file = open(f, 'r')
logger.info(f'Checking if required module file {file_name} exits in {cwd}')
file = open(file_name, 'r')
except IOError:
logger.error(f'There was an error opening the file {f} or file does not exist!')
logger.error(f'There was an error opening the file {file_name} or file does not exist!')
sys.exit()


def load_csv_to_df(csv_file_name: str):
"""reads csv file and return pandas data frame"""
"""reads csv file and returns pandas data frame"""
cwd = os.getcwd()
logger.info(f'Loading {csv_file_name} from directory {cwd}')
df = pd.read_csv(csv_file_name)
logger.info(f'Loaded {csv_file_name} file to pandas data frame in memory')
return df
Expand All @@ -101,7 +106,8 @@ def extract_url_hashes_from_df(df_name) -> list:
for full_url in urls:
url_hash = extract_hash(full_url)
url_hashes.append(url_hash)
logger.info(f'Extracted {len(url_hashes)} url hashes from pandas data frame')
logger.info(f'Extracted {len(url_hashes)} url hashes from todays scraped data')
logger.info(f'Extracted {url_hashes} url hashes from todays scraped data')
return url_hashes


Expand All @@ -114,7 +120,7 @@ def extract_hash(full_url: str) -> str:
return url_hash


def extract_listed_url_hashes_from_db():
def extract_listed_url_hashes_from_db() -> list:
"""Iterate over all rows in listed_ads table and
extract each url hash column value and return as list of hashes"""
conn = None
Expand All @@ -130,7 +136,7 @@ def extract_listed_url_hashes_from_db():
row = cur.fetchone()
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
logger.error(f'{error}')
finally:
if conn is not None:
conn.close()
Expand All @@ -141,6 +147,8 @@ def extract_listed_url_hashes_from_db():
clean_hash = clean_element.replace("(", "").replace(",", "")
clean_hashes.append(clean_hash)
logger.info(f'Extracted {len(clean_hashes)} hashes from database listed_ads table')
logger.info(f'Extracted clean hash count: {len(clean_hashes)}')
logger.info(f'Extracted clean hash list: {clean_hashes}')
return clean_hashes


Expand All @@ -150,7 +158,7 @@ def compare_df_to_db_hashes(df_hashes: list, db_hashes: list) -> list:
new_ads = []
existing_ads = []
removed_ads = []
logger.info(f'Comparing {len(df_hashes)} data frame hashes with {len(db_hashes)} listed table hashes')
logger.info(f'Comparing {len(df_hashes)} todays scraped data hashes with {len(db_hashes)} DB listed_ads table hashes')
for df_hash in df_hashes:
if df_hash in db_hashes:
existing_ads.append(df_hash)
Expand All @@ -163,12 +171,17 @@ def compare_df_to_db_hashes(df_hashes: list, db_hashes: list) -> list:
hash_categories.append(existing_ads)
hash_categories.append(removed_ads)
logger.info(f'Result {len(new_ads)} new, {len(existing_ads)} still_listed, {len(removed_ads)} to_remove hashes ')
logger.info(f'New todays scraped hashes: {new_ads}')
logger.info(f'Hashes from DB listed_ads table: {existing_ads}')
logger.info(f'Hashes for DB removed_ads table: {removed_ads}')
return hash_categories


def extract_new_msg_data(df, new_msg_hashes: list) -> dict:
""" Extract data from df and return as dict hash: (list column data for hash row)"""
data_dict = {}
logger.info(f'new_msg_hashes count {len(new_msg_hashes)}, hashes: {new_msg_hashes}')
logger.info('Starting extract new ads from todays scraped data farme in memory')
for hash_str in new_msg_hashes:
for index, row in df.iterrows():
url = row['URL']
Expand All @@ -190,6 +203,9 @@ def extract_new_msg_data(df, new_msg_hashes: list) -> dict:
row_data.append(days_count)
if url_hash == hash_str:
data_dict[url_hash] = row_data
logger.info(f'Extrcted new ad count from todays data frame {len(data_dict)} ')
for k, v in data_dict.items():
logger.info(f'{k} {v}')
return data_dict


Expand Down Expand Up @@ -271,6 +287,8 @@ def insert_data_to_listed_table(data: dict) -> None:
days_listed))
conn.commit()
cur.close()
for k, v in data.items():
logger.info(f'{k} {v}')
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
Expand Down Expand Up @@ -339,6 +357,7 @@ def extract_to_increment_msg_data(listed_url_hashes:list) -> list:
conn = None
to_increment_msg_data = {}
try:
logger.info(f'Connecting to DB to fetch data from listed_ads table')
params = config()
conn = psycopg2.connect(**params)
cur = conn.cursor()
Expand All @@ -361,12 +380,14 @@ def extract_to_increment_msg_data(listed_url_hashes:list) -> list:
data_values.append(dlv)
to_increment_msg_data[curr_row_hash] = data_values
cur.close()
logger.info(f'Extracted data from listed_ads table for {len(to_increment_msg_data)} messages')
for k, v in to_increment_msg_data.items():
logger.info(f'{k} {v}')
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
logger.info(f'Extracted data from listed_ads table for {len(to_increment_msg_data)} messages')
return to_increment_msg_data


Expand Down Expand Up @@ -417,6 +438,8 @@ def insert_data_to_removed_table(data: dict) -> None:
days_listed))
conn.commit()
cur.close()
for k, v in data.items():
logger.info(f'{k} {v}')
except (Exception, psycopg2.DatabaseError) as error:
logger.error(error)
print(error)
Expand All @@ -440,6 +463,7 @@ def delete_db_listed_table_rows(delisted_hashes: list) -> None:
cur.execute(full_cmd)
conn.commit()
cur.close()
logger.info(f'Deleted ads with hashes: {delisted_hashes} from listed_ads table')
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
Expand All @@ -462,6 +486,8 @@ def update_dlv_in_db_table(data: dict, todays_date: datetime) -> None:
if correct_dlv == days_listed:
pass
logger.info(f'Updated days_listed value for {dlv_count} messages in listed_ads table')
for k, v in data.items():
logger.info(f'{k} {v}')


def calc_valid_dlv(pub_date: str, todays_date: datetime) -> int:
Expand Down
19 changes: 15 additions & 4 deletions app/wsmodules/df_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
3. Save as clean df in csv format
"""
import pandas as pd
import os
from datetime import datetime


print("Debug info: Starting data frame cleaning module ... ")
# loading data to dataframe from csv file
df_to_clean = pd.read_csv("pandas_df.csv")


def clean_data_frame(df_name):
Expand Down Expand Up @@ -111,17 +111,28 @@ def create_email_body(clean_data_frame, file_name: str) -> None:

def df_cleaner_main():
""" Cleans df, sorts df by price in EUR, save to csv file """
print("Debug info: Starting data frame cleaning module ... ")
df_to_clean = pd.read_csv("pandas_df.csv")
clean_df = clean_data_frame(df_to_clean)
clean_sqm_col = clean_sqm_column(clean_df)
clean_price_col = split_price_column(clean_sqm_col)
clean_df = clean_sqm_eur_col(clean_price_col)
sorted_df = clean_df.sort_values(by='Price_in_eur', ascending=True)
sorted_df.to_csv("cleaned-sorted-df.csv")
all_ads_df = pd.read_csv("cleaned-sorted-df.csv", index_col=False)
create_file_copy()
create_email_body(all_ads_df, 'email_body_txt_m4.txt')
print("Debug info: Completed dat_formater module ... ")


def create_file_copy() -> None:
"""Creates file copy in data folder"""
todays_date = datetime.today().strftime('%Y-%m-%d')
dest_file = 'cleaned-sorted-df-' + todays_date + '.csv'
copy_cmd = 'cp cleaned-sorted-df.csv data/' + dest_file
if not os.path.exists('data'):
os.makedirs('data')
os.system(copy_cmd)


# Main module code driver
df_cleaner_main()
Loading

0 comments on commit b7c5ab1

Please sign in to comment.