script additional modifications

data-for-change · May 21, 2024 · 3fb267c · 3fb267c
1 parent b4f56c7
commit 3fb267c
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 78 deletions.
diff --git a/anyway/parsers/compare_cbs_and_anyway_road_segments_accidents.py b/anyway/parsers/compare_cbs_and_anyway_road_segments_accidents.py
@@ -1,119 +1,141 @@
-from anyway.models import AccidentMarkerView
-from anyway.widgets.widget_utils import get_expression_for_fields, get_expression_for_road_segment_location_fields, split_location_fields_and_others
-from anyway.models import AccidentMarkerView, RoadSegments
-from anyway.app_and_db import db
-from sqlalchemy import func, and_
+import os
 import pandas as pd
 from tqdm import tqdm
+from anyway.models import AccidentMarkerView, RoadSegments
+from anyway.widgets.widget_utils import (
+    get_expression_for_fields,
+    get_expression_for_road_segment_location_fields,
+    split_location_fields_and_others,
+)
+from anyway.app_and_db import db
+from sqlalchemy import func
+
+# Constants
+CBS_TYPE_1_SUMMARY_FILE = os.path.join("static", "data", "cbs_summary_files", "2022", "t01_type_1_for_segment_test.xls")
+CBS_TYPE_3_SUMMARY_FILE = os.path.join("static", "data", "cbs_summary_files", "2022", "t03_type_3_for_segment_test.xls")
+OUTPUT_DIR = os.path.join("static", "data", "cbs_summary_files", "2022", "comparison_output")
+OUTPUT_FILE = os.path.join(OUTPUT_DIR, "cbs_anyway_road_segments.csv")
 
-CBS_TYPE_1_SUMMARY_FILE = "static/data/cbs_summary_files/t01_type_1_for_segment_test.xls"
-CBS_TYPE_3_SUMMARY_FILE = "static/data/cbs_summary_files/t03_type_3_for_segment_test.xls"
+# Global dictionary for road segments
 ROAD_SEGMENTS_DICT = {}
 
 
-def get_cbs_count():
-    df_type_1 = pd.read_excel(CBS_TYPE_1_SUMMARY_FILE, skiprows=4)
-    df_type_1.columns = ["segment", "road", "from", "to", "acc_per_milion_km", "total", "total_light", "total_severe", "total_fatal", "2022_total", "2022_light", "2022_severe", "2022_fatal", "2021_total", "2020_total", "avg", "length"]
-    df_type_1 = df_type_1.loc[df_type_1.segment.notna()]
-    df_type_1 = df_type_1.loc[df_type_1.segment.astype(str).str.isdigit()]
-    df_type_1["provider_code"] = 1
-    df_type_1["road_segment_name_cbs"] = df_type_1["from"] + "_" + df_type_1["to"]
-    df_type_1_total = df_type_1[["road_segment_name_cbs", "road" , "segment","provider_code", "2020_total","2021_total", "2022_total"]].copy()
-    df_type_1_total.columns = ["road_segment_name_cbs",  "road" , "segment", "provider_code",  "2020_cbs", "2021_cbs", "2022_cbs"]
-    df_type_1_total[["2020_cbs", "2021_cbs", "2022_cbs"]] = df_type_1_total[["2020_cbs", "2021_cbs", "2022_cbs"]].fillna(0)
+def read_excel_file(file_path, skip_rows, columns, segment_col="segment"):
+    try:
+        df = pd.read_excel(file_path, skiprows=skip_rows)
+        df.columns = columns
+        df = df.loc[df[segment_col].notna() & df[segment_col].astype(str).str.isdigit()]
+        return df
+    except Exception as e:
+        print(f"Error reading {file_path}: {e}")
+        return pd.DataFrame()
 
-    df_type_1_2022 = df_type_1[["road_segment_name_cbs", "road" , "segment", "provider_code", "2022_light", "2022_severe", "2022_fatal"]]
 
+def get_cbs_count():
+    df_type_1_columns = [
+        "segment", "road", "from", "to", "acc_per_million_km", "total", "total_light",
+        "total_severe", "total_fatal", "2022_total", "2022_light", "2022_severe", "2022_fatal",
+        "2021_total", "2020_total", "avg", "length"
+    ]
+    df_type_3_columns = [
+        "segment", "road", "from", "to", "acc_per_million_km", "total", "2022_total",
+        "2021_total", "2020_total", "avg", "length"
+    ]
 
-    df_type_3 = pd.read_excel(CBS_TYPE_3_SUMMARY_FILE, skiprows=5)
-    df_type_3.columns = ["segment", "road", "from", "to", "acc_per_milion_km", "total", "2022_total", "2021_total", "2020_total", "avg", "length"]
-    df_type_3 = df_type_3.loc[df_type_3.segment.notna()]
-    df_type_3 = df_type_3.loc[df_type_3.segment.astype(str).str.isdigit()]
+    # Read and process type 1 data
+    df_type_1 = read_excel_file(CBS_TYPE_1_SUMMARY_FILE, 4, df_type_1_columns)
+    if df_type_1.empty:
+        return pd.DataFrame()
+    df_type_1["provider_code"] = 1
+    df_type_1["road_segment_name_cbs"] = df_type_1["from"].str.slice(start=1) + " -" + df_type_1["to"].str.slice(start=2)
+    df_type_1_total = df_type_1[["road_segment_name_cbs", "road", "segment", "provider_code", "2020_total", "2021_total", "2022_total"]].copy()
+    df_type_1_total.columns = ["road_segment_name_cbs", "road", "segment", "provider_code", "2020_cbs", "2021_cbs", "2022_cbs"]
+    df_type_1_total.fillna({"2020_cbs": 0, "2021_cbs": 0, "2022_cbs": 0}, inplace=True)
+
+    # Read and process type 3 data
+    df_type_3 = read_excel_file(CBS_TYPE_3_SUMMARY_FILE, 5, df_type_3_columns)
+    if df_type_3.empty:
+        return df_type_1_total
     df_type_3["provider_code"] = 3
-    df_type_3["road_segment_name_cbs"] = df_type_3["from"] + "_" + df_type_3["to"]
-    df_type_3_total = df_type_3[["road_segment_name_cbs", "road" , "segment", "provider_code", "2020_total","2021_total", "2022_total"]].copy()
-    df_type_3_total.columns = ["road_segment_name_cbs", "road" , "segment", "provider_code", "2020_cbs", "2021_cbs", "2022_cbs"]
-    df_type_3_total[["2020_cbs", "2021_cbs", "2022_cbs"]] = df_type_3_total[["2020_cbs", "2021_cbs", "2022_cbs"]].fillna(0)
+    df_type_3["road_segment_name_cbs"] = df_type_3["from"].str.slice(start=1) + " - " + df_type_3["to"].str.slice(start=2)
+    df_type_3_total = df_type_3[["road_segment_name_cbs", "road", "segment", "provider_code", "2020_total", "2021_total", "2022_total"]].copy()
+    df_type_3_total.columns = ["road_segment_name_cbs", "road", "segment", "provider_code", "2020_cbs", "2021_cbs", "2022_cbs"]
+    df_type_3_total.fillna({"2020_cbs": 0, "2021_cbs": 0, "2022_cbs": 0}, inplace=True)
 
+    # Combine type 1 and type 3 data
     df_cbs_total = pd.concat([df_type_1_total, df_type_3_total])
-    df_cbs_total.set_index(["road" , "segment", "provider_code"], inplace=True)
-    return df_cbs_total, df_type_1_2022
+    df_cbs_total.set_index(["road", "segment", "provider_code"], inplace=True)
+    return df_cbs_total
+
 
 def get_anyway_count():
     dfs = []
-    for road_segment in tqdm(RoadSegments.query.all()):
+
+    road_segments = RoadSegments.query.all()
+    for road_segment in tqdm(road_segments, desc="Processing road segments"):
         road_segment_id = road_segment.segment_id
-        if road_segment_id != 97790010:
-            continue
         road = road_segment.road
         segment = road_segment.segment
-        road_segment_name = road_segment.from_name + ' - ' + road_segment.to_name
-        print(road_segment_name)
+        road_segment_name = f"{road_segment.from_name} - {road_segment.to_name}"
         ROAD_SEGMENTS_DICT[road_segment_id] = road_segment_name
-        filters={"road_segment_id": road_segment_id,
-                "accident_year": [2019, 2020,2021,2022, 2023]}
+
+        filters = {
+            "road_segment_id": road_segment_id,
+            "accident_year": [2020, 2021, 2022]
+        }
         query = db.session.query(AccidentMarkerView)
         location_fields, other_fields = split_location_fields_and_others(filters)
+
         if other_fields:
-            query = query.filter(get_expression_for_fields(other_fields, AccidentMarkerView, and_))
-        query = query.filter(
-            get_expression_for_road_segment_location_fields(location_fields, AccidentMarkerView)
-        )
-        test_query = query
-        test_query = test_query.group_by(AccidentMarkerView.location_accuracy_hebrew)
-        test_query = test_query.group_by(AccidentMarkerView.location_accuracy_hebrew, AccidentMarkerView.provider_code, AccidentMarkerView.accident_year)
-        test_query = test_query.with_entities(
-            AccidentMarkerView.provider_code,
-            AccidentMarkerView.location_accuracy_hebrew,
-            AccidentMarkerView.accident_year,
-            func.count(AccidentMarkerView.location_accuracy_hebrew))
-
-        df2 = pd.read_sql_query(test_query.statement, test_query.session.bind)
-        print(df2)
-        query = query.group_by(AccidentMarkerView.provider_code,
-                            AccidentMarkerView.accident_severity,
-                            AccidentMarkerView.accident_year)
+            query = query.filter(get_expression_for_fields(other_fields, AccidentMarkerView))
+        if location_fields:
+            query = query.filter(get_expression_for_road_segment_location_fields(location_fields, AccidentMarkerView))
 
+        query = query.group_by(AccidentMarkerView.provider_code, AccidentMarkerView.accident_year)
         query = query.with_entities(
             AccidentMarkerView.provider_code,
-            AccidentMarkerView.accident_severity,
             AccidentMarkerView.accident_year,
-            func.count(AccidentMarkerView.accident_severity),
+            func.count().label("anyway_count"),
         )
 
-
-
         df = pd.read_sql_query(query.statement, query.session.bind)
-        df.rename(columns={"count_1": "anyway_count"}, inplace=True)  # pylint: disable=no-member
         df["road_segment_id"] = road_segment_id
         df["road"] = road
         df["segment"] = segment
         df["road_segment_name"] = road_segment_name
         dfs.append(df)
 
-    df_alls_segments = pd.concat(dfs)
-    df_alls_segments.sort_values(['road_segment_id', 'provider_code', 'accident_year'], inplace=True)
-    return df_alls_segments
+    df_all_segments = pd.concat(dfs)
+    df_all_segments.sort_values(["road_segment_id", "provider_code", "accident_year"], inplace=True)
+    return df_all_segments
 
 
 def parse():
     df_anyway = get_anyway_count()
-    df_anyway["road_segment_id"] = df_anyway["road_segment_id"].astype(int)
-    df_anyway_total = df_anyway.groupby(["road", "segment" ,"provider_code", "road_segment_id", "accident_year"])["anyway_count"].sum()
-    df_anyway_total = df_anyway_total.unstack(-1)
-    df_anyway_total.fillna(0, inplace=True)
-    df_cbs_total, df_type_1_2022 = get_cbs_count()
-    df_anyway_total.reset_index(inplace=True)
-    df_anyway_total.set_index(["road" , "segment", "provider_code"], inplace=True)
+    df_anyway_total = df_anyway.groupby(["road", "segment", "provider_code", "road_segment_id", "accident_year"])["anyway_count"].sum().unstack(fill_value=0).reset_index()
+    df_anyway_total.set_index(["road", "segment", "provider_code"], inplace=True)
+
+    df_cbs_total = get_cbs_count()
+    if df_cbs_total.empty:
+        return
+
     df_total = pd.merge(df_cbs_total, df_anyway_total, left_index=True, right_index=True, how="outer")
     df_total.reset_index(inplace=True)
-    df_total["road_segment_name"] = df_total.road_segment_id.apply(lambda s: ROAD_SEGMENTS_DICT.get(s))
-    df_total = df_total.rename(columns = {2020: "2020_anyway", 2021: "2021_anyway", 2022: "2022_anyway"})
-    df_total = df_total[["road_segment_name_cbs", "road_segment_name", "road_segment_id", "road", "segment",  "provider_code", "2020_cbs", "2020_anyway", "2021_cbs",  "2021_anyway", "2022_cbs",  "2022_anyway"]]
-    df_total["2020_mismatch"] = df_total["2020_cbs"] != df_total["2020_anyway"]
-    df_total["2021_mismatch"] = df_total["2021_cbs"] != df_total["2021_anyway"]
-    df_total["2022_mismatch"] = df_total["2022_cbs"] != df_total["2022_anyway"]
-    df_total["any_mismatch"] = df_total[["2020_mismatch",
-                                         "2021_mismatch",
-                                         "2022_mismatch"]].any(axis=1)
-    df_total.to_csv("cbs_anyway_road_segments.csv", index=False)
+    df_total["road_segment_name"] = df_total["road_segment_id"].map(ROAD_SEGMENTS_DICT)
+    df_total.rename(columns={2020: "2020_anyway", 2021: "2021_anyway", 2022: "2022_anyway"}, inplace=True)
+    df_total = df_total[[
+        "road_segment_name_cbs", "road_segment_name", "road_segment_id", "road", "segment", "provider_code",
+        "2020_cbs", "2020_anyway", "2021_cbs", "2021_anyway", "2022_cbs", "2022_anyway"
+    ]]
+    df_total["road_segment_name_cbs"] = df_total["road_segment_name_cbs"].str.strip()
+    df_total["road_segment_name_cbs"] = df_total["road_segment_name_cbs"].replace(r'\s+', ' ', regex=True)
+    df_total["road_names_matches"] = df_total["road_segment_name_cbs"] == df_total["road_segment_name"]
+    df_total["2020_match"] = df_total["2020_cbs"] == df_total["2020_anyway"]
+    df_total["2021_match"] = df_total["2021_cbs"] == df_total["2021_anyway"]
+    df_total["2022_match"] = df_total["2022_cbs"] == df_total["2022_anyway"]
+    df_total["all_match"] = df_total[["2020_match", "2021_match", "2022_match"]].all(axis=1)
+    df_total["diff_anyway_cbs"] = df_total[["2020_anyway", "2021_anyway", "2022_anyway"]].sum(axis=1) - df_total[["2020_cbs", "2021_cbs", "2022_cbs"]].sum(axis=1)
+
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    df_total.to_csv(OUTPUT_FILE, index=False)
+    print(f"Output saved to {OUTPUT_FILE}")
diff --git a/static/data/cbs_summary_files/README.md → static/data/cbs_summary_files/2022/README.md b/static/data/cbs_summary_files/README.md → static/data/cbs_summary_files/2022/README.md
diff --git a/...ary_files/t01_type_1_for_segment_test.xls → ...iles/2022/t01_type_1_for_segment_test.xls b/...ary_files/t01_type_1_for_segment_test.xls → ...iles/2022/t01_type_1_for_segment_test.xls
diff --git a/...ary_files/t03_type_3_for_segment_test.xls → ...iles/2022/t03_type_3_for_segment_test.xls b/...ary_files/t03_type_3_for_segment_test.xls → ...iles/2022/t03_type_3_for_segment_test.xls