global_landslide_data_analysis_&_visualization.py

# -*- coding: utf-8 -*-
"""Global Landslide Data Analysis & Visualization.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1xcBJsdaNC9SGPeq9B6jUOYgU_cGAZvjO

# **Global Landslides Evaluation & Visualization**

## **Import Libraries**
"""

# Commented out IPython magic to ensure Python compatibility.
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns

import folium
from folium import plugins
from folium import Marker
from folium.plugins import MarkerCluster, HeatMap

import math
import warnings
warnings.filterwarnings("ignore")

"""---

## **Getting Know About The Dataset**
"""

# read dataset to pandas dataframe

df = pd.read_csv('/content/drive/MyDrive/Colab Materials/Land Slide Datset NASA/Global_Landslide_Catalog_Export.csv')

# display first 5 rows of the dataset

df.head()

# display last 5 rows

df.tail()

# display shape of the data

df.shape

# display features data types

df.dtypes

"""---

## **Data Cleaning**
"""

# drop unwanted columns

df.drop(['event_id','event_time','location_description','event_title','event_description','photo_link',	'notes',	
         'event_import_source'	,'event_import_id','country_code','submitted_date',	'created_date',	'last_edited_date'],
        axis=1,
        inplace=True)

# print available coumns after drop unwanted columns

for i in df.columns:
  print(i)

# checking for null values

df.isnull().sum()

# change data type of 'event_date' Column

df['event_date_cal'] = pd.to_datetime(df['event_date'])

# split date & time in to separate columns

df['Date'] = pd.to_datetime(df['event_date_cal']).dt.date
df['Time'] = pd.to_datetime(df['event_date_cal']).dt.time

df.drop(['event_date','event_date_cal'],
        axis=1,
        inplace=True)

# display the result

df.head(2)

"""---

## **Exploratory Data Analysis & Visualization**

## **Event Reported Source**
"""

Reported_source = pd.DataFrame(df['source_name'].value_counts().head(15)).reset_index()
Reported_source.columns = ['Source Name','Reported Count']
Reported_source

# visualize source reported times

plt.figure(figsize=(18,12))
sns.barplot(x="Reported Count", y="Source Name", 
            data=Reported_source,
            palette="gist_earth")

plt.xticks(size=12)
plt.title('Reported Sources By Number Of Reported Times',size=16)
plt.xlabel('Reported Times',size=10)
plt.show()

"""> ## *Origon DOT Have Reported Huge Number Of Events During This Time Period ( 1988 - 2017 )*

## **Geospatial Visualization Of Globally Events**

---

> ### **Open Streat Map Style**
"""

# Create the map
map_1 = folium.Map(location=[51.1657,10.4515], tiles='cartodbpositron', zoom_start=2) 

mc1 = MarkerCluster()

for idx, row in df.iterrows(): 

     if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):

        mc1.add_child(Marker(location=[row['latitude'], row['longitude']]))

#add child to the map                                     
map_1.add_child(mc1)

# Display the map
map_1

"""> ### **Heat Map Style**"""

# Create the map
map_2 = folium.Map(location=[51.1657,10.4515], zoom_start=2) 

# List comprehension to make out list of lists
heat_data = [[row['latitude'],row['longitude']] for index, row in df.iterrows()]

# Plot it on the map
HeatMap(heat_data).add_to(map_2)

minimap = plugins.MiniMap()
map_2.add_child(minimap)


# Display the map
map_2

"""> ## **Accoring to the above maps we can determine that lot of land slide events happend in ,** 
- Indial Ocean  
- North America
- South America

---

## **Events By Years**
"""

# max available date

df['Date'].max()

# minamum available data 

df['Date'].min()

# split year from date
df['year'] = pd.to_datetime(df['Date']).dt.year

# varify the resuly

df.head(2)

#group by yeras
gr_by_years = pd.DataFrame(df.groupby('year')['source_name'].count().reset_index())

#change columns names
gr_by_years.columns = ['year','Occured_Events']
gr_by_years

# Visualize Occured Events By Years

fig, ax = plt.subplots(1, 1, figsize=[18, 8])
ax.plot(gr_by_years['year'], gr_by_years['Occured_Events'])

plt.xlabel('Years',size=15)
plt.ylabel('Count',size=15)

plt.legend(['Occured Events'], loc=2)
ax.set_title('Occured Events By Years',size=17)

"""## **Events By Months**"""

# split month from date
df['month'] = pd.to_datetime(df['Date']).dt.month

#check the result
df.head(2)

#group by months
gr_by_months = pd.DataFrame(df.groupby('month')['source_name'].count().reset_index())

#change columns names
gr_by_months.columns = ['month','Occured_Events']
gr_by_months

# Visualize Event occured Count by months

fig, ax = plt.subplots(1, 1, figsize=[18, 8])
ax.plot(gr_by_months['month'], gr_by_months['Occured_Events'])

plt.xlabel('Months',size=15)
plt.ylabel('Count',size=15)

plt.legend(['Occured Events'], loc=2)
ax.set_title('Occured Events By Months',size=17)

"""> ## *We Can See There Are Most Event Occured During 3 Quater Of The year*

### Events In 2010
"""

# Filter events in 2010

year_2010 = pd.DataFrame(df[(df['year'] == 2010)])
year_2010

"""### Visualize Events Occured In 2010 In Gepspatial Map"""

# Create the map
map_3 = folium.Map(location=[51.1657,10.4515], zoom_start=2) 

# List comprehension to make out list of lists
heat_data2 = [[row['latitude'],row['longitude']] for index, row in year_2010.iterrows()]

# Plot it on the map
HeatMap(heat_data2).add_to(map_3)

# Display the map
map_3

"""### **In 2010 More Events Occured In Indian Ocean**

### Events In 2017
"""

year_2017 = pd.DataFrame(df[(df['year'] == 2017)])
year_2017

"""### **Visualize Events Occured In 2017 In Gepspatial Map**"""

# Create the map
map_4 = folium.Map(location=[51.1657,10.4515], zoom_start=2) 

# List comprehension to make out list of lists
heat_data3 = [[row['latitude'],row['longitude']] for index, row in year_2017.iterrows()]

# Plot it on the map
HeatMap(heat_data3).add_to(map_4)

# Display the map
map_4

"""### * In 2017 Most Events Occured In Indian Ocean & Also Have Higher Count In North and South America*

## **Events By Category**
"""

def event_by_category():

  #print value count
  print(df['landslide_category'].value_counts()) 

  #print in countplot graph
  plt.figure(figsize=(18,8.5))
  sns.countplot(df['landslide_category'],palette='Set2')
  plt.xticks(rotation='vertical',size=15)
  plt.title('By Landslide Category',size=16)
  plt.xlabel('',size=10)
  plt.show()


event_by_category()

"""## **Events By Size**"""

def event_by_size():

  #print value count
  print(df['landslide_size'].value_counts()) 

  #print in countplot graph
  plt.figure(figsize=(18,8.5))
  sns.countplot(df['landslide_size'],palette='Set2_r')
  plt.xticks(rotation='vertical',size=15)
  plt.title('By Size',size=16)
  plt.xlabel('',size=10)
  plt.show()


event_by_size()

"""### **Very Large Events Geospatial Visualization**"""

size_large = pd.DataFrame(df[(df['landslide_size'] == 'very_large')])
size_large

map_5 = folium.Map(location=[51.1657,10.4515], zoom_start=2) 

# List comprehension to make out list of lists
heat_data4 = [[row['latitude'],row['longitude']] for index, row in size_large.iterrows()]

# Plot it on the map
HeatMap(heat_data4).add_to(map_5)

# Display the map
map_5

"""## **Events By Settings Type**"""

def event_by_setting():

  #print value count
  print(df['landslide_setting'].value_counts()) 

  #print in countplot graph
  plt.figure(figsize=(18,8.5))
  sns.countplot(df['landslide_setting'],palette='YlGnBu_r')
  plt.xticks(rotation='vertical',size=15)
  plt.title('Event By Settings Type',size=16)
  plt.xlabel('',size=10)
  plt.show()


event_by_setting()

"""> ## *Accoring to above map we can determine lots of events happend in above Roads setting. so roads contructions cause to these events most. if we can contruct roads with more safety & pre analysis we can reduce these events happening*

## **By Event TrigGed Type**
"""

def event_by_triger():

  #print value count
  print(df['landslide_trigger'].value_counts()) 

  #print in countplot graph
  plt.figure(figsize=(18,8.5))
  sns.countplot(df['landslide_trigger'],palette='crest')
  plt.xticks(rotation='vertical',size=15)
  plt.title('Event By Triger Type',size=16)
  plt.xlabel('',size=10)
  plt.show()


event_by_triger()

"""## **Events Occurred Due To Earth Quakes**"""

size_eth = pd.DataFrame(df[(df['landslide_trigger'] == 'earthquake')])
size_eth

"""## **Number Of Earth Quakes Events**"""

size_eth.size

"""## **Earth Quakes Triggerd Types**"""

def earth_quake_setting():

  #print value count
  print(size_eth['landslide_setting'].value_counts()) 

  #print in countplot graph
  plt.figure(figsize=(18,8.5))
  sns.countplot(size_eth['landslide_setting'],palette='Set2_r')
  plt.xticks(rotation='vertical',size=15)
  plt.title('Earth Quakes Trigerd settings',size=16)
  plt.xlabel('',size=10)
  plt.show()


earth_quake_setting()

"""### *According to the above graph there are unknow details there are high numbers in "above road" & "natural slope"*

### Earth Quakes Events Geospatial Visulization
"""

map_6 = folium.Map(location=[51.1657,10.4515], zoom_start=2) 

# List comprehension to make out list of lists
heat_data5 = [[row['latitude'],row['longitude']] for index, row in size_eth.iterrows()]

# Plot it on the map
HeatMap(heat_data5).add_to(map_6)

# Display the map
map_6

"""> ### *Accoring to the above map we can see these Earthquakes are trigerd near to earth plates & lots of number os earth quakes are happend with unknown reason. we can determine this unknown reasion as earth plates movements because these events happend between them. So **earth plates** are course to trigger Earthquakes most*

## **Top 10 Countries By Events**
"""

df['country_name'].value_counts().head(10)

"""# **Fatalities & Injuries**

## **Highet Fatalities Event Details**
"""

df[(df['fatality_count'] == df['fatality_count'].max())]

"""## **Highest injuries Event Details**

"""

df[(df['injury_count'] == df['injury_count'].max())]

"""## **Fatalities & Injuries Descriptive Statistics**"""

df[['fatality_count','injury_count']].describe()

"""## **Top 15 Countries With Events Fatalities & Injuries**"""

group_by_country = pd.DataFrame(df.groupby('country_name').sum()[['fatality_count','injury_count']].reset_index())

group_by_country_sort = group_by_country.sort_values('fatality_count',ascending=False)
group_by_country_sort.head(15)

"""## **Fatalities & Injuries By Each Triger Type**"""

group_by_tr = pd.DataFrame(df.groupby('landslide_category').sum()[['fatality_count','injury_count']].reset_index())

group_by_tr_sort = group_by_tr.sort_values('fatality_count',ascending=False)
group_by_tr_sort

group_by_tr_sort.plot(x="landslide_category", y=["fatality_count", "injury_count"], kind="bar",figsize=(22,8))
plt.xticks(rotation='vertical',size=15)
plt.title('Fatalities & Injuries By Each Event Caregory',size=15)
plt.show

"""---

## **Event Occured Due to Storms**
"""

storms = pd.DataFrame(df[df['storm_name'].notnull()])
storms

"""### **Storms Events Geospatial Visualzation**"""

map_7 = folium.Map(location=[51.1657,10.4515], tiles='Stamen Toner', zoom_start=2) 

# List comprehension to make out list of lists
heat_data6 = [[row['latitude'],row['longitude']] for index, row in storms.iterrows()]

# Plot it on the map
HeatMap(heat_data6).add_to(map_7)

# Display the map
map_7

"""## **Top 10 Storms Course to Occured Events**"""

df['storm_name'].value_counts().head(10)

"""## **Fatalities & Injeries In Due To The Top 10 Occured Storms**"""

storms = ['Supertyphoon Juan (Megi)','Tropical Depression Parma',
          'Agaton','Tropical Depression Urduja','Tropical Storm Tomas',          
          'Hurricane Tomas','Tropical Cyclone Agatha','Trami','Lawin','Utor']


storms_df = pd.DataFrame(df[(df['storm_name'] == storms[0]) | (df['storm_name'] == storms[1]) | (df['storm_name'] == storms[2]) | (df['storm_name'] == storms[3]) | (df['storm_name'] == storms[4]) | (df['storm_name'] == storms[5]) | (df['storm_name'] == storms[6]) | (df['storm_name'] == storms[7]) | (df['storm_name'] == storms[8]) | (df['storm_name'] == storms[9])])
storms_df

# Create the map
map_8 = folium.Map(location=[51.1657,10.4515], tiles='cartodbpositron', zoom_start=3) 


mc2 = MarkerCluster()

for idx, row in storms_df.iterrows(): 

     if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):

        mc2.add_child(Marker(location=[row['latitude'], row['longitude']],tooltip=row['storm_name']))

#add child to the map                                     
map_8.add_child(mc2)

# Display the map
map_8

"""#### **Alternative Visulization In Heat Map For Better View**"""

map_9 = folium.Map(location=[51.1657,10.4515], tiles='Stamen Toner', zoom_start=3) 

# List comprehension to make out list of lists
heat_data7 = [[row['latitude'],row['longitude']] for index, row in storms_df.iterrows()]

# Plot it on the map
HeatMap(heat_data7).add_to(map_9)

# Display the map
map_9

"""### *Most Event Occurred Due To Storms In Around Philiphines & Carabian Sea*

## **Fatalities & Injuries Due To Top 10 Storms**

#### **Total Fatalities & Injuries Due To Top 10 Storms**
"""

group_by_storms_df = pd.DataFrame(storms_df.groupby('storm_name').sum()[['fatality_count','injury_count']].reset_index())

group_by_storms_df_sort = group_by_storms_df.sort_values('fatality_count',ascending=False)
group_by_storms_df_sort

"""### **Average Fatalities & Injuries Due To Top 10 Storms**"""

group_by_storms_df_avg = pd.DataFrame(storms_df.groupby('storm_name').mean()[['fatality_count','injury_count']].reset_index())

group_by_storms_df_avg_sort = group_by_storms_df_avg.sort_values('fatality_count',ascending=False)
group_by_storms_df_avg_sort

"""## **Fatalities & Injuries By Years**"""

#group values by years & aggregated by facilty & injuries count
group_by_years = pd.DataFrame(df.groupby('year').sum()[['fatality_count','injury_count']].reset_index())

#sort values to decending order
group_by_years_sort = group_by_years.sort_values('fatality_count',ascending=False)
group_by_years_sort

"""## **Time Series Of Fatalities & Injuries**"""

# Visualize Fatalities & Injuries By Years
fig, ax = plt.subplots(1, 1, figsize=[22, 9])

ax.plot(group_by_years['year'], group_by_years['fatality_count'])
ax.plot(group_by_years['year'], group_by_years['injury_count'])

plt.xlabel('Years',size=15)
plt.ylabel('Count',size=15)

plt.legend(['Fatalities Count','Injuries Count'], loc=2)
ax.set_title('Time Series Of Fatalities & Injuries ',size=17)

"""## **Time Series Of Fatalities & Injuries Vs Occurred Events By Years**"""

# Visualize Fatalities & Injuries By Years
fig, ax = plt.subplots(1, 1, figsize=[22, 9])

ax.plot(gr_by_years['year'], gr_by_years['Occured_Events'])
ax.plot(group_by_years['year'], group_by_years['fatality_count'])
ax.plot(group_by_years['year'], group_by_years['injury_count'])

plt.xlabel('Years',size=15)
plt.ylabel('Count',size=15)

plt.legend(['Occured Events Count','Fatalities Count','Injuries Count'], loc=2)
ax.set_title('Time Series Of Fatalities & Injuries Vs Occured Events By Years',size=17)

"""---

# **Exploratory Data Analysis & Visualization In Sri Lanka**
"""

srilanka_df = pd.DataFrame(df[(df['longitude'] < 82.0000) & (df['latitude'] < 8.0000) & (df['latitude'] > 6.0000) & (df['longitude'] > 80.0000)])

srilanka_df.head()

srilanka_df.isnull().sum()

srilanka_df.shape

"""## *There Are **86** Events Occurred In Sri Lanka During 1988 To 2017*

---

## **Occured Events In Sri Lanka Geospatial Visualization**
"""

m11 = plugins.DualMap(location=(7.8731,80.7718), tiles=None, zoom_start=7.5)

folium.TileLayer("openstreetmap").add_to(m11.m1)
folium.TileLayer("Stamen Terrain").add_to(m11.m2)

# map 1.............................................................................................. 
mc15 = MarkerCluster()
for idx, row in srilanka_df.iterrows(): 

     if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):

        mc15.add_child(Marker(location=[row['latitude'], row['longitude']],tooltip="<b> Category : </b>"+ row['landslide_category']+"<br> <b> Trigger By : </b>" +row['landslide_trigger']+"<br> <b> Size : </b>" +row['landslide_size']))
# popup="<b> Category </b>"+ row['landslide_category']+"<br> <b> Trigger </b>" +row['landslide_trigger']+"<br> <b> Size </b>" +row['landslide_size']
#add child to the map                                     
m11.m1.add_child(mc15)

# map 2..............................................................................................

#List comprehension to make out list of lists
heat_data9 = [[row['latitude'],row['longitude']] for index, row in srilanka_df.iterrows()]
#Plot it on the map
HeatMap(heat_data9).add_to(m11.m2)


m11

"""> ## **Acording To The Above Map,**
- *We can See That In Sri Lanka Most Of The Events Mountain Side Areas. Such As Uva Province , Central province ,Sabaragamuwa Province*

## Lets Analyse These Events Deeply

---

## **Time Series Of Occurred Events By Years In Sri Lanka**
"""

#group by yeras
gr_by_years_sl = pd.DataFrame(srilanka_df.groupby('year')['source_name'].count().reset_index())

#change columns names
gr_by_years_sl.columns = ['year','Occured_Events']
gr_by_years_sl

# Visualize event occurred by Year

fig, ax = plt.subplots(1, 1, figsize=[18, 8])
ax.plot(gr_by_years_sl['year'], gr_by_years_sl['Occured_Events'])

plt.xlabel('Years',size=15)
plt.ylabel('Ocured Times',size=15)

plt.legend(['Event Occurred Times'], loc=2)
ax.set_title('Event Occurred Times By Years',size=17)

"""## **Time Series Of Ocured Events By Months In Sri Lanka**"""

#group by months
gr_by_month_sl = pd.DataFrame(srilanka_df.groupby('month')['source_name'].count().reset_index())

#change columns names
gr_by_month_sl.columns = ['month','Occured_Events']
gr_by_month_sl

# Visualize Event occured Count by months

fig, ax = plt.subplots(1, 1, figsize=[18, 8])
ax.plot(gr_by_month_sl['month'], gr_by_month_sl['Occured_Events'])

plt.xlabel('Months',size=15)
plt.ylabel('Count',size=15)

plt.legend(['Occurred Events'], loc=2)
ax.set_title('Occurred Events By Months',size=17)

"""## **Time Series Of Fatalities & Injuries Comparing To Events Occurred Times In Sri Lanka**"""

#group values by years & aggregated by facilty & injuries count In Sri Lanka
gr_by_ft_in_sl_yr = pd.DataFrame(srilanka_df.groupby('year').sum()[['fatality_count','injury_count']].reset_index())

#sort values to decending order
gr_by_ft_in_sl_yr_sort = gr_by_ft_in_sl_yr.sort_values('fatality_count',ascending=False)
gr_by_ft_in_sl_yr_sort

# Visualize Fatalities & Injuries By Years
fig, ax = plt.subplots(1, 1, figsize=[22, 9])

ax.plot(gr_by_years_sl['year'], gr_by_years_sl['Occured_Events'])
ax.plot(gr_by_ft_in_sl_yr['year'], gr_by_ft_in_sl_yr['fatality_count'])
ax.plot(gr_by_ft_in_sl_yr['year'], gr_by_ft_in_sl_yr['injury_count'])

plt.xlabel('Years',size=15)
plt.ylabel('Count',size=15)

plt.legend(['Occured Events Count','Fatalities Count','Injuries Count'], loc=2)
ax.set_title('Time Series Of Fatalities & Injuries Vs Occured Events By Years In Sri Lanka',size=17)

"""---

## **Details About Occurred Event With Highest Fatalities**
"""

high_ft = pd.DataFrame(srilanka_df[(srilanka_df['fatality_count'] == srilanka_df['fatality_count'].max())])
high_ft

"""## *Sri Lanka Hit a Biggest Landslide Disaster In 2016 You Can Read The Article Here https://en.wikipedia.org/wiki/2016_Sri_Lankan_floods*

## **Top 10 Fatalities Events**
"""

sl_ft_sort = srilanka_df.sort_values('fatality_count',ascending=False)
sl_ft_sort.head(10)

"""## **Distribution Of Categories In Occurred Events In Sri Lanka**"""

def event_by_category_sl():

  #print value count
  print(srilanka_df['landslide_category'].value_counts()) 

  #print in countplot graph
  plt.figure(figsize=(18,8.5))
  sns.countplot(srilanka_df['landslide_category'],palette='YlGnBu_r')
  plt.xticks(rotation='vertical',size=15)
  plt.title('Event Category Distribution In Sri Lanka',size=16)
  plt.xlabel('',size=10)
  plt.show()


event_by_category_sl()

"""## **Distribution Of Triger Type In Occurred Events In Sri Lanka**"""

def event_by_triger_sl():

  #print value count
  print(srilanka_df['landslide_trigger'].value_counts()) 

  #print in countplot graph
  plt.figure(figsize=(18,8.5))
  sns.countplot(srilanka_df['landslide_trigger'],palette='crest')
  plt.xticks(rotation='vertical',size=15)
  plt.title('Event By Triger Type In Sri Lanka',size=16)
  plt.xlabel('',size=10)
  plt.show()


event_by_triger_sl()

def event_by_size_sl():

  #print value count
  print(srilanka_df['landslide_size'].value_counts()) 

  #print in countplot graph
  plt.figure(figsize=(18,8.5))
  sns.countplot(srilanka_df['landslide_size'],palette='Set2_r')
  plt.xticks(rotation='vertical',size=15)
  plt.title('By Size In Sri Lanka',size=16)
  plt.xlabel('',size=10)
  plt.show()


event_by_size_sl()

"""## **Event Reported Sources About In Sri Lanka**"""

Reported_source_sl = pd.DataFrame(srilanka_df['source_name'].value_counts().head(15)).reset_index()
Reported_source_sl.columns = ['Source Name','Reported Count']
Reported_source_sl

# visualize source reported times About in Sri Lanka

plt.figure(figsize=(18,12))
sns.barplot(x="Reported Count", y="Source Name", 
            data=Reported_source_sl,
            palette="gist_earth")

plt.xticks(size=12)
plt.title('Reported Sources By Number Of Reported Times About In Sri Lanka',size=16)
plt.xlabel('Reported Times',size=10)
plt.show()

"""> ## *Print.Daily Mirror Has Did Good Job When Reporting Events*"""

# Commented out IPython magic to ensure Python compatibility.
# '''
# %%shell
# jupyter nbconvert --to html '/content/Global_Landslide_Data_Analysis_&_Visualization.ipynb'
# 
# ''''