zomato_bangalore_restaurants_final.py

# -*- coding: utf-8 -*-
"""Zomato Bangalore Restaurants_Final.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/12TAewgK-I-bOHDvFxu-0bUnNtumSBxWR

# ZOMATO RESTAURANTS BANGALORE ANALYSIS AND PREDICTION

Dataset - zomato.csv <br>
1. Rows - 51717 <br>
2. Columns - 17 <br>
3. Categorical Variables -  14 <br> 
4. Numerical Variables -  3<br>  
5. %of missing data - 4.28%

# Importing required libraries and dataset
"""

#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as  plt
import sklearn as sk
import seaborn as sns
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

drive.mount('/content/gdrive')

#loading dataframe
dataframe =pd.read_csv('/content/gdrive/My Drive/zomato.csv')
df = dataframe.copy()
df.head()

"""# Data extrapolation

### Column  description
* url : zomato url for the restaurants

* address : complete location of the restaurant

* name : name of the restaurant

* online_order : whether restaurant accepts online order

* book_table : whether restaurant provides option for booking table

* rating : restaurants rating on zomato website

* votes : number of individual who voted for restaurants

* phone : contact details of the restaurant

* localtion : area where restaurant is situated

* rest_type : Type of restaurants (Categorical value)

* dish_liked : what are all dishes of the restaurant that people liked

* cuisines : cuisines offered by the restaurant

* approx_cost(for two people) : average cost for two people
* review_list : reviews of the restaurant on zomato website
* menu_item : menu items available in the restuarant
* listed_in(type) : type of the restaurant
* listed_in(city) : locality of the restaurant position
"""

#Characteristics of the dataset
print("Rows x Columns:")
print(df.shape)
print("\n")
print("Various attributes:")
for i in range(17):
    print(i+1,df.columns[i])

print("Number of null in each colums")
df.isnull().sum()

totalnull = df.isnull().sum().sum()
print(totalnull)
percent_null = ((totalnull)/(51717*17))*100
print("Percentage of null values:",percent_null)

"""# Data cleaning"""

df.rename(columns={'approx_cost(for two people)':'approx_cost','listed_in(type)':'listed_type'},inplace = True)
df.drop(columns = ['location','address','url','phone','menu_item','reviews_list'],inplace = True)
df.rename(columns = {'listed_in(city)':'location'},inplace=True)
df.head()
# i think its best if we remove menu_item and dish_liked

# cleaning field-rating
df.rename(columns = {'rate':'rating'},inplace = True)
df['rating'] = df['rating'].replace('[ ]', '',regex = True)
df['rating'] = df['rating'].replace('',np.NaN)
df['rating'] = df['rating'].replace('-',np.NaN)
df['rating'] = df['rating'].replace('NEW',np.NaN)

df['rating'] = df.loc[: ,'rating'].astype(str)
df['rating'] =df['rating'].apply(lambda x:x.replace('/5','') ).apply(lambda x:float(x))

# removing "," and converting it into float value
df['approx_cost']=df['approx_cost'].astype(str).apply(lambda x:x.replace(',',''))
df.isnull().sum()

# restaraunt names has typos such as CafÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ© , correcting that
df['name'] = df['name'].astype(str).apply(lambda x:x.replace('Ã','')).apply(lambda x:x.replace('Â','')).apply(lambda x:x.replace("\\",'')).apply(lambda x:x.replace("Â",'')).apply(lambda x:x.replace("©",''))

df.isnull().sum()

#Handling the empty or missing values
df['rest_type']=df['rest_type'].fillna(df['rest_type'].mode()[0])
df['cuisines']=df['cuisines'].fillna(df['cuisines'].mode()[0])

print('Missing values in "location"  column: ',df['location'].isna().sum())
print('Missing values in "rest_type" column: ',df['rest_type'].isna().sum())
print('Missing values in "cuisines"  column: ',df['cuisines'].isna().sum())

df['approx_cost']=df['approx_cost'].astype(str).apply(lambda x:x.replace(',',''))
df['approx_cost']=df['approx_cost'].astype(float).fillna(df['approx_cost'].astype(float).mean())
df['approx_cost']=df['approx_cost'].astype(str)

print(df.isna().sum())


"""# data visualization"""

names = df['location'].value_counts()[:15].index
values = df['location'].value_counts()[:15].values
sns.barplot(names,values)
plt.title("Top 15 locations with large number of restaurants", weight = 'bold')
plt.xticks(rotation=90)
plt.show()

names

values

#Inference - Max number of restaurants in BTM

dishes_data = df[df.dish_liked.notnull()]
dishes_data.dish_liked = dishes_data.dish_liked.apply(lambda x:x.lower().strip())

dishes_data.dish_liked.isnull().sum()

dish_count = []
for i in dishes_data.dish_liked:
    for t in i.split(','):
        t = t.strip() 
        dish_count.append(t)

plt.figure(figsize=(12,6)) 
pd.Series(dish_count).value_counts()[:15].plot(kind='barh',color= 'r')
plt.suptitle('Top 15 dishes liked in Bangalore',fontweight='bold')
plt.xlabel('Dish')
plt.ylabel('Count')

#Inference - The most liked dish in Bangalore is Pasta

df.online_order.value_counts()

#No null values

ax= sns.countplot(df['online_order'])
plt.title('Number of Restaurants accepting online orders', weight='bold')
plt.xlabel('Online orders',size = 12)
plt.ylabel('No. of restaurants',size = 12)

#Inference - Online order has no null values.
#Most of the orders are online i.e 30444

colors = ('gold', 'red', 'lightcoral', 'lightskyblue','blue','green','silver')
fig= plt.figure(figsize=(17, 8))
explode = (0.1, 0, 0, 0,0,0,0,0,0,0) 
delplot = df['approx_cost'].value_counts()[:10].plot(kind = 'pie',autopct='%1.1f%%',fontsize=20,shadow=True,explode = explode,colors = colors)
centre_circle = plt.Circle((0,0),0.80,fc='w')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title("Average cost for 2 people in Rupees",fontsize = 15,weight = 'bold')

plt.figure(figsize=(17, 8))
df.approx_cost.astype(float).hist(color='r',range=(0,3000),bins=30)
plt.axvline(x= df.approx_cost.astype(float).mean(),ls='--',color='yellow')
plt.suptitle('Average cost for a meal in Bangalore Restaurants',fontweight='bold')
plt.xlabel('Average cost')
plt.ylabel('No of Restaurants')
print('Mean = ',df.approx_cost.astype(float).mean())
print('Standard Deviation = ',df.approx_cost.astype(float).std())
print('Median = ',df.approx_cost.astype(float).median())
print('Mode = ',df.approx_cost.astype(float).mode())
df['approx_cost']=df['approx_cost'].astype(float)

print(df.isna().sum())

df.dropna(how = 'any',inplace = True, subset = ['rating'])# remove rows without rating
df.isna().sum()

df.rating.hist(color='c')
plt.axvline(x= df.rating.mean(),ls='--',color='yellow')
plt.suptitle('Average rating for Bangalore Restaurants',fontweight='bold')
plt.xlabel('rating')
plt.ylabel('No of Restaurants')
print('Mean = ',df.rating.mean())
print('Median =',df.rating.median())
print('Mode =',df.rating.median())
print('Standard Deviation = ',df.rating.std())

plt.figure(figsize = (12,6))
sns.countplot(x=df['rating'], hue = df['online_order'])
plt.ylabel("Restaurants that Accept/Not Accepting online orders")
plt.title("rating vs Online order",weight = 'bold')

print(df.isna().sum())
df.head()

"""# Normalization , standarizing"""

from sklearn import preprocessing
norma_data=df.copy()
le = preprocessing.LabelEncoder()
norma_data['rest_type'] = norma_data['rest_type'].str.replace(',' , '').astype(str).apply(lambda x: ' '.join(sorted(x.split())))
norma_data['cuisines'] = norma_data['cuisines'].str.replace(',' , '').astype(str).apply(lambda x: ' '.join(sorted(x.split()))) 
norma_data['dish_liked'] = norma_data['dish_liked'].str.replace(',' , '').astype(str).apply(lambda x: ' '.join(sorted(x.split()))) 
norma_data['location'] = norma_data['location'].astype(str)

norma_data['online_order']= pd.get_dummies(norma_data.online_order, drop_first=True)
norma_data['book_table']= pd.get_dummies(norma_data.book_table, drop_first=True)
norma_data['location'] = le.fit_transform(norma_data['location'])
norma_data['rest_type'] = le.fit_transform(norma_data['rest_type'])
norma_data['cuisines'] = le.fit_transform(norma_data['cuisines'])
norma_data['dish_liked'] = le.fit_transform(norma_data['dish_liked'])

norma_data.head()

from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
df1=norma_data
x1 = df1[['rating']].values.astype(float)
x1_scaled = scaler.fit_transform(x1)
df_normalized1 = pd.DataFrame(x1_scaled)

x2 = df1[['approx_cost']].values.astype(float)
x2_scaled = scaler.fit_transform(x2)
df_normalized2 = pd.DataFrame(x2_scaled)

x3 = df1[['votes']].values.astype(float)
x3_scaled = scaler.fit_transform(x3)
df_normalized3 = pd.DataFrame(x3_scaled)

sns.distplot(df_normalized1[0])
plt.xlabel('Normalised rating')
plt.ylabel('')

num_values1=norma_data.select_dtypes(['float64','int64','int32']).columns
scaler = sk.preprocessing.StandardScaler()
scaler.fit(norma_data[num_values1])
norma_data[num_values1]=scaler.transform(norma_data[num_values1])

sns.distplot(norma_data.rating)
plt.suptitle('Average rating of Bangalore Restaurants',fontweight='bold')
plt.xlabel('standerdised rating')

"""# Hypothesis testing"""

import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats as stests
sample=df['rating'].sample(n=50)
print('Sample Mean: ',float(sample.mean()))
ztest ,pval = stests.ztest(sample, x2=None, value=3.75,alternative='smaller')
#value is the assumed mean value we take for H0
print('z Test Statistic:' ,ztest)
print("p value =",pval)

"""#### We claim that the minimum average rating of restaurants in Bangalore is 3.75 with a standard deviation of 1.1. To verify this hypothesis, a sample of 50 restaurants is chosen at random. The sample mean is as calculated above. Is there sufficient evidence in the sample to indicate, at a 5% significance level, that the assumption is true?

Here, H0: μ >= 3.75
      Ha: μ < 3.75  @ α = 0.05

The sample is large(>30) and the sample standard deviation is known. Thus the test statistic can be calculated as:

This is a one-tailed test. To be more specific, it's a left tailed test. So there a single critical value -zα = -z(0.005)
which is equal to -1.645. Thus, the rejection rejection is (-int,-1.645]

The test statistic does not fall in the rejection region
Decision : Fail to Reject Ho
In the context of our problem our conclusion is:
###### Conclusion - The averge rating of restaurants Bangalore is not less than 3.75

# Corelations
"""

df['online_order']= pd.get_dummies(df.online_order, drop_first=True)
df['book_table']= pd.get_dummies(df.book_table, drop_first=True)

plt.figure(figsize=(20,10))
sns.scatterplot(x='rating',y='votes',data=df)
plt.show()

plt.figure(figsize=(20,10))
sns.scatterplot(x='rating',y='votes',data=df)
plt.show()

plt.figure(figsize=(10,5))
c= df.corr()
sns.heatmap(c,cmap="inferno",annot=True)
c

"""# Data modeling"""

#Inference - come up with inference
df.shape


#standarizing

data = dataframe.copy()
data.dtypes

data['online_order']= pd.get_dummies(data.online_order, drop_first=True)
data['book_table']= pd.get_dummies(data.book_table, drop_first=True)
data.head()

data.drop(columns =['dish_liked','reviews_list','menu_item','listed_in(type)'], inplace = True)

data['rest_type'] = data['rest_type'].str.replace(',' , '').astype(str).apply(lambda x: ' '.join(sorted(x.split())))
data['rest_type'].value_counts().head()

data['cuisines'] = data['cuisines'].str.replace(',' , '').astype(str).apply(lambda x: ' '.join(sorted(x.split()))) 
data['cuisines'].value_counts().head()

data['location'] = data['location'].astype(str)

"""


May be we can add ffew more graphs here


"""

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['location'] = le.fit_transform(data['location'])
data['rest_type'] = le.fit_transform(data['rest_type'])
data['cuisines'] = le.fit_transform(data['cuisines'])

data['approx_cost(for two people)'] = data['approx_cost(for two people)'].str.replace(',','').astype('float')
data.tail()

data.drop(columns =['url','address','phone','listed_in(city)'], inplace = True)
data.rename(columns={'approx_cost(for two people)': 'average_cost','rate':'rating'}, inplace=True)
data['rating'] = data.loc[:,'rating'].replace('[ ]','',regex = True)
data['rating'] = data['rating'].replace('',np.NaN)
data['rating'] = data['rating'].replace('NEW',np.NaN)
data['rating'] = data['rating'].replace('-',np.NaN)
data.dropna(how = 'any', inplace = True)
data['rating'] = data['rating'].astype(str)
data['rating'] = data['rating'].apply(lambda r: r.replace('/5',''))
data['rating'] = data['rating'].apply(lambda r: float(r))
data

x = data.drop(['rating','name'],axis = 1)

y = data['rating']

from sklearn import model_selection
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(x,y,test_size = 0.3,random_state = 33)
data.info()

#MODELS

#LR
from sklearn import linear_model
lr = linear_model.LinearRegression()
lr.fit(X_train,Y_train)
y_pred_lr = lr.predict(X_test)

#determining accuracy of prediction
lr.score(X_test, Y_test)*100

#Random Forest
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train,Y_train)
y_pred_rfr = rfr.predict(X_test)

rfr.score(X_test,Y_test)*100

onehot = pd.read_csv("/content/gdrive/My Drive/zomato.csv")
onehot.head()

onehot.rename(columns={'rate':'rating'},inplace=True)
onehot['rating'] = onehot['rating'].replace('NEW',np.NaN)
onehot['rating'] = onehot['rating'].replace('-',np.NaN)
onehot.dropna(how = 'any', inplace = True)

onehot['rating'] = onehot.loc[:,'rating'].replace('[ ]','',regex = True)
onehot['rating'] = onehot['rating'].astype(str)
onehot['rating'] = onehot['rating'].apply(lambda r: r.replace('/5',''))
onehot['rating'] = onehot['rating'].apply(lambda r: float(r))

onehot['cuisines'] = onehot['cuisines'].str.replace(',' , '') 
onehot['cuisines'] = onehot['cuisines'].astype(str).apply(lambda x: ' '.join(sorted(x.split())))
onehot['cuisines'].unique()

onehot['rest_type'] = onehot['rest_type'].str.replace(',' , '') 
onehot['rest_type'] = onehot['rest_type'].astype(str).apply(lambda x: ' '.join(sorted(x.split())))
onehot['rest_type'].value_counts().head()

onehot['dish_liked'] = onehot['dish_liked'].str.replace(',' , '') 
onehot['dish_liked'] = onehot['dish_liked'].astype(str).apply(lambda x: ' '.join(sorted(x.split())))
onehot['dish_liked'].value_counts().head()

dummy_rest_type=pd.get_dummies(onehot['rest_type'])
dummy_city=pd.get_dummies(onehot['location'])
dummy_cuisines=pd.get_dummies(onehot['cuisines'])
dummy_dishliked=pd.get_dummies(onehot['dish_liked'])

final=pd.concat([onehot,dummy_rest_type,dummy_city,dummy_cuisines,dummy_dishliked],axis=1)

final

final.drop(columns=['rest_type','location','cuisines','dish_liked','name','phone'] , inplace=True)
final.drop(columns=['reviews_list','menu_item','listed_in(type)','listed_in(city)'], inplace=True)
final.drop(columns=['url','address'], inplace=True)

final['online_order']=pd.get_dummies(final['online_order'])
final['book_table']=pd.get_dummies(final['book_table'])
final

final['approx_cost(for two people)'] = final['approx_cost(for two people)'].str.replace(',' , '')
x = final.drop(['rating'],axis=1)
y = final['rating']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = sk.model_selection.train_test_split(x,y,test_size = 0.3,random_state = 33)

from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
rfr = sk.ensemble.RandomForestRegressor()
rfr.fit(X_train,y_train)
y_pred_rfr = rfr.predict(X_test)

rfr.score(X_test,y_test)*100