-
Notifications
You must be signed in to change notification settings - Fork 0
/
task_1(iris).py
157 lines (97 loc) · 4.83 KB
/
task_1(iris).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding: utf-8 -*-
"""Task-1(Iris).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1BZdaRZ8ek0nCkVLHmHr2LxedKHh4cKM7
Iris Flower Classification using Machine Learning,
iris dataset from Scikit-learn library is loaded using Seaborn
"""
import seaborn as sns
"""1. **Data Collection**"""
a=sns.load_dataset('iris')
a.head(3)
a.tail(3)
"""Checking for null values"""
a.info() # implies the absence of null values in the dataset
"""Checking for duplicates"""
a.duplicated() # no duplicates
"""2. **Data Visualization**"""
sns.scatterplot(data=a,x='sepal_length',y='petal_length',hue='species')
sns.histplot(data=a['sepal_length'],bins=3)
sns.lineplot(data=a,x='sepal_length',y='sepal_width',errorbar=None)
"""Pairplot is created to view the relationship between each of the variable with others present in the data."""
sns.pairplot(data=a,hue='species')
"""Data based on each species"""
import matplotlib.pyplot as plt
sns.barplot(data=a,x='species',y='sepal_length') # Virginica is the species with highest sepal length
plt.xlabel('Species of Iris Flower')
plt.ylabel('Length of sepal')
plt.title("Species vs Sepal(length)")
plt.show()
sns.barplot(data=a,x='species',y='petal_length')# Virginica is the species with highest petal length
plt.xlabel('Species of Iris Flower')
plt.ylabel('Length of Petal')
plt.title("Species vs Petal(length)")
plt.show()
sns.barplot(data=a,x='species',y='sepal_width') # Setosa is the species with highest sepal width
plt.xlabel('Species of Iris Flower')
plt.ylabel('Width of sepal')
plt.title("Species vs Sepal(width)")
plt.show()
sns.barplot(data=a,x='species',y='petal_width') # Virginica is the species with highest petal width
plt.xlabel('Species of Iris Flower')
plt.ylabel('Width of petal')
plt.title("Species vs Petal(width)")
plt.show()
"""From above 4 graphs, we can conclude that the species Virginica is higher in terms of size.
3. **Data Pre-processing**
Splitting the data into input and output
"""
x=a.drop(columns='species') # x- input
y=a['species'] # y - output
"""Since, the data has to be classified use of categorical value is to be noted. So, to make easier, the 3 classes are converted into numerics.i.e, 'setosa'=1, 'versicolor'=2, 'virginica'=3."""
y=y.replace({'setosa':1, 'versicolor':2, 'virginica':3})
"""Standardising the input data"""
from sklearn.preprocessing import StandardScaler
std=StandardScaler()
import pandas as pd
x=pd.DataFrame(data=std.fit_transform(x),columns=x.columns)
"""Data gets divided into training and testing data"""
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
"""4. **Model building**
As, the data is composed of multi class variable, 2 algorithms are used for model training - (1) K Nearest Classification; (2) Random Forests
(1) K Nearest Classification
"""
from sklearn.neighbors import KNeighborsClassifier
kc=KNeighborsClassifier(n_neighbors=2) # defining a model
kc.fit(x_train,y_train) # training the model
score_1=kc.score(x_train,y_train)
print('Accuracy Score for KNeighborsClassifier model with training data =',score_1)
score_2=kc.score(x_test,y_test)
print('Accuracy Score for KNeighborsClassifier model with test data =',score_2)
kc_a=KNeighborsClassifier(n_neighbors=5) # same model with increased number of neighbors
kc_a.fit(x_train,y_train)# training
score_1=kc_a.score(x_train,y_train)
print('Accuracy Score for KNeighborsClassifier model with training data =',score_1)
score_2=kc_a.score(x_test,y_test)
print('Accuracy Score for KNeighborsClassifier model with test data =',score_2)
"""(2) Random Forests"""
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=10) # building the model
rfc.fit(x_train,y_train)# training the model
rfc.fit(x_train,y_train)
scorea=rfc.score(x_train,y_train)
print('Accuracy Score for RandomForestClassifier model with training data =',scorea)
scoreb=rfc.score(x_test,y_test)
print('Accuracy Score for RandomForestClassifier model with test data =',scoreb)
rfc_a=RandomForestClassifier(n_estimators=50)
rfc_a.fit(x_train,y_train)
score_a=rfc_a.score(x_train,y_train)
print('Accuracy Score for RandomForestClassifier model with training data =',score_a)
score_b=rfc_a.score(x_test,y_test)
print('Accuracy Score for RandomForestClassifier model with test data =',score_b)
"""By comparing (1) and (2), when using random forest classifier,overfitting of model is observed. But the model "kc_a" built using KNeighborsClassifier algorithm with n_neighbors=5 is better, and hence selected for flower iris classification."""
import pickle
pickle.dump(kc_a,open('/content/drive/MyDrive/ONE/iris_classification_model.pkl','wb'))
"""The model chosen for iris flower classification is saved using the module pickle in Google Drive which can be later used for the same."""