-
Notifications
You must be signed in to change notification settings - Fork 9
/
svm.py
177 lines (162 loc) · 5.5 KB
/
svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
Title: Detection of Forest Cover Type using Linear SVM
Author: Abhinav Thukral
Description:
Built Linear SVM from scratch in Python 3.6.1 for multi-class classification of forest cover types.
Implementation of Linear SVM (Support Vector Machine Classification), Gradient Descent, one v/s all classification, feature scaling and cross validations from scratch.
"""
import pandas as pd
from random import seed
from random import randrange
from math import exp
from math import log
from math import floor
#Function to split data into train and test
def cross_val_split(data_X,data_Y,test_size,seed_val):
data_x = data_X.tolist()
data_y = data_Y.tolist()
seed(seed_val)
train_size = floor((1 - test_size)*len(data_x))
train_x = []
train_y = []
while(len(train_x)<train_size):
index = randrange(len(data_x))
train_x.append(data_x.pop(index))
train_y.append(data_y.pop(index))
return train_x,train_y,data_x,data_y
#Function to return columnwise max-min statistics for scaling
def statistics(x):
cols = list(zip(*x))
stats = []
for e in cols:
stats.append([min(e),max(e)])
return stats
#Function to scale the features
def scale(x, stat):
for row in x:
for i in range(len(row)):
row[i] = (row[i] - stat[i][0])/(stat[i][1] - stat[i][0])
#Function to convert different classes into different columns to implement one v/s all
def one_vs_all_cols(s):
m = list(set(s))
m.sort()
for i in range(len(s)):
new = [0]*len(m)
new[m.index(s[i])] = 1
s[i] = new
return m
#Function to compute Theta transpose x Feature Vector
def ThetaTX(Q,X):
det = 0.0
for i in range(len(Q)):
det += X[i]*Q[i]
return det
#Function to compute cost for negative class (classs = 0)
def LinearSVM_cost0(z):
if(z < -1): #Ensuring margin
return 0
return z + 1
#Function to compute cost for positive class (classs = 1)
def LinearSVM_cost1(z):
if(z > 1): #Ensuring margin
return 0
return -z + 1
#function to calculate sigmoid
def sigmoid(z):
return 1.0/(1.0 + exp(-z))
#Function to calculate SVM cost
def cost(theta,c,x,y):
cost = 0.0
for i in range(len(x)):
z = ThetaTX(theta[c], x[i])
cost += y[i]*LinearSVM_cost1(z) + (1 - y[i])*LinearSVM_cost0(z)
#cost += -1*(y[i]*log(sigmoid(z)) + (1 - y[i])*log(1 - sigmoid(z)))
return cost
#Function to perform Gradient Descent on the weights/parameters
def gradDescent(theta,c,x,y,learning_rate):
oldTheta = theta[c]
for Q in range(len(theta[c])):
derivative_sum = 0
for i in range(len(x)):
derivative_sum += (sigmoid(ThetaTX(oldTheta,x[i])) - y[i])*x[i][Q]
theta[c][Q] -= learning_rate*derivative_sum
#Function to return predictions using trained weights
def predict(data,theta):
predictions = []
count = 1
for row in data:
hypothesis = []
multiclass_ans = [0]*len(theta)
for c in range(len(theta)):
z = ThetaTX(row,theta[c])
hypothesis.append(sigmoid(z))
index = hypothesis.index(max(hypothesis))
multiclass_ans[index] = 1
predictions.append(multiclass_ans)
count+=1
return predictions
#Function to return accuracy
def accuracy(predicted, actual):
n = len(predicted)
correct = 0
for i in range(n):
if(predicted[i]==actual[i]):
correct+=1
return correct/n
#Function to perform cross validation
def cross_validation(x,y,test_data_size,validations,learning_rate,epoch):
print("No. of validation checks to be performed: ",validations)
print("No. of Iterations per validation: ",epoch)
accuracies = []
for valid in range(validations):
print("\nRunning Validation",valid+1)
x_train, y_train, x_test, y_test = cross_val_split(x,y,test_data_size,valid+1)
#Converting y_train to classwise columns with 0/1 values
classes = []
for i in range(len(label_map)):
classes.append([row[i] for row in y_train])
#Initialising Theta (Weights)
theta = [[0]*len(x_train[0]) for _ in range(len(classes))]
#Training the model
for i in range(epoch):
for class_type in range(len(classes)):
gradDescent(theta,class_type,x_train,classes[class_type],learning_rate)
if(i%(epoch/10)==0):
print("Processed", i*100/epoch,"%")
print("Completed")
#Predicting using test data
y_pred = predict(x_test,theta)
#Calculating accuracy
accuracies.append(accuracy(y_pred,y_test))
print("Validation",valid+1,"accuracy score: ",accuracies[valid])
return sum(accuracies)/len(accuracies)
#Dataset url to be imported
print("Running Forest Cover Detection using Linear SVM\n")
url = "dataset.csv"
dataset = pd.read_csv(url)
data = dataset.values
#Assigning x and y - features and classes
x = data[:,:26]
y = data[:,27]
#Feature Scaling by using column wise max, min stats
stats = statistics(x)
scale(x,stats)
#Converting different labels to columns
#label_map can be used later to retrieve the predicted class label in the original form (string format)
label_map = one_vs_all_cols(y)
#Splitting dataset into training and testing data
test_data_size = 0.2
learning_rate = 0.01
epoch = 500
validations = 5
final_score = cross_validation(x,y,test_data_size,validations,learning_rate,epoch)
#Printing Final Stats
print("\nReport")
print("Model used: ","Linear SVM using Gradient Descent")
print("Learning rate: ", learning_rate)
print("No. of iterations: ",epoch)
print("No. of features: ", len(x[0]))
print("Training data size: ", floor(len(x)*(1 - test_data_size)))
print("Test data size: ", len(x) - floor(len(x)*(1 - test_data_size)))
print("No. of validation tests performed: ", validations)
print("Accuracy: ",final_score*100,"%")