-
Notifications
You must be signed in to change notification settings - Fork 0
/
lr.py
180 lines (134 loc) · 4.37 KB
/
lr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#Goal : implement Natural Language Processing (NLP) system
#lr.py : implements a sentiment polarity analyzer using binary logistic regression
#predict a sentiment polarity for the corresponding feature vector of each movie review
import sys
import csv
from collections import OrderedDict
import math
import matplotlib.pyplot as plt
#store dictionary
def get_dict(dict_input):
f = open(dict_input, 'r')
input_f = csv.reader(f, delimiter =' ')
dictionary = OrderedDict()
for ith_row in input_f:
dictionary[ith_row[0]] = ith_row[1]
return dictionary
#store data
def store_data(input_file):
f = open(input_file, 'r')
input_f = csv.reader(f, delimiter = '\t')
X = []
Y = []
for ith_row in input_f:
Y.append(int(ith_row[0]))
x_i = OrderedDict()
x_i[-1] = 1 #bias term added
for i in range(1, len(ith_row)):
key, value = ith_row[i].split(":")
x_i[int(key)] = int(value)
X.append(x_i)
return [Y, X]
def sparse_dot(X, theta):
product = 0.0
for i,v in X.items():
product += v * theta[i]
return product
def single_SGD(Y_i, X_i, theta, learn_rate):
val = sparse_dot(X_i, theta) #thetaT x_i
for k,v in X_i.items():
step = learn_rate * v * (Y_i - (math.exp(val) / (1 + math.exp(val))))
theta[k] = theta[k] + step
return theta
def neg_LL(Y, X, theta):
tally = 0
for i in range(len(Y)):
val = sparse_dot(X[i], theta) #thetaT x_i
tally += -Y[i] * val + math.log(1 + math.exp(val))
return tally
#dictionary = W
def train(train_data, valid_data, dictionary ,num_epoch):
Y = train_data[0]
X = train_data[1]
valid_Y = valid_data[0]
valid_X = valid_data[1]
train_avg_NLLs = []
valid_avg_NLLs = []
epochs = []
learn_rate = 0.1
#initialize
theta = [0] * (len(dictionary) + 1) #bias term added
for epoch in range(num_epoch): #epoch
epochs.append(epoch)
for i in range(len(Y)): #N
theta = single_SGD(Y[i], X[i], theta, learn_rate)
train_tally = neg_LL(Y, X, theta)
valid_tally = neg_LL(valid_Y, valid_X, theta)
train_avg = train_tally / len(Y)
valid_avg = valid_tally / len(valid_Y)
train_avg_NLLs.append(train_avg)
valid_avg_NLLs.append(valid_avg)
return (theta, train_avg_NLLs, valid_avg_NLLs, epochs)
def plot_analysis(train_avg_NLLs,valid_avg_NLLs , epochs):
# plot analysis
fig = plt.figure()
#train
plt.errorbar(epochs, train_avg_NLLs, label='Train Data')
#validation
plt.errorbar(epochs, valid_avg_NLLs, label='Validation Data')
plt.legend(loc='lower right')
plt.show()
#predict
def predict(data, theta, out_file):
out = open(out_file, 'w')
error = 0.0
Y = data[0]
X = data[1]
for i in range(len(Y)):
val = sparse_dot(X[i], theta)
sigmoid = 1.0 / (1.0 + math.exp(-val))
if sigmoid > 0.5:
label = 1
else:
label = 0
txt = str(label) + "\n"
out.write(txt)
if label != Y[i]:
error += 1.0
return error / len(Y)
if __name__ == '__main__':
#path to the formatted .tsv file
formatted_train_input = sys.argv[1]
formatted_validation_input = sys.argv[2]
formatted_test_input = sys.argv[3]
#dictionary
dict_input = sys.argv[4]
#path to .labels file
train_out = sys.argv[5]
test_out = sys.argv[6]
metrics_out = sys.argv[7]
#number of times SGD loops through all of the training data
num_epoch = int(sys.argv[8])
#get dictionary
dictionary = get_dict(dict_input)
#store data
train_data = store_data(formatted_train_input)
valid_data = store_data(formatted_validation_input)
test_data = store_data(formatted_test_input)
#train
vals = train(train_data, valid_data, dictionary ,num_epoch)
theta = vals[0]
train_avg_NLLs = vals[1]
valid_avg_NLLs = vals[2]
epochs = vals[3]
#validation
plot_analysis(train_avg_NLLs,valid_avg_NLLs , epochs)
#predict
train_error = predict(train_data, theta, train_out)
test_error = predict(test_data, theta, test_out)
#metrics
metrics_out_file = open(metrics_out, 'w')
txts = "error (train) : " + str(train_error) + "\n" + "error (test) : " + str(test_error)
print(txts)
metrics_out_file.write(txts)
metrics_out_file.close()