-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sentiment Analysis using RNN.py
246 lines (192 loc) · 9.14 KB
/
Sentiment Analysis using RNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 27 17:41:20 2018
@author: Mohammad Wasil Saleem.
"""
import re
import pickle
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dropout, Conv1D, MaxPool1D, GRU, LSTM, Dense
def reviewWords(review, method):
data_train_Exclude_tags = re.sub(r'<[^<>]+>', " ", review) # Excluding the html tags
data_train_num = re.sub(r'[0-9]+', 'number', data_train_Exclude_tags) # Converting numbers to "NUMBER"
data_train_lower = data_train_num.lower() # Converting to lower case.
data_train_no_punctuation = re.sub(r"[^a-zA-Z]", " ", data_train_lower )
# using porter stemming.
# https://pythonprogramming.net/stemming-nltk-tutorial/
# https://github.com/MohammadWasil/Coursera-Machine-Learning-Python/blob/master/CSR%20ML/WEEK%237/Machine%20Learning%20Assignment%236/Python/processEmail.py
if method == "Porter Stemming":
#print("Processing dataset with porter stemming...")
stemmedWords = [ps.stem(word) for word in re.findall(r"\w+", data_train_no_punctuation)]
return(" ".join(stemmedWords))
# ussing stop words.
# After using stop words, training accuracy increases, but testing accuracy decreases in Kaggle.
# This method might overfit the training data.
if method == "Stop Words":
#print("Processing dataset with stop words...")
data_train_split = data_train_no_punctuation.split() # Splitting into individual words.
stopWords = set(stopwords.words("english") )
meaningful_words = [w for w in data_train_split if not w in stopWords] # Removing stop words.
return( " ".join( meaningful_words ))
if method == "Nothing":
#print("Processing dataset without porter stemming and stop words...")
return data_train_no_punctuation
def training_Validation_Data(cleanWords, data_train):
X = cleanWords
y = data_train["sentiment"]
test_start_index = int(data_train.shape[0] * .8)
x_train = X[0:test_start_index]
y_train = y[0:test_start_index]
x_val = X[test_start_index:]
y_val = y[test_start_index:]
return x_train, y_train, x_val, y_val
# Reading the Data
data_train = pd.read_csv(".../labeledTrainData.tsv", delimiter = "\t")
data_test = pd.read_csv(".../testData.tsv", delimiter = "\t")
# Input the value, whether you want to include porter stemming, stopwords.
print("Input 'Porter Stemming' for porter stemming, 'Stop Words' for stop words, or anywords for Neither of them: ")
preprocessingInput = input("Do you want to include porter stemming or stop word?\n")
if preprocessingInput == "Porter Stemming":
method = "Porter Stemming"
ps = PorterStemmer() # instantiating a class instance.
elif preprocessingInput == "Stop Words":
method = "Stop Words"
else:
method = "Nothing"
# Input the value, whether you want to run the model on LSTM RNN or GRU RNN.
print("Input 'LSTM' for LSTM RNN, 'GRU' for GRU RNN ")
modelInput= input("Do you want to compile the model using LSTM RNN or GRU RNN?\n")
if modelInput == "LSTM":
lstm = True
else:
lstm = False
# Let's process all the reviews together of train data.
cleanWords = []
for i in range(data_train['review'].size):
cleanWords.append( reviewWords( data_train["review"][i], method ))
print("---Review Processing Done!---\n")
# Splitting the data into tran and validation
x_train, y_train, x_val, y_val = training_Validation_Data(cleanWords, data_train)
# There is a data leakage in test set.
data_test["sentiment"] = data_test["id"].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)
y_test = data_test["sentiment"]
# Processing text dataset reviews.
testcleanWords = []
for i in range(data_train['review'].size):
testcleanWords.append( reviewWords( data_test["review"][i], method ))
print("---Test Review Processing Done!---\n")
# Generate the text sequence for RNN model
np.random.seed(1000)
num_most_freq_words_to_include = 5000
MAX_REVIEW_LENGTH_FOR_KERAS_RNN = 500 # Input for keras.
embedding_vector_length = 32
all_review_list = x_train + x_val
tokenizer = Tokenizer(num_words = num_most_freq_words_to_include)
tokenizer.fit_on_texts(all_review_list)
#tokenisingtrain data
train_reviews_tokenized = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(train_reviews_tokenized, maxlen = MAX_REVIEW_LENGTH_FOR_KERAS_RNN) # 20,000 x 500
#tokenising validation data
val_review_tokenized = tokenizer.texts_to_sequences(x_val)
x_val = pad_sequences(val_review_tokenized, maxlen = MAX_REVIEW_LENGTH_FOR_KERAS_RNN) # 5000 X 500
#tokenising Test data
test_review_tokenized = tokenizer.texts_to_sequences(testcleanWords)
x_test = pad_sequences(test_review_tokenized, maxlen = MAX_REVIEW_LENGTH_FOR_KERAS_RNN) # 5000 X 500
# Save the tokenizer, so that we can use this tokenizer whenever we need to predict any reviews.
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
def RNNModel(lstm = False):
model = Sequential()
model.add(Embedding(input_dim = num_most_freq_words_to_include,
output_dim = embedding_vector_length,
input_length = MAX_REVIEW_LENGTH_FOR_KERAS_RNN))
model.add(Dropout(0.2))
model.add(Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(MaxPool1D(pool_size = 2))
if lstm == True:
model.add(LSTM(100))
else:
model.add(GRU(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
return model
themodel = RNNModel(lstm)
themodel.summary()
themodel.fit(x_train, y_train, batch_size=64, epochs=3, validation_data=[x_val, y_val])
# LSTM
# training accuracy - 91.7 - without using stop words.LSTM
# training accuracy - 92.59 - with using stop words.LSTM
# training accuracy - 91.57 - with using porter stemming(No Stop words).LSTM
# GRU
# training accuracy - 91.93 - without using stopwords, porter stemming. -GRU
# training accuracy - 92.76 - with using stop words.GRU
# training accuracy - 92.07 - with using porter stemming.GRU
# Creating file name for saving the model.
if lstm == True:
modelSelected = "LSTM"
else:
modelSelected = "GRU"
fileName = "RNN " + modelSelected + " model" + method + ".h5"
# Saving the model for future reference.
themodel.save(fileName)
# Prediction.
ytest_prediction = themodel.predict(x_test)
from sklearn.metrics import roc_auc_score
print("The roc AUC socre for GRU(using porter stemming) model is : %.4f." %roc_auc_score(y_test, ytest_prediction))
# LSTM
# 94.71-without using stop words.LSTM
# 94.23-with using stop words.LSTM
# 94.65-without using stop words, only using porter stemming.LSTM
# GRU
# 94.52-without using stop words, porter stemming.-GRU
# 94.12-with using stop words.GRU
# 94.20-with using portrestemming(No stop words).GRU
# Creating csv file for
# Changing the shape of ytest_prediction to 1-Dimensional
ytest_prediction = np.array(ytest_prediction).reshape((25000, ))
for i in range(len(ytest_prediction)):
ytest_prediction[i] = round(ytest_prediction[i])
ytest_prediction = ytest_prediction.astype(int)
# Copy the predicted values to pandas dataframe with an id column, and a sentiment column.
output = pd.DataFrame(data = {"id": data_test["id"], "sentiment": ytest_prediction} )
outputName = "Predicted RNN " + modelSelected + " model" + method + ".csv"
output.to_csv(outputName, index = False, quoting = 3 )
# Score on kaggle comes out to be 0.87240 (Without usng stopwords, without using porter stemming.)-lstm
# Score on kaggle comes out to be 0.86964 (With using stopwords.)-lstm
# Score on kaggle comes out to be 0.87896 (Without using stopwords, using Porter Stemming.)-lstm
# next, try training it on GRU Recurrent Neural Network.
# Score on kaggle comes out to be 0.87444 (Without using stopwords, without using porter stemming.)-GRU
# Score on kaggle comes out to be 0.86768 (With using stopwords.)-GRU
# Score on kaggle comes out to be 0.86944 (Wiith using porter stemming.)-GRU
cm = confusion_matrix(y_test, ytest_prediction)
print(cm)
### Confusion Matrix ###
# GRU without stop words, without porter stemming
# Confusion matrix.
# [ [10715 1785]
# [ 1358 11142] ]
# misclassifying 3,143.
# GRU with using stop words, no porter stemming.
# [[10971 1529]
# [ 1779 10721]]
# misclassifying 3,308
# It seems that, when we used stop words, model overfit in the training set.
# When given new examples, it was not able to generalize well.
# i.e. the testing accuracy decreases.
# GRU without using stop words, only porter stemming
# [[10653 1847]
# [ 1417 11083]]
# misclassifying 3,264
# LSTM without stop words, without porter stemming.
# [[11465 1035]
# [ 2261 10239]]
# misclassifying 3,296
#######################