-
Notifications
You must be signed in to change notification settings - Fork 32
/
ml_models.py
140 lines (111 loc) · 6.47 KB
/
ml_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os, time, itertools
from sklearn import preprocessing
import extract_baseline_features
import extract_ml_features2 as extract_features
import utils, classifiers
import data_processing as data_proc
# Settings for the up-coming ML model
pragmatic = True
lexical = True
pos_grams = True
sentiment = True
topic = True
similarity = True
pos_ngram_list = [1]
ngram_list = [1]
embedding_dim = 100
word2vec_map = utils.load_vectors(filename='glove.6B.%dd.txt' % embedding_dim)
# Set the values for the portion fo data
n_train = 3000
n_test = 500
def baseline(tweets_train, train_labels, tweets_test, test_labels):
# Import the subjectivity lexicon
subj_dict = data_proc.get_subj_lexicon()
types_of_features = ['1', '2', '3', 'ngrams']
for t in types_of_features:
start = time.time()
utils.print_model_title("Classification using feature type " + t)
if t is '1':
x_train_features = extract_baseline_features.get_features1(tweets_train, subj_dict)
x_test_features = extract_baseline_features.get_features1(tweets_test, subj_dict)
if t is '2':
x_train_features = extract_baseline_features.get_features2(tweets_train, subj_dict)
x_test_features = extract_baseline_features.get_features2(tweets_test, subj_dict)
if t is '3':
x_train_features = extract_baseline_features.get_features3(tweets_train, subj_dict)
x_test_features = extract_baseline_features.get_features3(tweets_test, subj_dict)
if t is 'ngrams':
ngram_map, x_train_features = extract_baseline_features.get_ngram_features(tweets_train, n=1)
x_test_features = extract_baseline_features.get_ngram_features_from_map(tweets_test, ngram_map, n=1)
# Get the class ratio
class_ratio = utils.get_classes_ratio_as_dict(train_labels)
# Train on a Linear Support Vector Classifier
print("\nEvaluating a linear SVM model...")
classifiers.linear_svm(x_train_features, train_labels, x_test_features, test_labels, class_ratio)
# Train on a Logistic Regression Classifier
print("\nEvaluating a logistic regression model...")
classifiers.logistic_regression(x_train_features, train_labels, x_test_features, test_labels, class_ratio)
end = time.time()
print("Completion time of the baseline model with features type %s: %.3f s = %.3f min"
% (t, (end - start), (end - start) / 60.0))
def ml_model(train_tokens, train_pos, y_train, test_tokens, test_pos, y_test):
print("Processing TRAIN SET features...\n")
start = time.time()
train_pragmatic, train_lexical, train_pos, train_sent, train_topic, train_sim = extract_features.get_feature_set\
(train_tokens, train_pos, pragmatic=pragmatic, lexical=lexical,
ngram_list=ngram_list, pos_grams=pos_grams, pos_ngram_list=pos_ngram_list,
sentiment=sentiment, topic=topic, similarity=similarity, word2vec_map=word2vec_map)
end = time.time()
print("Completion time of extracting train models: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))
print("Processing TEST SET features...\n")
start = time.time()
test_pragmatic, test_lexical, test_pos, test_sent, test_topic, test_sim = extract_features.get_feature_set \
(test_tokens, test_pos, pragmatic=pragmatic, lexical=lexical,
ngram_list=ngram_list, pos_grams=pos_grams, pos_ngram_list=pos_ngram_list,
sentiment=sentiment, topic=topic, similarity=similarity, word2vec_map=word2vec_map)
end = time.time()
print("Completion time of extracting train models: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))
# Get all features together
all_train_features = [train_pragmatic, train_lexical, train_pos, train_sent, train_topic, train_sim]
all_test_features = [test_pragmatic, test_lexical, test_pos, test_sent, test_topic, test_sim]
# Choose your feature options: you can run on all possible combinations of features
sets_of_features = 6
feature_options = list(itertools.product([False, True], repeat=sets_of_features))
feature_options = feature_options[1:] # skip over the option in which all entries are false
# OR Can select just the features that you want
# From left to right, set to true if you want the feature to be active:
# [Pragmatic, Lexical-grams, POS-grams, Sentiment, LDA topics, Similarity]
# feature_options = [[True, True, True, True, True, True]]
for option in feature_options:
train_features = [{} for _ in range(len(train_tokens))]
test_features = [{} for _ in range(len(test_tokens))]
utils.print_features(option, ['Pragmatic', 'Lexical-grams', 'POS-grams', 'Sentiment', 'LDA topics', 'Similarity'])
# Make a feature selection based on the current feature_option choice
for i, o in enumerate(option):
if o:
for j, example in enumerate(all_train_features[i]):
train_features[j] = utils.merge_dicts(train_features[j], example)
for j, example in enumerate(all_test_features[i]):
test_features[j] = utils.merge_dicts(test_features[j], example)
# Vectorize and scale the features
x_train, x_test = utils.extract_features_from_dict(train_features, test_features)
x_train_scaled = preprocessing.scale(x_train, axis=0)
x_test_scaled = preprocessing.scale(x_test, axis=0)
print("Shape of the x train set (%d, %d)" % (len(x_train_scaled), len(x_train_scaled[0])))
print("Shape of the x test set (%d, %d)" % (len(x_test_scaled), len(x_test_scaled[0])))
# Run the model on the selection of features made
start = time.time()
utils.run_supervised_learning_models(x_train_scaled, y_train, x_test_scaled, y_test)
end = time.time()
print("Completion time of the Linear SVM model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))
if __name__ == "__main__":
path = os.getcwd()[:os.getcwd().rfind('/')]
to_write_filename = path + '/stats/ml_analysis.txt'
utils.initialize_writer(to_write_filename)
dataset = "ghosh" # can be "ghosh", "riloff", "sarcasmdetection" and "ptacek"
train_tokens, train_pos, train_labels, test_tokens, test_pos, test_labels = data_proc.get_dataset(dataset)
run_baseline = False
if run_baseline:
baseline(train_tokens, train_labels, test_tokens, test_labels)
else:
ml_model(train_tokens, train_pos, train_labels, test_tokens, test_pos, test_labels)