-
Notifications
You must be signed in to change notification settings - Fork 32
/
extract_statistical_features.py
331 lines (298 loc) · 16.9 KB
/
extract_statistical_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import emoji, re, string, time, os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import PorterStemmer, ngrams
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import LdaModel
from gensim.corpora import MmCorpus, Dictionary
import data_processing as data_proc
import vocab_helpers as helper
# Get 13 pragmatic features (like the presence of laughter, capitalized words, emojis, hashtags, user mentions)
# but also counts of punctuation, strong affirmatives, negations, intensifiers, tokens and average token size
def get_pragmatic_features(tweet_tokens):
capitalized_words = laughter = user_mentions = negations = affirmatives \
= interjections = intensifiers = punctuation = emojis = tweet_len_ch = hashtags = 0
for t in tweet_tokens:
tweet_len_ch += len(t)
if t.isupper():
capitalized_words = 1 # binary feature marking the presence of capitalized words
if t.startswith("@"):
user_mentions = 1 # binary feature for the presence of user mentions
if t.startswith("#"):
hashtags += 1 # count-based feature of hashtags used (excluding sarcasm or sarcastic)
if t.lower().startswith("haha") or re.match('l(o)+l$', t.lower()):
laughter = 1 # binary feature marking the presence of laughter
if t in helper.strong_negations:
negations += 1 # count-based feature of strong negations
if t in helper.strong_affirmatives:
affirmatives += 1 # count-based feature of strong affirmatives
if t in helper.interjections:
interjections += 1 # count-based feature of relevant interjections
if t in helper.intensifiers:
intensifiers += 1 # count-based feature of relevant intensifiers
if t in helper.punctuation:
punctuation += 1 # count-based feature of relevant punctuation signs
if t in emoji.UNICODE_EMOJI:
emojis += 1 # count-based feature of relevant punctuation signs
tweet_len_tokens = len(tweet_tokens) # get the length of the tweet in tokens
average_token_length = float(tweet_len_tokens) / max(1.0, float(tweet_len_ch)) # average tweet length
feature_list = {'tw_len_ch': tweet_len_ch, 'tw_len_tok': tweet_len_tokens, 'avg_len': average_token_length,
'capitalized': capitalized_words, 'laughter': laughter, 'user_mentions': user_mentions,
'negations': negations, 'affirmatives': affirmatives, 'interjections': interjections,
'intensifiers': intensifiers, 'punctuation': punctuation, 'emojis': emojis, 'hashtags':hashtags}
return feature_list
# Obtain 25 POS Tags from the CMU Twitter Part-of-Speech Tagger according to the paper
# "Part-of-Speech Tagging for Twitter: Annotation, Features, and Experiments" by Gimpel et al.
def get_pos_features(pos_tweet):
pos_dict = dict.fromkeys(['N', 'O', 'S', '^', 'Z', 'L', 'M', 'V', 'A', 'R', '!', 'D', 'P',
'&', 'T', 'X', 'Y', '#', '@', '~', 'U', 'E', '$', ',', 'G'], 0)
for pos in pos_tweet:
pos_dict[pos] += 1
return pos_dict
# Extract the n-grams (specified as a list n = [1, 2, 3, ...])
# e.g if n = [1,2,3] then ngram_features is a dictionary of all unigrams, bigrams and trigrams
# This ngram extractor works for any kind of tokens i.e both words and pos tags
def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False):
if len(n) < 1:
return {}
if not for_semantics:
if stem:
porter = PorterStemmer()
tokens = [porter.stem(t.lower()) for t in tokens]
if use_just_words:
tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#')
and t not in string.punctuation]
ngram_tokens = []
for i in n:
for gram in ngrams(tokens, i):
string_token = 'gram '
for j in range(i):
string_token += gram[j] + ' '
ngram_tokens.append(string_token)
ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)}
return ngram_features
# Get sentiment features -- a total of 18 features derived
# Emoji features: a count of the positive, negative and neutral emojis
# along with the ratio of positive to negative emojis and negative to neutral
# Using the MPQA subjectivity lexicon, we have to check words for their part of speech
# and obtain features: a count of positive, negative and neutral words, as well as
# a count of the strong and weak subjectives, along with their ratios and a total sentiment words.
# Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features)
def get_sentiment_features(tweet, tweet_tokens, tweet_pos, emoji_sent_dict, subj_dict):
sent_features = dict.fromkeys(["positive emoji", "negative emoji", "neutral emoji",
"emojis pos:neg", "emojis neutral:neg",
"subjlexicon weaksubj", "subjlexicon strongsubj",
"subjlexicon positive", "subjlexicon negative",
"subjlexicon neutral", "words pos:neg", "words neutral:neg",
"subjectivity strong:weak", "total sentiment words"], 0.0)
for t in tweet_tokens:
if t in emoji_sent_dict.keys():
sent_features['negative emoji'] += float(emoji_sent_dict[t][0])
sent_features['neutral emoji'] += float(emoji_sent_dict[t][1])
sent_features['positive emoji'] += float(emoji_sent_dict[t][2])
# Obtain the report of positive to negative emojis
if sent_features['negative emoji'] != 0:
if sent_features['positive emoji'] == 0:
sent_features['emojis pos:neg'] = -1.0 / float(sent_features['negative emoji'])
else:
sent_features['emojis pos:neg'] = sent_features['positive emoji'] \
/ float(sent_features['negative emoji'])
# Obtain the report of neutral to negative emojis
if sent_features['negative emoji'] != 0:
if sent_features['neutral emoji'] == 0:
sent_features['emojis neutral:neg'] = -1.0 / float(sent_features['negative emoji'])
else:
sent_features['emojis neutral:neg'] = sent_features['neutral emoji'] \
/ float(sent_features['negative emoji'])
lemmatizer = WordNetLemmatizer()
pos_translation = {'N': 'noun', 'V': 'verb', 'D': 'adj', 'R': 'adverb'}
for index in range(len(tweet_tokens)):
stemmed = lemmatizer.lemmatize(tweet_tokens[index], 'v')
if stemmed in subj_dict.keys():
if tweet_pos[index] in pos_translation and pos_translation[tweet_pos[index]] in subj_dict[stemmed].keys():
# Get the type of subjectivity (strong or weak) of this stemmed word
sent_features['subjlexicon ' + subj_dict[stemmed][pos_translation[tweet_pos[index]]][0]] += 1
# Get the type of polarity (pos, neg, neutral) of this stemmed word
if subj_dict[stemmed][pos_translation[tweet_pos[index]]][1] == 'both':
sent_features['subjlexicon positive'] += 1
sent_features['subjlexicon negative'] += 1
else:
sent_features['subjlexicon ' + subj_dict[stemmed][pos_translation[tweet_pos[index]]][1]] += 1
else:
if 'anypos' in subj_dict[stemmed].keys():
# Get the type of subjectivity (strong or weak) of this stemmed word
sent_features['subjlexicon ' + subj_dict[stemmed]['anypos'][0]] += 1 # strong or weak subjectivity
# Get the type of polarity (pos, neg, neutral) of this stemmed word
if subj_dict[stemmed]['anypos'][1] == 'both':
sent_features['subjlexicon positive'] += 1
sent_features['subjlexicon negative'] += 1
else:
sent_features['subjlexicon ' + subj_dict[stemmed]['anypos'][1]] += 1
# Use the total number of sentiment words as a feature
sent_features["total sentiment words"] = sent_features["subjlexicon positive"] \
+ sent_features["subjlexicon negative"] \
+ sent_features["subjlexicon neutral"]
if sent_features["subjlexicon negative"] != 0:
if sent_features["subjlexicon positive"] == 0:
sent_features["words pos:neg"] = -1.0 / float(sent_features["subjlexicon negative"])
else:
sent_features["words pos:neg"] = sent_features["subjlexicon positive"] \
/ float(sent_features["subjlexicon negative"])
if sent_features["subjlexicon negative"] != 0:
if sent_features["subjlexicon neutral"] == 0:
sent_features["words neutral:neg"] = -1.0 / float(sent_features["subjlexicon negative"])
else:
sent_features["words neutral:neg"] = sent_features["subjlexicon neutral"] \
/ float(sent_features["subjlexicon negative"])
if sent_features["subjlexicon weaksubj"] != 0:
if sent_features["subjlexicon strongsubj"] == 0:
sent_features["subjectivity strong:weak"] = -1.0 / float(sent_features["subjlexicon weaksubj"])
else:
sent_features["subjectivity strong:weak"] = sent_features["subjlexicon strongsubj"] \
/ float(sent_features["subjlexicon weaksubj"])
# Vader Sentiment Analyser
# Obtain the negative, positive, neutral and compound scores of a tweet
sia = SentimentIntensityAnalyzer()
polarity_scores = sia.polarity_scores(tweet)
for name, score in polarity_scores.items():
sent_features["Vader score " + name] = score
return sent_features
# Get the necessary data to perform topic modelling
# including clean noun and verb phrases (lemmatized, lower-case)
# Tokenization and POS labelled done as advertised by CMU
def build_lda_model(tokens_tags, pos_tags, use_nouns=True, use_verbs=True, use_all=False,
num_of_topics=8, passes=25, verbose=True):
path = os.getcwd()[:os.getcwd().rfind('/')]
topics_filename = str(num_of_topics) + "topics"
if use_nouns:
topics_filename += "_nouns"
if use_verbs:
topics_filename += "_verbs"
if use_all:
topics_filename += "_all"
# Set the LDA, Dictionary and Corpus filenames
lda_filename = path + "/models/topic_models/lda_" + topics_filename + ".model"
dict_filename = path + "/res/topic_data/dict/dict_" + topics_filename + ".dict"
corpus_filename = path + "/res/topic_data/corpus/corpus_" + topics_filename + ".mm"
# Build a topic model if it wasn't created yet
if not os.path.exists(lda_filename):
# Extract the lemmatized documents
docs = []
for index in range(len(tokens_tags)):
tokens = tokens_tags[index].split()
pos = pos_tags[index].split()
docs.append(data_proc.extract_lemmatized_tweet(tokens, pos, use_verbs, use_nouns, use_all))
# Compute the dictionary and save it
dictionary = Dictionary(docs)
dictionary.filter_extremes(keep_n=60000)
dictionary.compactify()
Dictionary.save(dictionary, dict_filename)
# Compute the bow corpus and save it
corpus = [dictionary.doc2bow(d) for d in docs]
MmCorpus.serialize(corpus_filename, corpus)
if verbose:
print("\nCleaned documents:")
print(docs)
print("\nDictionary:")
print(dictionary)
print("\nCorpus in BoW form:")
print(corpus)
# Start training an LDA Model
start = time.time()
print("\nBuilding the LDA topic model...")
lda_model = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary)
lda_model.save(lda_filename)
end = time.time()
print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))
if verbose:
print("\nList of words associated with each topic:")
lda_topics = lda_model.show_topics(formatted=False)
lda_topics_list = [[word for word, prob in topic] for topic_id, topic in lda_topics]
print([t for t in lda_topics_list])
# Load the previously saved dictionary
dictionary = Dictionary.load(dict_filename)
# Load the previously saved corpus
mm_corpus = MmCorpus(corpus_filename)
# Load the previously saved LDA model
lda_model = LdaModel.load(lda_filename)
# Print the top 10 words for each topic
for topic_id in range(num_of_topics):
print("\nTop 10 words for topic ", topic_id)
print([dictionary[word_id] for (word_id, prob) in lda_model.get_topic_terms(topic_id, topn=10)])
index = 0
if verbose:
for doc_topics, word_topics, word_phis in lda_model.get_document_topics(mm_corpus, per_word_topics=True):
print('Index ', index)
print('Document topics:', doc_topics)
print('Word topics:', word_topics)
print('Phi values:', word_phis)
print('-------------- \n')
index += 1
return dictionary, mm_corpus, lda_model
# Predict the topic of an unseen testing example based on the LDA model built on the train set
def get_topic_features_for_unseen_tweet(dictionary, lda_model, tokens_tags, pos_tags,
use_nouns=True, use_verbs=True, use_all=False, verbose=False):
# Extract the lemmatized documents
docs = data_proc.extract_lemmatized_tweet(tokens_tags, pos_tags, use_verbs, use_nouns, use_all)
tweet_bow = dictionary.doc2bow(docs)
topic_prediction = lda_model[tweet_bow]
topic_features = {}
if verbose:
print("\nTopic prediction\n")
for t in topic_prediction:
print(t)
if any(isinstance(topic_list, type([])) for topic_list in topic_prediction):
topic_prediction = topic_prediction[0]
for topic in topic_prediction:
topic_features['topic ' + str(topic[0])] = topic[1]
return topic_features
# Use the distributions of topics in a tweet as features
def get_topic_features(corpus, lda_model, index):
topic_features = {}
doc_topics, word_topic, phi_values = lda_model.get_document_topics(corpus, per_word_topics=True)[index]
for topic in doc_topics:
topic_features['topic ' + str(topic[0])] = topic[1]
return topic_features
# Collect all features
def get_feature_set(tweets_tokens, tweets_pos, pragmatic=False, pos_unigrams=False, pos_bigrams=False,
lexical=False, ngram_list=[1], sentiment=False, topic=True):
features = []
if sentiment:
# Emoji lexicon - underlying sentiment (pos, neutral, neg)
emoji_dict = data_proc.build_emoji_sentiment_dictionary()
# Obtain subjectivity features from the MPQA lexicon and build the subjectivity lexicon
subj_dict = data_proc.get_subj_lexicon()
if topic:
use_nouns = True
use_verbs = True
use_all = False
dictionary, corpus, lda_model = build_lda_model(tweets_tokens, tweets_pos,
use_nouns=use_nouns, use_verbs=use_verbs, use_all=use_all,
num_of_topics=6, passes=20, verbose=False)
for index in range(len(tweets_tokens)):
tokens_this_tweet = tweets_tokens[index].split()
pos_this_tweet = tweets_pos[index].split()
pragmatic_features = {}
pos_unigrams_features = {}
pos_bigrams_features = {}
words_ngrams = {}
sentiment_features = {}
topic_features = {}
if pragmatic:
pragmatic_features = get_pragmatic_features(tokens_this_tweet)
if pos_unigrams:
pos_unigrams_features = get_pos_features(pos_this_tweet)
if pos_bigrams:
pos_bigrams_features = get_ngrams(pos_this_tweet, n=[3], for_semantics=True)
if lexical:
words_ngrams = get_ngrams(tokens_this_tweet, n=ngram_list, use_just_words=True)
if sentiment:
sentiment_features = get_sentiment_features(tweets_tokens[index], tokens_this_tweet,
pos_this_tweet, emoji_dict, subj_dict)
if topic:
topic_features = get_topic_features_for_unseen_tweet\
(dictionary, lda_model, tokens_this_tweet, pos_this_tweet,
use_nouns=use_nouns, use_verbs=use_verbs, use_all=use_all)
# Append all extracted features of this tweet to the final list of all features
features.append({**pragmatic_features, **pos_unigrams_features, **pos_bigrams_features,
**words_ngrams, **sentiment_features, **topic_features})
return features