-
Notifications
You must be signed in to change notification settings - Fork 32
/
data_prep_for_visualization.py
134 lines (112 loc) · 6.28 KB
/
data_prep_for_visualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from __future__ import print_function
from glob import glob
from natsort import natsorted
from keras.models import load_model
import utils, keras, os
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, LSTM
from keras.layers import Dense, Flatten
from keras.models import Sequential
from visualize_hidden_units import get_activations, visualize_activations
# Define some parameters
MODEL_PATH = os.getcwd()[:os.getcwd().rfind('/')] + '/models/dnn_models/vis_checkpoints/'
BATCH_SIZE = 256
EPOCHS = 10
EMBEDDING_DIM = 32
HIDDEN_UNITS = 32
DENSE_UNITS = 128
NO_OF_CLASSES = 2
SHUFFLE = False
# Prepare data for visualizations (attention and lstm)
def prepare_data(shuffle=False, labels_to_categorical=True):
path = os.getcwd()[:os.getcwd().rfind("/")]
to_write_filename = path + "/stats/data_prep_for_lstm_visualization.txt"
utils.initialize_writer(to_write_filename)
train_filename = "train.txt"
test_filename = "test.txt"
tokens_filename = "clean_original_" # other types of tokens to experiment with in /res/tokens/
data_path = path + "/res/tokens/tokens_"
# Load the data
train_data = utils.load_file(data_path + tokens_filename + train_filename)
test_data = utils.load_file(data_path + tokens_filename + test_filename)
if shuffle:
train_data = utils.shuffle_words(train_data)
test_data = utils.shuffle_words(test_data)
print("DATA IS SHUFFLED")
# Load the labels
train_labels = [int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + train_filename)]
test_labels = [int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + test_filename)]
# Get the max length of the train tweets
max_tweet_length = utils.get_max_len_info(train_data)
# Convert all tweets into sequences of word indices
tokenizer, train_indices, test_indices = utils.encode_text_as_word_indexes(train_data, test_data, lower=True)
vocab_size = len(tokenizer.word_counts) + 1
word_to_index = tokenizer.word_index
print("There are %s unique tokens." % len(word_to_index))
# Pad sequences with 0s (can do it post or pre - post works better here)
x_train = pad_sequences(train_indices, maxlen=max_tweet_length, padding="post", truncating="post", value=0.)
x_test = pad_sequences(test_indices, maxlen=max_tweet_length, padding="post", truncating="post", value=0.)
# Transform the output into categorical data or just keep it as it is (in a numpy array)
if labels_to_categorical:
train_labels = to_categorical(np.asarray(train_labels))
test_labels = to_categorical(np.asarray(test_labels))
else:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
return x_train, train_labels, x_test, test_labels, vocab_size, tokenizer, max_tweet_length
# Visualize the activations for one tweet
def one_tweet_visualization(model, x_test, index_to_word, tweet_number=3473, plot=True, verbose=False):
vis_input = x_test[tweet_number: tweet_number + 1]
activations, names = get_activations(model, vis_input, layer_name=None)
visualize_activations(activations, names, vis_input, index_to_word, plot, verbose)
# Get the best previously trained model (saved in MODEL_PATH) otherwise train a new model
def train_lstm_for_visualization():
checkpoints = glob(MODEL_PATH + "*.h5")
if len(checkpoints) > 0:
checkpoints = natsorted(checkpoints)
assert len(checkpoints) != 0, "No checkpoints for visualization found."
checkpoint_file = checkpoints[-1]
print("Loading [{}]".format(checkpoint_file))
model = load_model(checkpoint_file)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy", utils.f1_score])
print(model.summary())
# Load the data
x_train, y_train, x_test, y_test, vocab_size, tokenizer, max_tweet_length = prepare_data(SHUFFLE)
# Get the word to index and the index to word mappings
word_index = tokenizer.word_index
index_to_word = {index: word for word, index in word_index.items()}
# Evaluate the previously trained model on test data
test_loss, test_acc, test_fscore = model.evaluate(x_test, y_test, verbose=1, batch_size=256)
print("Loss: %.3f\nF-score: %.3f\n" % (test_loss, test_fscore))
return model, index_to_word, x_test
else:
# Load the data
x_train, y_train, x_test, y_test, vocab_size, tokenizer, max_tweet_length = prepare_data(SHUFFLE)
# Get the word to index and the index to word mappings
word_index = tokenizer.word_index
index_to_word = {index: word for word, index in word_index.items()}
# Build, evaluate and save the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_tweet_length,
embeddings_initializer="glorot_normal", name="embedding_layer"))
model.add(LSTM(output_dim=HIDDEN_UNITS, name="recurrent_layer", activation="tanh", return_sequences=True))
model.add(Flatten())
model.add(Dense(DENSE_UNITS, activation="relu", name="dense_layer"))
model.add(Dense(NO_OF_CLASSES, activation="softmax"))
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(),
metrics=["accuracy", utils.f1_score])
model.summary()
checkpoint = ModelCheckpoint(monitor="val_acc", filepath=MODEL_PATH + "model_{epoch:02d}_{val_acc:.3f}.h5",
save_best_only=True, mode="max")
model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS,
validation_data=(x_test, y_test), callbacks=[checkpoint])
score = model.evaluate(x_test, y_test)
print("Loss: %.3f\nF-score: %.3f\n" % (score[0], score[1]))
return model, index_to_word, x_test
if __name__ == "__main__":
model, index_to_word, x_test = train_lstm_for_visualization()
# Select a tweet number to plot visualizations for (will be saved as html and ust plotted (not saved) using matplot)
one_tweet_visualization(model, x_test, index_to_word, tweet_number=3473, plot=False, verbose=True)