This repository has been archived by the owner on Apr 21, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
/
utils.py
123 lines (109 loc) · 5.48 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
# file: utils.py
# author: songyouwei <youwei0314@gmail.com>
# Copyright (C) 2018. All Rights Reserved.
import os
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
def load_word_vec(word_index=None, embedding_dim=100):
fname = './glove.twitter.27B/glove.twitter.27B.'+str(embedding_dim)+'d.txt' \
if embedding_dim != 300 else './glove.42B.300d.txt'
fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
word_vec = {}
for line in fin:
tokens = line.rstrip().split()
if tokens[0] in word_index.keys():
word_vec[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
return word_vec
def read_dataset(type='twitter', mode='train', embedding_dim=100, max_seq_len=40, max_aspect_len=3, polarities_dim=3):
print("preparing data...")
fname = {
'twitter': {
'train': './datasets/acl-14-short-data/train.raw',
'test': './datasets/acl-14-short-data/test.raw'
},
'restaurant': {
'train': './datasets/semeval14/Restaurants_Train.xml.seg',
'test': './datasets/semeval14/Restaurants_Test_Gold.xml.seg'
},
'laptop': {
'train': './datasets/semeval14/Laptops_Train.xml.seg',
'test': './datasets/semeval14/Laptops_Test_Gold.xml.seg'
}
}
fin = open(fname[type][mode], 'r', encoding='utf-8', newline='\n', errors='ignore')
lines = fin.readlines()
fin.close()
print("number of {0} {1} data: {2}".format(type, mode ,len(lines)/3))
text = ''
texts_raw = []
texts_raw_without_aspects = []
texts_left = []
texts_left_with_aspects = []
texts_right = []
texts_right_with_aspects = []
aspects = []
polarities = []
for i in range(0, len(lines), 3):
text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
aspect = lines[i+1].lower().strip()
polarity = lines[i+2].strip()
text_raw = text_left + " " + aspect + " " + text_right
text += text_raw
texts_raw.append(text_raw)
texts_raw_without_aspects.append(text_left + " " + text_right)
texts_left.append(text_left)
texts_left_with_aspects.append(text_left + " " + aspect)
texts_right.append(text_right)
texts_right_with_aspects.append(aspect + " " + text_right)
aspects.append(aspect)
polarities.append(int(polarity))
text_words = text.strip().split()
print('tokenizing...')
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(text_words)
word_index = tokenizer.word_index
texts_raw_indices = tokenizer.texts_to_sequences(texts_raw)
texts_raw_indices = pad_sequences(texts_raw_indices, maxlen=max_seq_len)
texts_raw_without_aspects_indices = tokenizer.texts_to_sequences(texts_raw_without_aspects)
texts_raw_without_aspects_indices = pad_sequences(texts_raw_without_aspects_indices, maxlen=max_seq_len)
texts_left_indices = tokenizer.texts_to_sequences(texts_left)
texts_left_indices = pad_sequences(texts_left_indices, maxlen=max_seq_len)
texts_left_with_aspects_indices = tokenizer.texts_to_sequences(texts_left_with_aspects)
texts_left_with_aspects_indices = pad_sequences(texts_left_with_aspects_indices, maxlen=max_seq_len)
texts_right_indices = tokenizer.texts_to_sequences(texts_right)
texts_right_indices = pad_sequences(texts_right_indices, maxlen=max_seq_len, padding='post', truncating='post')
texts_right_with_aspects_indices = tokenizer.texts_to_sequences(texts_right_with_aspects)
texts_right_with_aspects_indices = pad_sequences(texts_right_with_aspects_indices, maxlen=max_seq_len, padding='post', truncating='post')
aspects_indices = tokenizer.texts_to_sequences(aspects)
aspects_indices = pad_sequences(aspects_indices, maxlen=max_aspect_len)
polarities_matrix = K.eval(tf.one_hot(indices=polarities, depth=polarities_dim))
if mode == 'test':
return texts_raw_indices, texts_raw_without_aspects_indices, texts_left_indices, texts_left_with_aspects_indices, \
aspects_indices, texts_right_indices, texts_right_with_aspects_indices, polarities_matrix
embedding_matrix_file_name = '{0}_{1}_embedding_matrix.dat'.format(str(embedding_dim), type)
if os.path.exists(embedding_matrix_file_name):
print('loading embedding_matrix:', embedding_matrix_file_name)
embedding_matrix = pickle.load(open(embedding_matrix_file_name, 'rb'))
else:
print('loading word vectors...')
word_vec = load_word_vec(word_index, embedding_dim)
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
print('building embedding_matrix:', embedding_matrix_file_name)
for word, i in word_index.items():
vec = word_vec.get(word)
if vec is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = vec
pickle.dump(embedding_matrix, open(embedding_matrix_file_name, 'wb'))
return texts_raw_indices, texts_raw_without_aspects_indices, texts_left_indices, texts_left_with_aspects_indices, \
aspects_indices, texts_right_indices, texts_right_with_aspects_indices, \
polarities_matrix, \
embedding_matrix, \
tokenizer
if __name__ == '__main__':
read_dataset()