-
Notifications
You must be signed in to change notification settings - Fork 0
/
word2vec.py
245 lines (213 loc) · 10 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
"""
@author Victor I. Afolabi
A.I. Engineer & Software developer
javafolabi@gmail.com
Created on 28 October, 2017 @ 9:55 PM.
Copyright © 2017. Victor. All rights reserved.
"""
import datetime as dt
import multiprocessing
import numpy as np
import gensim.models.word2vec as w2v
from nltk import word_tokenize, sent_tokenize
class Word2Vec:
def __init__(self, filename, window=2, **kwargs):
self.window = window
max_word = kwargs.get('max_word', None)
self.logging = kwargs.get('logging') or True
# Read corpus
corpus_text = open(filename, mode='r', encoding='utf-8').read()
if max_word:
corpus_text = corpus_text[:max_word]
corpus_text = corpus_text.lower()
# word2id & id2word
unique_words = set(word_tokenize(corpus_text))
self._vocab_size = len(unique_words)
self._word2id = {w: i for i, w in enumerate(unique_words)}
self._id2word = {i: w for i, w in enumerate(unique_words)}
# Sentences
raw_sentences = sent_tokenize(corpus_text)
self._sentences = [word_tokenize(sent) for sent in raw_sentences]
# Free some memory
del corpus_text, unique_words, raw_sentences
# Creatnig features & labels
self._X = np.zeros(shape=[len(self._sentences), self._vocab_size])
self._y = np.zeros(shape=[len(self._sentences), self._vocab_size])
start_time = dt.datetime.now()
for s, sent in enumerate(self._sentences):
for i, word in enumerate(sent):
start = max(i - self.window, 0)
end = min(self.window+i, len(sent)) + 1
word_window = sent[start:end]
for context in word_window:
if context is not word:
# data.append([word, context])
self._X[s] = self.one_hot(self._word2id[word])
self._y[s] = self.one_hot(self._word2id[context])
if self.logging:
print(('\rProcessing {:,} of {:,} sentences. '
'Time taken: {}').format(s+1, len(self._sentences),
dt.datetime.now() - start_time),
end='')
# Free memory
del start_time
self._num_examples = self._X.shape[0]
self._epochs_completed = self._index_in_epoch = 0
def one_hot(self, idx):
temp = np.zeros(shape=[self._vocab_size])
temp[idx] = 1.
return temp
@property
def features(self):
return self._X
@property
def labels(self):
return self._y
@property
def num_examples(self):
return self._num_examples
@property
def vocab_size(self):
return self._vocab_size
@property
def word2id(self):
return self._word2id
@property
def id2word(self):
return self._id2word
@property
def sentences(self):
return self._sentences
@property
def epochs_completed(self):
return self._epochs_completed
def next_batch(self, batch_size, shuffle=True):
start = self._index_in_epoch
# Shuffle for first epoch
if self._epochs_completed == 0 and start == 0 and shuffle:
permute = np.arange(self._num_examples)
np.random.shuffle(permute)
self._X = self._X[permute]
self._y = self._y[permute]
# Go to next batch
if start + batch_size > self._num_examples:
# Finished epoch
self._epochs_completed += 1
# Get the rest examples in this epoch
rest_examples = self._num_examples - start
rest_features = self._X[start:self._num_examples]
rest_labels = self._y[start:self._num_examples]
# Shuffle the data
if shuffle:
permute = np.arange(self._num_examples)
np.random.shuffle(permute)
self._X = self._X[permute]
self._y = self._y[permute]
# Start next epoch
start = 0
self._index_in_epoch = batch_size - rest_examples
end = self._index_in_epoch
features = np.concatenate((rest_features, self._X[start:end]), axis=0)
labels = np.concatenate((rest_labels, self._y[start:end]), axis=0)
return features, labels
else:
self._index_in_epoch += batch_size
end = self._index_in_epoch
return self._X[start:end], self._y[start:end]
# !---------------------------------------------- genisim.models.word2vec ----------------------------------------------! #
class GensimWord2Vec:
"""
| __init__(self, filename, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=<built-in function hash>,
iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000,
compute_loss=False)
| Initialize the model from an iterable of `sentences`. Each sentence is a
| list of words (unicode strings) that will be used for training.
|
| `filename` path to the file to be loaded
|
| The `sentences` iterable can be simply a list, but for larger corpora,
| consider an iterable that streams the sentences directly from disk/network.
| See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in
| this module for such examples.
|
| If you don't supply `sentences`, the model is left uninitialized -- use if
| you plan to initialize it in some other way.
|
| `sg` defines the training algorithm. By default (`sg=0`), CBOW is used.
| Otherwise (`sg=1`), skip-gram is employed.
|
| `size` is the dimensionality of the feature vectors.
|
| `window` is the maximum distance between the current and predicted word within a sentence.
|
| `alpha` is the initial learning rate (will linearly drop to `min_alpha` as training progresses).
|
| `seed` = for the random number generator. Initial vectors for each
| word are seeded with a hash of the concatenation of word + str(seed).
| Note that for a fully deterministically-reproducible run, you must also limit the model to
| a single worker thread, to eliminate ordering jitter from OS thread scheduling. (In Python
| 3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED
| environment variable to control hash randomization.)
|
| `min_count` = ignore all words with total frequency lower than this.
|
| `max_vocab_size` = limit RAM during vocabulary building; if there are more unique
| words than this, then prune the infrequent ones. Every 10 million word types
| need about 1GB of RAM. Set to `None` for no limit (default).
|
| `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
| default is 1e-3, useful range is (0, 1e-5).
|
| `workers` = use this many worker threads to train the model (=faster training with multicore machines).
|
| `hs` = if 1, hierarchical softmax will be used for model training.
| If set to 0 (default), and `negative` is non-zero, negative sampling will be used.
|
| `negative` = if > 0, negative sampling will be used, the int for negative
| specifies how many "noise words" should be drawn (usually between 5-20).
| Default is 5. If set to 0, no negative samping is used.
|
| `cbow_mean` = if 0, use the sum of the context word vectors. If 1 (default), use the mean.
| Only applies when cbow is used.
|
| `hashfxn` = hash function to use to randomly initialize weights, for increased
| training reproducibility. Default is Python's rudimentary built in hash function.
|
| `iter` = number of iterations (epochs) over the corpus. Default is 5.
|
| `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain
| in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count).
| Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and
| returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`.
| Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
| of the model.
|
| `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before
| assigning word indexes.
|
| `batch_words` = target size (in words) for batches of examples passed to worker threads (and
| thus cython routines). Default is 10000. (Larger batches will be passed if individual
| texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
"""
def __init__(self, filename, **kwargs):
corpus = open(filename, mode='r', encoding='utf-8').read()
raw_sentences = sent_tokenize(corpus)
self._sentences = [word_tokenize(sent) for sent in raw_sentences]
workers = multiprocessing.cpu_count()
sg = 1 # 0 - CBOW while 1 - skip gram
self._model = w2v.Word2Vec(sentences=self._sentences,
workers=workers, **kwargs)
# Free memory
del corpus, raw_sentences
# !- Properties
@property
def model(self):
return self._model
@property
def sentences(self):
return self._sentences
@property
def word_count(self):
return self._model.corpus_count