-
Notifications
You must be signed in to change notification settings - Fork 0
/
skip-gram.py
138 lines (104 loc) · 5.1 KB
/
skip-gram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import pandas as pd , numpy as np , random, time
import spacy
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
train_data = pd.read_csv("./corpus/train.csv")
test_data = pd.read_csv("./corpus/test.csv")
train_data_length = len(train_data['Description'][:15000]) # modify it later
nlp = spacy.load('en_core_web_sm')
tokenized_sentences = []
vocab = []
for i in range(train_data_length):
sentence = train_data["Description"][i]
doc = nlp(sentence)
tokens = [token.text for token in doc if token.text.strip() != ''] # removing spaces
tokens = ["<start>"] + tokens + ["<end>"]
tokenized_sentences.append(tokens)
vocab.extend(tokens)
vocab = list(set(vocab))
vocab_size = len(vocab)
print(vocab_size)
words_to_ind = {word: i for i, word in enumerate(vocab)}
class SkipGramModel_NegativeSampling(nn.Module):
def __init__(self, tokenized_sentences, window_size, embedding_length, number_of_negative_samples):
super().__init__()
self.neighbours = self.get_neighbours(tokenized_sentences, window_size)
self.data , self.target = self.get_data(number_of_negative_samples)
# Use GPU if available
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialise the matrix with random values
self.embedding_matrix = nn.Embedding(vocab_size, embedding_length).to(self.device)
# fills the data uniformly between the values -1 and 1
self.embedding_matrix.weight.data.uniform_(-1, 1)
self.context_matrix = nn.Embedding(vocab_size, embedding_length).to(self.device)
self.context_matrix.weight.data.uniform_(-1, 1)
self.criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy with Logits Loss
self.optimizer = torch.optim.Adam(list(self.embedding_matrix.parameters()) + list(self.context_matrix.parameters()))
# Get the neighbours of all words
def get_neighbours(self, tokenized_sentences, window_size):
neighbours = {}
for sentence in tokenized_sentences:
for i, word in enumerate(sentence):
for j in range(max(0, i-window_size), min(len(sentence), i+window_size+1)):
if i!=j:
if words_to_ind[word] not in neighbours.keys():
neighbours[words_to_ind[word]] = [words_to_ind[sentence[j]]]
else:
neighbours[words_to_ind[word]].append(words_to_ind[sentence[j]])
return neighbours
# Generate Positive anf Negative samples
def get_data(self, n):
# list of lists with 1 positive samples followed by n negative samples for each instance of a word
generated_data = []
target = []
for idx, neighbours in self.neighbours.items():
excluded_list = np.array([idx] + neighbours)
for value in neighbours:
generated_data.append([idx, value]) # positive sample
target.append(1)
for _ in range(n):
rndm_num = idx
while rndm_num in excluded_list:
rndm_num = random.randrange(0, vocab_size)
generated_data.append([idx, rndm_num]) # negative sample
target.append(0)
return np.array(generated_data), np.array(target)
def train(self, epochs, batch_size=1024):
# Convert numpy array to pytorch tensors
data_tensor = torch.tensor(self.data, dtype=torch.long)
target_tensor = torch.tensor(self.target, dtype=torch.float)
# Create DataLoader for batching
dataset = TensorDataset(data_tensor, target_tensor)
dataLoader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
for epoch in range(epochs):
print(f"Epoch: {epoch}")
for batch_data, batch_target in dataLoader:
batch_data = batch_data.to(self.device)
batch_target = batch_target.to(self.device)
word1_index = batch_data[:, 0]
word2_index = batch_data[:, 1]
# Get embeddings
word1_embedding = self.embedding_matrix(word1_index)
word2_context = self.context_matrix(word2_index)
# Calculate dot product and apply sigmoid
dot_product = torch.sum(word1_embedding * word2_context, dim=1)
prediction = torch.sigmoid(dot_product)
# Calculate loss
loss = self.criterion(prediction, batch_target)
# Backpropagation and update
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
model = SkipGramModel_NegativeSampling(tokenized_sentences, 5, 300, 5)
start_time = time.time()
model.train(5)
end_time = time.time()
print(f"Time taken: {end_time - start_time}")
word_embeddings = model.embedding_matrix.weight.data.numpy()
print(word_embeddings)
data_to_save = {
'word_embeddings': word_embeddings,
'words_to_ind': words_to_ind
}
torch.save(data_to_save, 'skip-gram-word-vectors.pt')