-
Notifications
You must be signed in to change notification settings - Fork 1
/
SKIPgram_V1.py
171 lines (146 loc) · 5.95 KB
/
SKIPgram_V1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import torch
import torch.nn as nn
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# 加载GloVe词向量
def load_glove_embeddings(path, word_to_ix, embedding_dim):
embeddings = np.random.rand(len(word_to_ix), embedding_dim)
with open(path, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
if word in word_to_ix:
vector = np.asarray(values[1:], dtype='float32')
embeddings[word_to_ix[word]] = vector
return torch.tensor(embeddings, dtype=torch.float)
# 加载并预处理文本
def load_text(file_path):
with open(file_path, "r", encoding="utf-8") as file:
text = file.read().lower()
#sentences = re.split(r"['.!?]", text)
sentences = re.split(r'[.!?;"]+', text)
processed_sentences = []
for sentence in sentences:
sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)
words = sentence.split()
if words:
processed_sentences.extend(["<start>"] + words + ["<end>"])
return processed_sentences
# 生成Skip-gram训练数据
def generate_skipgram_data(raw_text, context_size, word_to_ix):
data = []
for i in range(context_size, len(raw_text) - context_size):
target = word_to_ix[raw_text[i]]
contexts = [word_to_ix[raw_text[j]] for j in range(i - context_size, i + context_size + 1) if j != i]
for context in contexts:
data.append((torch.tensor([target], dtype=torch.long), torch.tensor([context], dtype=torch.long)))
return data
# 定义Skip-gram模型
class SkipGram(nn.Module):
def __init__(self, vocab_size, embedding_dim, pretrained_embeddings):
super(SkipGram, self).__init__()
self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
self.linear = nn.Linear(embedding_dim, vocab_size)
def forward(self, input_word_indices):
embeds = self.embeddings(input_word_indices)
out = self.linear(embeds)
log_probs = nn.functional.log_softmax(out, dim=-1)
return log_probs
# 参数设置
CONTEXT_SIZE = 2
EMBEDDING_DIM = 100
GLOVE_PATH = 'glove.6B.100d.txt'
TRAIN_TEXT_PATH = 'text_train.txt'
TEST_TEXT_PATH = 'text_test.txt'
# 加载和处理文本数据
train_text = load_text(TRAIN_TEXT_PATH)
test_text = load_text(TEST_TEXT_PATH)
vocab = set(train_text + test_text)
vocab_size = len(vocab)
# word_to_ix = {word: ix for ix, word in enumerate(vocab)}
# ix_to_word = {ix: word for word in word_to_ix}
word_to_ix = {word: ix for ix, word in enumerate(vocab)}
ix_to_word = {ix: word for ix, word in enumerate(vocab)}
# 加载词向量
pretrained_embeddings = load_glove_embeddings(GLOVE_PATH, word_to_ix, EMBEDDING_DIM)
# 创建模型
model = SkipGram(vocab_size, EMBEDDING_DIM, pretrained_embeddings)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
# 准备数据
train_data = generate_skipgram_data(train_text, CONTEXT_SIZE, word_to_ix)
test_data = generate_skipgram_data(test_text, CONTEXT_SIZE, word_to_ix)
# 训练模型
train_losses = []
validation_losses = []
predictions_per_epoch = []
import time
# 在训练之前记录开始时间
start_time = time.time()
for epoch in range(100):
model.train()
total_train_loss = 0
for target, context in train_data:
log_probs = model(target)
loss = loss_function(log_probs, context)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_train_loss += loss.item()
train_losses.append(total_train_loss / len(train_data))
# 计算验证损失
model.eval()
total_val_loss = 0
with torch.no_grad():
for target, context in test_data:
log_probs = model(target)
loss = loss_function(log_probs, context)
total_val_loss += loss.item()
validation_losses.append(total_val_loss / len(test_data))
predictions_per_epoch_temp = {}
context_words = ["croissant", "cloud"]
for context_word in context_words:
context_word_idx = torch.tensor([word_to_ix[context_word]], dtype=torch.long)
log_probs = model(context_word_idx)
top_context_indices = log_probs.topk(4).indices.tolist()
predicted_context_words = [ix_to_word[idx] for idx in top_context_indices[0]]
predictions_per_epoch_temp[context_word] = predicted_context_words
predictions_per_epoch.append(predictions_per_epoch_temp)
# print(f'Epoch {epoch+1}, Training Loss: {train_losses[-1]}, Validation Loss: {validation_losses[-1]}')
# print(f'Context predictions for "{context_word}": {predicted_context_words}')
# 记录结束时间并计算总时间
end_time = time.time()
total_time = end_time - start_time
print(f'Total training time: {total_time:.2f} seconds')
# 构造DataFrame时,处理每个词的预测结果
results = {
'Epoch Number': range(1, len(train_losses) + 1),
'Training Loss': train_losses,
'Validation Loss': validation_losses,
'Timecost':total_time
}
for word in ['croissant', 'cloud']:
results[f'{word} Predictions'] = [epoch[word] for epoch in predictions_per_epoch]
results_df = pd.DataFrame(results)
with pd.ExcelWriter('Skipgram_training_results.xlsx', engine='openpyxl') as writer:
results_df.to_excel(writer, index=False, sheet_name='Training Results')
# t-SNE可视化
embeddings = model.embeddings.weight.to('cpu').detach().numpy()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
embeds_tsne = tsne.fit_transform(embeddings)
plt.figure(figsize=(10, 10))
plt.title("t-SNE Visualization of Word Embeddings")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
# 设置图的横纵轴范围
plt.xlim(-15, 15)
plt.ylim(-15, 15)
for i, label in enumerate(word_to_ix.keys()):
x, y = embeds_tsne[i, :]
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.savefig('tsne_word_embeddings_Skipgram.png', dpi=300)
plt.show()