-
Notifications
You must be signed in to change notification settings - Fork 2
/
data.py
99 lines (83 loc) · 2.95 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import torch
from torch.autograd import Variable
class Dictionary(object):
def __init__(self, vocab_path):
self.word2idx = {}
self.idx2word = []
with open(vocab_path) as f:
for line in f:
self.add_word(line.strip())
self.pad = self['<pad>']
self.unk = self['<unk>']
def add_word(self, word):
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word) - 1
return self.word2idx[word]
def __getitem__(self, key):
return self.word2idx.get(key, self.word2idx['<unk>'])
def __len__(self):
return len(self.idx2word)
class DataIter(object):
def __init__(self, corpus_path, batch_size, seq_len, dictionary, cuda=False):
self.corpus_path = corpus_path
self.batch_size = batch_size
self.seq_len = seq_len
self.dictionary = dictionary
self.cuda = cuda
self.pad = self.dictionary['<pad>']
self.build_data()
if self.cuda:
self.post = self.post.cuda()
self.cmnt = self.cmnt.cuda()
self.neg = self.neg.cuda()
def sent2idx(self, sent):
'''
the output will cut or pad to self.seq_len
'''
s = sent.split(' ')[0:self.seq_len]
s = map(lambda x: self.dictionary[x], s)
arr = s + [self.dictionary.pad] * (self.seq_len - len(s))
return torch.LongTensor(arr)
def build_data(self):
with open(self.corpus_path) as f:
lines = f.read().splitlines()
self.nsample = len(lines)
self.post = torch.LongTensor(self.nsample, self.seq_len)
self.cmnt = torch.LongTensor(self.nsample, self.seq_len)
self.neg = torch.LongTensor(self.nsample, self.seq_len)
for i, l in enumerate(lines):
post, cmnt, neg = map(self.sent2idx, l.split('\t=>\t'))
self.post[i] = post
self.cmnt[i] = cmnt
self.neg[i] = neg
def __iter__(self):
for i in range(0, self.nsample, self.batch_size):
post = Variable(self.post[i:i+self.batch_size])
cmnt = Variable(self.cmnt[i:i+self.batch_size])
neg = Variable(self.neg[i:i+self.batch_size])
yield (post, cmnt, neg)
def __len__(self):
return self.nsample
if __name__ == '__main__':
import time
def time_usage(func):
def wrapper(*args, **kwargs):
beg_ts = time.time()
retval = func(*args, **kwargs)
end_ts = time.time()
print("elapsed time: %f" % (end_ts - beg_ts))
return retval
return wrapper
dictionary = Dictionary('./full_dataset/train.vocab')
batch_size = 10
seq_len = 30
cuda = True
train_iter = DataIter(
corpus_path = './full_dataset/valid.txt',
batch_size = batch_size,
seq_len = seq_len,
dictionary = dictionary,
cuda = cuda,
)