-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
executable file
·198 lines (159 loc) · 5.68 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import numpy as np
import chainer
from chainer import cuda, Variable
from chainer import optimizers
import chainer.functions as F # F.ctc
from model import RNNASR
import utils
import scipy.io.wavfile as wav
import matplotlib.pyplot as plt
import matplotlib.font_manager as mfm
font_path = "SimHei.ttf"
prop = mfm.FontProperties(fname=font_path)
# https://github.com/jameslyons/python_speech_features
from python_speech_features import mfcc
from wer import wer
def compress_seq(y, blank):
t = []
seq = []
for label in y:
if label == blank:
if len(t) == 0:
continue
[seq.append(i) for i in t]
t = []
else:
if len(t) == 0 or label != t[-1]:
t.append(label)
[seq.append(i) for i in t]
return seq
def get_mean_std(train_set):
total = None
for item in train_set:
samplerate, wavdata = wav.read(os.path.join(data_root, item[0]))
feats = mfcc(wavdata, samplerate).astype(np.float32)
if total is None:
total = feats
else:
total = np.vstack((total, feats))
mean = feats.mean(axis=0)
std = feats.std(axis=0)
return mean.reshape((1, -1)), std.reshape((1, -1))
## Train
def forward_one_sample(model, wavfile, label, SIL_idx, useGPU):
try:
samplerate, wavdata = wav.read(wavfile)
except IOError:
return None, None
feats = mfcc(wavdata, samplerate).astype(np.float32)
feats = (feats - mean) / std
model.reset_state()
if useGPU:
input_seq = [Variable(cuda.to_gpu(feats[i, :].reshape((1, -1))))
for i in range(feats.shape[0])]
y = model(input_seq)
label = Variable(cuda.to_gpu(
xp.array(label, dtype=xp.int32).reshape((1, -1))))
else:
input_seq = [Variable(feats[i, :][np.newaxis, :])
for i in range(feats.shape[0])]
# y = [model(item) for item in input_seq]
y = model(input_seq)
label = Variable(xp.array(label, dtype=xp.int32).reshape((1, -1)))
loss = F.connectionist_temporal_classification(y, label, SIL_idx)
return y, loss
def plot_ctc(y):
y = np.array(y).squeeze()
p = plt.plot(y)
plt.axis([0, 250, -0.1, 1.1])
plt.legend(p, ['blank', '可', '以', '不', '行'], prop=prop)
plt.show()
# Evaluate on test dataset
def evaluate(testset):
evaluator = model.copy() # to use different state
evaluator.train = False
# for item in testset:
total_symbol = 0
error_symbol = 0
for item in testset:
evaluator.reset_state() # initialize state 是否重置关系不大
x_data = os.path.join(data_root, item[0])
y_data = item[1]
y, _ = forward_one_sample(evaluator, x_data, y_data, SIL_idx, useGPU)
if y is None:
continue
# decoding
y_prob = [F.softmax(y[i]).data for i in range(len(y))]
# observe the model output by uncommenting the following line
plot_ctc(y_prob)
y_dec = [y_prob[i].argmax() for i in range(len(y))]
num_seq = utils.compress_seq(y_dec, SIL_idx)
print('decode sequence: ', num_seq)
print('target sequence: ', y_data)
total_symbol += len(y_data)
error_symbol += wer(y_data, num_seq)
print('WER: ', str(float(error_symbol) / total_symbol * 100) + '%')
if __name__ == '__main__':
## Prepare dataset
data_root = "yesno_cn"
save_head = "yesno_cn"
train_list = os.path.join(data_root, "train_list.txt")
dataset = []
with open(train_list, 'r') as fh:
lines = fh.readlines()
for line in lines:
# line = line.decode('utf-8')
wavfile, transcribe = line.split(' ', 1)
if transcribe.strip() == '可以':
label = [1, 2]
else:
label = [3, 4]
dataset.append([wavfile, label])
np.random.seed(0)
np.random.shuffle(dataset)
trainset = dataset[ : 27]
testset = dataset[27 : ]
mean, std = get_mean_std(trainset)
## Prepare model
n_feature = 13
n_units = 300
n_symbol = 5
n_epoch = 2 # number of epochs
SIL_idx = 0 # index of blank symbol
grad_clip = 10 # gradient norm threshold to clip 较大的值,模型收敛的较快
model = RNNASR(n_feature, n_units, n_symbol)
## use GPU or not
useGPU = False
xp = cuda.cupy if useGPU else np
if useGPU:
cuda.get_device(0).use()
model.to_gpu()
## Setup optimizer
optimizer = optimizers.NesterovAG() # 比RMSpropGraves快一些
#optimizer = optimizers.RMSpropGraves()
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.GradientClipping(grad_clip)) # 必不可少的
# train loop
counter = 0
trainsize = len(trainset)
for epoch in range(1, n_epoch+1):
indexes = np.random.permutation(trainsize)
for i in range(trainsize):
x_data = os.path.join(data_root, trainset[indexes[i]][0])
y_data = trainset[indexes[i]][1]
_, loss = forward_one_sample(model, x_data, y_data, SIL_idx, useGPU)
if loss is None: # for file missing problem
continue
print('epoch %d %d of %d loss: %.4f' % (epoch, i, trainsize, loss.data))
model.cleargrads()
loss.backward()
optimizer.update()
## Save the model and the optimizer and evaluate model
# print('save the model')
# serializers.save_npz(save_head+str(epoch)+'.model', model)
# print('save the optimizer')
# serializers.save_npz(save_head+str(epoch)+'.state', optimizer)
evaluate(testset)