-
Notifications
You must be signed in to change notification settings - Fork 0
/
rnn_utils.py
139 lines (113 loc) · 4.96 KB
/
rnn_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves.urllib.request import urlretrieve
from six.moves import xrange as range
import os
import pdb
import sys
import numpy as np
import os.path
import tensorflow as tf
import cPickle as pickle
#import pickle
def sparse_tuple_from(sequences, dtype=np.int32):
"""Create a sparse representention of x.
Args:
sequences: a list of lists of type dtype where each element is a sequence
Returns:
A tuple with (indices, values, shape)
"""
indices = []
values = []
for n, seq in enumerate(sequences):
indices.extend(zip([n]*len(seq), range(len(seq))))
values.extend(seq)
indices = np.asarray(indices, dtype=np.int64)
values = np.asarray(values, dtype=dtype)
shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)
return indices, values, shape
def pad_sequences(sequences, maxlen=None, dtype=np.float32,
padding='post', truncating='post', value=0.):
'''Pads each sequence to the same length: the length of the longest
sequence.
If maxlen is provided, any sequence longer than maxlen is truncated to
maxlen. Truncation happens off either the beginning or the end
(default) of the sequence. Supports post-padding (default) and
pre-padding.
Args:
sequences: list of lists where each element is a sequence
maxlen: int, maximum length
dtype: type to cast the resulting sequence.
padding: 'pre' or 'post', pad either before or after each sequence.
truncating: 'pre' or 'post', remove values from sequences larger
than maxlen either in the beginning or in the end of the sequence
value: float, value to pad the sequences to the desired value.
Returns
x: numpy array with dimensions (number_of_sequences, maxlen)
lengths: numpy array with the original sequence lengths
'''
lengths = np.asarray([len(s) for s in sequences], dtype=np.int64)
nb_samples = len(sequences)
if maxlen is None:
maxlen = np.max(lengths)
# take the sample shape from the first non empty sequence
# checking for consistency in the main loop below.
sample_shape = tuple()
for s in sequences:
if len(s) > 0:
sample_shape = np.asarray(s).shape[1:]
break
x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
for idx, s in enumerate(sequences):
if len(s) == 0:
continue # empty list was found
if truncating == 'pre':
trunc = s[-maxlen:]
elif truncating == 'post':
trunc = s[:maxlen]
else:
raise ValueError('Truncating type "%s" not understood' % truncating)
# check `trunc` has expected shape
trunc = np.asarray(trunc, dtype=dtype)
if trunc.shape[1:] != sample_shape:
raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
(trunc.shape[1:], idx, sample_shape))
if padding == 'post':
x[idx, :len(trunc)] = trunc
elif padding == 'pre':
x[idx, -len(trunc):] = trunc
else:
raise ValueError('Padding type "%s" not understood' % padding)
return x, lengths
def pad_all_batches(batch_feature_array):
for batch_num in range(len(batch_feature_array)):
batch_feature_array[batch_num] = pad_sequences(batch_feature_array[batch_num])[0]
return batch_feature_array
def get_tidigits_to_index_mapping():
return {"z": 0, "o": 10, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "_": 11}
def compare_predicted_to_true(preds, trues_tup):
inv_index_mapping = {v: k for k, v in get_tidigits_to_index_mapping().items()}
preds = tf.sparse_tensor_to_dense(preds, default_value=-1).eval()
trues = tf.sparse_tensor_to_dense(tf.SparseTensor(indices=trues_tup[0], values=trues_tup[1], dense_shape=trues_tup[2]), default_value=-1).eval()
for true, pred in zip(trues, preds):
predicted_label = "".join([inv_index_mapping[ch] for ch in pred if ch != -1])
true_label = "".join([inv_index_mapping[ch] for ch in true if ch != -1])
print("Predicted: {}\n Actual: {}\n".format(predicted_label, true_label))
def load_dataset(dataset_path):
with open(dataset_path, 'rb') as f:
dataset = pickle.load(f)
return dataset
def make_batches(dataset, batch_size=16):
#print("dataset", dataset[0][:10])
#print (dataset[1][:10])
#print (dataset[2][:10])
examples = []
sequences = []
seqlens = []
l1, l2, l3 = dataset
for i in range(0, len(l1), batch_size):
examples.append(l1[i:i + batch_size])
sequences.append(l2[i:i + batch_size])
seqlens.append(l3[i:i + batch_size])
return examples, sequences, seqlens