-
Notifications
You must be signed in to change notification settings - Fork 8
/
voicing_model.py
263 lines (201 loc) · 16.9 KB
/
voicing_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import tensorflow as tf
import numpy as np
import datasets
from model import NetworkMelody
from collections import namedtuple
import sys
import common
import librosa
def create_model(self, args):
context_size = int(self.context_width/self.spectrogram_hop_size)
with tf.name_scope('model_pitch'):
self.note_logits = None
self.note_probabilities = self.spectrogram[:, context_size:-context_size, :360, 0]
with tf.name_scope('model_voicing'):
# voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1)
if args.harmonic_stacking > 1:
spectrogram_windows = []
print("stacking the spectrogram")
for i in range(args.harmonic_stacking):
f_ref = 440 # arbitrary reference frequency
hz = f_ref*(i+1)
interval = librosa.core.hz_to_midi(hz) - librosa.core.hz_to_midi(f_ref)
int_bins = int(round(interval*self.bins_per_semitone))
spec_layer = self.spectrogram[:, :, int_bins:self.bin_count+int_bins, :]
print(i+1, "offset", int_bins, "end", self.bin_count+int_bins, "shape", spec_layer.shape)
spec_layer = tf.pad(spec_layer, ((0, 0), (0, 0), (0, self.bin_count-spec_layer.shape[2]), (0, 0)))
print("padded shape", spec_layer.shape)
spectrogram_windows.append(spec_layer)
voicing_layer = tf.concat(spectrogram_windows, axis=-1)
else:
voicing_layer = self.spectrogram[:, :, :360, :]
if args.first_pool_type == "avg":
voicing_layer = tf.layers.average_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same")
if args.first_pool_type == "max":
voicing_layer = tf.layers.max_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same")
print("after pooling", voicing_layer.shape)
octave = int(int(voicing_layer.shape[2])/6)
note = int(int(voicing_layer.shape[2])/6/12)
if args.activation is not None:
activation = getattr(tf.nn, args.activation)
if args.architecture == "full_1layer":
if args.conv_ctx:
voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0)))
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "octave_1layer":
if args.conv_ctx:
voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0)))
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "valid", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_1layer":
if args.conv_ctx:
voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0)))
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "valid", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "octave_octave":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_note":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_dilated":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave))
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "dilated_note":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave))
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_octave":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "octave_note":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_octave_fix":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
octave = int(int(voicing_layer.shape[2])/6)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_note_octave":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
octave = int(int(voicing_layer.shape[2])/6)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_note_octave_octave":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
octave = int(int(voicing_layer.shape[2])/6)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_note_note_octave":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
octave = int(int(voicing_layer.shape[2])/6)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_note_note_octave_octave":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
octave = int(int(voicing_layer.shape[2])/6)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.architecture == "note_octave_octave_temporal":
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
octave = int(int(voicing_layer.shape[2])/6)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, note), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
octave = int(int(voicing_layer.shape[2])/6)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*7+1, 3), (1, 1), "same", activation=activation)
voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
if args.last_layer == "conv":
print("adding last conv valid layer")
print("model output", voicing_layer.shape)
if args.last_conv_ctx:
voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.last_conv_ctx, args.last_conv_ctx), (0, 0), (0, 0)))
print("padded", voicing_layer.shape)
voicing_layer = tf.layers.conv2d(voicing_layer, 1, (args.last_conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True)
print("last conv output", voicing_layer.shape)
voicing_layer = voicing_layer[:, context_size:-context_size, :, :]
print("cut context", voicing_layer.shape)
self.voicing_logits = tf.squeeze(voicing_layer)
print("squeeze", voicing_layer.shape)
if args.last_layer == "dense":
voicing_layer = tf.layers.flatten(voicing_layer)
self.voicing_logits = tf.layers.dense(voicing_layer, args.annotations_per_window)
self.loss = common.loss(self, args)
self.est_notes = common.est_notes(self, args)
self.training = common.optimizer(self, args)
HOP_LENGTH = 512
def parse_args(argv):
parser = common.common_arguments({
"samplerate": 44100, "context_width": 5*HOP_LENGTH, "annotations_per_window": 10, "hop_size": 1, "frame_width": HOP_LENGTH,
"note_range": 72, "min_note": 24, "batch_size": 32,
"evaluate_every": 5000,
"evaluate_small_every": 1000,
})
# Model specific arguments
parser.add_argument("--spectrogram", default="cqt", type=str, help="Postprocessing layer")
parser.add_argument("--first_pool_type", default=None, type=str, help="First pooling type")
parser.add_argument("--first_pool_size", default=[1, 5], nargs="+", type=str, help="Input pooling size")
parser.add_argument("--first_pool_stride", default=[1, 5], nargs="+", type=str, help="Input pooling stride")
parser.add_argument("--capacity_multiplier", default=8, type=int)
parser.add_argument("--architecture", default="full_1layer", type=str)
parser.add_argument("--conv_ctx", default=0, type=int)
parser.add_argument("--batchnorm", default=0, type=int)
parser.add_argument("--dropout", default=0.0, type=float)
parser.add_argument("--last_layer", default="conv", type=str)
parser.add_argument("--last_conv_ctx", default=0, type=int)
parser.add_argument("--harmonic_stacking", default=1, type=int)
parser.add_argument("--activation", default="relu", type=str)
args = parser.parse_args(argv)
common.name(args, "voicing")
return args
def construct(args):
network = NetworkMelody(args)
with network.session.graph.as_default():
spectrogram_function, spectrogram_thumb, spectrogram_info = common.spectrograms(args)
def preload_fn(aa):
aa.annotation = datasets.Annotation.from_time_series(*aa.annotation, args.frame_width*args.samplerate/44100)
aa.audio.load_resampled_audio(args.samplerate).load_spectrogram(spectrogram_function, spectrogram_thumb, spectrogram_info[2])
def dataset_transform(tf_dataset, dataset):
return tf_dataset.map(dataset.prepare_example, num_parallel_calls=args.threads).batch(args.batch_size_evaluation).prefetch(10)
def dataset_transform_train(tf_dataset, dataset):
return tf_dataset.shuffle(10**5).map(dataset.prepare_example, num_parallel_calls=args.threads).batch(args.batch_size).prefetch(10)
train_dataset, test_datasets, validation_datasets = common.prepare_datasets(args.datasets, args, preload_fn, dataset_transform, dataset_transform_train)
network.construct(args, create_model, train_dataset.dataset.output_types, train_dataset.dataset.output_shapes, spectrogram_info=spectrogram_info)
return network, train_dataset, validation_datasets, test_datasets
if __name__ == "__main__":
common.main(sys.argv[1:], construct, parse_args)