-
Notifications
You must be signed in to change notification settings - Fork 21
/
mfcc_vtln.py
executable file
·115 lines (91 loc) · 3.84 KB
/
mfcc_vtln.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
"""This script illustrates the use of VTLN within shennong.
The pipeline is as follows:
* Setup the Buckeye corpus (40 speakers, 38h of speech in 254 files)
* Here for fastest computing the corpus is reduced to 10 speakers and 68 files
* Compute VTLN warps on 10m of speech for each speaker
* Extract warped MFCCs on the whole corpus and save them to file
"""
import argparse
import pathlib
from shennong import Utterances
from shennong.processor import MfccProcessor, VtlnProcessor
from shennong.utils import list_files_with_extension, get_njobs
def prepare_buckeye(directory):
"""Generates a list of utterances from the Buckeye corpus
The utterances as (<utterance> <wav> <speaker>). The Buckeye directory is
organized as `<speaker>/<utterance>/<utterance>.wav`
"""
print('preparing Buckeye corpus')
wavs = list_files_with_extension(directory, '.wav', abspath=True)
# for dev, use a reduced corpus
print('WARNING: reducing to 10 speakers for this example...')
wavs = [w for w in wavs if 's1' in w]
utterances = []
for wav in wavs:
name = pathlib.Path(wav).stem
spk = name[:3]
utterances.append((name, wav, spk))
utterances = Utterances(utterances)
print(
f'found {len(utterances)} utterances from '
f'{len(utterances.by_speaker())} speakers')
return utterances
def main():
"""Train VTLN, extract warps and apply warped MFCC on Buckeye corpus"""
parser = argparse.ArgumentParser()
parser.add_argument(
'buckeye_corpus', type=pathlib.Path,
help='path to the raw Buckeye Corpus')
parser.add_argument(
'output_file', type=pathlib.Path,
help='where to save the computed MFCCs')
parser.add_argument(
'-j', '--njobs', type=int, default=get_njobs(),
help='number of parallel jobs to use, default to %(default)s')
parser.add_argument(
'-d', '--duration', type=float, default=10*60,
help=('speech duration per speaker for VTLN training, '
'default to %(default)s'))
parser.add_argument(
'--warp-step', type=float, default=0.01,
help='VTLN warp step, default to %(default)s')
parser.add_argument(
'--warp-min', type=float, default=0.85,
help='VTLN min warp, default to %(default)s')
parser.add_argument(
'--warp-max', type=float, default=1.25,
help='VTLN max warp, default to %(default)s')
args = parser.parse_args()
# check input parameters
if args.output_file.exists():
raise ValueError(f'{args.output_file} already exists')
if not args.buckeye_corpus.is_dir():
raise ValueError(f'{args.buckeye_corpus} is not a directory')
# generates utterances from the Buckeye corpus
utterances = prepare_buckeye(args.buckeye_corpus)
# extract 10m of speech per speaker to train VTLN
vtln_utterances = utterances.fit_to_duration(args.duration)
# compute the VTLN warps coefficients
print(
f'training VTLN on {args.duration}s per speaker '
f'({len(vtln_utterances)} utterances)')
processor = VtlnProcessor(
warp_step=args.warp_step,
min_warp=args.warp_min,
max_warp=args.warp_max)
processor.set_logger('info')
warps = processor.process(
vtln_utterances, njobs=args.njobs, group_by='speaker')
print('VTLN warps per speaker are:')
for spk, warp in sorted(warps.items()):
print(f'{spk}: {warp}')
# convert warps from speaker to utterance in the whole corpus
warps = {utt.name: warps[utt.speaker] for utt in utterances}
print(f'computing warped MFCCs for {len(utterances)} uttterances')
features = MfccProcessor().process_all(
utterances, vtln_warp=warps, njobs=args.njobs)
print(f'writing MFCCs to {args.output_file}')
features.save(args.output_file)
if __name__ == '__main__':
main()