-
Notifications
You must be signed in to change notification settings - Fork 7
/
newlangdir_make_graphs-cvteMandarin.sh
executable file
·85 lines (72 loc) · 3.47 KB
/
newlangdir_make_graphs-cvteMandarin.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env bash
# For now, this is almost an exact copy of newlangdir_make_graphs.sh.
# Make an IL L.fst and G.fst. Compose them with an HC.fst, to make an HCLG.fst.
if [ $# != 1 ]; then
echo "Usage: $0 <newlangdir>" # e.g., $0 tamil, or $0 russian.
echo "Inputs and outputs are all in <newlangdir>."
echo "Inputs: lang/clean.txt, local/dict/lexicon.txt, local/dict/words.txt."
# Intermediate outputs: lang/topo == dict.topo (from prepare_lang.sh);
# local/dict/lexiconp.txt; dict/L.fst == lang/L.fst; lang/G.fst; lang/lm.arpa.gz.
echo "Output: graph/HCLG.fst"
echo "SRILM must be in your path."
exit 1
fi
[ ! -d $1 ] && echo "$0: missing directory $1. Aborting." && exit 1
newlangdir=$1
# Set up environment variables.
. cmd.sh || exit 1
. path.sh || exit 1
command -v ngram-count 1>/dev/null 2>&1 || { echo "$0: failed to find SRILM tools. Install SRILM and update path.sh."; exit 1; }
# If you get that error, do this: cd kaldi/tools; ./install_srilm.sh; follow its instructions.
# Input files and dirs.
model=cvte/s5/exp/chain/tdnn
phones_src=$model/phones.txt
dict_src=$newlangdir/local/dict
lang=$newlangdir/lang
# Dirs of intermediate files and output files.
dict=$newlangdir/dict # Made by prepare_lang.sh.
dict_tmp=$newlangdir/dict_tmp # Made by prepare_lang.sh.
graph=$newlangdir/graph # Made by mkgraph.sh.
# Intermediate files.
lm_src=$lang/lm.arpa
[ ! -d $lang ] && echo "$0: missing directory $lang. Aborting." && exit 1
[ ! -f $lang/clean.txt ] && echo "$0: missing file $lang/clean.txt. Aborting." && exit 1
[ ! -d $dict_src ] && echo "$0: missing directory $dict_src. Aborting." && exit 1
[ ! -f $dict_src/lexicon.txt ] && echo "$0: missing file $dict_src/lexicon.txt. Aborting." && exit 1
[ ! -f $dict_src/words.txt ] && echo "$0: missing file $dict_src/words.txt. Aborting." && exit 1
[ ! -d $model ] && echo "$0: missing directory $model. Aborting." && exit 1
[ ! -f $phones_src ] && echo "$0: missing file $phones_src. Aborting." && exit 1
[ ! -f nonsilence_phones.txt ] && echo "$0: missing file nonsilence_phones.txt. Aborting." && exit 1
echo "$0: prepare_lang"
# Make some files.
echo "sil laughter noise oov" > $dict_src/extra_questions.txt
echo "sil" > $dict_src/optional_silence.txt
cat << EOF > $dict_src/silence_phones.txt
sil
laughter
noise
oov
EOF
cp nonsilence_phones.txt $dict_src
# Make lexiconp.txt from lexicon.txt.
# Also make the word lexicon, L.fst.
if [ $dict_src/lexiconp.txt -ot $dict_src/lexicon.txt ]; then
# It might not have been created yet.
rm -f $dict_src/lexiconp.txt
fi
utils/prepare_lang.sh --phone-symbol-table $phones_src $dict_src "<unk>" $dict_tmp $dict || exit 1
# (Sometimes the tamil/dict/words.txt built by prepare_lang.sh lacks a line for <unk>,
# so I've hacked a few lines into prepare_lang.sh to fix that.)
# Make the grammar model, G.fst.
echo "$0: ngram-count"
ngram-count -text $lang/clean.txt -order 3 -limit-vocab -vocab $dict_src/words.txt -kndiscount -interpolate -lm $lm_src || exit 1
gzip -f $lm_src
echo "$0: format_lm"
utils/format_lm.sh $dict $lm_src.gz $dict_src/lexicon.txt $lang || exit 1
# Make the HCLG graph, graph/HCLG.fst.
# Tamil's is 360 MB.
# Needs $lang/{L.fst, G.fst, phones.txt, words.txt, phones/silence.csl, phones/disambig.int}, $model/final.mdl, $model/tree
# Temporarily makes $lang/tmp/LG.fst, $lang/tmp/CLG_$N_$P.fst, graph/Ha.fst, graph/HCLGa.fst.
# Copies words.txt and phones/* from $lang to $graph.
echo "$0: mkgraph"
utils/mkgraph.sh --self-loop-scale 1.0 $lang $model $graph || exit 1