forked from gooofy/kaldi-adapt-lm
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfilter_lex.py
44 lines (38 loc) · 1.33 KB
/
filter_lex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
import argparse
parser = argparse.ArgumentParser(
description="Filter a lexicon file by the words given in a vocab file.")
parser.add_argument("-l", "--lex", default=None, help="lexicon file", required=True)
parser.add_argument("-v", "--vocab", default=None, help="vocab file", required=True)
parser.add_argument("--include-variants", action='store_true', help="include pronunciation variants")
parser.add_argument("-o", "--out", default=None, help="output file to store result")
args = parser.parse_args()
new_file = [] if args.out is not None else None
words = []
# !SIL SIL
# nspc nC
# UNK SPN
# Load LM words
with open(args.vocab, "r") as f:
for line in f:
line = line.strip()
if line:
words.append(line)
# Add required words
words += ["!SIL", "nspc", "UNK"]
with open(args.lex, 'r') as f:
for line in f:
line = line.strip()
if line:
this_word = line.split()[0]
if this_word and args.include_variants:
this_word = this_word.split("_")[0]
if this_word and this_word in words:
if new_file is None:
print(line)
else:
new_file.append(line)
if new_file is not None:
with open(args.out, 'w') as fp:
for line in new_file:
fp.write("%s\n" % line)