-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinferBaidu.py
222 lines (202 loc) · 8.53 KB
/
inferBaidu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""Inferer for DeepSpeech2 model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')
import argparse
import functools
import paddle.fluid as fluid
from data_utils.data import DataGenerator
from model_utils.model import DeepSpeech2Model
from model_utils.model_check import check_cuda, check_version
from utils.error_rate import wer, cer
from utils.utility import add_arguments, print_arguments
import create_manifest
import json
import codecs
import soundfile
import time
import numpy as np
ds2_model = None
data_generator = None
vocab_list = None
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('num_samples', int, 128, "# of samples to infer.")
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 1024, "# of recurrent cells per layer.")
add_arg('alpha', float, 2.5, "Coef of LM for beam search.")
add_arg('beta', float, 0.3, "Coef of WC for beam search.")
add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
add_arg('use_gru', bool, True, "Use GRUs instead of simple RNNs.")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('share_rnn_weights',bool, False, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
add_arg('target_dir', str,
'/content/Baidu-Deepspeech2-For-Python3/dataset/librispeech/test-clean/LibriSpeech/test-clean/',
"Filepath of voice sample testing folder.")
add_arg('infer_manifest', str,
'data/manifest.test-clean',
"Filepath of manifest to infer.")
add_arg('mean_std_path', str,
'models/baidu_en8k/mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'models/baidu_en8k/vocab.txt',
"Filepath of vocabulary.")
add_arg('lang_model_path', str,
'models/lm/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.")
add_arg('model_path', str,
'models/baidu_en8k',
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.")
add_arg('decoding_method', str,
'ctc_beam_search',
"Decoding method. Options: ctc_beam_search, ctc_greedy",
choices = ['ctc_beam_search', 'ctc_greedy'])
add_arg('error_rate_type', str,
'wer',
"Error rate type for evaluation.",
choices=['wer', 'cer'])
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc'])
add_arg('audio_path', str,
'',
"Audio path to test",
choices=['linear', 'mfcc'])
# yapf: disable
args = parser.parse_args()
def prepare_manifest():
print("Preparing Manifest")
create_manifest.prepare_dataset(target_dir=args.target_dir, manifest_path=args.infer_manifest)
def load_model():
# check if set use_gpu=True in paddlepaddle cpu version
check_cuda(args.use_gpu)
# check if paddlepaddle version is satisfied
check_version()
if args.use_gpu:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
# Load model
data_generator = DataGenerator(
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config='{}',
specgram_type=args.specgram_type,
keep_transcription_text=True,
place = place,
is_training = False)
ds2_model = DeepSpeech2Model(
vocab_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
share_rnn_weights=args.share_rnn_weights,
place=place,
init_from_pretrained_model=args.model_path)
# decoders only accept string encoded in utf-8
vocab_list = data_generator.vocab_list
ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
vocab_list)
return ds2_model, data_generator, vocab_list
def infer(ds2_model, data_generator, vocab_list):
"""Inference for DeepSpeech2."""
# Prepare manifest
if args.audio_path:
json_lines = []
audio_data, samplerate = soundfile.read(args.audio_path)
duration = float(len(audio_data)) / samplerate
json_lines.append(
json.dumps({
'audio_filepath': args.audio_path,
'duration': duration,
'text': 'NO TRANSCRIPT'
}))
with codecs.open(args.infer_manifest, 'w', 'utf-8') as out_file:
for line in json_lines:
out_file.write(line + '\n')
else:
prepare_manifest()
# Load audio
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.infer_manifest,
batch_size=args.num_samples,
sortagrad=False,
shuffle_method=None)
#infer_data = next(batch_reader()) # (padded_audios, texts, audio_lens, masks, audio_file_path)
error_rate_func = cer if args.error_rate_type == 'cer' else wer
errors_sum, len_refs, num_ins = 0.0, 0, 0
error_arr = [];wer_arr=[]
ds2_model.logger.info("\nEverything Prepared .. Starting inference ...\n")
for infer_data in batch_reader():
probs_split= ds2_model.infer_batch_probs(
infer_data=infer_data,
feeding_dict=data_generator.feeding)
result_transcripts= ds2_model.decode_batch_beam_search(
probs_split=probs_split,
beam_alpha=args.alpha,
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
cutoff_top_n=args.cutoff_top_n,
vocab_list=vocab_list,
num_processes=args.num_proc_bsearch)
target_transcripts = infer_data[1]
audio_file_paths = infer_data[4]
json_lines = []
print("Writing Results on TRANSCRIPTION.json ...")
for target, result, audio_file_path in zip(target_transcripts, result_transcripts,audio_file_paths):
target = target.replace("’", "'")
erroris = error_rate_func(target, result)
json_lines.append(
json.dumps({
'Audio file path': audio_file_path,
'Target Transcription': target,
'Output Transcription': result,
'The {} '.format(args.error_rate_type): erroris,
}, indent=4, ensure_ascii=False, sort_keys=True))
error_arr.append(erroris)
wer_arr = np.array(error_arr)
print("Current Error Rate is : ",str(np.average(wer_arr)))
with codecs.open('TRANSCRIPTION.json', 'a+', 'utf-8') as out_file:
for line in json_lines:
out_file.write(line + '\n')
with codecs.open('TRANSCRIPTION.json', 'a+', 'utf-8') as out_file:
out_file.write("Average Error Rate is : " + str(np.average(wer_arr)) +'\n')
ds2_model.logger.info("Finished Inference.")
if args.audio_path:
return result_transcripts.pop()
def main():
global ds2_model
global data_generator
global vocab_list
print_arguments(args)
#args.audio_path = audio_path
if not ds2_model:
print("\nModel Loading Initiated ...")
ds2_model, data_generator, vocab_list = load_model()
print("\nModel Loaded Successfully ...\n")
tic = time.time()
result_transcripts = infer(ds2_model, data_generator, vocab_list)
toc = time.time()
print("{} Mins Required For Transcription".format(toc-tic/60))
else:
tic = time.time()
result_transcripts = infer(ds2_model, data_generator, vocab_list)
toc = time.time()
print("{} Mins Required For Transcription".format(toc-tic/60))
return result_transcripts
if __name__ == '__main__':
print(main())