-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgen_audio_features.py
147 lines (117 loc) · 5.88 KB
/
gen_audio_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Takes at the input audio files from dataset in .wav format
then applies voice activity detection (VAD) and finally extracts
features from cleared data and saves it in binary format .npy"""
#region Imports
import datetime
import argparse
from os import walk, makedirs
from os.path import isdir, join
import numpy as np
import scipy.io.wavfile as wf
from voice_activity_detector import VoiceActivityDetector
from feature_extractor import FeatureExtractor
from progress_bar import print_progress_bar
#endregion
slash_sign = '/' # for Linux
# slash_sign = '\\' # for Windows
def load_wave(path_to_file):
sample_rate, wave_data = wf.read(path_to_file)
return sample_rate, wave_data
def count_files_in_dataset_dir(dataset_dir, file_format):
count = 0
for root, dirs, files in walk(dataset_dir):
for i in range(len(files)):
if files[i].endswith(file_format):
count += 1
return count
def gen_property_file(num_classes, num_coeffs, path_to_dataset):
file_name = path_to_dataset + "_property.txt"
with open(file_name, "w") as text_file:
text_file.write("{},{}".format(num_classes, num_coeffs))
print('Created property file containing info about number of classes and features')
def parse_label_voxceleb(path_to_file):
path_list = path_to_file.split(slash_sign)
label = path_list[-3][3:]
return int(label)
def clear_audio_from_voice(wave_data, sample_rate, path_to_file):
detector = VoiceActivityDetector(wave_data=wave_data,
sample_rate=sample_rate,
save_visual_results=False,
file_name=path_to_file)
cleared_audio = detector.detect_voice_activity()
return cleared_audio
def extract_mel_filterbank_energies(data, sample_rate, num_coeffs):
extractor = FeatureExtractor(data_array=data,
sample_rate=sample_rate,
num_of_coeffs=num_coeffs)
features = extractor.extract_log_mel_filterbank_energies()
return features
def save_numpy_array(np_array, path_to_file, root_dir, postfix):
path_to_file = path_to_file[:-4] # eliminate '.wav'
path_to_file = path_to_file.split(slash_sign)
dataset_name = root_dir.split(slash_sign)[-1]
name_idx = path_to_file.index(dataset_name)
path_to_file[name_idx] = dataset_name + postfix
new_dir_path = slash_sign.join(path_to_file[:-1])
if not isdir(new_dir_path):
makedirs(new_dir_path)
path_to_file = slash_sign.join(path_to_file)
np.save(path_to_file, np_array)
def gen_audio_features(dataset_dir, num_coeffs, vad_only):
"""Takes at input path of root directory of dataset (or subset).
For example, it can be the path to train/test folder
in main dataset directory"""
print('Start loading and processing of dataset...')
count_of_records = count_files_in_dataset_dir(dataset_dir, file_format='.wav')
print('Number of files to process: ', count_of_records)
list_name = dataset_dir + '_list.txt'
dataset_list = open(list_name, "w")
labels = np.array([])
time_start = datetime.datetime.now()
record_idx = 0
print_progress_bar(iteration=record_idx, total=count_of_records,
prefix='{}/{}'.format(record_idx, count_of_records),
suffix='complete')
for root, dirs, files in sorted(walk(dataset_dir)):
for wave_file in files:
if wave_file.endswith('.wav'):
path_to_file = join(root, wave_file)
sample_rate, wave_data = load_wave(path_to_file)
cleared_audio = clear_audio_from_voice(wave_data, sample_rate, path_to_file)
if vad_only:
save_numpy_array(cleared_audio, path_to_file, root_dir=dataset_dir, postfix='_silencecleared')
else:
features = extract_mel_filterbank_energies(cleared_audio, sample_rate, num_coeffs)
save_numpy_array(features, path_to_file, root_dir=dataset_dir, postfix='_features')
label = parse_label_voxceleb(path_to_file)
if label not in labels:
labels = np.append(labels, label)
realtive_path_to_file = slash_sign.join(
[item for item in path_to_file.split(slash_sign) \
if item not in dataset_dir.split(slash_sign)])
dataset_list.write('{} {}\n'.format(realtive_path_to_file, label))
record_idx += 1
print_progress_bar(iteration=record_idx, total=count_of_records,
prefix='{}/{}'.format(record_idx, count_of_records),
suffix='complete')
dataset_list.close()
num_classes = len(labels)
print('Dataset successfully processed and saved')
gen_property_file(num_classes, num_coeffs, path_to_dataset=(dataset_dir))
time_end = datetime.datetime.now()
print('Elapsed time: ', time_end-time_start)
def parse_arguments():
parser = argparse.ArgumentParser(description="Performing speaker diarization")
parser.add_argument('--dataset_dir', required=True, default='',
help="Path of directory containig audio dataset")
parser.add_argument('--num_coeffs', type=int, default=40,
help="(Optional) Number of coefficients to extract from \
each audio frame (default is 40 in accordance with article)")
parser.add_argument('--vad_only', action='store_true',
help="Wheter to use voice activity detection only without \
feature extraction")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_arguments()
gen_audio_features(args.dataset_dir, int(args.num_coeffs), args.vad_only)