Skip to content

Commit

Permalink
Added training and small optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
sxdxfan committed Mar 15, 2021
1 parent 993170a commit 02f6427
Show file tree
Hide file tree
Showing 11 changed files with 401 additions and 108 deletions.
5 changes: 4 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
Data/
Records/
__pycache__/
Checkpoints/
__pycache__/
.idea/
.DS_Store
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
Data/
Records/
__pycache__/
Checkpoints/
__pycache__/
.idea/
.DS_Store
137 changes: 122 additions & 15 deletions DataLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,134 @@ def load_audio(path, sample_rate):
return np.array(sound.get_array_of_samples()).astype(float)


def preprocess(audio_path, sample_rate=16000, window_size=0.02, window_stride=0.01, window='hamming'):
audio = load_audio(audio_path, sample_rate)
nfft = int(sample_rate * window_size)
win_length = nfft
hop_length = int(sample_rate * window_stride)

d = stft(audio, n_fft=nfft, hop_length=hop_length,
win_length=win_length, window=window)

spect, phase = magphase(d)
pcen_result = pcen2(e=spect, sr=sample_rate, hop_length=hop_length)
mean_pcen = pcen_result.mean()
std_pcen = pcen_result.std()

pcen_result = np.add(pcen_result, -mean_pcen)
pcen_result = pcen_result / std_pcen

return pcen_result


def get_batch(batch):
longest_sample = max(batch, key=lambda p: p[0].shape[1])[0]
freq_size = longest_sample.shape[0]
mini_batch_size = len(batch)
max_seq_length = longest_sample.shape[1]
inputs = np.zeros((mini_batch_size, freq_size, max_seq_length))
target_sizes = np.zeros(shape=(mini_batch_size,), dtype=int)
input_percentages = np.zeros(shape=(mini_batch_size,), dtype=float)
targets = []
input_file_path_and_transcription = []

for x in range(mini_batch_size):
sample = batch[x]
tensor = sample[0]
target = sample[1]
tensor_path = sample[2]
original_transcription = sample[3]
seq_length = tensor.shape[1]
tensor_new = np.pad(tensor, ((0, 0), (0, abs(seq_length - max_seq_length))), 'wrap')
inputs[x] = tensor_new
input_percentages[x] = seq_length / float(max_seq_length)
target_sizes[x] = len(target)
targets.extend(target)
input_file_path_and_transcription.append([tensor_path, original_transcription])

targets = np.array(targets)

return inputs, input_percentages, targets, target_sizes, input_file_path_and_transcription


class DataLoader(object):
def __init__(self, sample_rate=16000, window_size=0.02, window_stride=0.01):
def __init__(self, dataset, batch_sampler):
self.dataset = dataset
self.batch_sampler = batch_sampler
self.sample_iter = iter(self.batch_sampler)

def __next__(self):
try:
indices = next(self.sample_iter)
indices = [i for i in indices][0]
batch = get_batch([self.dataset[i] for i in indices])
return batch
except Exception as e:
print("Encountered exception {}".format(e))
raise StopIteration()

def __iter__(self):
return self

def __len__(self):
return len(self.batch_sampler)

def reset(self):
self.batch_sampler.reset()


class SpectrogramDataset(object):
def __init__(self, labels, sample_rate, window_size, window_stride, manifest_file_path):
self.manifest_file_path = manifest_file_path
with open(self.manifest_file_path) as f:
lines = f.readlines()
self.ids = [x.strip().split(',') for x in lines]
self.size = len(lines)
self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
self.sample_rate = sample_rate
self.window_size = window_size
self.window_stride = window_stride

def preprocess(self, audio_path, window='hamming'):
audio = load_audio(audio_path, self.sample_rate)
nfft = int(self.sample_rate * self.window_size)
win_length = nfft
hop_length = int(self.sample_rate * self.window_stride)
def __getitem__(self, index):
sample = self.ids[index]
audio_path, transcript_loaded = sample[0], sample[1]
spectrogram = preprocess(audio_path, self.sample_rate, self.window_size, self.window_stride)
transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript_loaded)]))
return spectrogram, transcript, audio_path, transcript_loaded

def __len__(self):
return self.size


class BucketingSampler(object):
def __init__(self, data_source, batch_size=1, shuffle=False):
self.data_source = data_source
self.batch_size = batch_size
self.ids = list(range(0, len(data_source)))
self.batch_id = 0
self.bins = []
self.shuffle = shuffle
self.reset()

def __iter__(self):
return self

d = stft(audio, n_fft=nfft, hop_length=hop_length,
win_length=win_length, window=window)
def __next__(self):
if self.batch_id < len(self):
ids = self.bins[self.batch_id]
self.batch_id += 1
yield ids
else:
raise StopIteration()

spect, phase = magphase(d)
pcen_result = pcen2(e=spect, sr=self.sample_rate, hop_length=hop_length)
mean_pcen = pcen_result.mean()
std_pcen = pcen_result.std()
def __len__(self):
return len(self.bins)

pcen_result = np.add(pcen_result, -mean_pcen)
pcen_result = pcen_result / std_pcen
def get_bins(self):
if self.shuffle:
np.random.shuffle(self.ids)
self.bins = [self.ids[i:i + self.batch_size] for i in range(0, len(self.ids), self.batch_size)]

return pcen_result
def reset(self):
self.get_bins()
self.batch_id = 0
4 changes: 3 additions & 1 deletion Decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ def __init__(self, labels, blank=0):
self.labels = labels
self.blank = blank

def decode(self, output):
def decode(self, output, max_len=None):
output = softmax(output.astype(np.float32), axis=-1)
best_path = np.argmax(output, axis=1)
if max_len is not None:
best_path = best_path[:max_len]
return "".join(self.labels[k] for k, _ in itertools.groupby(best_path) if k != self.blank)


Expand Down
34 changes: 4 additions & 30 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,19 @@ FROM ubuntu:18.04
ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get update && apt-get upgrade -y && apt-get autoremove && apt-get autoclean
RUN apt-get install -y \
vim \
libreadline-gplv2-dev \
libncursesw5-dev \
libssl-dev \
libsqlite3-dev \
tk-dev \
libgdbm-dev \
libc6-dev \
libbz2-dev \
python3-setuptools \
libboost-all-dev \
build-essential \
checkinstall \
libc-dev \
python3-pkg-resources \
python-setuptools-doc \
python3-pip \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender-dev \
git \
ffmpeg \
apt-utils
RUN apt-get install -y python3-dev python3-pip ffmpeg

ARG PROJECT=sova-asr
ARG PROJECT_DIR=/${PROJECT}
ARG PROJECT_DIR=/$PROJECT
RUN mkdir -p $PROJECT_DIR
WORKDIR $PROJECT_DIR

COPY requirements.txt .
RUN pip3 install --upgrade pip
RUN pip3 install -r requirements.txt
RUN pip3 install PuzzleLib
RUN rm -rf $PROJECT_DIR/*

RUN apt-get install -y locales && locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8

CMD gunicorn --access-logfile - -w 1 --bind 0.0.0.0:8888 app:app --timeout 15000
ENV LC_ALL en_US.UTF-8
36 changes: 5 additions & 31 deletions Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,21 @@ FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get update && apt-get upgrade -y && apt-get autoremove && apt-get autoclean
RUN apt-get install -y \
vim \
libreadline-gplv2-dev \
libncursesw5-dev \
libssl-dev \
libsqlite3-dev \
tk-dev \
libgdbm-dev \
libc6-dev \
libbz2-dev \
python3-setuptools \
libboost-all-dev \
build-essential \
checkinstall \
libc-dev \
python3-pkg-resources \
python-setuptools-doc \
python3-pip \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender-dev \
git \
ffmpeg \
apt-utils
RUN apt-get install -y python3-dev python3-pip ffmpeg

ARG PROJECT=sova-asr-gpu
ARG PROJECT_DIR=/sova-asr
ARG PROJECT=sova-asr
ARG PROJECT_DIR=/$PROJECT
RUN mkdir -p $PROJECT_DIR
WORKDIR $PROJECT_DIR

COPY requirements.txt .
RUN pip3 install --upgrade pip
RUN pip3 install -r requirements.txt
RUN ln -s /usr/local/cuda/targets/x86_64-linux/lib/ /usr/local/cuda/lib64/
RUN pip3 install colorama
RUN pip3 install PuzzleLib --install-option="--backend=cuda"
RUN rm -rf $PROJECT_DIR/*

RUN apt-get install -y locales && locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8

CMD gunicorn --access-logfile - -w 1 --bind 0.0.0.0:8888 app:app --timeout 15000
ENV LC_ALL en_US.UTF-8
10 changes: 2 additions & 8 deletions SpeechRecognizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import argparse
import configparser
from DataLoader import DataLoader
from DataLoader import preprocess
from Decoder import GreedyDecoder


Expand All @@ -25,12 +25,6 @@ def __init__(self, config_path='config.ini'):
from PuzzleLib.Models.Nets.WaveToLetter import loadW2L
from PuzzleLib.Modules import MoveAxis

self.data_loader = DataLoader(
sample_rate=self.sample_rate,
window_size=self.window_size,
window_stride=self.window_stride
)

nfft = int(self.sample_rate * self.window_size)
self.w2l = loadW2L(modelpath=self.config['Wav2Letter']['model_path'], inmaps=(1 + nfft // 2),
nlabels=len(self.labels))
Expand All @@ -52,7 +46,7 @@ def __init__(self, config_path='config.ini'):
self.decoder = GreedyDecoder(self.labels)

def recognize(self, audio_path):
preprocessed_audio = self.data_loader.preprocess(audio_path)
preprocessed_audio = preprocess(audio_path, self.sample_rate, self.window_size, self.window_stride)
if self.cpu:
from PuzzleLib.CPU.CPUArray import CPUArray
inputs = CPUArray.toDevice(np.array([preprocessed_audio]).astype(np.float32))
Expand Down
Loading

0 comments on commit 02f6427

Please sign in to comment.