Added training and small optimizations

sovaai · Mar 15, 2021 · 02f6427 · 02f6427
1 parent 993170a
commit 02f6427
Show file tree

Hide file tree

Showing 11 changed files with 401 additions and 108 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,3 +1,6 @@
 Data/
 Records/
-__pycache__/
+Checkpoints/
+__pycache__/
+.idea/
+.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 Data/
 Records/
-__pycache__/
+Checkpoints/
+__pycache__/
+.idea/
+.DS_Store
diff --git a/DataLoader.py b/DataLoader.py
@@ -21,27 +21,134 @@ def load_audio(path, sample_rate):
     return np.array(sound.get_array_of_samples()).astype(float)
 
 
+def preprocess(audio_path, sample_rate=16000, window_size=0.02, window_stride=0.01, window='hamming'):
+    audio = load_audio(audio_path, sample_rate)
+    nfft = int(sample_rate * window_size)
+    win_length = nfft
+    hop_length = int(sample_rate * window_stride)
+
+    d = stft(audio, n_fft=nfft, hop_length=hop_length,
+             win_length=win_length, window=window)
+
+    spect, phase = magphase(d)
+    pcen_result = pcen2(e=spect, sr=sample_rate, hop_length=hop_length)
+    mean_pcen = pcen_result.mean()
+    std_pcen = pcen_result.std()
+
+    pcen_result = np.add(pcen_result, -mean_pcen)
+    pcen_result = pcen_result / std_pcen
+
+    return pcen_result
+
+
+def get_batch(batch):
+    longest_sample = max(batch, key=lambda p: p[0].shape[1])[0]
+    freq_size = longest_sample.shape[0]
+    mini_batch_size = len(batch)
+    max_seq_length = longest_sample.shape[1]
+    inputs = np.zeros((mini_batch_size, freq_size, max_seq_length))
+    target_sizes = np.zeros(shape=(mini_batch_size,), dtype=int)
+    input_percentages = np.zeros(shape=(mini_batch_size,), dtype=float)
+    targets = []
+    input_file_path_and_transcription = []
+
+    for x in range(mini_batch_size):
+        sample = batch[x]
+        tensor = sample[0]
+        target = sample[1]
+        tensor_path = sample[2]
+        original_transcription = sample[3]
+        seq_length = tensor.shape[1]
+        tensor_new = np.pad(tensor, ((0, 0), (0, abs(seq_length - max_seq_length))), 'wrap')
+        inputs[x] = tensor_new
+        input_percentages[x] = seq_length / float(max_seq_length)
+        target_sizes[x] = len(target)
+        targets.extend(target)
+        input_file_path_and_transcription.append([tensor_path, original_transcription])
+
+    targets = np.array(targets)
+
+    return inputs, input_percentages, targets, target_sizes, input_file_path_and_transcription
+
+
 class DataLoader(object):
-    def __init__(self, sample_rate=16000, window_size=0.02, window_stride=0.01):
+    def __init__(self, dataset, batch_sampler):
+        self.dataset = dataset
+        self.batch_sampler = batch_sampler
+        self.sample_iter = iter(self.batch_sampler)
+
+    def __next__(self):
+        try:
+            indices = next(self.sample_iter)
+            indices = [i for i in indices][0]
+            batch = get_batch([self.dataset[i] for i in indices])
+            return batch
+        except Exception as e:
+            print("Encountered exception {}".format(e))
+            raise StopIteration()
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return len(self.batch_sampler)
+
+    def reset(self):
+        self.batch_sampler.reset()
+
+
+class SpectrogramDataset(object):
+    def __init__(self, labels, sample_rate, window_size, window_stride, manifest_file_path):
+        self.manifest_file_path = manifest_file_path
+        with open(self.manifest_file_path) as f:
+            lines = f.readlines()
+        self.ids = [x.strip().split(',') for x in lines]
+        self.size = len(lines)
+        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
         self.sample_rate = sample_rate
         self.window_size = window_size
         self.window_stride = window_stride
 
-    def preprocess(self, audio_path, window='hamming'):
-        audio = load_audio(audio_path, self.sample_rate)
-        nfft = int(self.sample_rate * self.window_size)
-        win_length = nfft
-        hop_length = int(self.sample_rate * self.window_stride)
+    def __getitem__(self, index):
+        sample = self.ids[index]
+        audio_path, transcript_loaded = sample[0], sample[1]
+        spectrogram = preprocess(audio_path, self.sample_rate, self.window_size, self.window_stride)
+        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript_loaded)]))
+        return spectrogram, transcript, audio_path, transcript_loaded
+
+    def __len__(self):
+        return self.size
+
+
+class BucketingSampler(object):
+    def __init__(self, data_source, batch_size=1, shuffle=False):
+        self.data_source = data_source
+        self.batch_size = batch_size
+        self.ids = list(range(0, len(data_source)))
+        self.batch_id = 0
+        self.bins = []
+        self.shuffle = shuffle
+        self.reset()
+
+    def __iter__(self):
+        return self
 
-        d = stft(audio, n_fft=nfft, hop_length=hop_length,
-                 win_length=win_length, window=window)
+    def __next__(self):
+        if self.batch_id < len(self):
+            ids = self.bins[self.batch_id]
+            self.batch_id += 1
+            yield ids
+        else:
+            raise StopIteration()
 
-        spect, phase = magphase(d)
-        pcen_result = pcen2(e=spect, sr=self.sample_rate, hop_length=hop_length)
-        mean_pcen = pcen_result.mean()
-        std_pcen = pcen_result.std()
+    def __len__(self):
+        return len(self.bins)
 
-        pcen_result = np.add(pcen_result, -mean_pcen)
-        pcen_result = pcen_result / std_pcen
+    def get_bins(self):
+        if self.shuffle:
+            np.random.shuffle(self.ids)
+        self.bins = [self.ids[i:i + self.batch_size] for i in range(0, len(self.ids), self.batch_size)]
 
-        return pcen_result
+    def reset(self):
+        self.get_bins()
+        self.batch_id = 0
diff --git a/Decoder.py b/Decoder.py
@@ -10,9 +10,11 @@ def __init__(self, labels, blank=0):
         self.labels = labels
         self.blank = blank
 
-    def decode(self, output):
+    def decode(self, output, max_len=None):
         output = softmax(output.astype(np.float32), axis=-1)
         best_path = np.argmax(output, axis=1)
+        if max_len is not None:
+            best_path = best_path[:max_len]
         return "".join(self.labels[k] for k, _ in itertools.groupby(best_path) if k != self.blank)
 
 

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -3,45 +3,19 @@ FROM ubuntu:18.04
 ARG DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update && apt-get upgrade -y && apt-get autoremove && apt-get autoclean
-RUN apt-get install -y \
-    vim \
-    libreadline-gplv2-dev \
-    libncursesw5-dev \
-    libssl-dev \
-    libsqlite3-dev \
-    tk-dev \ 
-    libgdbm-dev \
-    libc6-dev \
-    libbz2-dev \
-    python3-setuptools \
-    libboost-all-dev \
-    build-essential \
-    checkinstall \
-    libc-dev \
-    python3-pkg-resources \ 
-    python-setuptools-doc \
-    python3-pip \ 
-    libglib2.0-0 \
-    libsm6 \
-    libxext6 \
-    libxrender-dev \
-    git \
-    ffmpeg \
-    apt-utils
+RUN apt-get install -y python3-dev python3-pip ffmpeg
 
 ARG PROJECT=sova-asr
-ARG PROJECT_DIR=/${PROJECT}
+ARG PROJECT_DIR=/$PROJECT
 RUN mkdir -p $PROJECT_DIR
 WORKDIR $PROJECT_DIR
 
 COPY requirements.txt .
 RUN pip3 install --upgrade pip
 RUN pip3 install -r requirements.txt
-RUN pip3 install PuzzleLib
 RUN rm -rf $PROJECT_DIR/*
+
 RUN apt-get install -y locales && locale-gen en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US:en
-ENV LC_ALL en_US.UTF-8
-
-CMD gunicorn --access-logfile - -w 1 --bind 0.0.0.0:8888 app:app --timeout 15000 
+ENV LC_ALL en_US.UTF-8
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -3,47 +3,21 @@ FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
 ARG DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update && apt-get upgrade -y && apt-get autoremove && apt-get autoclean
-RUN apt-get install -y \
-    vim \
-    libreadline-gplv2-dev \
-    libncursesw5-dev \
-    libssl-dev \
-    libsqlite3-dev \
-    tk-dev \ 
-    libgdbm-dev \
-    libc6-dev \
-    libbz2-dev \
-    python3-setuptools \
-    libboost-all-dev \
-    build-essential \
-    checkinstall \
-    libc-dev \
-    python3-pkg-resources \ 
-    python-setuptools-doc \
-    python3-pip \ 
-    libglib2.0-0 \
-    libsm6 \
-    libxext6 \
-    libxrender-dev \
-    git \
-    ffmpeg \
-    apt-utils
+RUN apt-get install -y python3-dev python3-pip ffmpeg
 
-ARG PROJECT=sova-asr-gpu
-ARG PROJECT_DIR=/sova-asr
+ARG PROJECT=sova-asr
+ARG PROJECT_DIR=/$PROJECT
 RUN mkdir -p $PROJECT_DIR
 WORKDIR $PROJECT_DIR
 
 COPY requirements.txt .
 RUN pip3 install --upgrade pip
 RUN pip3 install -r requirements.txt
 RUN ln -s /usr/local/cuda/targets/x86_64-linux/lib/ /usr/local/cuda/lib64/
-RUN pip3 install colorama
 RUN pip3 install PuzzleLib --install-option="--backend=cuda"
 RUN rm -rf $PROJECT_DIR/*
+
 RUN apt-get install -y locales && locale-gen en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US:en
-ENV LC_ALL en_US.UTF-8
-
-CMD gunicorn --access-logfile - -w 1 --bind 0.0.0.0:8888 app:app --timeout 15000 
+ENV LC_ALL en_US.UTF-8
diff --git a/SpeechRecognizer.py b/SpeechRecognizer.py
@@ -1,7 +1,7 @@
 import numpy as np
 import argparse
 import configparser
-from DataLoader import DataLoader
+from DataLoader import preprocess
 from Decoder import GreedyDecoder
 
 
@@ -25,12 +25,6 @@ def __init__(self, config_path='config.ini'):
         from PuzzleLib.Models.Nets.WaveToLetter import loadW2L
         from PuzzleLib.Modules import MoveAxis
 
-        self.data_loader = DataLoader(
-            sample_rate=self.sample_rate,
-            window_size=self.window_size,
-            window_stride=self.window_stride
-        )
-
         nfft = int(self.sample_rate * self.window_size)
         self.w2l = loadW2L(modelpath=self.config['Wav2Letter']['model_path'], inmaps=(1 + nfft // 2),
                            nlabels=len(self.labels))
@@ -52,7 +46,7 @@ def __init__(self, config_path='config.ini'):
             self.decoder = GreedyDecoder(self.labels)
 
     def recognize(self, audio_path):
-        preprocessed_audio = self.data_loader.preprocess(audio_path)
+        preprocessed_audio = preprocess(audio_path, self.sample_rate, self.window_size, self.window_stride)
         if self.cpu:
             from PuzzleLib.CPU.CPUArray import CPUArray
             inputs = CPUArray.toDevice(np.array([preprocessed_audio]).astype(np.float32))