Preprocessing_breakdown.Rmd

---
jupyter:
  jupytext:
    text_representation:
      extension: .Rmd
      format_name: rmarkdown
      format_version: '1.2'
      jupytext_version: 1.11.2
  kernelspec:
    display_name: Python [conda env:deepal]
    language: python
    name: conda-env-deepal-py
---

### Preprocessing used in detection and partially in unsupervised feature learning

```{python}
import torch
import torch.nn.functional as F
import soundfile as sf # load audio files
import matplotlib.pyplot as plt
import numpy as np
# %matplotlib inline
import math
import pandas as pd
```

```{python}
""" 
Load audio file
called by class Dataset(AudioDataset) in audiodataset.py  
"""
def load_audio_file(file_name, sr=None, mono=True):
    # returns data and sr
    # always_2d = True --> 2D array is returned even if mono
    y, sr_orig = sf.read(file_name, always_2d=True, dtype="float32") # y=data, 2D np array (frames x chns), channels: 1st dim, as cols
    if mono and y.ndim == 2 and y.shape[1] > 1: # channels = data.shape[1]; for consistent indexing, mono (frames, 1);
        y = np.mean(y, axis=1, keepdims=True) # stereo -> mono, still a 2D array (frames x 1)
    if sr is not None and sr != sr_orig:
        y = resampy.resample(y, sr_orig, sr, axis=0, filter="kaiser_best") # if wrong SR: resample with Kaiser-window
    return torch.from_numpy(y).float().t() # transpose, (1 x frames) --> needed shape for STFT
```

```{python}
chimp = load_audio_file('/home/am_deepal/Desktop/AugDemo/call-chimp-uta-phsm_00000_2012_1240_4000_6000.wav', sr=44100)
```

```{python}
"""Pre-Emphasize in order to raise higher frequencies and lower low frequencies."""
# high frequencies are more likely to be dampened during transmission
# thus have less intensity, compared to the lower frequency parts, e.g. F0, 1st harmonic, 2nd harmonic, etc..
class PreEmphasize(object):
    def __init__(self, factor=0.97):
        self.factor = factor

    def __call__(self, y):
        if y.dim() != 2:
            raise ValueError(
                "PreEmphasize expects a 2 dimensional signal of size (c, n), "
                "but got size: {}.".format(y.size())
            )
        return torch.cat(
            (y[:, 0].unsqueeze(dim=-1), y[:, 1:] - self.factor * y[:, :-1]), dim=-1 # purpose: remove sharpe edges
        )
```

```{python}
transform = PreEmphasize()
pre_emph = transform(chimp)
```

```{python}
n_fft = 4096
hop_length = 441
sr = 44100
```

```{python}
# spectrogram generation related starts here
"""Converts a given audio to a (power) spectrogram (amplitude not log scaled)."""
class Spectrogram(object):


    def __init__(self, n_fft, hop_length, center=True):
        # n_fft: resolution on freq axis
        self.n_fft = n_fft
        self.hop_length = hop_length # resolution on time axis; width of hann window/4; overlap shd add up to 1
        # center - true: t-th frame in spectrogram is centered at time t x hop_length of the signal # orcaspot: center=False
        # --> create 1-to-1 correspondence
        # done by reflected padding (padding on both sides)
        self.center = center
        # hann window: more weight on 'current' freqs at time t (weighting functions/weight matrix used in FFT analysis)
        # window functions control the amount of signal leakage between freq bins of FFT
        self.window = torch.hann_window(self.n_fft)

    def __call__(self, y):
        if y.dim() != 2:
            raise ValueError(
                "Spectrogram expects a 2 dimensional signal of size (c, n), "
                "but got size: {}.".format(y.size())
            )
        # Fourier transform: 2 inputs and 2 output dimensions (re, im).
        # For each frequency bin, the magnitude sqrt(re^2 + im^2) tells you the amplitude of the component at the corresponding frequency.
        # amplitude/color in spectrogram: height in each FT (not of our interest though)
        # The phase atan2(im, re) tells you the relative phase of that component.
        # STFT sliding is similar to sliding windows of conv2D
        # n_frames = ((data_len - (win_size - 1) - 1) / hop_size) + 1
        # librosa stft returns: (1 + n_fft/2, n_frames) (comparison of librosa and torch stft: https://www.programmersought.com/article/76425845654/)
        # torch STFT returns(* x N x T x 2) --> N: freq, T: time_frames, last dim: real and imaginary components
        # rea = spec[:, :, 0]
        # imag = spec[:, :, 1]
        # use rea and imag to calc magnitude and phase
        S = torch.stft(
            input=y, # shape(1 x num_samples)
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            window=self.window,
            center=self.center,
            onesided=True,
        ).transpose(1, 2) # --> (1, T, n_fft/2 + 1, 2) feature matrix or (* x T x F x 2); this is complex valued STFT
        # e.g. n_fft = 512, feature matrix returned by stft.transpose() is (1, T, 257, 2)
        S /= self.window.pow(2).sum().sqrt() # energy is lost due to windowing, scale factor to recover energy
        S = S.pow(2).sum(-1) # get power of "complex" tensor (--> real number) and sum along last dim, returns (1xTxF) or (1 t f)
        return S
```

```{python}
spec = Spectrogram(n_fft, hop_length, center=False)
stft = spec(pre_emph)
```

```{python}
print(stft.shape)
```

```{python}
plt.imshow(stft.squeeze())
```

```{python}
# prepare axis
f = np.linspace(0, sr/2, num=stft.shape[2])
t = np.arange(0, stft.shape[1] + 1) * hop_length / sr

plt.figure(1)
plt.pcolormesh(f, t, stft.squeeze(), shading='auto')
plt.xlabel("freq (Hz)")
plt.ylabel("time(s)")
plt.title("Power Spectrogram")
plt.savefig("/home/am_deepal/Desktop/AugDemo/PowerSpectrogram02.png")
plt.show()
```

### Augmentation


#### Random Pitch Shift
#### Random Time Stretch
#### Random Amplitude
#### Add Noises


##### helper

```{python}
"""
Scaling spectrogram dimension (time/frequency) by a given factor.
Called by TimeStretch and PitchShift
"""
def _scale(spectrogram: torch.Tensor, shift_factor: float, dim: int):
    in_dim = spectrogram.dim()
    if in_dim < 3:
        raise ValueError(
            "Expected spectrogram with size (c t f) or (n c t f)"
            ", but got {}".format(spectrogram.size())
        )
    if in_dim == 3:
        spectrogram.unsqueeze_(dim=0) # F.interpolate(): needs (n c t f)
    size = list(spectrogram.shape)[2:]
    dim -= 1
    size[dim] = int(round(size[dim] * shift_factor)) # increase or decrease the amount of bins
    spectrogram = F.interpolate(spectrogram, size=size, mode="nearest") # scale with interpolation (next neighbor) to the new shape
    if in_dim == 3:
        spectrogram.squeeze_(dim=0) # reshaped to (c t f)
    return spectrogram
```

#### 1. Random Pitch Shift

```{python}
"""
Randomly shifts the pitch of a spectrogram by a factor of 2**Uniform(log2(from), log2(to)) (small pitch shift, not drastic).
Human perceive pitch logarithmically, in log2 (https://www.reddit.com/r/musictheory/comments/360ner/why_do_humans_perceive_sound_in_log_base_2/)
PitchShift: change how fast the original cycle is being played back, without changing duration 
            the harmonic structure is still preserved (i.e. freq peaks are multiples of fundamental freq)
"""
class RandomPitchShift(object):

    def __init__(self, from_=0.5, to_=1.5):
        self.from_ = math.log2(from_) #log2(0.5) = -1
        self.to_ = math.log2(to_) #log2(0.5) = 0.58

    def __call__(self, spectrogram: torch.Tensor):
        # uniform distribution; number of steps through which pitch must be shifted
        factor = 2 ** torch.empty((1,)).uniform_(self.from_, self.to_).item() # .item() return value of this tensor as standard Python number
        median = spectrogram.median()
        size = list(spectrogram.shape)
        scaled = _scale(spectrogram, factor, dim=2)
        if factor > 1:
            out = scaled[:, :, : size[2]]
        else:
            out = torch.full(size, fill_value=median, dtype=spectrogram.dtype)
            new_f_bins = int(round(size[2] * factor))
            out[:, :, 0:new_f_bins] = scaled
        return out

```

```{python}
trans_pitch = RandomPitchShift()
shifted_pitch = trans_pitch(stft)
```

```{python}
print(shifted_pitch.shape)
```

```{python}
plt.imshow(shifted_pitch.squeeze())
```

```{python}
# prepare axis
f = np.linspace(0, sr/2, num=stft.shape[-1])
t = np.arange(0, stft.shape[-2] + 1) * hop_length / sr

plt.figure(2)
plt.pcolormesh(f, t, shifted_pitch.squeeze(), shading='auto')
plt.xlabel("freq (Hz)")
plt.ylabel("time(s)")
plt.title("Random Pitch Shift on Power Spectrogram")
plt.savefig("/home/am_deepal/Desktop/AugDemo/Random_Pitch_Shift_PowerSpectrogram02.png")
plt.show()
```

#### 2. Random Time Stretch

```{python}
"""Randomly stretches the time of a spectrogram by a factor of 2**Uniform(log2(from), log2(to))."""
class RandomTimeStretch(object):

    def __init__(self, from_=0.5, to_=2):
        self.from_ = math.log2(from_)
        self.to_ = math.log2(to_)

    def __call__(self, spectrogram: torch.Tensor):
        factor = 2 ** torch.empty((1,)).uniform_(self.from_, self.to_).item()
        return _scale(spectrogram, factor, dim=1)
```

```{python}
trans_time = RandomTimeStretch()
shifted_time = trans_time(stft)
```

```{python}
shifted_time.shape
```

```{python}
# prepare axis
f = np.linspace(0, sr/2, num=stft.shape[-1])
t = np.arange(0, shifted_time.shape[-2] + 1) * hop_length / sr

plt.figure(3)
plt.pcolormesh(f, t, shifted_time.squeeze(), shading='auto')
plt.xlabel("freq (Hz)")
plt.ylabel("time(s)")
plt.title("Random Time Stretch on Power Spectrogram")
plt.savefig("/home/am_deepal/Desktop/AugDemo/Random_Time_Stretch_PowerSpectrogram02.png")
plt.show()
```

#### 3. Random Amplitude

```{python}
"""
Randomly scaling (uniform distributed) the amplitude based on a given input spectrogram (intensity augmenation).
(color in spectrogram)
"""
class RandomAmplitude(object):
    def __init__(self, increase_db=3, decrease_db=None):
        self.inc_db = increase_db
        if decrease_db is None:
            decrease_db = -increase_db
        elif decrease_db > 0:
            decrease_db *= -1
        self.dec_db = decrease_db

    def __call__(self, spec):
        db_change = torch.randint(
            self.dec_db, self.inc_db, size=(1,), dtype=torch.float
        )
        return spec.mul(10 ** (db_change / 10))
```

```{python}
trans_amp = RandomAmplitude()
rand_amp = trans_amp(stft)
```

```{python}
rand_amp.shape
```

```{python}
# prepare axis
f = np.linspace(0, sr/2, num=stft.shape[-1])
t = np.arange(0, stft.shape[-2] + 1) * hop_length / sr

plt.figure(4)
plt.pcolormesh(f, t, rand_amp.squeeze(), shading='auto')
plt.xlabel("freq (Hz)")
plt.ylabel("time(s)")
plt.title("Random Amplitude on Power Spectrogram")
plt.savefig("/home/am_deepal/Desktop/AugDemo/Random_Amp_PowerSpectrogram03.png")
plt.show()
```

#### 4. Add Noises

```{python}
""" 
Randomly adds a given noise file to the given spectrogram by considering a randomly selected
(uniform distributed) SNR of min = -3 dB and max = 12 dB. The noise file could also be intensity, pitch, and/or time
augmented. If a noise file is longer/shorter than the given spectrogram it will be subsampled/self-concatenated. 
The spectrogram is expected to be a power spectrogram, which is **not** logarithmically compressed (for calc SNR).
Perform also histogram equalizing so that it looks more like a spectrogram
"""
class RandomAddNoise(object):

    def __init__(
        self,
        noise_files: List(str),
        spectrogram_transform,
        transform,
        min_length=0,
        min_snr=12,
        max_snr=-3,
        return_original=False,
    ):
        if not noise_files:
            raise ValueError("No noise files found")
        self.noise_files = noise_files
        self.t_spectrogram = spectrogram_transform # preemphasize and stft on noise files
        self.noise_file_locks = {file: Lock() for file in noise_files}
        self.transform = transform # choose pitch shift, time stretch, freq compression on noise files before adding noise
        self.min_length = min_length
        self.t_pad = PaddedSubsequenceSampler(sequence_length=min_length, dim=1)
        self.min_snr = min_snr if min_snr > max_snr else max_snr
        self.max_snr = max_snr if min_snr > max_snr else min_snr
        self.return_original = return_original

    def __call__(self, spectrogram):
        if len(self.noise_files) == 1:
            idx = 0
        else:
            idx = torch.randint(
                0, len(self.noise_files) - 1, size=(1,), dtype=torch.long
            ).item()
        noise_file = self.noise_files[idx]

        try:
            if not self.noise_file_locks[noise_file].acquire(timeout=10):
                print("Warning: Could not acquire lock for {}".format(noise_file))
                return spectrogram
            noise_spec = self.t_spectrogram(noise_file)
        except Exception:
            import traceback

            print(traceback.format_exc())
            return spectrogram
        finally:
            self.noise_file_locks[noise_file].release()

        noise_spec = self.t_pad._maybe_sample_subsequence(
            noise_spec, spectrogram.size(1) * 2
        )
        noise_spec = self.transform(noise_spec)

        if self.min_length > 0:
            spectrogram = self.t_pad._maybe_pad(spectrogram)

        if spectrogram.size(1) > noise_spec.size(1):
            n_repeat = int(math.ceil(spectrogram.size(1) / noise_spec.size(1)))
            noise_spec = noise_spec.repeat(1, n_repeat, 1)
        if spectrogram.size(1) < noise_spec.size(1):
            high = noise_spec.size(1) - spectrogram.size(1)
            start = torch.randint(0, high, size=(1,), dtype=torch.long)
            end = start + spectrogram.size(1)
            noise_spec_part = noise_spec[:, start:end]
        else:
            noise_spec_part = noise_spec

        snr = torch.randint(self.max_snr, self.min_snr, size=(1,), dtype=torch.float)
        signal_power = spectrogram.sum()
        noise_power = noise_spec_part.sum()

        K = (signal_power / noise_power) * 10 ** (-snr / 10) # maintain a certain SNR after injecting noises
        spectrogram_aug = spectrogram + noise_spec_part * K

        if self.return_original:
            return spectrogram_aug, spectrogram
        return spectrogram_aug
```

### Freq. compression


<img src="call_types_freqs.png">

```{python}
df = pd.read_csv('/home/am_deepal/Desktop/AugDemo/call_type_time_freq_lookup.ods')
```

```{python}
"""
Frequency compression of a given frequency range into a chosen number of frequency bins (orcaspot: 256).
used for frequency compression - linear
"""
class Interpolate(object):
    def __init__(self, n_freqs, sr=None, f_min=0, f_max=None):
        self.n_freqs = n_freqs # n_freq_bins --> defines the last dim of input to network
        self.sr = sr
        self.f_min = f_min
        self.f_max = f_max

    def __call__(self, spec):
        # num of freq bins = n_fft/2 + 1; f_fft: num of samples for computing each frame
        n_fft = (spec.size(-1) - 1) * 2 # spec.size(-1): get number of freq bins from spec

        if self.sr is not None and n_fft is not None:
            # freq resolution/bin width = sample_freq / n_fft (https://dsp.stackexchange.com/questions/31203/frequency-resolution-of-dft)
            min_bin = int(max(0, math.floor(n_fft * self.f_min / self.sr)))
            max_bin = int(min(n_fft - 1, math.ceil(n_fft * self.f_max / self.sr)))
            spec = spec[:, :, min_bin:max_bin] # input to network is focused between f_min and f_max

        #spec.unsqueeze_(dim=0) # input shape for interpolate(): (mini-batch x channels x [optional depth] x [optional height] x width)
        # F.interpolate(): down/up samples the input to either the given size or the given scale_factor
        # nearest neighbor interpolation: https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
        spec = F.interpolate(spec, size=(spec.size(-2), self.n_freqs), mode="nearest") # returns: size=(time_frames, self.n_freqs) or (T, 256) in orcaspot;
        return spec.squeeze(dim=0)
```

```{python}
f_min = 100
f_max = 2500
freq_comp = Interpolate(256, sr, f_min, f_max)
lin_freq = freq_comp(stft)
```

```{python}
print("Required shape for F.interpolate is ", stft.shape)
print("After freq_compression, the new shape is ", lin_freq.shape)
```

```{python}
# prepare axis
f = np.linspace(f_min, f_max, num=lin_freq.shape[-1])
t = np.arange(0, lin_freq.shape[-2] + 1) * hop_length / sr

plt.figure(5)
plt.pcolormesh(f, t, lin_freq.squeeze(), shading='auto')
plt.xlabel("freq (Hz)")
plt.ylabel("time(s)")
plt.title("Freq Compression on Power Spectrogram")
plt.savefig("/home/am_deepal/Desktop/AugDemo/FreqCompression_PowerSpectrogram02.png")
plt.show()
```

### Amp. to log-scale

```{python}
"""Turns a spectrogram from the power/amplitude scale to the decibel scale (log scale of amplitudes)."""
class Amp2Db(object):

    def __init__(self, min_level_db=None, stype="power"):
        self.stype = stype
        self.multiplier = 10. if stype == "power" else 20.
        if min_level_db is None:
            self.min_level = None
        else:
            min_level_db = -min_level_db if min_level_db > 0 else min_level_db
            self.min_level = torch.tensor(
                np.exp(min_level_db / self.multiplier * np.log(10))
            )

    def __call__(self, spec):
        if self.min_level is not None:
            spec_ = torch.max(spec, self.min_level)
        else:
            spec_ = spec
        spec_db = self.multiplier * torch.log10(spec_) # log scale: dB-scale
        return spec_db
```

### Normalization (db or mean-std)

```{python}

```

```{python}

```

### Classification - Transfer Learning

```{python}
import librosa
import matplotlib.pyplot as plt
import librosa.display
from pathlib import Path
import matplotlib.pyplot as plot
from scipy.io import wavfile
import numpy as np
import pylab
import imageio
import cv2
```

```{python}
audiofolders =['ph','sm','phtb']

for folder in audiofolders:
      
  path = Path('PATH'+folder).glob('**/*.wav')
  wavs = [str(wavf) for wavf in path if wavf.is_file()]
  wavs.sort()

  number_of_files=len(wavs)

  spk_ID = [wavs[i].split('/')[-1][:-4].lower() for i in range(number_of_files)]

  for i in range(number_of_files):
      y, sr = librosa.load(wavs[i], sr=None)
      
      p =librosa.stft(y)
      S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=512)
      log_S = librosa.amplitude_to_db(S)
# #       # # plt.figure(figsize=(12,4))
#       # pylab.figure(figsize=(3,3))
      # plt.figure(figsize=(2.24, 2.24))
      # pylab.axis('off') 
      # pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[])
      
      # librosa.display.specshow(log_S)
#       # # plot.title('spec of vowel')
#       # # plot.xlabel('time')
      # # plot.ylabel('freq')
#       # # plt.margins(0)
#       # print(log_S.shape)
#       # print(log_S.min())
#       # print(sr)
      # print(a.shape)
#       # print(spk_ID[i])
#       # print(log_S)
      
      mean = log_S.mean()
      std = log_S.std()
      S_norm = (log_S - mean) / (std + 1e-6)
      S_min, S_max = S_norm.min(), S_norm.max()
      S_scaled = 255 * (S_norm - S_min) / (S_max - S_min)
      S_scaled = S_scaled.astype(np.uint8)
      # # S_scaled_rgb = np.repeat(S_scaled[np.newaxis,...], 3, -1)
      # im_torch = torchvision.transforms.ToTensor()(S_scaled)
      # S_scaled_rgb = im_torch.expand(3,-1,-1)
      
      
      # print(log_S.shape)
      # print(S_scaled_rgb)
      
      save_path = 'PATH' + folder +'/'
      # # # plt.savefig(str(save_path)+"{}.png".format(spk_ID[i]))
      # # np.save(str(save_path)+"{}.npy".format(spk_ID[i]),log_S)
      imageio.imwrite(str(save_path)+"{}.png".format(spk_ID[i]),S_scaled[::-1])
      
    
# # For validation

audiofolders =['ph','sm','phtb']

for folder in audiofolders:
      
  path = Path('PATH'+folder).glob('**/*.wav')
  wavs = [str(wavf) for wavf in path if wavf.is_file()]
  wavs.sort()

  number_of_files=len(wavs)

  spk_ID = [wavs[i].split('/')[-1][:-4].lower() for i in range(number_of_files)]

  for i in range(number_of_files):
      y, sr = librosa.load(wavs[i], sr=None)
      
      p =librosa.stft(y)
      S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=512)
      log_S = librosa.amplitude_to_db(S, ref=np.max)
#       # # plt.figure(figsize=(12,4))
#       # pylab.figure(figsize=(3,3))
      # plt.figure(figsize=(2.24, 2.24))
      # pylab.axis('off') 
      # pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[])
      
      # librosa.display.specshow(log_S)
      # # plot.title('spec of vowel')
      # # plot.xlabel('time')
      # # plot.ylabel('freq')
      # # plt.margins(0)
      # print(log_S.shape)
      # print(log_S.min())
      # print(sr)
      
      # print(spk_ID[i])
      # # print(log_S)
      
      mean = log_S.mean()
      std = log_S.std()
      S_norm = (log_S - mean) / (std + 1e-6)
      S_min, S_max = S_norm.min(), S_norm.max()
      S_scaled = 255 * (S_norm - S_min) / (S_max - S_min)
      S_scaled = S_scaled.astype(np.uint8)
      # # S_scaled_rgb = np.repeat(S_scaled[np.newaxis,...], 3, -1)
      # im_torch = torchvision.transforms.ToTensor()(S_scaled)
      # S_scaled_rgb = im_torch.expand(3,-1,-1)
      
      # # print(S_scaled)
      
      # print(S_scaledb.shape)

      save_path = 'PATH' + folder +'/'
      # np.save(str(save_path)+"{}.npy".format(spk_ID[i]),S_scaled_rgb)
      imageio.imwrite(str(save_path)+"{}.png".format(spk_ID[i]),S_scaled[::-1])

```