config/self_supervised/vqapc_example.yml

# All hyper-parameters taken from the official implementation at
# https://github.com/iamyuanchung/VQ-APC/tree/96230cc358b174b736b4c0e7664b3e72b304d9b0
data:
  # Dataset-related setting
  dataset:
    name: 'LibriSpeech'    # Specify dataset, must match to dataset/<corpus>.py
    path: '/path/to/LibriSpeech'         # Path to unzipped LibriSpeech dataset
    train_split: ['train-clean-360']        # Splits to be used as training set
    dev_split: ['dev-clean']                # Splits to be used as valid.   set
    batch_size: 32                                                # Batch sizes
    audio_max_frames: 1500     # Max length of spectrogram to ensure batch size
  # Attributes of audio feature
  audio:
    feat_type: 'fbank'                                           # Feature type
    feat_dim:  80                                           # Feature dimension
    frame_length: 25                                        # Window size in ms
    frame_shift: 10                                            # Hop size in ms
    cmvn: True                      # Apply uttr.-wised CMVN on Mel spectrogram

model:
  method: 'vqapc'                                       # Accepts npc/apc/vqapc
  n_future: 5               # Target the next n-th frame in future for learning
  paras:
    num_layers: 3             # Number of GRU layers to be stacked in APC model
    hidden_size: 512                       # Dimension of feature of all layers
    dropout: 0.1                                   # Dropout between GRU layers
    residual: True                     # Residual Connection between GRU layers
    vq:                                         # Remove this part to train APC
      codebook_size: [512]            # Codebook size of each group in VQ-layer
      code_dim: [512]             # Dim of each group summing up to hidden_size
      gumbel_temperature: 0.5       # Temperature of Gumbel Softmax in VQ-layer

hparas:
    optimizer: 'Adam'
    lr: 0.0001
    epoch: 100