training/finetune_joint_tacotron2_hifigan.yaml

# This EXPERIMENTAL configuration is for ESPnet2 to finetune
# Conformer FastSpeech2 + HiFiGAN vocoder jointly. To run
# this config, you need to specify "--tts_task gan_tts"
# option for tts.sh at least and use 22050 hz audio as the
# training data (mainly tested on LJspeech).
# This configuration tested on 4 GPUs with 12GB GPU memory.
# It takes around less than 1 week to finish the training but
# 100k iters model should generate reasonable results.

# YOU NEED TO MODIFY THE "*_params" AND "init_param" SECTIONS
# IF YOU WANT TO USE YOUR OWN PRETRAINED MODLES.

##########################################################
#                  TTS MODEL SETTING                     #
##########################################################
tts: joint_text2wav
tts_conf:
    # copied from pretrained model's config.yaml
    text2mel_type: tacotron2
    text2mel_params:
        embed_dim: 512               # char or phn embedding dimension
        elayers: 1                   # number of blstm layers in encoder
        eunits: 512                  # number of blstm units
        econv_layers: 3              # number of convolutional layers in encoder
        econv_chans: 512             # number of channels in convolutional layer
        econv_filts: 5               # filter size of convolutional layer
        atype: location              # attention function type
        adim: 512                    # attention dimension
        aconv_chans: 32              # number of channels in convolutional layer of attention
        aconv_filts: 15              # filter size of convolutional layer of attention
        cumulate_att_w: true         # whether to cumulate attention weight
        dlayers: 2                   # number of lstm layers in decoder
        dunits: 1024                 # number of lstm units in decoder
        prenet_layers: 2             # number of layers in prenet
        prenet_units: 256            # number of units in prenet
        postnet_layers: 5            # number of layers in postnet
        postnet_chans: 512           # number of channels in postnet
        postnet_filts: 5             # filter size of postnet layer
        output_activation: null      # activation function for the final output
        use_batch_norm: true         # whether to use batch normalization in encoder
        use_concate: true            # whether to concatenate encoder embedding with decoder outputs
        use_residual: false          # whether to use residual connection in encoder
        spk_embed_dim: 192           # speaker embedding dimension
        spk_embed_integration_type: add # how to integrate speaker embedding
        dropout_rate: 0.5            # dropout rate
        zoneout_rate: 0.1            # zoneout rate
        reduction_factor: 1          # reduction factor
        use_masking: true            # whether to apply masking for padded part in loss calculation
        bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
        use_guided_attn_loss: true   # whether to use guided attention loss
        guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
        guided_attn_loss_lambda: 1.0 # strength of guided attention loss

    # copied from pretrained vocoder's config.yaml
    vocoder_type: hifigan_generator
    vocoder_params:
        bias: true
        channels: 512
        in_channels: 80
        kernel_size: 7
        nonlinear_activation: LeakyReLU
        nonlinear_activation_params:
            negative_slope: 0.1
        out_channels: 1
        resblock_dilations:
        - - 1
          - 3
          - 5
        - - 1
          - 3
          - 5
        - - 1
          - 3
          - 5
        resblock_kernel_sizes:
        - 3
        - 7
        - 11
        upsample_kernel_sizes:
        - 16
        - 16
        - 4
        - 4
        upsample_scales:
        - 8
        - 8
        - 2
        - 2
        use_additional_convs: true
        use_weight_norm: true

    # copied from pretrained vocoder's config.yaml
    discriminator_type: hifigan_multi_scale_multi_period_discriminator
    discriminator_params:
        follow_official_norm: true
        period_discriminator_params:
            bias: true
            channels: 32
            downsample_scales:
            - 3
            - 3
            - 3
            - 3
            - 1
            in_channels: 1
            kernel_sizes:
            - 5
            - 3
            max_downsample_channels: 1024
            nonlinear_activation: LeakyReLU
            nonlinear_activation_params:
                negative_slope: 0.1
            out_channels: 1
            use_spectral_norm: false
            use_weight_norm: true
        periods:
        - 2
        - 3
        - 5
        - 7
        - 11
        scale_discriminator_params:
            bias: true
            channels: 128
            downsample_scales:
            - 4
            - 4
            - 4
            - 4
            - 1
            in_channels: 1
            kernel_sizes:
            - 15
            - 41
            - 5
            - 3
            max_downsample_channels: 1024
            max_groups: 16
            nonlinear_activation: LeakyReLU
            nonlinear_activation_params:
                negative_slope: 0.1
            out_channels: 1
        scale_downsample_pooling: AvgPool1d
        scale_downsample_pooling_params:
            kernel_size: 4
            padding: 2
            stride: 2
        scales: 3

    # loss function related
    generator_adv_loss_params:
        average_by_discriminators: false # whether to average loss value by #discriminators
        loss_type: mse                   # loss type, "mse" or "hinge"
    discriminator_adv_loss_params:
        average_by_discriminators: false # whether to average loss value by #discriminators
        loss_type: mse                   # loss type, "mse" or "hinge"
    use_feat_match_loss: true            # whether to use feat match loss
    feat_match_loss_params:
        average_by_discriminators: false # whether to average loss value by #discriminators
        average_by_layers: false         # whether to average loss value by #layers of each discriminator
        include_final_outputs: true      # whether to include final outputs for loss calculation
    use_mel_loss: true     # whether to use mel-spectrogram loss
    mel_loss_params:
        fs: 22050          # must be the same as the training data
        n_fft: 1024        # fft points
        hop_length: 256    # hop size
        win_length: null   # window length
        window: hann       # window type
        n_mels: 80         # number of Mel basis
        fmin: 0            # minimum frequency for Mel basis
        fmax: null         # maximum frequency for Mel basis
        log_base: null     # null represent natural log
    lambda_text2mel: 1.0   # loss scaling coefficient for text2mel loss
    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss

    # others
    sampling_rate: 22050          # needed in the inference for saving wav
    segment_size: 32              # segment size for random windowed discriminator
    cache_generator_outputs: true # whether to cache generator outputs in the training

# extra module for additional inputs
#pitch_extract: dio           # pitch extractor type
#pitch_extract_conf:
#    reduction_factor: 1
#pitch_normalize: global_mvn  # normalizer for the pitch feature
#energy_extract: energy       # energy extractor type
#energy_extract_conf:
#    reduction_factor: 1
#energy_normalize: global_mvn # normalizer for the energy feature

# initialization (might need to modify for your own pretrained model)
init_param:
- exp/22k/tts_train_tacotron2_raw_char/train.loss.ave_5best.pth:tts:tts.generator.text2mel
- exp/22k/ljspeech_hifigan.v1/generator.pth::tts.generator.vocoder
- exp/22k/ljspeech_hifigan.v1/discriminator.pth::tts.discriminator

##########################################################
#            OPTIMIZER & SCHEDULER SETTING               #
##########################################################
# optimizer setting for generator
optim: adam
optim_conf:
    lr: 1.25e-5
    betas: [0.5, 0.9]
    weight_decay: 0.0
scheduler: exponentiallr
scheduler_conf:
    gamma: 0.999875
# optimizer setting for discriminator
optim2: adam
optim2_conf:
    lr: 1.25e-5
    betas: [0.5, 0.9]
    weight_decay: 0.0
scheduler2: exponentiallr
scheduler2_conf:
    gamma: 0.999875
generator_first: true # whether to start updating generator first

##########################################################
#                OTHER TRAINING SETTING                  #
##########################################################
#num_iters_per_epoch: 1000 # number of iterations per epoch
max_epoch: 500            # number of epochs
accum_grad: 1             # gradient accumulation
batch_bins: 1600000       # batch bins (feats_type=raw)
batch_type: numel         # how to make batch
grad_clip: -1             # gradient clipping norm
grad_noise: false         # whether to use gradient noise injection
sort_in_batch: descending # how to sort data in making batch
sort_batch: descending    # how to sort created batches
num_workers: 4            # number of workers of data loader
use_amp: false            # whether to use pytorch amp
log_interval: 50          # log interval in iterations
keep_nbest_models: 5      # number of models to keep
num_att_plot: 3           # number of attention figures to be saved in every check
seed: 777                 # random seed number
patience: null            # patience for early stopping
unused_parameters: true   # needed for multi gpu case
best_model_criterion:     # criterion to save the best models
-   - valid
    - text2mel_loss
    - min
-   - train
    - text2mel_loss
    - min
-   - train
    - total_count
    - max
cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
                           # in the case of GAN-TTS training, we strongly recommend setting to false
cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
                           # therefore, we set to false as a default (recommend trying both cases)