-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathhparams.py
89 lines (72 loc) · 3.15 KB
/
hparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import tensorflow as tf
import numpy as np
# Default hyperparameters
hparams = tf.contrib.training.HParams(
wavernn_data_random_state = 12345,
wavernn_random_seed = 5678,
cell_type = "GRU_STD", # or GRU_FC
num_units = 896,
num_layers = 3,
kernel_size =9,
#padding
padding = 12, # kernel_size // 2 * num_layers
input_type = "dual_channels_quantize",
num_mels = 80, # actually 25
num_channels = 256,
multiples = 400, # =hop_length
sample_rate = 22050,
scaling = 0.185,
hop_size = 400,
win_size = 1600,
n_fft = 2048, # not less than win_length
fmin = 0,
fmax = 8000,
cin_channels = 80, #Set this to -1 to disable local conditioning, else it must be equal to mel_nums!!
gin_channels = -1, #Set this to -1 to disable global conditioning, Only used for multi speaker dataset
max_time_sec = None,
max_time_steps = 13000, #Max time steps in audio used to train wavernn (decrease to save memory)
swap_with_cpu = False,
encoder_conditional_features = True,
dropout = 0,
quantize_channels = 256,
mel_bias = 5.0,
mel_scale = 10.0,
training_batch_size = 48,
testing_batch_size = 12,
# audio data proprocessing
rescale=True, # Whether to rescale audio prior to preprocessing
rescaling_max=0.999, # Rescaling value
trim_silence=True, # Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
clip_mels_length=True, # For cases of OOM (Not really recommended, working on a workaround)
max_mel_frames=900, # Only relevant when clip_mels_length = True
# M-AILABS (and other datasets) trim params
trim_fft_size=512,
trim_hop_size=128,
trim_top_db=60,
# Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
# It's preferred to set True to use with https://github.com/r9y9/wavernn_vocoder
# Does not work if n_ffit is not multiple of hop_size!!
use_lws=False,
silence_threshold=2, # silence threshold used for sound trimming for wavernn preprocessing
# Limits
min_level_db=-100,
ref_level_db=20,
# Mel and Linear spectrograms normalization/scaling and clipping
signal_normalization=True,
allow_clipping_in_normalization=True, # Only relevant if mel_normalization = True
symmetric_mels=True, # Whether to scale the data to be symmetric around 0
max_abs_value=4., # max absolute value of data. If symmetric, data will be [-max, max] else [0, max]
wavernn_batch_size=2, # batch size used to train wavernn.
wavernn_test_size=0.0441, # % of data to keep as test data, if None, wavernn_test_batches must be not None
wavernn_test_batches=None, # number of test batches.
wavernn_learning_rate=1e-3,
wavernn_adam_beta1=0.9,
wavernn_adam_beta2=0.999,
wavernn_adam_epsilon=1e-6,
wavernn_ema_decay=0.9999, # decay rate of exponential moving average
wavernn_dropout=0.05, # drop rate of wavernn layers
)
def hparams_debug_string():
values = hparams.values()
hp = [' %s: %s' % (name, values[name]) for name in sorted(values) if name != 'sentences']
return 'Hyperparameters:\n' + '\n'.join(hp)