diff --git a/tacotron/model.py b/tacotron/model.py index 57fd559..a2492f9 100644 --- a/tacotron/model.py +++ b/tacotron/model.py @@ -552,9 +552,14 @@ def summary(self, mode): # Training only ============================================================================ if mode == tf.estimator.ModeKeys.TRAIN: - with tf.name_scope('loss'): - # Note, the estimator will write the loss_op as 'loss/loss'. - # tf.summary.scalar('loss', self.loss_op) + # Note, for some stupid reason, the estimator will search for an existing loss + # summary ('loss'). In case it is named different it will create another summary + # with the name 'loss'. To prevent this duplication the final loss has to be + # called 'loss'. + tf.summary.scalar('loss', self.loss_op) + + with tf.name_scope('losses'): + tf.summary.scalar('loss_total', self.loss_op) tf.summary.scalar('loss_decoder', self.loss_op_decoder) tf.summary.scalar('loss_post_processing', self.loss_op_post_processing) diff --git a/tacotron/params/dataset.py b/tacotron/params/dataset.py index 52649f6..d45d5bc 100644 --- a/tacotron/params/dataset.py +++ b/tacotron/params/dataset.py @@ -8,7 +8,7 @@ # Default hyper-parameters: dataset_params = tf.contrib.training.HParams( # Folder containing the dataset. - dataset_folder='/thesis/datasets/ljspeech', + dataset_folder='/home/yves-noel/documents/master/thesis/datasets/LJSpeech-1.1', # Dataset load helper. dataset_loader=LJSpeechDatasetHelper, diff --git a/tacotron/params/evaluation.py b/tacotron/params/evaluation.py index 6cf82e3..3fe4ad3 100644 --- a/tacotron/params/evaluation.py +++ b/tacotron/params/evaluation.py @@ -3,7 +3,7 @@ # Default hyper-parameters: evaluation_params = tf.contrib.training.HParams( # Batch size used for evaluation. - batch_size=32, + batch_size=2, # Number of threads used to load data during evaluation. n_threads=4, @@ -22,14 +22,14 @@ # The number of buckets to create. Note that this is the number of buckets that are actually # created. If less buckets are needed for proper sorting of the data, less buckets are used. - n_buckets=20, + n_buckets=5, # Flag enabling the bucketing mechanism to output batches of smaller size than # `batch_size` if not enough samples are available. allow_smaller_batches=True, # Checkpoint folder used for loading the latest checkpoint. - checkpoint_dir='/tmp/tacotron/ljspeech/LJSpeech', + checkpoint_dir='/tmp/checkpoints/ljspeech', # Run folder to load a checkpoint from the checkpoint folder. checkpoint_load_run='train', @@ -38,11 +38,11 @@ checkpoint_save_run='evaluate', # Flag to control if all checkpoints or only the latest one should be evaluated. - evaluate_all_checkpoints=False, + evaluate_all_checkpoints=True, # Number of global steps after which to save the model summary. summary_save_steps=50, # Number of global steps after which to log the global steps per second. - performance_log_steps=50 + performance_log_steps=1 ) diff --git a/tacotron/params/model.py b/tacotron/params/model.py index ee5c7b9..ea22df1 100644 --- a/tacotron/params/model.py +++ b/tacotron/params/model.py @@ -48,7 +48,7 @@ reconstruction_iterations=50, # Flag allowing to force the use accelerated RNN implementation from CUDNN. - force_cudnn=True, + force_cudnn=False, # Encoder network parameters. encoder=tf.contrib.training.HParams( diff --git a/tacotron/params/training.py b/tacotron/params/training.py index 9349a4a..b859040 100644 --- a/tacotron/params/training.py +++ b/tacotron/params/training.py @@ -5,42 +5,42 @@ # Default hyper-parameters: training_params = tf.contrib.training.HParams( # Number of training epochs. - n_epochs=5000000, + n_epochs=2, # Batch size used for training. - batch_size=40, + batch_size=2, # Number of threads used to load data during training. n_threads=4, # Maximal number of samples to load from the train dataset. - max_samples=None, + max_samples=100, # Flag that enables/disables sample shuffle at the beginning of each epoch. shuffle_samples=True, # Flag telling the training code to load pre-processed features or calculate them on the fly. - load_preprocessed=True, + load_preprocessed=False, # Cache preprocessed features in RAM entirely. cache_preprocessed=True, # Number of batches to pre-calculate for feeding to the GPU. - n_pre_calc_batches=16, + n_pre_calc_batches=4, # Number of samples each bucket can pre-fetch. - n_samples_per_bucket=16, + n_samples_per_bucket=4, # The number of buckets to create. Note that this is the number of buckets that are actually # created. If less buckets are needed for proper sorting of the data, less buckets are used. - n_buckets=20, + n_buckets=5, # Flag enabling the bucketing mechanism to output batches of smaller size than # `batch_size` if not enough samples are available. allow_smaller_batches=False, # Checkpoint folder used for training. - checkpoint_dir='/thesis/checkpoints/ljspeech', + checkpoint_dir='/tmp/checkpoints/ljspeech', # Run folder to load data from and save data in to the checkpoint folder. checkpoint_run='train', @@ -49,10 +49,10 @@ # checkpoint_save_secs=60 * 30, # Number of batches after which to save a checkpoint. - checkpoint_save_steps=5000, + checkpoint_save_steps=10, # Number of global steps after which to save the model summary. - summary_save_steps=50, + summary_save_steps=10, # Flag controlling whether to actually write a write a summary during training. # The only exceptions to this are the attention alignment plots and the train losses. @@ -62,7 +62,7 @@ checkpoints_to_keep=3000, # Number of global steps after which to log the global steps per second. - performance_log_steps=50, + performance_log_steps=1, # The clipping ratio used for gradient clipping by global norm. gradient_clip_norm=1.0, diff --git a/tacotron/train.py b/tacotron/train.py index 566de0d..a76ca1a 100644 --- a/tacotron/train.py +++ b/tacotron/train.py @@ -28,6 +28,13 @@ def main(_): ) ) + # NOTE: During training an estimator may add the following hooks on its own: + # (NanTensorHook, LoggingTensorHook, CheckpointSaverHook). + # A `NanTensorHook` is always created. + # A `LoggingTensorHook` is created if `log_step_count_steps` is set. + # A `CheckpointSaverHook` is not created if an existing hook is found in `training_hooks`. + # If multiple `CheckpointSaverHook` objects are found only the first is used (This behaviour + # is not very obvious as it does not output a warning). config = tf.estimator.RunConfig( model_dir=checkpoint_dir, session_config=session_config,