Fix the estimator creating a duplicate 'loss_1' summary

WIP (#3) * The estimator has an hardcoded 'loss' summary. In case the is no loss summary with exactly this name, or there is a scope with this name, the estimator will create a new duplicate summary or will alter the scope name.
yweweler · Nov 28, 2018 · ce8aa70 · ce8aa70
1 parent df9e88b
commit ce8aa70
Show file tree

Hide file tree

Showing 6 changed files with 33 additions and 21 deletions.
diff --git a/tacotron/model.py b/tacotron/model.py
@@ -552,9 +552,14 @@ def summary(self, mode):
 
         # Training only ============================================================================
         if mode == tf.estimator.ModeKeys.TRAIN:
-            with tf.name_scope('loss'):
-                # Note, the estimator will write the loss_op as 'loss/loss'.
-                # tf.summary.scalar('loss', self.loss_op)
+            # Note, for some stupid reason, the estimator will search for an existing loss
+            # summary ('loss'). In case it is named different it will create another summary
+            # with the name 'loss'. To prevent this duplication the final loss has to be
+            # called 'loss'.
+            tf.summary.scalar('loss', self.loss_op)
+
+            with tf.name_scope('losses'):
+                tf.summary.scalar('loss_total', self.loss_op)
                 tf.summary.scalar('loss_decoder', self.loss_op_decoder)
                 tf.summary.scalar('loss_post_processing', self.loss_op_post_processing)
 

diff --git a/tacotron/params/dataset.py b/tacotron/params/dataset.py
@@ -8,7 +8,7 @@
 # Default hyper-parameters:
 dataset_params = tf.contrib.training.HParams(
     # Folder containing the dataset.
-    dataset_folder='/thesis/datasets/ljspeech',
+    dataset_folder='/home/yves-noel/documents/master/thesis/datasets/LJSpeech-1.1',
 
     # Dataset load helper.
     dataset_loader=LJSpeechDatasetHelper,

diff --git a/tacotron/params/evaluation.py b/tacotron/params/evaluation.py
@@ -3,7 +3,7 @@
 # Default hyper-parameters:
 evaluation_params = tf.contrib.training.HParams(
     # Batch size used for evaluation.
-    batch_size=32,
+    batch_size=2,
 
     # Number of threads used to load data during evaluation.
     n_threads=4,
@@ -22,14 +22,14 @@
 
     # The number of buckets to create. Note that this is the number of buckets that are actually
     # created. If less buckets are needed for proper sorting of the data, less buckets are used.
-    n_buckets=20,
+    n_buckets=5,
 
     # Flag enabling the bucketing mechanism to output batches of smaller size than
     # `batch_size` if not enough samples are available.
     allow_smaller_batches=True,
 
     # Checkpoint folder used for loading the latest checkpoint.
-    checkpoint_dir='/tmp/tacotron/ljspeech/LJSpeech',
+    checkpoint_dir='/tmp/checkpoints/ljspeech',
 
     # Run folder to load a checkpoint from the checkpoint folder.
     checkpoint_load_run='train',
@@ -38,11 +38,11 @@
     checkpoint_save_run='evaluate',
 
     # Flag to control if all checkpoints or only the latest one should be evaluated.
-    evaluate_all_checkpoints=False,
+    evaluate_all_checkpoints=True,
 
     # Number of global steps after which to save the model summary.
     summary_save_steps=50,
 
     # Number of global steps after which to log the global steps per second.
-    performance_log_steps=50
+    performance_log_steps=1
 )
diff --git a/tacotron/params/model.py b/tacotron/params/model.py
@@ -48,7 +48,7 @@
     reconstruction_iterations=50,
 
     # Flag allowing to force the use accelerated RNN implementation from CUDNN.
-    force_cudnn=True,
+    force_cudnn=False,
 
     # Encoder network parameters.
     encoder=tf.contrib.training.HParams(

diff --git a/tacotron/params/training.py b/tacotron/params/training.py
@@ -5,42 +5,42 @@
 # Default hyper-parameters:
 training_params = tf.contrib.training.HParams(
     # Number of training epochs.
-    n_epochs=5000000,
+    n_epochs=2,
 
     # Batch size used for training.
-    batch_size=40,
+    batch_size=2,
 
     # Number of threads used to load data during training.
     n_threads=4,
 
     # Maximal number of samples to load from the train dataset.
-    max_samples=None,
+    max_samples=100,
 
     # Flag that enables/disables sample shuffle at the beginning of each epoch.
     shuffle_samples=True,
 
     # Flag telling the training code to load pre-processed features or calculate them on the fly.
-    load_preprocessed=True,
+    load_preprocessed=False,
 
     # Cache preprocessed features in RAM entirely.
     cache_preprocessed=True,
 
     # Number of batches to pre-calculate for feeding to the GPU.
-    n_pre_calc_batches=16,
+    n_pre_calc_batches=4,
 
     # Number of samples each bucket can pre-fetch.
-    n_samples_per_bucket=16,
+    n_samples_per_bucket=4,
 
     # The number of buckets to create. Note that this is the number of buckets that are actually
     # created. If less buckets are needed for proper sorting of the data, less buckets are used.
-    n_buckets=20,
+    n_buckets=5,
 
     # Flag enabling the bucketing mechanism to output batches of smaller size than
     # `batch_size` if not enough samples are available.
     allow_smaller_batches=False,
 
     # Checkpoint folder used for training.
-    checkpoint_dir='/thesis/checkpoints/ljspeech',
+    checkpoint_dir='/tmp/checkpoints/ljspeech',
 
     # Run folder to load data from and save data in to the checkpoint folder.
     checkpoint_run='train',
@@ -49,10 +49,10 @@
     # checkpoint_save_secs=60 * 30,
 
     # Number of batches after which to save a checkpoint.
-    checkpoint_save_steps=5000,
+    checkpoint_save_steps=10,
 
     # Number of global steps after which to save the model summary.
-    summary_save_steps=50,
+    summary_save_steps=10,
 
     # Flag controlling whether to actually write a write a summary during training.
     # The only exceptions to this are the attention alignment plots and the train losses.
@@ -62,7 +62,7 @@
     checkpoints_to_keep=3000,
 
     # Number of global steps after which to log the global steps per second.
-    performance_log_steps=50,
+    performance_log_steps=1,
 
     # The clipping ratio used for gradient clipping by global norm.
     gradient_clip_norm=1.0,

diff --git a/tacotron/train.py b/tacotron/train.py
@@ -28,6 +28,13 @@ def main(_):
         )
     )
 
+    # NOTE: During training an estimator may add the following hooks on its own:
+    # (NanTensorHook, LoggingTensorHook, CheckpointSaverHook).
+    # A `NanTensorHook` is always created.
+    # A `LoggingTensorHook` is created if `log_step_count_steps` is set.
+    # A `CheckpointSaverHook` is not created if an existing hook is found in `training_hooks`.
+    # If multiple `CheckpointSaverHook` objects are found only the first is used (This behaviour
+    # is not very obvious as it does not output a warning).
     config = tf.estimator.RunConfig(
         model_dir=checkpoint_dir,
         session_config=session_config,