Skip to content

Commit

Permalink
Fix the estimator creating a duplicate 'loss_1' summary
Browse files Browse the repository at this point in the history
WIP (#3)

* The estimator has an hardcoded 'loss' summary. In case the is no loss summary with exactly this name, or there is a scope with this name, the estimator will create a new duplicate summary or will alter the scope name.
  • Loading branch information
yweweler committed Nov 28, 2018
1 parent df9e88b commit ce8aa70
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 21 deletions.
11 changes: 8 additions & 3 deletions tacotron/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,9 +552,14 @@ def summary(self, mode):

# Training only ============================================================================
if mode == tf.estimator.ModeKeys.TRAIN:
with tf.name_scope('loss'):
# Note, the estimator will write the loss_op as 'loss/loss'.
# tf.summary.scalar('loss', self.loss_op)
# Note, for some stupid reason, the estimator will search for an existing loss
# summary ('loss'). In case it is named different it will create another summary
# with the name 'loss'. To prevent this duplication the final loss has to be
# called 'loss'.
tf.summary.scalar('loss', self.loss_op)

with tf.name_scope('losses'):
tf.summary.scalar('loss_total', self.loss_op)
tf.summary.scalar('loss_decoder', self.loss_op_decoder)
tf.summary.scalar('loss_post_processing', self.loss_op_post_processing)

Expand Down
2 changes: 1 addition & 1 deletion tacotron/params/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# Default hyper-parameters:
dataset_params = tf.contrib.training.HParams(
# Folder containing the dataset.
dataset_folder='/thesis/datasets/ljspeech',
dataset_folder='/home/yves-noel/documents/master/thesis/datasets/LJSpeech-1.1',

# Dataset load helper.
dataset_loader=LJSpeechDatasetHelper,
Expand Down
10 changes: 5 additions & 5 deletions tacotron/params/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Default hyper-parameters:
evaluation_params = tf.contrib.training.HParams(
# Batch size used for evaluation.
batch_size=32,
batch_size=2,

# Number of threads used to load data during evaluation.
n_threads=4,
Expand All @@ -22,14 +22,14 @@

# The number of buckets to create. Note that this is the number of buckets that are actually
# created. If less buckets are needed for proper sorting of the data, less buckets are used.
n_buckets=20,
n_buckets=5,

# Flag enabling the bucketing mechanism to output batches of smaller size than
# `batch_size` if not enough samples are available.
allow_smaller_batches=True,

# Checkpoint folder used for loading the latest checkpoint.
checkpoint_dir='/tmp/tacotron/ljspeech/LJSpeech',
checkpoint_dir='/tmp/checkpoints/ljspeech',

# Run folder to load a checkpoint from the checkpoint folder.
checkpoint_load_run='train',
Expand All @@ -38,11 +38,11 @@
checkpoint_save_run='evaluate',

# Flag to control if all checkpoints or only the latest one should be evaluated.
evaluate_all_checkpoints=False,
evaluate_all_checkpoints=True,

# Number of global steps after which to save the model summary.
summary_save_steps=50,

# Number of global steps after which to log the global steps per second.
performance_log_steps=50
performance_log_steps=1
)
2 changes: 1 addition & 1 deletion tacotron/params/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
reconstruction_iterations=50,

# Flag allowing to force the use accelerated RNN implementation from CUDNN.
force_cudnn=True,
force_cudnn=False,

# Encoder network parameters.
encoder=tf.contrib.training.HParams(
Expand Down
22 changes: 11 additions & 11 deletions tacotron/params/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,42 +5,42 @@
# Default hyper-parameters:
training_params = tf.contrib.training.HParams(
# Number of training epochs.
n_epochs=5000000,
n_epochs=2,

# Batch size used for training.
batch_size=40,
batch_size=2,

# Number of threads used to load data during training.
n_threads=4,

# Maximal number of samples to load from the train dataset.
max_samples=None,
max_samples=100,

# Flag that enables/disables sample shuffle at the beginning of each epoch.
shuffle_samples=True,

# Flag telling the training code to load pre-processed features or calculate them on the fly.
load_preprocessed=True,
load_preprocessed=False,

# Cache preprocessed features in RAM entirely.
cache_preprocessed=True,

# Number of batches to pre-calculate for feeding to the GPU.
n_pre_calc_batches=16,
n_pre_calc_batches=4,

# Number of samples each bucket can pre-fetch.
n_samples_per_bucket=16,
n_samples_per_bucket=4,

# The number of buckets to create. Note that this is the number of buckets that are actually
# created. If less buckets are needed for proper sorting of the data, less buckets are used.
n_buckets=20,
n_buckets=5,

# Flag enabling the bucketing mechanism to output batches of smaller size than
# `batch_size` if not enough samples are available.
allow_smaller_batches=False,

# Checkpoint folder used for training.
checkpoint_dir='/thesis/checkpoints/ljspeech',
checkpoint_dir='/tmp/checkpoints/ljspeech',

# Run folder to load data from and save data in to the checkpoint folder.
checkpoint_run='train',
Expand All @@ -49,10 +49,10 @@
# checkpoint_save_secs=60 * 30,

# Number of batches after which to save a checkpoint.
checkpoint_save_steps=5000,
checkpoint_save_steps=10,

# Number of global steps after which to save the model summary.
summary_save_steps=50,
summary_save_steps=10,

# Flag controlling whether to actually write a write a summary during training.
# The only exceptions to this are the attention alignment plots and the train losses.
Expand All @@ -62,7 +62,7 @@
checkpoints_to_keep=3000,

# Number of global steps after which to log the global steps per second.
performance_log_steps=50,
performance_log_steps=1,

# The clipping ratio used for gradient clipping by global norm.
gradient_clip_norm=1.0,
Expand Down
7 changes: 7 additions & 0 deletions tacotron/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ def main(_):
)
)

# NOTE: During training an estimator may add the following hooks on its own:
# (NanTensorHook, LoggingTensorHook, CheckpointSaverHook).
# A `NanTensorHook` is always created.
# A `LoggingTensorHook` is created if `log_step_count_steps` is set.
# A `CheckpointSaverHook` is not created if an existing hook is found in `training_hooks`.
# If multiple `CheckpointSaverHook` objects are found only the first is used (This behaviour
# is not very obvious as it does not output a warning).
config = tf.estimator.RunConfig(
model_dir=checkpoint_dir,
session_config=session_config,
Expand Down

0 comments on commit ce8aa70

Please sign in to comment.