-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathhelpers.py
405 lines (320 loc) · 15 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
import tensorflow as tf
from tensorflow.contrib import seq2seq
from tensorflow.python.framework import tensor_shape
class TacotronInferenceHelper(seq2seq.Helper):
"""
Custom sequence to sequence inference helper for the Tacotron model.
This helper handles proper initialization of the decoder RNNs initial states and is
responsible for feeding the last frame of the decoder output as input to the next step.
See: https://github.com/tensorflow/tensorflow/issues/12065
"""
def __init__(self, batch_size, input_size, max_iterations=None):
"""
Creates an TacotronInferenceHelper instance.
Arguments:
batch_size (tf.Dimension):
Size of the current batch.
input_size (int):
RNN input size.
max_iterations (tf.Dimension):
The maximal number of frames to generate. Defaults to None.
If None generation will continue until the decoder reaches its own limit.
"""
self._batch_size = batch_size
self._input_size = input_size
# Set the sequence length to be generated according to max_iterations.
if max_iterations is None:
# Do not stop generating.
self._sequence_length = None
else:
# Create a tensor of length batch_size with each field containing max_iterations.
# Generates max_iterations frames for each batch entry.
self._sequence_length = tf.tile([max_iterations], [self._batch_size])
@property
def batch_size(self):
"""
Get the batch size of the current batch.
Returns:
tf.Dimension:
Batch size.
"""
return self._batch_size
@property
def sample_ids_shape(self):
"""
Shape of tensor returned by `sample`, excluding the batch dimension.
Note:
- Since the decoder does not output embeddings this function is basically irrelevant.
- However it has to be implemented since it is called for some reason.
Returns:
tf.TensorShape
"""
# Copied from the abstract seq2seq.CustomHelper class.
return tensor_shape.TensorShape([])
@property
def sample_ids_dtype(self):
"""
DType of tensor returned by `sample`.
Note:
- Since the decoder does not output embeddings this function is basically irrelevant.
- However it has to be implemented since it is called for some reason.
Returns:
tf.DType
"""
# Copied from the abstract seq2seq.CustomHelper class.
return tf.int32
def initialize(self, name=None):
"""
Query information used to initialize the decoder RNN.
This information includes information about whether the decoding process is finished yet
as well as the initial inputs to the RNN.
The initial state of the decoding process has to be that decoding is not finished.
As for the initial input we use a zero vector aka. <GO> frame.
Arguments:
name: Unused.
Returns:
(initial_finished, initial_inputs):
initial_finished (tf.Tensor):
A tensor indicating for each sequence in the batch that decoding is not
finished. The shape is shape=(B), with B being the batch size.
initial_inputs:
A all zero tensor resembling the <GO> frame used as the first decoder input.
"""
# When the decoder starts, there is no sequence in the batch that is finished.
initial_finished = tf.tile([False], [self._batch_size])
# The initial input for the decoder is considered to be a <GO> frame.
# We will input an zero vector as the <GO> frame.
initial_inputs = tf.zeros([self._batch_size, self._input_size], dtype=tf.float32)
return initial_finished, initial_inputs
def sample(self, time, outputs, state, name=None):
"""
Takes outputs and emits sample id's.
Note:
- Since the decoder does not use embeddings this function is basically irrelevant.
- However it has to be implemented since it is called for some reason.
Arguments:
time: Unused.
outputs: Unused.
state: Unused.
name: Unused.
Returns:
tf.Tensor
"""
# return None => ValueError: x and y must both be non-None or both be None
# Returning some tensor of dtype=tf.int32 and random shape seems to be enough.
return tf.zeros(1, dtype=tf.int32)
def __is_decoding_finished(self, next_time, outputs):
"""
Determine for each sequence in a batch if decoding is finished or not.
Arguments:
next_time:
The time count of the following decoding step.
outputs (tf.Tensor):
Outputs of the last decoder step. The shape is expected to be shape=(B, O),
with B being the batch size and O being the RNNs output size.
Returns:
tf.Tensor:
A tensor indicating for each sequence in the batch whether decoding is
finished. The shape is shape=(B), with B being the batch size.
"""
if self._sequence_length is None:
# Do not stop generating frames.
finished = tf.tile([False], [self._batch_size])
else:
# Stop if the desired sequence length was reached.
finished = (next_time >= self._sequence_length)
return finished
def next_inputs(self, time, outputs, state, sample_ids, name=None):
"""
Query the next RNN inputs and RNN state as well as whether decoding is finished or not.
Arguments:
time:
The time count of the previous decoding step.
outputs (tf.Tensor):
RNN outputs from the last decoding step. The shape is expected to be shape=(B, O),
with B being the batch size and O being the RNNs output size.
state:
RNN state from the last decoding step.
sample_ids: Unused.
name: Unused.
Returns:
(finished, next_inputs, next_state):
finished (tf.Tensor):
A tensor indicating for each sequence in the batch if decoding is finished.
The shape is shape=(B), with B being the batch size.
next_inputs (tf.Tensor):
Tensor containing the inputs for the next step. The shape is
shape=(B, input_size), with B being the batch size.
next_state:
RNN state.
"""
del sample_ids # unused by next_inputs
# Check if decoding is finished.
finished = self.__is_decoding_finished(next_time=time + 1,
outputs=outputs)
# Use the last steps outputs as the next steps inputs.
# When using the Tacotron reduction factor r the RNN produces an output of size
# r * `input_size`. But it only takes input of size `input_size`.
# We will therefore only pass every r'th frame to the next decoding step.
next_inputs = outputs[:, -self._input_size:]
# Use the resulting state from the last step as the next state.
next_state = state
return finished, next_inputs, next_state
class TacotronTrainingHelper(seq2seq.Helper):
"""
Custom sequence to sequence training helper for the Tacotron model.
This helper handles proper initialization of the decoder RNNs initial states and is
responsible for feeding the last frame of the decoder output as input to the next step.
This helper feeds every r'th frame of the RNNs output as input to the next decoding
step as described in [1], section "3.3 Decoder".
See: "Tacotron: Towards End-to-End Speech Synthesis"
* Source: [1] https://arxiv.org/abs/1703.10135
"""
def __init__(self, batch_size, outputs, input_size, reduction_factor):
"""
Creates an TacotronTrainingHelper instance.
Arguments:
batch_size (tf.Dimension):
Size of the current batch.
outputs (tf.Tensor):
Ground truth Mel. spectrogram data used for feeding ground truth frames during
training. The shape is expected to be shape=(B, T_spec, n_mels), with B being the
batch size and T_spec being the number of frames in the spectrogram.
input_size (int):
The size of the features in the last dimension of `outputs`.
This has to be equal to n_mels.
reduction_factor (int):
The Tacotron reduction factor to use. Used to feed every r'th ground truth frame.
"""
with tf.name_scope("TacotronTrainingHelper"):
# Copy every r'th frame from the ground truth spectrogram.
# => shape=(B, T_spec // reduction_factor, n_mels)
self.outputs = outputs[:, reduction_factor - 1::reduction_factor, :]
self._input_size = input_size
self._reduction_factor = reduction_factor
self._batch_size = batch_size
# Get the number of time frames the decoder has to produce.
# Note that we will produce sequences over the entire length of the batch. Maybe this
# way the network will learn to generate silence after producing the actual sentence.
n_target_steps = tf.shape(self.outputs)[1]
# Create a tensor of length batch_size with each field containing n_target_steps.
self._sequence_length = tf.tile([n_target_steps], [self._batch_size])
@property
def sequence_length(self):
"""
Get the sequence lengths.
Returns:
tf.Tensor:
Tensor containing the the sequence lengths of each entry in the batch. The shape
is shape=(B), with B being the batch size.
"""
return self._sequence_length
@property
def batch_size(self):
"""
Get the batch size of the current batch.
Returns:
tf.Dimension:
Batch size.
"""
return self._batch_size
@property
def sample_ids_shape(self):
"""
Shape of tensor returned by `sample`, excluding the batch dimension.
Note:
- Since the decoder does not output embeddings this function is basically irrelevant.
- However it has to be implemented since it is called for some reason.
Returns:
tf.TensorShape
"""
# Copied from the seq2seq.TrainingHelper class.
return tensor_shape.TensorShape([])
@property
def sample_ids_dtype(self):
"""
DType of tensor returned by `sample`.
Note:
- Since the decoder does not output embeddings this function is basically irrelevant.
- However it has to be implemented since it is called for some reason.
Returns:
tf.DType
"""
# Copied from the seq2seq.TrainingHelper class.
return tf.int32
def initialize(self, name=None):
"""
Query information used to initialize the decoder RNN.
This information includes information about whether the decoding process is finished yet
as well as the initial inputs to the RNN.
The initial state of the decoding process has to be that decoding is not finished.
As for the initial input we use a zero vector aka. <GO> frame.
Arguments:
name: Unused.
Returns:
(initial_finished, initial_inputs):
initial_finished (tf.Tensor):
A tensor indicating for each sequence in the batch that decoding is not
finished. The shape is shape=(B), with B being the batch size.
initial_inputs:
A all zero tensor resembling the <GO> frame used as the first decoder input.
"""
with tf.name_scope(name, "TacotronTrainingHelperInitialize"):
# When the decoder starts, there is no sequence in the batch that is finished.
initial_finished = tf.tile([False], [self._batch_size])
# The initial input for the decoder is considered to be a <GO> frame.
# We will input an zero vector as the <GO> frame.
initial_inputs = tf.zeros([self._batch_size, self._input_size], dtype=tf.float32)
return initial_finished, initial_inputs
def sample(self, time, outputs, name=None, **unused_kwargs):
"""
Takes outputs and emits sample id's
Note:
- Since the decoder does not use embeddings this function is basically irrelevant.
- However it has to be implemented since it is called for some reason.
Arguments:
time: Unused.
outputs: Unused.
name: Unused.
**unused_kwargs: Unused.
Returns:
tf.Tensor
"""
# Returning some tensor of dtype=tf.int32 and random shape seems to be enough.
return tf.zeros(1, dtype=tf.int32)
def next_inputs(self, time, outputs, state, name=None, **unused_kwargs):
"""
Query the next RNN inputs and RNN state as well as whether decoding is finished or not.
Arguments:
time:
Index in the time axis from the last decoding step.
outputs (tf.Tensor):
RNN outputs from the last decoding step. The shape is expected to be shape=(B, O),
with B being the batch size and O being the RNNs output size.
state:
RNN state from the last decoding step.
name: Unused.
**unused_kwargs: Unused.
Returns:
(finished, next_inputs, next_state):
finished (tf.Tensor):
A tensor indicating for each sequence in the batch if decoding is finished.
The shape is shape=(B), with B being the batch size.
next_inputs (tf.Tensor):
Tensor containing the inputs for the next step. The shape is
shape=(B, input_size), with B being the batch size.
next_state:
RNN state.
"""
with tf.name_scope("TacotronTrainingHelperNextInputs"):
# Increment the time index.
next_time = time + 1
# Query finished state for each sequence in the batch.
finished = (next_time >= self._sequence_length)
# During training we do not use the last steps outputs (step t) as the next steps
# inputs. We will feed the r'th ground truth frame from the Mel. spectrogram that
# equals the ground truth output at step t.
next_inputs = self.outputs[:, time, :]
# Use the resulting state from the last step as the next state.
next_state = state
return finished, next_inputs, next_state