-
Notifications
You must be signed in to change notification settings - Fork 3
/
attention.py
75 lines (67 loc) · 4.12 KB
/
attention.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import tensorflow as tf
class attention_mechanism():
def attention(inputs, attention_size, time_major=False, return_alphas=False):
"""
Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.
The idea was proposed in the article by Z. Yang et al., "Hierarchical Attention Networks
for Document Classification", 2016: http://www.aclweb.org/anthology/N16-1174.
Args:
inputs: The Attention inputs.
Matches outputs of RNN/Bi-RNN layer (not final state):
In case of RNN, this must be RNN outputs `Tensor`:
If time_major == False (default), this must be a tensor of shape:
`[batch_size, max_time, cell.output_size]`.
If time_major == True, this must be a tensor of shape:
`[max_time, batch_size, cell.output_size]`.
In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and
the backward RNN outputs `Tensor`.
If time_major == False (default),
outputs_fw is a `Tensor` shaped:
`[batch_size, max_time, cell_fw.output_size]`
and outputs_bw is a `Tensor` shaped:
`[batch_size, max_time, cell_bw.output_size]`.
If time_major == True,
outputs_fw is a `Tensor` shaped:
`[max_time, batch_size, cell_fw.output_size]`
and outputs_bw is a `Tensor` shaped:
`[max_time, batch_size, cell_bw.output_size]`.
attention_size: Linear size of the Attention weights.
time_major: The shape format of the `inputs` Tensors.
If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
Using `time_major = True` is a bit more efficient because it avoids
transposes at the beginning and end of the RNN calculation. However,
most TensorFlow data is batch-major, so by default this function
accepts input and emits output in batch-major form.
return_alphas: Whether to return attention coefficients variable along with layer's output.
Used for visualization purpose.
Returns:
The Attention output `Tensor`.
In case of RNN, this will be a `Tensor` shaped:
`[batch_size, cell.output_size]`.
In case of Bidirectional RNN, this will be a `Tensor` shaped:
`[batch_size, cell_fw.output_size + cell_bw.output_size]`.
"""
if isinstance(inputs, tuple):
# In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
inputs = tf.concat(inputs, 2)
if time_major:
# (T,B,D) => (B,T,D)
inputs = tf.array_ops.transpose(inputs, [1, 0, 2])
inputs_shape = inputs.shape
sequence_length = inputs_shape[1].value # the length of sequences processed in the antecedent RNN layer
hidden_size = inputs_shape[2].value # hidden size of the RNN layer
# Attention mechanism
W_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
v = tf.tanh(tf.matmul(tf.reshape(inputs, [-1, hidden_size]), W_omega) + tf.reshape(b_omega, [1, -1]))
vu = tf.matmul(v, tf.reshape(u_omega, [-1, 1]))
exps = tf.reshape(tf.exp(vu), [-1, sequence_length])
alphas = exps / tf.reshape(tf.reduce_sum(exps, 1), [-1, 1])
# Output of Bi-RNN is reduced with attention vector
output = tf.reduce_sum(inputs * tf.reshape(alphas, [-1, sequence_length, 1]), 1)
if not return_alphas:
return output
else:
return output, alphas