-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloss.py
291 lines (253 loc) · 11.4 KB
/
loss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
"""
Custom modification of TF losses to enable generation of DAML synthetic negatives
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
import pdb
def triplet_loss(anchor_embedding, positive_embedding, negative_embedding,
margin = 1.0, scope="triplet_loss"):
with tf.variable_scope(scope):
positive_distance = tf.reduce_sum(tf.squared_difference(positive_embedding, anchor_embedding))
negative_distance = tf.reduce_sum(tf.squared_difference(negative_embedding, anchor_embedding))
loss = tf.maximum(0., margin + positive_distance - negative_distance)
return tf.reduce_sum(loss), tf.reduce_sum(positive_distance), tf.reduce_sum(negative_distance)
def pairwise_distance(feature, squared=False):
"""Computes the pairwise distance matrix with numerical stability.
output[i, j] = || feature[i, :] - feature[j, :] ||_2
Args:
feature: 2-D Tensor of size [number of data, feature dimension].
squared: Boolean, whether or not to square the pairwise distances.
Returns:
pairwise_distances: 2-D Tensor of size [number of data, number of data].
"""
pairwise_distances_squared = math_ops.add(
math_ops.reduce_sum(math_ops.square(feature), axis=[1], keepdims=True),
math_ops.reduce_sum(
math_ops.square(array_ops.transpose(feature)),
axis=[0],
keepdims=True)) - 2.0 * math_ops.matmul(feature,
array_ops.transpose(feature))
# Deal with numerical inaccuracies. Set small negatives to zero.
pairwise_distances_squared = math_ops.maximum(pairwise_distances_squared, 0.0)
# Get the mask where the zero distances are at.
error_mask = math_ops.less_equal(pairwise_distances_squared, 0.0)
# Optionally take the sqrt.
if squared:
pairwise_distances = pairwise_distances_squared
else:
pairwise_distances = math_ops.sqrt(
pairwise_distances_squared + math_ops.to_float(error_mask) * 1e-16)
# Undo conditionally adding 1e-16.
pairwise_distances = math_ops.multiply(
pairwise_distances, math_ops.to_float(math_ops.logical_not(error_mask)))
num_data = array_ops.shape(feature)[0]
# Explicitly set diagonals to zero.
mask_offdiagonals = array_ops.ones_like(pairwise_distances) - array_ops.diag(
array_ops.ones([num_data]))
pairwise_distances = math_ops.multiply(pairwise_distances, mask_offdiagonals)
return pairwise_distances
def masked_maximum(data, mask, dim=1):
"""Computes the axis wise maximum over chosen elements.
Args:
data: 2-D float `Tensor` of size [n, m].
mask: 2-D Boolean `Tensor` of size [n, m].
dim: The dimension over which to compute the maximum.
Returns:
masked_maximums: N-D `Tensor`.
The maximized dimension is of size 1 after the operation.
"""
axis_minimums = math_ops.reduce_min(data, dim, keepdims=True)
masked_maximums = math_ops.reduce_max(
math_ops.multiply(data - axis_minimums, mask), dim,
keepdims=True) + axis_minimums
return masked_maximums
def masked_minimum(data, mask, dim=1):
"""Computes the axis wise minimum over chosen elements.
Args:
data: 2-D float `Tensor` of size [n, m].
mask: 2-D Boolean `Tensor` of size [n, m].
dim: The dimension over which to compute the minimum.
Returns:
masked_minimums: N-D `Tensor`.
The minimized dimension is of size 1 after the operation.
"""
axis_maximums = math_ops.reduce_max(data, dim, keepdims=True)
masked_minimums = math_ops.reduce_min(
math_ops.multiply(data - axis_maximums, mask), dim,
keepdims=True) + axis_maximums
return masked_minimums
def triplet_semihard_loss(labels, embeddings, margin=1.0):
"""Computes the triplet loss with semi-hard negative mining.
The loss encourages the positive distances (between a pair of embeddings with
the same labels) to be smaller than the minimum negative distance among
which are at least greater than the positive distance plus the margin constant
(called semi-hard negative) in the mini-batch. If no such negative exists,
uses the largest negative distance instead.
See: https://arxiv.org/abs/1503.03832.
Args:
labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
multiclass integer labels.
embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should
be l2 normalized.
margin: Float, margin term in the loss definition.
Returns:
triplet_loss: tf.float32 scalar.
"""
# Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
lshape = array_ops.shape(labels)
assert lshape.shape == 1
labels = array_ops.reshape(labels, [lshape[0], 1])
# Build pairwise squared distance matrix.
pdist_matrix = pairwise_distance(embeddings, squared=True)
# Build pairwise binary adjacency matrix.
adjacency = math_ops.equal(labels, array_ops.transpose(labels))
# Invert so we can select negatives only.
adjacency_not = math_ops.logical_not(adjacency)
batch_size = array_ops.size(labels)
# Compute the mask.
pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1]) # For batch size of 128, it would be 16384x128
# computes the logical_and
mask = math_ops.logical_and(
array_ops.tile(adjacency_not, [batch_size, 1]),
math_ops.greater(
pdist_matrix_tile, array_ops.reshape(
array_ops.transpose(pdist_matrix), [-1, 1])))
mask_final = array_ops.reshape(
math_ops.greater(
math_ops.reduce_sum(
math_ops.cast(mask, dtype=dtypes.float32), 1, keepdims=True),
0.0), [batch_size, batch_size])
mask_final = array_ops.transpose(mask_final)
adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32)
mask = math_ops.cast(mask, dtype=dtypes.float32)
# negatives_outside: smallest D_an where D_an > D_ap.
negatives_outside = array_ops.reshape(
masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
negatives_outside = array_ops.transpose(negatives_outside)
# negatives_inside: largest D_an.
negatives_inside = array_ops.tile(
masked_maximum(pdist_matrix, adjacency_not), [1, batch_size])
semi_hard_negatives = array_ops.where(
mask_final, negatives_outside, negatives_inside)
# pdb.set_trace()
loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives)
mask_positives = math_ops.cast(
adjacency, dtype=dtypes.float32) - array_ops.diag(
array_ops.ones([batch_size]))
# In lifted-struct, the authors multiply 0.5 for upper triangular
# in semihard, they take all positive pairs except the diagonal.
num_positives = math_ops.reduce_sum(mask_positives)
end_points = {}
end_points['num_positives'] = num_positives
end_points['semi_hard_negatives'] = semi_hard_negatives
end_points['negatives_outside'] = negatives_outside
end_points['negatives_inside'] = negatives_inside
end_points['mask'] = mask
end_points['pdist_matrix_tile'] = pdist_matrix_tile
end_points['adjacency_not'] = adjacency_not
end_points['mask_positives'] = mask_positives
end_points['pdist_matrix'] = pdist_matrix
end_points['mask_final'] = pdist_matrix
end_points['loss_mat'] = loss_mat
triplet_loss = math_ops.truediv(
math_ops.reduce_sum(
math_ops.maximum(
math_ops.multiply(loss_mat, mask_positives), 0.0)),
num_positives,
name='triplet_semihard_loss')
return triplet_loss, end_points
def lifted_struct_loss(adjacency, embeddings, margin=1.0):
"""Computes the lifted structured loss.
The loss encourages the positive distances (between a pair of embeddings
with the same labels) to be smaller than any negative distances (between a
pair of embeddings with different labels) in the mini-batch in a way
that is differentiable with respect to the embedding vectors.
See: https://arxiv.org/abs/1511.06452.
Args:
labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
multiclass integer labels.
embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should not
be l2 normalized.
margin: Float, margin term in the loss definition.
Returns:
lifted_loss: tf.float32 scalar.
"""
# Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
# lshape = array_ops.shape(labels)
# assert lshape.shape == 1
# labels = array_ops.reshape(labels, [lshape[0], 1])
# Build pairwise squared distance matrix.
pairwise_distances = pairwise_distance(embeddings)
# Build pairwise binary adjacency matrix.
# adjacency = math_ops.equal(labels, array_ops.transpose(labels))
# Invert so we can select negatives only.
adjacency_not = math_ops.logical_not(adjacency)
# batch_size = array_ops.size(adjacency)
batch_size = adjacency.get_shape().as_list()[0]
# Calculate alpha - D(i,k)
diff = margin - pairwise_distances
mask = math_ops.cast(adjacency_not, dtype=dtypes.float32)
# Safe maximum: Temporarily shift negative distances
# above zero before taking max.
# this is to take the max only among negatives.
row_minimums = math_ops.reduce_min(diff, 1, keepdims=True) # compute row minimum distances for each element in the batch
# Get the row negative maximum distances for each image
row_negative_maximums = math_ops.reduce_max(
math_ops.multiply(diff - row_minimums, mask), 1,
keepdims=True) + row_minimums
# Compute the loss.
# Keep track of matrix of maximums where M_ij = max(m_i, m_j)
# where m_i is the max of alpha - negative D_i's.
# This matches the Caffe loss layer implementation at:
# https://github.com/rksltnl/Caffe-Deep-Metric-Learning-CVPR16/blob/0efd7544a9846f58df923c8b992198ba5c355454/src/caffe/layers/lifted_struct_similarity_softmax_layer.cpp # pylint: disable=line-too-long
# Compute the maximum element matrix
max_elements = math_ops.maximum(
row_negative_maximums, array_ops.transpose(row_negative_maximums))
diff_tiled = array_ops.tile(diff, [batch_size, 1])
mask_tiled = array_ops.tile(mask, [batch_size, 1])
max_elements_vect = array_ops.reshape(
array_ops.transpose(max_elements), [-1, 1])
loss_exp_left = array_ops.reshape(
math_ops.reduce_sum(
math_ops.multiply(
math_ops.exp(diff_tiled - max_elements_vect), mask_tiled),
1,
keepdims=True), [batch_size, batch_size])
loss_mat = max_elements + math_ops.log(
loss_exp_left + array_ops.transpose(loss_exp_left))
# Add the positive distance.
loss_mat += pairwise_distances
mask_positives = math_ops.cast(
adjacency, dtype=dtypes.float32) - array_ops.diag(
array_ops.ones([batch_size]))
# *0.5 for upper triangular, and another *0.5 for 1/2 factor for loss^2.
num_positives = math_ops.reduce_sum(mask_positives) / 2.0
end_points = {}
end_points['num_positives'] = num_positives
end_points['pairwise_distances'] = pairwise_distances
end_points['mask'] = mask
end_points['max_elements'] = max_elements
end_points['adjacency_not'] = adjacency_not
end_points['mask_positives'] = mask_positives
end_points['row_minimums'] = row_minimums
end_points['loss_exp_left'] = loss_exp_left
end_points['loss_mat'] = loss_mat
end_points['max_elements_vect'] = max_elements_vect
end_points['loss_mat'] = loss_mat
end_points['adjacency'] = adjacency
end_points['row_negative_maximums'] = row_negative_maximums
end_points['embeddings'] = embeddings
end_points['diff'] = diff
lifted_loss = math_ops.truediv(
0.25 * math_ops.reduce_sum(
math_ops.square(
math_ops.maximum(
math_ops.multiply(loss_mat, mask_positives), 0.0))),
num_positives,
name='liftedstruct_loss')
return lifted_loss, end_points