-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword2vec_prediction_evaluation.py
254 lines (188 loc) · 9.36 KB
/
word2vec_prediction_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# Step 4: Build and train a skip-gram model.
# ***************************************************************************************** #
batch_size = 128
embedding_size = 200 # Dimension of the embedding vector.
skip_window = 5 # How many words to consider left and right.
num_skips = 8 # How many times to reuse an input to generate a label.
num_sampled = 64 # Number of negative examples to sample.
num_steps = 200501
learning_rate = 0.05
# ***************************************************************************************** #
###############
#os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1, 2, 3, 4, 5, 6, 7' #use GPU with ID=0
config1 = tf.ConfigProto()
config1.allow_soft_placement=True
config1.gpu_options.allocator_type = 'BFC'
config1.gpu_options.per_process_gpu_memory_fraction = 0.95 # maximun alloc gpu50% of MEM
config1.gpu_options.allow_growth = True #allocate dynamically
################
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
graph = tf.Graph()
with graph.as_default():
# Input data.
with tf.name_scope('inputs'):
#train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_inputs0 = tf.placeholder(tf.int32, shape=[batch_size])
train_inputs1 = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
loaded_embeddings = np.load('w2v_embeddings.npy')
loaded_nce_weights = np.load('w2v_nce_weights.npy')
loaded_nce_bias = np.load('w2v_nce_bias.npy')
print("loaded embeddings of shape: ", np.shape(loaded_embeddings))
print("loaded nce weights of shape: ", np.shape(loaded_nce_weights))
print("loaded nce bias of shape: ", np.shape(loaded_nce_bias))
# Ops and variables pinned to the CPU because of missing GPU implementation
#with tf.device('/cpu:0'):
# Look up embeddings for inputs.
with tf.name_scope('embeddings'):
#embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embeddings = tf.constant(loaded_embeddings)
#embed = tf.nn.embedding_lookup(embeddings, train_inputs) # TODO: match to poc input
embed0 = tf.nn.embedding_lookup(embeddings, train_inputs0)
embed1 = tf.nn.embedding_lookup(embeddings, train_inputs1)
embed = embed0 + embed1
# Construct the variables for the NCE loss
with tf.name_scope('weights'):
nce_weights = tf.constant(loaded_nce_weights)
with tf.name_scope('biases'):
nce_biases = tf.constant(loaded_nce_bias)
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
# Explanation of the meaning of NCE loss:
# http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
with tf.name_scope('loss'):
loss = tf.reduce_mean(
tf.nn.nce_loss(
weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed, # TODO: check proper input
num_sampled=num_sampled,
num_classes=vocabulary_size))
# Add the loss value as a scalar to summary.
tf.summary.scalar('loss', loss)
with tf.name_scope('validation_loss'):
#logits = tf.matmul(validation_embed, tf.transpose(nce_weights))
logits = tf.matmul(embed, tf.transpose(nce_weights))
logits = tf.nn.bias_add(logits, nce_biases)
labels_one_hot = tf.one_hot(train_labels, vocabulary_size, axis=1)
labels_one_hot = tf.reshape(labels_one_hot, [batch_size, vocabulary_size])
validation_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_one_hot,logits=logits)
validation_loss = tf.reduce_sum(validation_loss, axis=1)
print('validation loss: ', validation_loss)
#TODO: check if reduce sum or reduce mean
validation_loss = tf.reduce_mean(validation_loss)
# Add the loss value as a scalar to summary.
tf.summary.scalar('loss', loss)
with tf.name_scope('predictions'):
predictions = tf.nn.sigmoid(logits)
print(predictions)
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
valid_dataset)
similarity = tf.matmul(
valid_embeddings, normalized_embeddings, transpose_b=True)
# Merge all summaries.
merged = tf.summary.merge_all()
# Add variable initializer.
init = tf.global_variables_initializer()
# Step 5: Begin training.
with tf.Session(config=config1, graph=graph) as session:
# Open a writer to write summaries.
writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)
# We must initialize all variables before we use them.
init.run()
print('Initialized')
# Define metadata variable.
run_metadata = tf.RunMetadata()
# calculate loss on validation set
average_validation_loss = 0
average_accuracy = 0
for i in range(int(validation_size/batch_size)):
validation_batch_inputs0, validation_batch_inputs1, validation_batch_labels, validation_batch_context_dict = generate_validation_batch(batch_size, num_skips,skip_window)
#validation_batch_inputs0 = [i[0] for i in validation_batch_inputs]
#validation_batch_inputs1 = [i[1] for i in validation_batch_inputs]
#validation_feed_dict = {validation_inputs0: validation_batch_inputs0, validation_inputs1: validation_batch_inputs1, validation_labels: validation_batch_labels}
validation_feed_dict = {train_inputs0: validation_batch_inputs0, train_inputs1: validation_batch_inputs1, train_labels: validation_batch_labels}
'''
separate dict into two lists of input words
get the embeddings for each list
multiply the embeddings
flatten the result
multiply by nce weights
add nce bias
run tf.nn.sigmoid
get argmax (id of the word)
check if the id is in the context list
calculate accuracy
add to avg accuracy
'''
validation_loss_value, context_predictions = session.run([validation_loss, predictions], feed_dict = validation_feed_dict, run_metadata=run_metadata)
average_validation_loss = average_validation_loss + validation_loss_value
unique_tuple_indices = np.arange(0, batch_size, num_skips)
unique_predictions = context_predictions[unique_tuple_indices]
unique_tuples = np.array(list(zip(validation_batch_inputs0, validation_batch_inputs1)))[unique_tuple_indices]
unique_predictions_argmax = np.argmax(unique_predictions, axis=1)
correct_prediction_count = 0
for i in range(len(unique_tuples)):
unique_tuple = tuple(unique_tuples[i])
prediction = unique_predictions_argmax[i]
context_words = validation_batch_context_dict[unique_tuple]
if(prediction in context_words):
correct_prediction_count += 1
average_accuracy += correct_prediction_count/len(unique_tuples)
average_validation_loss = average_validation_loss / (validation_size / batch_size)
average_accuracy /= (validation_size / batch_size)
print("Average Validation Loss of %d batches: %.5f" % (int(validation_size / batch_size), average_validation_loss))
print("Average Accuracy of %d batches: %.5f" % (int(validation_size / batch_size), average_accuracy))
final_embeddings = normalized_embeddings.eval()
# Write corresponding labels for the embeddings.
with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
for i in xrange(vocabulary_size):
f.write(reverse_dictionary[i] + '\n')
# Create a configuration for visualizing embeddings with the labels in TensorBoard.
config = projector.ProjectorConfig()
embedding_conf = config.embeddings.add()
embedding_conf.tensor_name = embeddings.name
embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
projector.visualize_embeddings(writer, config)
writer.close()
# Step 6: Visualize the embeddings.
# pylint: disable=missing-docstring
# Function to draw visualization of distance between embeddings.
def plot_with_labels(low_dim_embs, labels, filename):
assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(
label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
try:
# pylint: disable=g-import-not-at-top
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne = TSNE(
perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png'))
except ImportError as ex:
print('Please install sklearn, matplotlib, and scipy to show embeddings.')
print(ex)