-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_random_embeddings.py
60 lines (38 loc) · 1.23 KB
/
create_random_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import model as m
import mydataloader
import train
import embeddingholder
import config
import torch
import torch.autograd as autograd
from docopt import docopt
import numpy as np
import mydataloader
def main():
args = docopt("""Create new random embeddings for a dataset
Usage:
create_random_embeddings.py create <dataset_path> <name> <dimensions>
""")
dataset_path = args['<dataset_path>']
name = args['<name>']
dimensions = int(args['<dimensions>'])
data = mydataloader.load_snli(dataset_path)
create_embeddings(data, name, dimensions)
def create_embeddings(data, name, dimensions):
# first get vocab
vocab = set()
for p, h, _ in data:
vocab |= set(p + h)
print('Found', len(vocab), 'distinct words.')
vocab = list(vocab)
# now create random matrix based on vocab: <#words> X <#dimens>
embedding_matrix = m.cuda_wrap(torch.FloatTensor(len(vocab), dimensions))
torch.nn.init.xavier_uniform(embedding_matrix)
# store in files
vocab_name = name + '.vocab'
with open(vocab_name, 'w') as vocab_out:
vocab_out.write('\n'.join(vocab))
np.save(name, embedding_matrix.cpu().numpy())
print('Done.')
if __name__ == '__main__':
main()