-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpretrained_embeddings.py
182 lines (129 loc) · 5.23 KB
/
pretrained_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import argparse
import hashlib
import json
import os
import sys
import numpy as np
import torch
from tqdm import tqdm
# from vocab_definitions import BOS_TOK, EOS_TOK, special_tokens
from standalone_elmo import batch_to_ids, ElmoCharacterEncoder, remove_sentence_boundaries
PADDING_TOK = '<PAD>'
BOS_TOK = '<S>'
EOS_TOK = '</S>'
special_tokens = [PADDING_TOK, BOS_TOK, EOS_TOK]
# files for original elmo model
weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json'
def maybe_download(remote_url, cache_dir):
path = os.path.join(cache_dir, os.path.basename(remote_url))
if not os.path.exists(path):
os.system(f'curl {remote_url} -o {path} -L')
return path
def hash_string_list(string_list):
m = hashlib.sha256()
for s in string_list:
m.update(str.encode(s))
return m.hexdigest()[:8]
def read_text_vocab_file(path):
output = []
with open(path) as f:
for token in f.read().splitlines():
output.append(token)
return output
def read_amr_vocab_file(path):
output = []
with open(path) as f:
for token in f.read().splitlines():
output.append(token)
return output
def build_text_vocab(sentences, vocab_path=None):
tokens = set()
# (Optional) Add vocab from file.
if vocab_path is not None:
with open(vocab_path) as f:
for line in f:
w = line.strip().split()[0]
tokens.add(w)
# Add vocab from sentences.
for s in sentences:
assert isinstance(s, list)
tokens.update(s)
# Add special tokens and convert to list.
for w in special_tokens:
if w in tokens:
tokens.remove(w)
tokens = special_tokens + sorted(tokens)
# Build mapping.
word2idx = {k: i for i, k in enumerate(tokens)}
return word2idx
def get_character_embeddings_from_elmo(tokens, cache_dir, cuda=False):
assert len(special_tokens) == 3
assert tokens[1] == BOS_TOK and tokens[2] == EOS_TOK
# Remove special tokens.
vocab_to_cache = tokens[3:]
size = 512
batch_size = 1024
char_embedder = ElmoCharacterEncoder(
options_file=maybe_download(options_file, cache_dir=cache_dir),
weight_file=maybe_download(weights_file, cache_dir=cache_dir),
requires_grad=False)
if cuda:
char_embedder.cuda()
all_vocab_to_cache = [BOS_TOK, EOS_TOK] + vocab_to_cache
shape = (1 + len(all_vocab_to_cache), size)
embeddings = np.zeros(shape, dtype=np.float32)
for start in tqdm(range(0, len(all_vocab_to_cache), batch_size), desc='embed'):
end = min(start + batch_size, len(all_vocab_to_cache))
batch = all_vocab_to_cache[start:end]
batch_ids = batch_to_ids([[x] for x in batch])
if cuda:
batch_ids = batch_ids.cuda()
output = char_embedder(batch_ids)
vec = remove_sentence_boundaries(output['token_embedding'], output['mask'])[0].squeeze(1)
embeddings[1 + start:1 + end] = vec.cpu()
# Note: This is a hack to match legacy behavior.
embeddings = np.concatenate([embeddings, embeddings], axis=1)
return embeddings
def get_embeddings_path(tokens, cache_dir=None):
token_hash = hash_string_list(tokens)
if cache_dir is not None:
return '{}/elmo.{}.npy'.format(cache_dir, token_hash)
return 'elmo.{}.npy'.format(token_hash)
def read_embeddings(tokens, path=None, cache_dir=None):
if path is None:
path = get_embeddings_path(tokens, cache_dir)
assert os.path.exists(path), path
print('reading embeddings from {} for {} tokens'.format(path, len(tokens)))
embeddings = np.load(path)
assert embeddings.shape[0] == len(tokens)
return embeddings
def write_embeddings(path, embeddings):
np.save(path, embeddings)
def main(arg):
tokens = read_text_vocab_file(args.vocab)
token_hash = hash_string_list(tokens)
print('found {} tokens with hash = {}'.format(len(tokens), token_hash))
path = f'{args.cache_dir}/elmo.{token_hash}.npy'
if os.path.exists(path):
print('embeddings found at {}, exiting'.format(path))
sys.exit()
embeddings = get_character_embeddings_from_elmo(tokens, args.cache_dir, args.cuda)
print(f'writing to {path}')
write_embeddings(path, embeddings)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--vocab", type=str, help="Vocab file.",
required=True)
parser.add_argument('--cuda', action='store_true',
help='If true, then use GPU.')
parser.add_argument('--allow-cpu', action='store_true',
help='If true, then allow CPU if GPU not available.')
parser.add_argument('--cache-dir', type=str, required=True,
help='Folder to save elmo weights and embeddings.')
args = parser.parse_args()
if args.allow_cpu and args.cuda:
if not torch.cuda.is_available():
print('WARNING: CUDA not available. Falling back to CPU.')
args.cuda = False
main(args)