-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
50 lines (33 loc) · 1.5 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import gensim
import gensim.downloader as api
import numpy as np
from tqdm.notebook import tqdm
from nltk.corpus import brown
from nltk.stem import WordNetLemmatizer
from collections import Counter
def save_word2vec_format(fname, vocab, vector_size):
"""Store the input-hidden weight matrix in the same format used by the
original C word2vec-tool, for compatibility."""
total_vec = len(vocab)
with gensim.utils.open(fname, 'wb') as fout:
fout.write(gensim.utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, row in tqdm(vocab.items()):
row = row.astype(np.float32)
fout.write(gensim.utils.to_utf8(word) + b" " + row.tobytes())
def get_words():
"""Extract words from the Brown corpus and save them to a file, then load word vectors"""
vectors = {}
lem = WordNetLemmatizer()
wv = api.load('word2vec-google-news-300')
brown_words = [word for word in brown.words() if word.isalpha() and word.islower()]
wordlist = {lem.lemmatize(word) for (word, count) in Counter(brown_words).items() if count > 1 and count < 300 and len(word) > 2}
with open('wordlist.txt', 'w') as f:
for word in wordlist:
try:
vectors[word] = wv[word]
f.write(word + '\n')
except KeyError:
pass
save_word2vec_format('vectors.bin', vectors, 300)
get_words()