-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_vocabulary.py
199 lines (166 loc) · 8.77 KB
/
get_vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# -*- coding: utf-8 -*-
from argparse import ArgumentParser
import operator
import math
def pmi_smoothed(joint_cooc,total_cooc,freq_word1,freq_word2,alpha_smoothing=1):
pmi_score=math.log((joint_cooc/total_cooc) / ((freq_word1/total_cooc) * ((freq_word2**alpha_smoothing)/(total_cooc**alpha_smoothing))))
return pmi_score
def get_dict_weights(corpus_path,set_wordvocab,window_size,word2index):
corpus_file=open(corpus_path,'r',encoding='utf-8')
dict_weight={}
dict_weight_word1={}
dict_weight_word2={}
total_cooc=0
for word in set_wordvocab:
index=word2index[word]
dict_weight[index]={}
dict_weight_word1[index]=0
dict_weight_word2[index]=0
for line in corpus_file:
linesplit=line.strip().split(" ")
for i in range(len(linesplit)):
tokeni=linesplit[i]
if tokeni in set_wordvocab:
indexi=word2index[tokeni]
for j in range(i+2,min(i+1+window_size,len(linesplit))):
tokenj=linesplit[j]
if tokenj in set_wordvocab and tokenj!=tokeni:
indexj=word2index[tokenj]
weight=1/(j-i)
if indexj in dict_weight[indexi]:
dict_weight[indexi][indexj][0]+=weight
dict_weight[indexi][indexj][1]+=1
else:
dict_weight[indexi][indexj]=[weight,1]
total_cooc+=weight
dict_weight_word1[indexi]+=weight
dict_weight_word2[indexj]+=weight
return dict_weight,dict_weight_word1,dict_weight_word2,total_cooc
def get_pair_vocab(corpus_path,set_wordvocab,window_size,min_occ,max_pairsize,alpha_smoothing,word2index,index2word,output_path_pairvocab,symmetry="false"):
max_neighbours_per_word=int(max_pairsize/len(set_wordvocab))
dict_weight,dict_weight_word1,dict_weight_word2,total_cooc=get_dict_weights(corpus_path,set_wordvocab,window_size,word2index)
dict_pmi={}
set_pairs=set()
if output_path_pairvocab.lower()!="false": output_file=open(output_path_pairvocab,'w',encoding='utf-8')
for index1 in dict_weight:
total_cooc_word1=0
dict_pmi.clear()
for index2 in dict_weight[index1]:
if symmetry=="true":
if index2 in dict_weight and index1 in dict_weight[index2]:
if dict_weight[index1][index2][1]+dict_weight[index2][index1][1]>=min_occ:
pmi_score=pmi_smoothed(dict_weight[index1][index2][0]+dict_weight[index2][index1][0],total_cooc,dict_weight_word1[index1]+dict_weight_word2[index1],dict_weight_word1[index2]+dict_weight_word2[index2],alpha_smoothing)
dict_pmi[index2]=pmi_score
else:
if dict_weight[index1][index2][1]>=min_occ:
pmi_score=pmi_smoothed(dict_weight[index1][index2][0],total_cooc,dict_weight_word1[index1],dict_weight_word2[index2],alpha_smoothing)
dict_pmi[index2]=pmi_score
else:
if dict_weight[index1][index2][1]>=min_occ:
pmi_score=pmi_smoothed(dict_weight[index1][index2][0],total_cooc,dict_weight_word1[index1],dict_weight_word2[index2],alpha_smoothing)
dict_pmi[index2]=pmi_score
list_pmi_sorted=sorted(dict_pmi.items(), key=operator.itemgetter(1), reverse=True)[:max_neighbours_per_word]
word1=index2word[index1]
for index2,pmi_score in list_pmi_sorted:
word2=index2word[index2]
set_pairs.add((word1,word2))
if symmetry=="true" and (word2,word1) in set_pairs: continue
if output_path_pairvocab.lower()!="false": output_file.write(word1+"\t"+word2+"\t"+str(round(pmi_score,3))+"\n")
if output_path_pairvocab.lower()!="false":output_file.close()
return set_pairs
def get_dict_pairvocab_fromset(set_pairs,word2index):
dict_pairvocab={}
for word1,word2 in set_pairs:
if word1 in word2index and word2 in word2index:
if word1 not in dict_pairvocab: dict_pairvocab[word1]=set()
dict_pairvocab[word1].add(word2)
return dict_pairvocab
#Store the pair vocabulary in a txt file
def print_pairs(set_pairs,output_path):
output_file=open(output_path,'w',encoding='utf-8')
for pair in set_pairs:
output_file.write(pair[0]+"\t"+pair[1]+"\n")
output_file.close()
def print_wordvocab(set_wordvocab,output_path):
output_file=open(output_path,'w',encoding='utf-8')
for word in set_wordvocab:
output_file.write(word+"\n")
output_file.close()
def get_stopwords(stopwords_path):
set_stopwords=set()
stopwords_file=open(stopwords_path,encoding='utf-8').readlines()
for line in stopwords_file:
set_stopwords.add(line.strip())
return set_stopwords
def get_word_vocab(corpus_path):
corpus_file=open(corpus_path,'r',encoding='utf-8')
dict_freq={}
for line in corpus_file:
linesplit=line.strip().split(" ")
for token in linesplit:
if token in dict_freq: dict_freq[token]+=1
else: dict_freq[token]=1
return dict_freq
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('-corpus', '--corpus_file', help='Input corpus path (tokenized)', required=True)
parser.add_argument('-wordsize', '--wordvocabulary_size', help='Maximum number of words considered (sorted by frequency)', required=True)
parser.add_argument('-output', '--output_folder', help='Output directory to store vocabulary files', required=False, default="./vocab/")
parser.add_argument('-stopwords', '--stopwords_path', help='Path to stopwords file. Write "false" if no stopwords to be used', required=False, default="./stopwords_en.txt")
parser.add_argument('-window', '--window_size', help='Co-ocurring window size', required=False, default=10)
parser.add_argument('-min_occ', '--min_occurrences_pairs', help='Minimum number of occurrences required for word pairs', required=False, default=20)
parser.add_argument('-max_pairsize', '--max_pairvocabulary_size', help='Maximum number of word pairs', required=False, default=3000000)
parser.add_argument('-smoothing', '--alpha_smoothing_factor', help='Alpha smoothing factor in the pmi calculation (default=1)', required=False, default=0.75)
parser.add_argument('-min_freq', '--minimum_frequency', help='Minimum frequency of words', required=False, default=5)
parser.add_argument('-symmetry', '--symmetry', help='Indicates whether pairs are symmetric (true) or not (false)', required=False, default="false")
args = vars(parser.parse_args())
corpus_path=args['corpus_file']
num_words=int(args['wordvocabulary_size'])
output_path=args['output_folder']
stopwords_path=args['stopwords_path']
window_size=int(args['window_size'])
min_occ=int(args['min_occurrences_pairs'])
max_pairsize=int(args['max_pairvocabulary_size'])
alpha_smoothing=float(args['alpha_smoothing_factor'])
min_freq=int(args['minimum_frequency'])
symmetry=args['symmetry'].lower()
print ("Loading word frequency dictionary...")
#Get frequency dictionary from corpus
dict_freq=get_word_vocab(corpus_path)
#Ouput file frequency dictionary
if not os.path.exists(output_path): os.mkdir(output_path)
output_path_dictfreq=output_path+"word_frequency_all.txt"
dictfreq_file=open(output_path_dictfreq,'w',encoding='utf-8')
if stopwords_path.lower()=="false": set_stopwords=set()
else: set_stopwords=get_stopwords(stopwords_path)
#Print word frequency dictionary. Extract word vocabulary set.
word2index={}
index2word={}
list_freq_sorted=sorted(dict_freq.items(), key=operator.itemgetter(1), reverse=True)
cont_wordvocab=0
set_wordvocab=set()
for word,freq in list_freq_sorted:
if freq<min_freq: break
if cont_wordvocab<num_words and word not in set_stopwords and "__" not in word and not word.isdigit():
set_wordvocab.add(word)
word2index[word]=cont_wordvocab
index2word[cont_wordvocab]=word
cont_wordvocab+=1
dictfreq_file.write(word+"\t"+str(freq)+"\n")
dictfreq_file.close()
list_freq_sorted.clear()
dict_freq.clear()
print ("Done loading word frequency dictionary. Stored in "+output_path_dictfreq+ "\n")
print ("Now computing pair vocabulary... (this can take a couple of hours depending on the size of the corpus)")
#Get and print pair vocabulary
output_path_pairvocab=output_path+"pair_vocab_pmi.txt"
set_pairvocab=get_pair_vocab(corpus_path,set_wordvocab,window_size,min_occ,max_pairsize,alpha_smoothing,word2index,index2word,output_path_pairvocab,symmetry)
print ("The pair vocabulary has been printed in "+output_path_pairvocab)
#Print final word vocabulary
final_set_wordvocab=set()
for word1,word2 in set_pairvocab:
final_set_wordvocab.add(word1)
final_set_wordvocab.add(word2)
output_path_wordvocab=output_path+"word_vocab.txt"
print_wordvocab(final_set_wordvocab,output_path_wordvocab)
print ("Finished. Word vocabulary printed in "+output_path_wordvocab)