-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
30 lines (28 loc) · 1.72 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import operator
import os
import re
# Function that generates a vocab file for ELISE preprocessing code given some input directory with
# .txt files. It extracts all the tokens in these files to build the vocab file. The tokens in the
# vocab file are ordered by frequency. Name of the vocab file outputted is "elmo-base-vocab.txt".
def create_vocab_file(input_folder):
vocab_freq_dict = {} #where we will write the words
directory_list = list(os.listdir(input_folder))
for file_name in directory_list:
if file_name.endswith(".txt"): #we are pulling ALL the words from the actual files
with open(input_folder+file_name, "r") as txt_file:
lines = txt_file.read().split("\n")
for line in lines:
tokens = line.split(" ")
for token in tokens:
clean_token = re.sub(r'[^\w\s]', '', token) #remove punctuation from tokens
if len(clean_token)>1: #tokens should be at least length 2, rest we ignore
if clean_token not in vocab_freq_dict: #add it to the dict,
vocab_freq_dict[clean_token] = 1
else: #increment if it's already there
vocab_freq_dict[clean_token] += 1
#Sort by frequency
sorted_vocab_freq_tuples = sorted(vocab_freq_dict.items(), key=operator.itemgetter(1), reverse=True)
with open("elmo-base-vocab.txt", "w") as output_file:
for token_tuple in sorted_vocab_freq_tuples: #finally, output the tokens to file
output_file.write(token_tuple[0])
output_file.write("\n")