-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUtils.py
156 lines (104 loc) · 4.84 KB
/
Utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import matplotlib.pyplot as plt
from collections import Counter
""""""""""""""""""""""""""
# Dorin Keshales
""""""""""""""""""""""""""
def read_data(src_file, trf_file, start_token='<s>', end_token='</s>'):
# Read all the source sentences
with open(src_file, 'r', encoding='utf-8') as f:
f_data = f.readlines()
# Read all the target sentences
with open(trf_file, 'r', encoding='utf-8') as e:
e_data = e.readlines()
# Create list of tuples where each tuple is composed from the source and target sentences
d = [tuple([start_token] + sentence.strip().split() + [end_token] for sentence in pair) for pair in
zip(f_data, e_data)]
src = [elem[0] for elem in d]
trg = [elem[1] for elem in d]
return d, src, trg
# Considerate rare words like if the were unknown words in order to train the corresponding embedding vector
def convert_rare_words_to_unknown_token(data, num_occurrences=3, unknown_token='<unk>'):
count = Counter()
convert_to_unk = set()
# Count the number of occurrences of each word in the training set
for sentence in data:
count.update(sentence)
# Collect the words in the training set that appear only once
for word, amount in count.items():
if amount <= num_occurrences:
convert_to_unk.add(word)
# Go over each sentence in the training set
for sentence in data:
# For each word in the sentence
for i in range(len(sentence)):
# If the current word appears only once then considerate it as unknown word
if sentence[i] in convert_to_unk:
sentence[i] = unknown_token
# Return the updated training set data.
return data
# Making a words vocabulary and tags vocabulary where each word, tag in the training set has a unique index
def create_words_vocabulary(data, start_token='<s>', end_token='</s>', unknown_token='<unk>'):
vocab_words = set()
# Go over each sentence in the data set
for sentence in data:
# For each word in the sentence
for word in sentence:
vocab_words.add(word)
# Remove the unknown_token, start_token and end_token from the set in order to give them meaningful indexes
vocab_words.remove(unknown_token)
vocab_words.remove(start_token)
vocab_words.remove(end_token)
# Sort the set
vocab_words = sorted(vocab_words)
# Add the unknown_token, start_token and end_token with indexes 0, 1, 2 respectively
vocab_words = [unknown_token, start_token, end_token] + vocab_words
# Map each word to a unique index
word_to_ix = {word: i for i, word in enumerate(vocab_words)}
ix_to_word = {i: word for i, word in enumerate(vocab_words)}
return word_to_ix, ix_to_word
# Replace each source word and target word with their unique indexes.
def src_and_trg_to_indexes(data, src_to_idx, trg_to_idx, unknown_token='<unk>'):
d = [([src_to_idx[word] if word in src_to_idx else src_to_idx[unknown_token] for word in src],
[trg_to_idx[word] if word in trg_to_idx else trg_to_idx[unknown_token] for word in trg]) for src, trg in data]
src = [elem[0] for elem in d]
trg = [elem[1] for elem in d]
return src, trg
# Update the textual data withe the words which were replaced with the unknown token
def update_with_unk(trg_data, trg_to_idx, unknown_token='<unk>'):
return [[word if word in trg_to_idx else unknown_token for word in seq] for seq in trg_data]
# Replace each word in the data set with its corresponding index
def convert_data_to_indexes(data, vocab, unknown_token='<unk>'):
sentences, words_indexes = [], []
# Go over each sentence in the training set
for sentence in data:
# For each word in the sentence
for word in sentence:
# Find its corresponding index - if not exist then assign the index of the unknown_token
ix = vocab.get(word) if word in vocab else vocab.get(unknown_token)
words_indexes.append(ix)
# Keep the words in the data set in sentences order
sentences.append(words_indexes)
words_indexes = []
# Return the updated data
return sentences
def plot_graph(title, to_plot, color):
plt.title(title + " vs Epochs")
# Plot
ticks = [i for i in range(1, len(to_plot) + 1)]
plt.plot(ticks, to_plot, color=color)
# x y labels
plt.ylabel(title)
plt.xlabel("Epochs")
plt.xticks(ticks)
plt.show()
def test_predictions(predictions, file_name, start_token='<s>', end_token='</s>'):
# Clear the content of the file if it already exists; Otherwise, create the file
if os.path.exists(file_name):
os.remove(file_name)
f = open(file_name, "a+")
# Write each prediction to the predictions file
for prediction in predictions:
f.write("%s\n" % prediction[len(start_token) + 1:])
# Close the file
f.close()