-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathperform_task.py
316 lines (251 loc) · 12.3 KB
/
perform_task.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
import os
import configuration as config
import json
langs = config.langs
## Gets the length of the source and target files
def get_length(path_source, path_target):
with open(path_source,'r') as f1:
lines_source = f1.readlines()
with open(path_target,'r') as f2:
lines_target = f2.readlines()
total_source = len(lines_source)
total_target = len(lines_target)
# Checking if the lengths are equal or not
if total_source == total_target:
return total_source
else:
return -1
## Removes the data where any side of the parallel data is empty
def remove_empty(lines_source, lines_target):
out_source = []
out_target = []
for (source, target) in zip(lines_source, lines_target):
if (not(source.strip())) or (not(target.strip())):
continue
else:
## Stripping the sentences too in order to remove extra spaces in any case
out_source.append(source.strip() + '\n')
out_target.append(target.strip() + '\n')
# Reverting the correction if it is buggy!
if len(out_source)==len(out_target):
return out_source, out_target
else:
return lines_source, lines_target
## Removes the data where both the source and target are either the same English sentence or some special characters
def remove_english(lines_source, lines_target):
out_source = []
out_target = []
for (source, target) in zip(lines_source, lines_target):
if source == target:
continue
else:
## Stripping the sentences too in order to remove extra spaces in any case
out_source.append(source.strip() + '\n')
out_target.append(target.strip() + '\n')
# Reverting the correction if it is buggy!
if len(out_source)==len(out_target):
return out_source, out_target
else:
return lines_source, lines_target
## Removes repetitions within a particular source
def remove_repetitions(lines_source, lines_target):
# Adding tuples of sentences to a set
tup_set = set()
for (source, target) in zip(lines_source, lines_target):
tup_set.add((source,target))
# Converting it to a list and dividing the sentences
tup_list = list(tup_set)
out_source = []
out_target = []
for (source,target) in tup_list:
## Stripping the sentences too in order to remove extra spaces in any case
out_source.append(source.strip() + '\n')
out_target.append(target.strip() + '\n')
# Reverting the correction if it is buggy!
if len(out_source)==len(out_target):
return out_source, out_target
else:
return lines_source, lines_target
### Cleans the base dataset at different levels
def clean_base(path_source, path_target, dest_source, dest_target):
## Note that here the lines that are extracted will also consist of '\n' in the end
with open(path_source,'r') as f1:
lines_source = f1.readlines()
with open(path_target,'r') as f2:
lines_target = f2.readlines()
## If the lengths of the two files are not equal, then just discard that entire dataset
if len(lines_source)!=len(lines_target):
lines_source, lines_target = [], []
else:
## Removes the data where any side of the parallel data is empty
lines_source, lines_target = remove_empty(lines_source, lines_target)
## Removes the data where both the source and target sentences are the same sentence (which is English).
## Might be having some same special character on both the sides
lines_source, lines_target = remove_english(lines_source, lines_target)
## Removes repetitions within the dataset
lines_source, lines_target = remove_repetitions(lines_source, lines_target)
## Now writing the obtained lines in the destinations
with open(dest_source, 'w') as f1:
for source in lines_source:
f1.write(source)
with open(dest_target, 'w') as f2:
for target in lines_target:
f2.write(target)
return
## Function to add the tuple of source-target in the set
def add_to_set(path_source, path_target, merge_set):
# Note that here the lines that are extracted will also consist of '\n' in the end
with open(path_source,'r') as f1:
lines_source = f1.readlines()
with open(path_target,'r') as f2:
lines_target = f2.readlines()
# Do not modify the set if the number of lines are not same for the dataset
if len(lines_source)!=len(lines_target):
return
for (source,target) in zip(lines_source, lines_target):
merge_set.add((source,target))
### Has all the required tasks for the dataset files
### path_source can either be the path of a source language file or it can be the path of the json file
def task(path_source, path_target, str_task, path_pair, merge_set):
## Calculates the length of the files
if str_task == config.get_base_stats or str_task == config.get_clean_stats:
return get_length(path_source, path_target)
## Cleans the base dataset
elif str_task == config.clean_base_data:
## Assumes that the files are properly passed
src = path_source.split('.')[-1]
tgt = path_target.split('.')[-1]
## Getting the destination files
dest_source = os.path.join(path_pair, config.clean_train_name + '.' + src)
dest_target = os.path.join(path_pair, config.clean_train_name + '.' + tgt)
## Checking if we are passing the proper name as the Input or not
if dest_source==path_source or dest_target==path_target:
print('Aborted: Attempted to overwrite the stored base data')
return -1
## Call the functon that would clean the dataset files
clean_base(path_source, path_target, dest_source, dest_target)
return 0
## Merges the training data into one file
elif str_task == config.merge_clean_data:
add_to_set(path_source, path_target, merge_set)
return 0
## Converts a list of tuples into a dictionary along with adding a total
def tuplist2json(tup_list):
temp = dict()
sumVal = 0
for tup in tup_list:
temp[tup[0]] = tup[1]
sumVal = sumVal + int(tup[1])
temp['total'] = sumVal
return temp
## Parses all the available directories for a particular pair of langids
## Assumes the directory structure that we proposed earlier
def parsePair(src, tgt, str_task):
datapath_list = config.datapath_list
str_pair = src + '-' + tgt
tup_list = [] # To be used to get the stats of the dataset
merge_set = set() # To be used to merge the training data
for path in datapath_list:
dataset = path.split('/')[-1]
path_pair = os.path.join(path, str_pair)
if os.path.exists(path_pair):
# Getting the absolute paths of the input files to be read
if str_task == config.get_base_stats or str_task == config.clean_base_data:
path_source = os.path.join(path_pair, config.base_train_name + '.' + src)
path_target = os.path.join(path_pair, config.base_train_name + '.' + tgt)
elif str_task == config.get_clean_stats or str_task == config.merge_clean_data:
path_source = os.path.join(path_pair, config.clean_train_name + '.' + src)
path_target = os.path.join(path_pair, config.clean_train_name + '.' + tgt)
# Proceed further only if all the specified files and directories exist
if os.path.exists(path_source) and os.path.exists(path_target):
val = task(path_source, path_target, str_task, path_pair, merge_set)
if val!=-1 and (str_task == config.get_base_stats or str_task == config.get_clean_stats):
tup_list.append((dataset, val))
# Writing the data tuples stored in the set to the final file
if str_task == config.merge_clean_data:
path_base = config.path_indic_dataset
path_pair = os.path.join(path_base,str_pair)
# Creating the particular folder in the Indic Parallel folder
os.makedirs(path_pair,exist_ok = True)
# Getting the destination of the files
dest_source = os.path.join(path_pair, config.merge_train_name + '.' + src)
dest_target = os.path.join(path_pair, config.merge_train_name + '.' + tgt)
# Opening the files to be written
f_source = open(dest_source, 'w')
f_target = open(dest_target, 'w')
# Converting the set into a list
merge_set = list(merge_set)
# Loading the test and dev data for removal from train
path_indic_test = config.path_indic_test
path_indic_dev = config.path_indic_dev
with open(os.path.join(path_indic_dev, config.wat_dev_name + '.' + src)) as f:
src_dev = f.readlines()
with open(os.path.join(path_indic_dev, config.wat_dev_name + '.' + tgt)) as f:
tgt_dev = f.readlines()
with open(os.path.join(path_indic_test, config.wat_test_name + '.' + src)) as f:
src_test = f.readlines()
with open(os.path.join(path_indic_test, config.wat_test_name + '.' + tgt)) as f:
tgt_test = f.readlines()
# Creating set of tuples for the test and dev data
pairs_test, pairs_dev = [],[]
for (src_d, tgt_d) in zip(src_dev, tgt_dev):
pairs_dev.append((src_d,tgt_d))
for (src_t, tgt_t) in zip(src_test, tgt_test):
pairs_test.append((src_t, tgt_t))
pairs_dev = set(pairs_dev)
pairs_test = set(pairs_test)
# Removing test + dev from the merged data
count = 0
for (source, target) in merge_set:
if (source,target) in pairs_dev or (source,target) in pairs_test:
continue
else:
f_source.write(source)
f_target.write(target)
count+=1
# Appending the total train (after the dev-test split) stats in the Json file
tup_list.append(('total',count))
# Closing the open files
f_source.close()
f_target.close()
return tup_list
## Defining the main function to do the task across all the pairs
def main(str_task):
stat_dict = dict()
for i in range(len(langs)-1):
for j in range(i+1, len(langs)):
# Print the on-going language
print(langs[i], langs[j])
# The task corresponding to getting the file stats
if str_task==config.get_base_stats or str_task == config.get_clean_stats:
stat_dict[langs[i] + '-' + langs[j]] = tuplist2json(parsePair(langs[i], langs[j], str_task))
# The task corresponding to cleaning the training data
elif str_task==config.clean_base_data:
unusedTemp = parsePair(langs[i], langs[j], str_task)
# The task corresponding to merge the cleaned training data. Also has the overall stats for the data
elif str_task == config.merge_clean_data:
stat_dict[langs[i] + '-' + langs[j]] = tuplist2json(parsePair(langs[i], langs[j], str_task))
# Writing the computed stats in the corresponding files
if str_task == config.get_base_stats:
with open(config.saved_base_stats, 'w') as fp:
json.dump(stat_dict, fp, indent=4)
elif str_task == config.get_clean_stats:
with open(config.saved_clean_stats, 'w') as fp:
json.dump(stat_dict, fp, indent=4)
elif str_task == config.merge_clean_data:
with open(config.saved_merge_stats, 'w') as fp:
json.dump(stat_dict, fp, indent=4)
return
if __name__ == '__main__':
## 0 -> Call this function to get the stats of the base dataset
# print('Getting the stats of the base data')
# main(config.get_base_stats)
## 1 -> Call this function to get the cleaned data
# print('Cleanining the base data obtained by us')
# main(config.clean_base_data)
## 2 -> Call this function to get the stats of the cleaned data (to be called only after the above step)
# print('Getting the stats of the cleaned data')
# main(config.get_clean_stats)
## 3 -> Call this function to merge all the dataset files along with taking the repetitions into account. Also reports the overall stats, and removes WAT test+dev
print('Merging the overall data and getting the stats for that')
main(config.merge_clean_data)