-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilter_parallel.py
executable file
·200 lines (176 loc) · 8.53 KB
/
filter_parallel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python3
import argparse
import sys
import codecs
if sys.version_info[0] == 2:
from itertools import izip
else:
izip = zip
from collections import defaultdict as dd, Counter
import re
import os.path
import gzip
import os
import glob
import numpy as np
from lputil import mkdir_p
scriptdir = os.path.dirname(os.path.abspath(__file__))
reader = codecs.getreader('utf8')
writer = codecs.getwriter('utf8')
def prepfile(fh, code):
ret = gzip.open(fh.name, code if code.endswith("t") else code+"t") if fh.name.endswith(".gz") else fh
if sys.version_info[0] == 2:
if code.startswith('r'):
ret = reader(fh)
elif code.startswith('w'):
ret = writer(fh)
else:
sys.stderr.write("I didn't understand code "+code+"\n")
sys.exit(1)
return ret
def filterlines(ifh, blackballs, seqs, lows, highs, keepfh, rejectfh):
''' pair ifh and seqs. if low < seq < high for all, then write to keepfh, else write to rejectfh '''
for ln, (line, bb, *vals) in enumerate(zip(ifh, blackballs, *seqs)):
doreject=True
if not bb:
for val, low, high in zip(vals, lows, highs):
if val > low and val < high:
doreject=False
break
#sys.stderr.write("{} {} {} vs {} to {}: {}\n".format(ifh.name, ln, vals, lows, highs, doreject))
fh = rejectfh if doreject else keepfh
fh.write(line)
def countfiles(dir):
''' how many (non-directory) files in this dir? '''
ret = 0
for _, _, files in os.walk(dir):
ret += len(files)
return ret
def blackball(fline, eline):
''' rules for eliminating lines '''
return len(fline)==0 or \
len(eline) == 0 or \
len(fline.strip().split()) == 0 or \
len(eline.strip().split()) == 0 or \
fline.startswith("#untranslated") or \
eline.startswith("#untranslated") or \
fline.startswith("#Untranslated") or \
eline.startswith("#Untranslated")
def main():
parser = argparse.ArgumentParser(description="filter extracted parallel data directory",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--indir", "-i", default="./extracted", help="input directory")
parser.add_argument("--lang", "-l", help="input directory")
parser.add_argument("--stds", "-s", type=int, default=1, help="number of standard deviations from mean to filter out")
parser.add_argument("--filterdir", "-f", default="./filtered", help="output filter directory")
parser.add_argument("--genre", "-g", default="original", help="genre to use when filtering (could try tokenized but not available for twitter)")
parser.add_argument("--remaindir", "-r", default="./remainder", help="output remainder directory")
try:
args = parser.parse_args()
except IOError as msg:
parser.error(str(msg))
# crawl indir for expected original files. cat them together, save ratios, get mean and stdev
# for each file, including manifest, zip with ratios, determine whether it belongs in filter or remaindir
# TODO: add deltas too!
indir = args.indir
filterdir = args.filterdir
remaindir = args.remaindir
mkdir_p(filterdir)
mkdir_p(remaindir)
# assumption: there are a number of *.eng.manifest files, each paired with *.<lang>.manifest, and for each i, there is original/i.eng.flat and original/i.<lang>.flat
engmanifests = glob.glob(os.path.join(indir, "*.eng.manifest"))
fmanifests = []
ratios = dd(list)
deltas = dd(list)
blackballs = dd(list)
genres = set()
for eman in engmanifests:
ebase = os.path.basename(eman)
genre = '.'.join(ebase.split('.')[:-2])
genres.add(genre)
fman = os.path.join(os.path.dirname(eman), "%s.%s.manifest" % (genre, args.lang))
fmanifests.append(fman)
eorig = os.path.join(args.indir, args.genre, "%s.%s.eng.flat" % (genre, args.genre))
forig = os.path.join(args.indir, args.genre, "%s.%s.%s.flat" % (genre, args.genre, args.lang))
# test existence
for f in [eman, fman, eorig, forig]:
if not os.path.exists(f):
sys.stderr.write("ERROR: %s does not exist\n" % f)
sys.exit(1)
#slurp files, calculate ratios, store ratios
eorig = prepfile(open(eorig, 'r'), 'r')
forig = prepfile(open(forig, 'r'), 'r')
for ln, (eline, fline) in enumerate(izip(eorig, forig)):
ewords = eline.strip().split()
fwords = fline.strip().split()
blackballs[genre].append(blackball(eline, fline))
deltas[genre].append(abs(len(ewords)-len(fwords)))
try:
ratios[genre].append((len(ewords)+0.0)/(len(fwords)+0.0))
except ZeroDivisionError:
sys.stderr.write("0-length foreign sentence at line {} of {}\n".format(ln+1, forig.name))
ratios[genre].append(0.)
allratios = np.concatenate(list(map(np.array, ratios.values())), 0)
alldeltas = np.concatenate(list(map(np.array, deltas.values())), 0)
allblackballs = np.concatenate(list(map(np.array, blackballs.values())), 0)
bbrejectsize = Counter(allblackballs)[True]
ratiomean = np.mean(allratios)
ratiostd = np.std(allratios)
lowratio = ratiomean-(args.stds*ratiostd)
highratio = ratiomean+(args.stds*ratiostd)
rejectratiosize = len(list(filter(lambda x: x<lowratio or x > highratio, allratios)))
deltamean = np.mean(alldeltas)
deltastd = np.std(alldeltas)
lowdelta = deltamean-(args.stds*deltastd)
highdelta = deltamean+(args.stds*deltastd)
rejectdeltasize = len(list(filter(lambda x: x<lowdelta or x > highdelta, alldeltas)))
sys.stderr.write("Could be rejecting %d of %d lines (%f %%) with ratio below %f or above %f\n" % (rejectratiosize, len(allratios), 100.0*rejectratiosize/len(allratios), lowratio, highratio))
sys.stderr.write("Could be rejecting %d of %d lines (%f %%) with delta below %f or above %f\n" % (rejectdeltasize, len(alldeltas), 100.0*rejectdeltasize/len(alldeltas), lowdelta, highdelta))
reject_ratio_delta_size = len(list(filter(lambda x: (x[0]<lowratio or x[0]>highratio) and (x[1]<lowdelta or x[1]>highdelta), zip(allratios, alldeltas))))
sys.stderr.write("Actually rejecting %d of %d lines (%f %%) meeting both delta and ratio criteria\n" % (reject_ratio_delta_size, len(alldeltas), 100.0*reject_ratio_delta_size/len(alldeltas)))
sys.stderr.write("Also rejecting %d of %d lines (%f %%) for blackball criteria\n" % (bbrejectsize, len(allblackballs), 100.0*bbrejectsize/len(allblackballs)))
# iterate through manifests and all files and filter per ratio and delta
for manset in (engmanifests, fmanifests):
for man in manset:
sys.stderr.write("filtering %s\n" % man)
base = os.path.basename(man)
genre = '.'.join(base.split('.')[:-2])
sys.stderr.write("genre %s\n" % genre)
rats = ratios[genre]
delts = deltas[genre]
bbs = blackballs[genre]
reject_ratio_delta_size = len(list(filter(lambda x: (x[0]<lowratio or x[0]>highratio) and (x[1]<lowdelta or x[1]>highdelta), zip(rats, delts))))
#rejectratiosize = len(list(filter(lambda x: x<lowratio or x > highratio, rats)))
sys.stderr.write("rejecting %d of %d\n" % (reject_ratio_delta_size, len(rats)))
infile = prepfile(open(man, 'r'), 'r')
filterfile = prepfile(open(os.path.join(filterdir, base), 'w'), 'w')
remainfile = prepfile(open(os.path.join(remaindir, base), 'w'), 'w')
filterlines(infile, bbs, (rats, delts), (lowratio,lowdelta), (highratio,highdelta), filterfile, remainfile)
# for directories in extracted
#http://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory
for subdir in next(os.walk(indir))[1]:
# make parallel directories
# for genres in genre set
# for languages
# filter lines
insubdir = os.path.join(indir, subdir)
filtersubdir = os.path.join(filterdir, subdir)
mkdir_p(filtersubdir)
remainsubdir = os.path.join(remaindir, subdir)
mkdir_p(remainsubdir)
for genre in genres:
for lang in (args.lang, 'eng'):
base = "%s.%s.%s.flat" % (genre, subdir, lang)
infilename = os.path.join(insubdir, base)
if os.path.exists(infilename):
infile = prepfile(open(infilename, 'r'), 'r')
filterfile = prepfile(open(os.path.join(filtersubdir, base), 'w'), 'w')
remainfile = prepfile(open(os.path.join(remainsubdir, base), 'w'), 'w')
filterlines(infile, blackballs[genre], (ratios[genre], deltas[genre]), (lowratio,lowdelta), (highratio,highdelta), filterfile, remainfile)
else:
sys.stderr.write("%s does not exist\n" % infilename)
# count files in each of the directories; should be the same
for dir in (indir, filterdir, remaindir):
sys.stderr.write("%d files in %s\n" % (countfiles(dir), dir))
if __name__ == '__main__':
main()