-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
415 lines (352 loc) · 13.5 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
import attributes_map
import discrete_values_map as m
import numpy as np
import json
from datetime import datetime
import discrete_values_counts
from random import randint
import linecache
def merge_tsvs(outpath, *paths):
"""
Merge multiple tsv files into one.
:param outpath:
:param paths:
:return:
"""
with open(outpath, 'w') as fout:
first = True
for inpath in paths:
with open(inpath, 'r') as fin:
for line in fin:
# If has header.
if line[0].isalpha():
# Only copy the header from the first file.
if first:
fout.write(line)
else:
fout.write(line)
first = False
def shuffle_file(in_path, out_path, n_lines, header=True):
"""
Out of memmory shuffle function.
:param in_path: file that should be shuffeled.
:param out_path: output where shuffeled file should be written.
:param n_lines: number of lines of the file.
:param header: boolean, if True first row gets copied.
:return:
"""
with open(out_path, 'w') as out:
numbers = list(range(n_lines))
# Copy header
if header:
out.write(linecache.getline(in_path, 1))
n_lines -= 1
for i in range(n_lines):
idx = randint(0, n_lines-i-1)
if header: idx += 1
n = numbers.pop(idx)
assert n != 0
# Write line n, to output
out.write(linecache.getline(in_path, n+1))
def get_percentile(path, idx=0, percentile=95):
"""
:param path: path of the training file.
:param idx: target idx, from which the percentile should be calculated.
:param percentile: which percentile to get.
:return: the value of percentile.
"""
x = []
with open(path, 'r') as f:
for line in f:
# Skip header if it exsists.
if line.startswith("ADLOADINGTIME"): continue
x.append(float(line.strip().split('\t')[idx]))
return np.percentile(x, percentile)
def remove_outliers(inpath, outpath, idx=0):
"""
Removes outliers.
:param inpath: path of the file to remove outliers from.
:param outpath: path of the output file without outleirs.
:param idx: index of the target variable
:return: returns the number of remaining samples.
"""
print "Removing outliers"
percentile = get_percentile(inpath, idx, 95)
print "96th percentile %f" % (percentile)
first = True
n_lines = 0
with open(outpath, 'w') as fout:
with open(inpath, 'r') as fin:
for line in fin:
# First line could be a header.
if first:
first = False
if line.startswith("ADLOADINGTIME"):
fout.write(line)
continue
row = line.strip().split('\t')
if float(row[idx]) > percentile: continue
fout.write(line)
n_lines += 1
print "Lines remaining: %d" % (n_lines)
return n_lines
# Helper array for new header, which consists of discretasized attributes.
discrete_headers = ['UA_DEVICETYPE_' + x for x in m.UA_DEVICETYPE] + ['DEVICEORIENTATION_' + x for x in m.DEVICEORIENTATION] +\
['UA_BROWSERRENDERINGENGINE_' + x for x in m.UA_BROWSERRENDERINGENGINE] + ['ACTUALDEVICETYPE_' + x for x in m.ACTUALDEVICETYPE] +\
['PLATFORM_' + x for x in m.PLATFORM] + ['INTENDEDDEVICETYPE_' + x for x in m.INTENDEDDEVICETYPE] + ['CDNNAME_' + x for x in m.CDNNAME] +\
['EXTERNALADSERVER_' + x for x in m.EXTERNALADSERVER] + ['NETWORKTYPE_' + x for x in m.NETWORKTYPE] +\
['ACCOUNTID_' + x for x in m.ACCOUNTID] + ['CREATIVETYPE_' + x for x in m.CREATIVETYPE] + ['UA_OS_' + x for x in m.UA_OS] +\
['SDK_' + x for x in m.SDK]
def discretasize_line(line, header_old):
"""
Helper function for binarizing discreete attributes with low amount of categories.
:param line: one sample
:param header_old: header of that sample
:return: new line, with discreete attributes binarized.
"""
new_line = np.zeros(len(discrete_headers))
for T in attributes_map.TEST_SET_ALL:
idx = header_old.index(T)
val = line[idx]
new_idx = discrete_headers.index(T + '_' + val)
new_line[new_idx] = 1
assert len(discrete_headers) == len(new_line)
return new_line.tolist()
def binaries(line, header_old):
"""
Helper function for transforming binary attributes.
:param line: one sample
:param header_old: header of that sample
:return: new line, with binary attributes preprocessed.
"""
new_line = []
for b in attributes_map.BINARY:
idx = header_old.index(b)
val = line[idx]
if val != 'null':
new_line.append(val)
else:
new_line.append('0')
assert len(attributes_map.BINARY) == len(new_line)
return new_line
def continious(line, header_old):
"""
Helper function for transforming continious attributes.
:param line: one sample
:param header_old: header of that sample
:return: new line, with continious attributes preprocessed.
"""
new_line = []
for b in attributes_map.ALL_CONTINIOUS:
idx = header_old.index(b)
val = line[idx]
if val != 'null':
new_line.append(val)
else:
if b == 'TOPMOSTREACHABLEWINDOWHEIGHT' or b == 'HOSTWINDOWHEIGHT':
new_line.append('50')
elif b == 'TOPMOSTREACHABLEWINDOWWIDTH' or b == 'HOSTWINDOWWIDTH':
new_line.append('320')
assert len(attributes_map.ALL_CONTINIOUS) == len(new_line)
return new_line
def json_parse(line, header_old):
"""
Helper function for transforming JSON attributes.
:param line: one sample
:param header_old: header of that sample
:return: new line, with JSON attributes preprocessed.
"""
new_line = []
# FILEJSON
idx = header_old.index('FILESJSON')
val = line[idx]
d = json.loads(val)
s = 0
for key in d:
s += key['size']
new_line.append(s)
new_line.append(len(d))
# ERRORJSON
idx = header_old.index('ERRORSJSON')
val = line[idx]
d = json.loads(val)
new_line.append(len(d))
assert len(json_headers) == len(new_line)
return new_line
def timestamps(line, header_old):
"""
Helper function for preprocessing timestamps
:param line: one sample
:param header_old: header of that sample
:return: new line, with timestamp preprocessed.
"""
idx = header_old.index('TIMESTAMP')
val = line[idx]
d = datetime.fromtimestamp(float(val))
new_line = [val, d.day, d.weekday(), d.hour, d.minute, d.second, d.microsecond, (((((d.day * 24 + d.hour) * 60) + d.minute) * 60 + d.second) * 1000000) + d.microsecond]
assert len(attributes_map.TIMESTAMPS) == len(new_line)
return new_line
def missings(line, header_old):
"""
Helper function for adding the number of missing fields attribute.
:param line: one sample
:param header_old: header of that sample
:return: new line, with missing attribute.
"""
cntr = 0
for v in line:
if v == 'null':
cntr += 1
new_line = [cntr]
return new_line
def geo(line, header_old):
"""
Helper function for preprocessing GEO attributes.
:param line: one sample
:param header_old: header of that sample
:return: new line, with GEO attributes preprocessed.
"""
new_line = []
latLng = []
GEO_lat_lng = ["GEOIP_LNG", "GEOIP_LAT"]
for b in GEO_lat_lng:
idx = header_old.index(b)
val = line[idx]
if val != 'null':
latLng.append(val)
new_line.append(val)
else:
new_line.append('0')
if len(latLng) == 2:
[lat,lng] = latLng
new_line.append(float(lat) + float(lng))
new_line.append(float(lat) * float(lng))
new_line.append(float(lat) - float(lng))
else:
new_line.append(0)
new_line.append(0)
new_line.append(1000) # A large value that others dont have. Only happens in 20 cases anyway.
idx = header_old.index('GEOIP_COUNTRY')
val = line[idx]
cntries = geoip_countries
line = np.zeros(len(cntries))
if val in cntries:
new_idx = cntries.index(val)
line[new_idx] = 1
else:
line[-1] = 1
new_line = new_line + line.tolist()
assert len(attributes_map.GEO) + len(cntries) == len(new_line)
return new_line
def discretasize_large(line, header_old):
"""
Helper function for binarizing discreete attributes with large number of categories.
:param line: one sample
:param header_old: header of that sample
:return: new line, with discrete attributes preprocessed.
"""
fields = attributes_map.CATEGORIES_LARGE
new_line = []
for f in fields:
idx = header_old.index(f)
val = line[idx]
field_values = field_values_dict[f]
zeros = np.zeros(len(field_values))
if val in field_values:
new_idx = field_values.index(val)
zeros[new_idx] = 1
else:
zeros[-1] = 1
new_line = new_line + zeros.tolist()
return new_line
def discrete_large_headers(fields):
"""returns headers for fields"""
new_header = []
for field in fields:
new_header = new_header + [field + '_' + val for val in field_values_dict[field]]
return new_header
atleast=10000
def get_field_values(field, atleast=10000):
'''return categories for attribute "field" that have at least "atleast" samples with that value'''
return [key for key,c in getattr(discrete_values_counts, field).items() if c >= atleast] + ['OTHER']
"""values for attribute f"""
field_values_dict = {}
for f in attributes_map.CATEGORIES_LARGE:
field_values_dict[f] = get_field_values(f)
"""geoip_countries"""
geoip_countries = [key for key,c in m.GEOIP_COUNTRY_vals.items() if c >= atleast] + ['OTHER']
def geoip_heades(geoip_countries):
"""geoip_country_headers"""
return ['GEOIP_COUNTRY_' + x for x in geoip_countries]
'''json headers'''
json_headers = ['FILESJSON_size', 'FILESJSON_len', 'ERRORJSON_len']
def preprocess(inpath, outpath):
'''
Function that preprocesses the input file according to rules written in functions.
:param inpath: file that should be preprocessed
:param outpath: output file
'''
# Set new header.
header_new = [attributes_map.TARGET] + discrete_headers + attributes_map.BINARY + attributes_map.ALL_CONTINIOUS + json_headers + attributes_map.TIMESTAMPS + attributes_map.GEO + geoip_heades(geoip_countries) + discrete_large_headers(attributes_map.CATEGORIES_LARGE) + ['missing_count']
header_old = []
first = True
with open(outpath, 'w') as out:
with open(inpath, 'r') as fin:
for line in fin:
line = np.array(line.strip().split('\t'))
# First select header.
if first:
first = False
out.write('\t'.join(header_new) + '\n')
header_old = line.tolist()
target_idx = header_old.index(attributes_map.TARGET)
continue
'''Preprocess this line'''
# TARGET
new_line = [line[target_idx]]
# Discretes with small number of categoires.
new_line += discretasize_line(line, header_old)
# Binary
new_line += binaries(line, header_old)
# Continious
new_line += continious(line, header_old)
# Json
new_line += json_parse(line, header_old)
# Timestamps
new_line += timestamps(line, header_old)
# Geo
new_line += geo(line, header_old)
# Discrete large fields
new_line += discretasize_large(line, header_old)
# Number of nulls
new_line += missings(line, header_old)
''' Check that line size matches new header size'''
assert len(new_line) == len(header_new)
# Write out this line.
out.write('\t'.join([str(x) for x in new_line]) + '\n')
if __name__ == '__main__':
# Base derictroy where you have train and test file.
base = "D:\\mfrik_data\\"
fileTrain = "ccdm_all.tsv"
fileTest = "ccdm_test.tsv"
# Merge medium, large and sample tsv files into "cdm_all"
merge_tsvs(base + "ccdm_all.tsv", base + "ccdm_medium.tsv",
base + "ccdm_large.tsv",
base + "ccdm_sample.tsv")
# Prepare train paths.
base_base_train = base + fileTrain
without_outliers_train = base + fileTrain + "-without-outliers.tsv"
shuffled_path_train = base + fileTrain + "-without-outliers-shuffled.tsv"
preprocessed_train = base + fileTrain + "-preprocessed.tsv"
# Preprocess train file: remove outliers, shuffle, preprocess.
# Remove outliers from train.
new_len = remove_outliers(base_base_train, without_outliers_train, 0)
# Shuffle train file.
shuffle_file(without_outliers_train, shuffled_path_train, new_len)
# Preprocess train file.
preprocess(shuffled_path_train, preprocessed_train)
# Prepare test paths.
base_base_test = base + fileTest
preprocessed_test = base + fileTest + "-preprocessed.tsv"
# Preprocess test file: only preprocess the attributes.
preprocess(base_base_test, preprocessed_test)