This repository has been archived by the owner on Nov 24, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
161 lines (131 loc) · 5.86 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Author: Hongwei Zhang
# Email: hw_zhang@outlook.com
import os
# import bz2
import errno
import argparse
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import dump_svmlight_file
from encoder import FFMHashEncoder, SVMHashEncoder, OneHotEncoder
def read_train(filename):
train_X = list()
train_y = list()
with open(filename, "r") as in_f:
for line in in_f:
label, features = line.split("\t")
train_X.append(features.split(","))
train_y.append(int(float(label)))
return np.array(train_X), np.array(train_y)
def read_test(filename):
test_X = list()
with open(filename, "r") as in_f:
for line in in_f:
test_X.append(line.split(","))
return np.array(test_X)
def save_to_disk(filename, X, y):
with open(filename, "w") as out_f:
if y is not None:
for i in range(len(X)):
out_f.write("{} {}\n".format(y[i], " ".join(X[i])))
else:
for i in range(len(X)):
out_f.write("{}\n".format(" ".join(X[i])))
def encode_and_save(args,
train_filename, original_train_X, train_y,
test_filename, original_test_X, test_y):
enc = None
if args.encoder_type == "hash":
if args.output_format == "ffm":
enc = FFMHashEncoder(args.hash_base, args.hash_offset)
elif args.output_format == "svm":
enc = SVMHashEncoder(args.hash_base, args.hash_offset)
else:
raise NotImplementedError(
"{} output format is not supported".format(
args.output_format))
elif args.encoder_type == "onehot":
enc = OneHotEncoder()
else:
raise NotImplementedError(
"{} encoder type is not supported".format(
args.encoder_type))
enc.fit(original_train_X)
train_X = enc.transform(original_train_X)
test_X = enc.transform(original_test_X)
if args.encoder_type == "hash":
save_to_disk(train_filename, train_X, train_y)
save_to_disk(test_filename, test_X, test_y)
elif args.encoder_type == "onehot":
dump_svmlight_file(
train_X, train_y, train_filename, zero_based=False)
if test_y is not None:
dump_svmlight_file(
test_X, test_y, test_filename, zero_based=False)
else:
# dump_svmlight_file need y, so bad
dump_svmlight_file(
test_X, np.zeros(test_X.shape[0]),
f=test_filename, zero_based=False)
else:
raise NotImplementedError(
"{} encoder type is not supported".format(
args.encoder_type))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create cv")
parser.add_argument("-k", "--kfold", type=int, default=5, help="kfold")
parser.add_argument("-r", "--random_state", type=int, default=0,
help="random state for creating cv")
parser.add_argument("-b", "--hash_base", type=int, default=100000,
help="hash base")
parser.add_argument("-m", "--hash_offset", type=int, default=100,
help="hash offset")
parser.add_argument("-e", "--encoder_type", type=str, default="hash",
choices=["hash", "onehot"],
help="encoder type, onehot only supports svm format")
parser.add_argument("-f", "--output_format", type=str, default="ffm",
choices=["ffm", "svm"],
help="output format when encoder_type is hash")
parser.add_argument("-p", "--prefix", type=str, required=True,
help="prefix of cv filename")
parser.add_argument("-o", "--output", type=str, required=True,
help="output dir")
parser.add_argument("-t", "--train_filename", type=str, required=True,
help="filename of training data")
parser.add_argument("-v", "--test_filename", type=str,
help="filename of test data")
args = parser.parse_args()
try:
os.makedirs(args.output)
except OSError as e:
# if dir exists, ignore exception
if e.errno != errno.EEXIST and os.path.isdir(args.output):
print("cannot create output dir, because {}".format(str(e)))
original_train_X, train_y = read_train(args.train_filename)
skf = StratifiedKFold(n_splits=args.kfold, shuffle=True,
random_state=args.random_state)
curr_fold = 1
for train_index, valid_index in skf.split(original_train_X, train_y):
cv_original_train_X = original_train_X[train_index]
cv_original_valid_X = original_train_X[valid_index]
cv_train_y = train_y[train_index]
cv_valid_y = train_y[valid_index]
cv_train_filename = args.output + "/" + "_".join(
[args.prefix, "cv", "train", "fold", str(curr_fold)]) + ".csv"
cv_valid_filename = args.output + "/" + "_".join(
[args.prefix, "cv", "valid", "fold", str(curr_fold)]) + ".csv"
print("encode and save {} and {}".format(
cv_train_filename, cv_valid_filename))
encode_and_save(args,
cv_train_filename, cv_original_train_X, cv_train_y,
cv_valid_filename, cv_original_valid_X, cv_valid_y)
curr_fold += 1
if args.test_filename is not None:
original_test_X = read_test(args.test_filename)
train_filename = args.output + "/" + "_".join(
[args.prefix, "train"]) + ".csv"
test_filename = args.output + "/" + "_".join(
[args.prefix, "test"]) + ".csv"
encode_and_save(args,
train_filename, original_train_X, train_y,
test_filename, original_test_X, None)