-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_dataset.py
121 lines (95 loc) · 4.27 KB
/
preprocess_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import argparse
from preprocessing.acl_dataset import get_acl_dataset
from preprocessing.subj_dataset import get_subj_dataset
def create_argument_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-d',
'--dataset',
type=str,
help='Which dataset to pre-process')
parser.add_argument('-dd',
'--data-dir',
type=str,
help='The location of the Large Movie Review Dataset')
parser.add_argument('-dod',
'--data-output-dir',
type=str,
help='The output dir for the formatted dataset')
parser.add_argument('-dt',
'--dataset-type',
type=str,
help='The type of dataset to be created (Full, Debug, Active Learning)')
parser.add_argument('-trsp',
'--train-save-path',
type=str,
help='The location to save the formatted train dataset')
parser.add_argument('-vsp',
'--validation-save-path',
type=str,
help='The location to save the formatted validation dataset')
parser.add_argument('-tsp',
'--test-save-path',
type=str,
help='The location to save the formatted test dataset')
parser.add_argument('-ef',
'--embedding-file',
type=str,
help='The location of the embedding file')
parser.add_argument('-ep',
'--embedding-path',
type=str,
help='Location of the embedding file (Testing Dataset Only)')
parser.add_argument('-ewi',
'--embedding-wordindex-path',
type=str,
help='Location of the embedding word index file (Testing Dataset Only)')
parser.add_argument('-es',
'--embed-size',
type=int,
help='The embedding size of the embedding file')
parser.add_argument('-s',
'--sentence-size',
type=int,
help=('The sentence size that will be used in the model.' +
'If a sentence in our dataset is larger than this variable' +
'It will be cropped to this size. Otherwise, it will be padded' +
'with an special character'))
parser.add_argument('-o',
'--output-dir',
type=str,
help='The path of the new formatted dataset (TFRecord)')
return parser
def main():
parser = create_argument_parser()
user_args = vars(parser.parse_args())
train_save_path = user_args['train_save_path']
validation_save_path = user_args['validation_save_path']
test_save_path = user_args['test_save_path']
dataset = user_args['dataset']
data_dir = user_args['data_dir']
data_output_dir = user_args['data_output_dir']
output_dir = user_args['output_dir']
embedding_file = user_args['embedding_file']
embed_size = user_args['embed_size']
embedding_path = user_args['embedding_path']
embedding_wordindex_path = user_args['embedding_wordindex_path']
sentence_size = user_args['sentence_size']
if dataset == 'acl':
dataset_processor = get_acl_dataset(user_args['dataset_type'])
elif dataset == 'subj':
dataset_processor = get_subj_dataset(user_args['dataset_type'])
movie_review_dataset = dataset_processor(
train_save_path=train_save_path,
validation_save_path=validation_save_path,
test_save_path=test_save_path,
data_dir=data_dir,
data_output_dir=data_output_dir,
output_dir=output_dir,
embedding_file=embedding_file,
embed_size=embed_size,
embedding_path=embedding_path,
embedding_wordindex_path=embedding_wordindex_path,
sentence_size=sentence_size)
movie_review_dataset.create_dataset()
if __name__ == '__main__':
main()