-
Notifications
You must be signed in to change notification settings - Fork 1
/
train_model.py
128 lines (116 loc) · 3.4 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#########################################################################
# Train a hiearchical classifier on an experiment list and pickle the
# model.
#########################################################################
import sys
import os
from os.path import join, basename
from optparse import OptionParser
import json
import collections
from collections import defaultdict, Counter
import numpy as np
import dill
from models import model
from common import load_dataset
def main():
usage = "usage: %prog <configuration_file> <dataset_directory>"
parser = OptionParser(usage)
parser.add_option(
"-m",
"--pretrained_ensemble",
help="Path to a dilled pre-trained ensemble of classifiers"
)
parser.add_option(
"-o",
"--out_dir",
help="Directory in which to write the model"
)
(options, args) = parser.parse_args()
config_f = args[0]
dataset_dir = args[1]
out_dir = options.out_dir
if options.pretrained_ensemble:
pretrained_ensemble_f = options.pretrained_ensemble
else:
pretrained_ensemble_f = None
# Load the configuration
print("Reading configuration from {}.".format(config_f))
with open(config_f, 'r') as f:
config = json.load(f)
features = config['features']
algorithm = config['algorithm']
params = config['params']
preprocessors = None
preprocessor_params = None
if 'preprocessors' in config:
assert 'preprocessor_params' in config
preprocessors = config['preprocessors']
preprocessor_params = config['preprocessor_params']
# Train model
mod = train_model(
dataset_dir,
features,
algorithm,
params,
join(out_dir, 'tmp'),
preprocessor_names=preprocessors,
preprocessor_params=preprocessor_params,
model_dependency=pretrained_ensemble_f
)
print("Dumping the model with dill...")
out_f = join(out_dir, 'model.dill')
with open(out_f, 'wb') as f:
dill.dump(mod, f)
print("done.")
def train_model(
dataset_dir,
features,
algorithm,
params,
tmp_dir,
model_dependency=None,
preprocessor_names=None,
preprocessor_params=None
):
# Load the data
r = load_dataset.load_dataset(
dataset_dir,
features
)
og = r[0]
label_graph = r[1]
label_to_name = r[2]
the_exps = r[3]
exp_to_index = r[4]
exp_to_labels = r[5]
exp_to_tags = r[6]
exp_to_study = r[7]
study_to_exps = r[8]
exp_to_ms_labels = r[9]
data_matrix = r[10]
gene_names = r[11]
# Train the classifier
print('Training model: {}'.format(algorithm))
print('Parameters:\n{}'.format(json.dumps(params, indent=4)))
if preprocessor_names is not None:
print('Preprocessing data with: {}'.format(preprocessor_names))
print('Parameters:\n{}'.format(json.dumps(preprocessor_params, indent=4)))
mod = model.train_model(
algorithm,
params,
data_matrix,
the_exps,
exp_to_labels,
label_graph,
item_to_group=exp_to_study,
tmp_dir=tmp_dir,
features=gene_names,
model_dependency=model_dependency,
preprocessor_names=preprocessor_names,
preprocessor_params=preprocessor_params
)
print('done.')
return mod
if __name__ == "__main__":
main()