forked from sergeyf/ki_43B
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate_utils.py
212 lines (189 loc) · 7.74 KB
/
evaluate_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 11 15:31:42 2021
@author: sergey feldman
"""
from time import time
from copy import deepcopy
import numpy as np
from constants import N_JOBS, RANDOM_STATE, N_SPLITS, N_BAYESSEARCH_ITER
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import get_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupShuffleSplit, StratifiedShuffleSplit, ShuffleSplit
from mlxtend.feature_selection import ColumnSelector
from sklearn.model_selection import GridSearchCV
from ml_models import meta_pipeline, meta_grid
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.linear_model import LogisticRegression
from skopt import BayesSearchCV
from skopt.callbacks import DeltaYStopper, DeltaXStopper
def get_cv(
learning_task,
groups=False,
n_splits=N_SPLITS,
shuffle_split=False,
shuffle_split_train_size=0.75,
shuffle_split_test_size=0.25,
random_state=RANDOM_STATE
):
# this function creates the appropriate cross-validation splitter.
# if a group is provided, it is used to stratify the splits
# otherwise if it's a classification problem, the classes as used as the strata.
# for regression there is no stratification
# there is some wonkiness when you have exactly one split - that's a scikit learn issue
assert n_splits >= 1
if shuffle_split or n_splits == 1:
if groups:
cv = GroupShuffleSplit(n_splits=n_splits, train_size=shuffle_split_train_size,
test_size=shuffle_split_train_size, random_state=random_state)
elif learning_task in {"binary", "multiclass"}:
cv = StratifiedShuffleSplit(n_splits=n_splits, train_size=shuffle_split_train_size,
test_size=shuffle_split_test_size, random_state=random_state)
else:
cv = ShuffleSplit(n_splits=n_splits, train_size=shuffle_split_train_size,
test_size=shuffle_split_test_size, random_state=random_state)
else:
if groups:
cv = GroupKFold(n_splits)
elif learning_task in {"binary", "multiclass"}:
cv = StratifiedKFold(n_splits, shuffle=True, random_state=random_state)
else:
cv = KFold(n_splits, shuffle=True, random_state=random_state)
return cv
def prepend_column_subset(pipeline, covariate_names):
# note: this will fall apart if the input is a naked classifier/regressor and not a pipeline
# but that shouldn't happen anywhere in this repo
steps = [('column_subset', ColumnSelector(cols=covariate_names))] + deepcopy(pipeline.steps)
return Pipeline(steps)
def get_default_scoring(learning_task):
# this function defines the "default" scoring
# that will be used to evaluate ML models
if learning_task == "binary":
return "roc_auc"
elif learning_task == "multiclass":
return "roc_auc_ovr_weighted"
else:
return "neg_mean_absolute_error"
def stacked_model(df, y, pipeline, grid, input_covariates_list, names_of_covariate_groups, inner_cv, scoring, n_jobs=N_JOBS, verbose=False):
# individual models to be stacked
# have to fit them all first and figure out best hyper-params
clfs = []
for feat_name, feat_list in zip(names_of_covariate_groups, input_covariates_list):
# if there's too few features, we can't take only 1% of them
if len(feat_list) < 100 and "selector__percentile" in grid:
grid["selector__percentile"] = [5, 10, 100]
else:
grid["selector__percentile"] = [1, 10, 100]
# too much to search via gridsearch so using bayesian search
if verbose:
print(f"Fitting individual model for '{feat_name}' with {len(feat_list)} features...")
start = time()
# use bayesian search if there are too many hyperparameters settings in the grid
if np.prod([len(i) for i in grid.values()]) > N_BAYESSEARCH_ITER:
clf = BayesSearchCV(
prepend_column_subset(pipeline, feat_list),
grid,
cv=inner_cv,
scoring=scoring,
n_jobs=n_jobs,
n_iter=N_BAYESSEARCH_ITER,
n_points=2, # this works with the default n_jobs=8 and n_cv=4
refit=True
)
clf.fit(df, y, callback=[DeltaXStopper(1e-8), DeltaYStopper(0.001, n_best=3)])
else:
clf = GridSearchCV(
prepend_column_subset(pipeline, feat_list),
grid,
cv=inner_cv,
scoring=scoring,
n_jobs=n_jobs,
refit=True
)
clf.fit(df, y)
clfs.append((feat_name, clf.best_estimator_))
model_name, model = clf.best_estimator_.steps[-1]
if verbose:
print(f'Done in {time() - start} seconds.')
if model_name in {'svc', 'logistic'}:
if np.all(model.coef_ == 0):
print(f'All zeros for {model}, {feat_name}, {y.name}')
# the stacker itself
# meta_clf = GridSearchCV(
# estimator=meta_pipeline,
# param_grid=meta_grid,
# cv=inner_cv,
# scoring=scoring,
# n_jobs=n_jobs,
# refit=True,
# )
meta_clf = LogisticRegression(max_iter=10000, solver='liblinear', random_state=RANDOM_STATE)
# the stacked classifier
clf_stacked = StackingClassifier(
estimators=clfs,
final_estimator=meta_clf,
cv=inner_cv,
n_jobs=n_jobs,
verbose=0
)
# train the entire model hierarchy
clf_stacked.fit(df, y)
clfs.append(('stacked', clf_stacked))
return clfs
def sklearn_pipeline_evaluator(
df,
output_covariate,
input_covariates_list,
names_of_covariate_groups,
pipeline,
grid,
groups=None, # TODO: support groups later if we need to
outer_cv=None,
learning_task="binary",
scoring=None,
random_state=RANDOM_STATE,
n_splits=N_SPLITS,
n_jobs=N_JOBS,
verbose=False
):
if scoring == None:
scoring = get_default_scoring(learning_task)
scorer = get_scorer(scoring)
# see here for learning metrics: https://scikit-learn.org/stable/modules/model_evaluation.html
if outer_cv is None:
outer_cv = get_cv(learning_task, groups is not None, n_splits=n_splits, random_state=random_state)
nested_scores = []
estimators = []
for i, (train_inds, test_inds) in enumerate(outer_cv):
start = time()
if verbose:
print(f"Working on the fold {i}...")
df_train = df.iloc[train_inds, :]
df_test = df.iloc[test_inds, :]
# have to reuse the same inner split in multiple spots
# and this is what you have to do in sklearn to actually reuse it
inner_cv = get_cv(learning_task, groups is not None, n_splits=n_splits, random_state=random_state)
if groups is None:
inner_cv = inner_cv.split(df_train, df_train[output_covariate])
else:
inner_cv = inner_cv.split(df_train, df_train[output_covariate], groups=groups[train_inds])
inner_cv = list(inner_cv)
clfs = stacked_model(
df_train,
df_train[output_covariate],
pipeline,
grid,
input_covariates_list,
names_of_covariate_groups,
inner_cv,
scoring,
n_jobs,
verbose=False
)
estimators.append(clfs)
individual_scores = [scorer(clf[1], df_test, df_test[output_covariate]) for clf in clfs]
nested_scores.append(individual_scores)
if verbose:
print(f"Done in {time() - start} seconds.\n")
return {"test_score": np.array(nested_scores), "estimator": estimators}