-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQuickML_Stacking.py
183 lines (183 loc) · 9.18 KB
/
QuickML_Stacking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, TimeSeriesSplit
from sklearn.model_selection import ShuffleSplit,StratifiedKFold,KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingRegressor, RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier,ExtraTreesRegressor
from sklearn.linear_model import LogisticRegressionCV, LinearRegression, Ridge
from sklearn.svm import LinearSVC, SVR, LinearSVR
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, LassoLarsCV
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LinearRegression
from sklearn.model_selection import GridSearchCV,StratifiedShuffleSplit,ShuffleSplit
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, StratifiedKFold
import time
import pdb
import time
import copy
from collections import Counter
#############################################################################
def accu(results, y_cv):
return (results==y_cv).astype(int).sum(axis=0)/(y_cv.shape[0])
def rmse(results, y_cv):
return np.sqrt(np.mean((results - y_cv)**2, axis=0))
################################################################################
def QuickML_Stacking(X_train, y_train, X_test='', modeltype='Regression',Boosting_Flag=False,
scoring='', verbose=0):
"""
Quickly build Stacks of multiple model results
Input must be a clean data set (only numeric variables, no categorical or string variables).
"""
start_time = time.time()
seed = 99
if len(X_train) <= 100000 or X_train.shape[1] < 50:
NUMS = 100
FOLDS = 5
else:
NUMS = 200
FOLDS = 10
## create Stacking models
estimators = []
### This keeps tracks of the number of predict_proba columns generated by each model ####
estimator_length = []
if isinstance(X_test, str):
no_fit = True
else:
no_fit = False
if no_fit:
#### This is where you don't fit the model but just do cross_val_predict ####
if modeltype == 'Regression':
if scoring == '':
scoring = 'neg_mean_squared_error'
scv = KFold(n_splits=FOLDS, random_state=seed, shuffle=True)
if Boosting_Flag:
###### Bagging models if Bagging is chosen ####
model4 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
n_estimators=NUMS,random_state=seed)
results = cross_val_predict(model4,X_train,y_train, cv=scv,n_jobs=-1)
estimators.append(('Bagging1',model4))
estimator_length.append(1)
elif Boosting_Flag is None:
#### Tree models if Linear chosen #####
model5 = DecisionTreeRegressor(random_state=seed,min_samples_leaf=2)
results = cross_val_predict(model5,X_train,y_train, cv=scv,n_jobs=-1)
estimators.append(('Decision Trees',model5))
estimator_length.append(1)
else:
#### Linear Models if Boosting is chosen #####
model6 = LassoCV(alphas=np.logspace(-10,-1,50), cv=scv,random_state=seed)
results = cross_val_predict(model6,X_train,y_train, cv=scv,n_jobs=-1)
estimators.append(('LassoCV Regularization',model6))
estimator_length.append(1)
else:
n_classes = len(Counter(y_train))
if scoring == '':
scoring = 'accuracy'
scv = StratifiedKFold(n_splits=FOLDS, random_state=seed, shuffle=True)
if Boosting_Flag:
#### Linear Models if Boosting is chosen #####
model4 = LinearDiscriminantAnalysis()
results = cross_val_predict(model4,X_train,y_train, cv=scv,n_jobs=-1,
method='predict_proba')
estimators.append(('Linear Discriminant',model4))
estimator_length.append(results.shape[1])
elif Boosting_Flag is None:
#### Tree models if Linear chosen #####
model6 = DecisionTreeClassifier(min_samples_leaf=2)
results = cross_val_predict(model6,X_train,y_train, cv=scv,n_jobs=-1,
method='predict_proba')
estimators.append(('Decision Tree',model6))
estimator_length.append(results.shape[1])
else:
###### Naive Bayes models if Bagging is chosen ####
if n_classes <= 2:
try:
model7 = GaussianNB()
except:
model7 = DecisionTreeClassifier(min_samples_leaf=2)
else:
try:
model7 = MultinomialNB()
except:
model7 = DecisionTreeClassifier(min_samples_leaf=2)
results = cross_val_predict(model7,X_train,y_train, cv=scv,n_jobs=-1,
method='predict_proba')
estimators.append(('Naive Bayes',model7))
estimator_length.append(results.shape[1])
else:
#### This is where you fit the model and then predict ########
if modeltype == 'Regression':
if scoring == '':
scoring = 'neg_mean_squared_error'
scv = KFold(n_splits=FOLDS, random_state=seed, shuffle=True)
if Boosting_Flag:
###### Bagging models if Bagging is chosen ####
model4 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
n_estimators=NUMS,random_state=seed)
results = model4.fit(X_train,y_train).predict(X_test)
estimators.append(('Bagging1',model4))
estimator_length.append(1)
elif Boosting_Flag is None:
#### Tree models if Linear chosen #####
model5 = DecisionTreeRegressor(random_state=seed,min_samples_leaf=2)
results = model5.fit(X_train,y_train).predict(X_test)
estimators.append(('Decision Trees',model5))
estimator_length.append(1)
else:
#### Linear Models if Boosting is chosen #####
model6 = LassoCV(alphas=np.logspace(-10,-1,50), cv=scv,random_state=seed)
results = model6.fit(X_train,y_train).predict(X_test)
estimators.append(('LassoCV Regularization',model6))
estimator_length.append(1)
else:
n_classes = len(Counter(y_train))
if scoring == '':
scoring = 'accuracy'
scv = StratifiedKFold(n_splits=FOLDS, random_state=seed, shuffle=True)
if Boosting_Flag:
#### Linear Models if Boosting is chosen #####
model4 = LinearDiscriminantAnalysis()
results = model4.fit(X_train,y_train).predict_proba(X_test)
estimators.append(('Linear Discriminant',model4))
estimator_length.append(results.shape[1])
elif Boosting_Flag is None:
#### Tree models if Linear chosen #####
model6 = DecisionTreeClassifier(min_samples_leaf=2)
results = model6.fit(X_train,y_train).predict_proba(X_test)
estimators.append(('Decision Tree',model6))
estimator_length.append(results.shape[1])
else:
###### Naive Bayes models if Bagging is chosen ####
if n_classes <= 2:
try:
model7 = GaussianNB()
except:
model7 = DecisionTreeClassifier(min_samples_leaf=2)
else:
try:
model7 = MultinomialNB()
except:
model7 = DecisionTreeClassifier(min_samples_leaf=2)
results = model7.fit(X_train,y_train).predict_proba(X_test)
estimators.append(('Naive Bayes',model7))
estimator_length.append(results.shape[1])
#stacks = np.c_[results1,results2,results3]
estimators_list = [(tuples[0],tuples[1]) for tuples in estimators]
estimator_names = [tuples[0] for tuples in estimators]
#### Here is where we consolidate the estimator names and their results into one common list ###
ls = []
for x,y in dict(zip(estimator_names,estimator_length)).items():
els = [x+str(eachy) for eachy in range(y)]
ls += els
if verbose == 1:
print(' Time taken for Stacking: %0.1f seconds' %(time.time()-start_time))
return ls, results
#########################################################