-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtest_multi_model.py
157 lines (128 loc) · 5.45 KB
/
test_multi_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import pandas as pd
from sklearn.model_selection import train_test_split
from tests.helpers import make_dataset, make_regression_dataset
from vf_portalytics.multi_model import MultiModel
def test_multi_model():
total_x, total_y = make_dataset()
# Declare basic parameters
cat_feature = 'category'
feature_col_list = total_x.columns.drop(cat_feature)
clusters = total_x[cat_feature].unique()
# Split into train and test
train_index, test_index = train_test_split(total_x.index, test_size=0.33, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]
# keep all the features
selected_features = {}
for gp_key in clusters:
selected_features[gp_key] = feature_col_list
nominal_features = ['feature_0']
ordinal_features = ['feature_1']
# imitate params given from hyper optimization tuning
params = {
'A': {
'model_name': 'XGBRegressor',
'transformer_nominal': 'TargetEncoder',
'n_estimators': 3,
'transformer_ordinal': 'OrdinalEncoder'
},
'B': {
'model_name': 'XGBRegressor',
'n_estimators': 3,
'transformer_nominal': 'TargetEncoder',
'transformer_ordinal': 'OrdinalEncoder'
}
}
# Initialize model
model = MultiModel(group_col=cat_feature, clusters=clusters, params=params,
selected_features=selected_features, nominals=nominal_features, ordinals=ordinal_features)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
# check if produces 1-1 results
assert (test_y.index == pred_test_y.index).all()
# check if we predict a sample from an other category
test_sample_0 = pd.DataFrame(test_x.iloc[0].copy()).T
test_sample_0['category'] = 'New_Category'
pred_test_sample_0 = model.predict(test_sample_0)
assert (pred_test_sample_0.values == 0).all()
def test_multi_model_to_single_model():
n_features = 5
total_x, total_y = make_regression_dataset(n_samples=100, n_features=n_features, n_informative=3)
# Split into train and test
train_index, test_index = train_test_split(total_x.index, test_size=0.33, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]
cat_feature = 'category'
train_x[cat_feature] = 'group_0'
clusters = train_x[cat_feature].unique()
feature_col_list = train_x.columns.drop(cat_feature)
# keep all the features
selected_features = {}
for gp_key in clusters:
selected_features[gp_key] = feature_col_list
nominal_features = ['feature_0']
ordinal_features = ['feature_1']
# imitate params given from hyper optimization tuning
params = {
'group_0': {
'model_name': 'XGBRegressor',
'n_estimators': 5,
'transformer_nominal': 'TargetEncoder',
'transformer_ordinal': 'OrdinalEncoder'},
}
# Initiliaze model
model = MultiModel(group_col=cat_feature, clusters=clusters, params=params,
selected_features=selected_features, nominals=nominal_features, ordinals=ordinal_features)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
# check if we predicted with a model different than DummyClassifier
assert not (pred_test_y == 0).all()[0]
def test_multi_model_with_double_target():
n_features = 5
total_x, total_y = make_dataset()
# Declare basic parameters
cat_feature = 'category'
feature_col_list = total_x.columns.drop(cat_feature)
clusters = total_x[cat_feature].unique()
# Split into train and test
train_index, test_index = train_test_split(total_x.index, test_size=0.33, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]
# make the target double
train_y = pd.DataFrame({'target_1': train_y, 'target_2': 2*train_y})
# keep all the features
selected_features = {}
for gp_key in clusters:
selected_features[gp_key] = feature_col_list
nominal_features = ['feature_0']
ordinal_features = ['feature_1']
# imitate params given from hyper optimization tuning
params = {
'A': {
'model_name': 'XGBRegressorChain',
'order': [0, 1],
'max_depth': 2,
'min_samples_leaf': 400,
'min_samples_split': 400,
'n_estimators': 6,
'transformer_nominal': 'TargetEncoder',
'transformer_ordinal': 'OrdinalEncoder'
},
'B': {
'model_name': 'ExtraTreesRegressor',
'max_depth': 2,
'min_samples_leaf': 400,
'min_samples_split': 400,
'n_estimators': 5,
'transformer_nominal': 'TargetEncoder',
'transformer_ordinal': 'OrdinalEncoder'
}
}
# Initiliaze model
model = MultiModel(group_col=cat_feature, clusters=clusters, params=params,
selected_features=selected_features, nominals=nominal_features, ordinals=ordinal_features)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
assert pred_test_y.shape[1] == 2
assert model.sub_models['B'].n_estimators == 5
assert model.sub_models['A'].base_estimator.n_estimators == 6