-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgreedy.py
140 lines (112 loc) · 4.53 KB
/
greedy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from rgf.sklearn import RGFClassifier, RGFRegressor, FastRGFClassifier, FastRGFRegressor
class RGF:
"""
Wrapper for (Fast) Regularized Greedy Forest
based on RGFClassifier/FastRGFClassifier (classification) and RGFRegressor/FastRGFRegressor (regression)
https://github.com/RGF-team/rgf_python
Parameters
----------
task: string ("classification", "regression")
Either regression of classification task
fast: bool
Should Fast RGF implemented be used?
# To Dos
----------
- ...
"""
def __init__(self, task, fast=False):
if task == 'classification':
self.metric = 'roc_auc'
self.task = "classification"
if fast:
self.model = FastRGFClassifier()
else:
self.model = RGFClassifier(loss="Log")
else:
self.metric = 'neg_mean_squared_error'
self.task = "regression"
if fast:
self.model = FastRGFRegressor()
else:
self.model = RGFRegressor(loss="LS", normalize=True)
self.X_test = None
self.X_train = None
self.y_test = None
self.y_train = None
self.grid_search = None
self.y_predict = None
self.test_score = None
def load_data(self, path_train, path_test):
"""
Method for loading data from path
:param path_train: path to training csv
:param path_test: path to test csv
:return: None
"""
# Load data
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)
# Columns
cols_train = df_train.columns.tolist()
cols_test = df_test.columns.tolist()
# Subset relevant columns in data
use_this_cols = set(cols_train).intersection(cols_test)
df_train = df_train.loc[:, use_this_cols]
df_test = df_test.loc[:, use_this_cols]
# Target and features
self.y_train = df_train.loc[:, "label"]
self.X_train = df_train.drop("label", axis=1)
self.y_test = df_test.loc[:, "label"]
self.X_test = df_test.drop("label", axis=1)
# Label encoding
if self.task == 'classification':
la_encoder_train = LabelEncoder()
self.y_train = pd.Series(la_encoder_train.fit_transform(self.y_train))
la_encoder_test = LabelEncoder()
self.y_test = pd.Series(la_encoder_test.fit_transform(self.y_test))
def tune(self, grid, random=False, n_iter=10, folds=5, cores=4):
"""
Method for parameter optimization via grid search
:param grid: dict of parameters
:param random: boolean if random search should be used
:param n_iter: int for number of parameter settings in random search
:param folds: number of CV folds
:param cores: number of cores to use
:return: None
"""
# CV object
# Random search
if random:
self.grid_search = RandomizedSearchCV(estimator=self.model,
param_distributions=grid,
n_iter=n_iter,
scoring=self.metric,
cv=folds,
n_jobs=cores,
verbose=3)
# Grid search
else:
self.grid_search = GridSearchCV(estimator=self.model,
param_grid=grid,
scoring=self.metric,
cv=folds,
n_jobs=cores,
verbose=3)
# Run search
self.grid_search.fit(X=self.X_train, y=self.y_train)
def score(self):
"""
Method for scoring test data set
:return: None
"""
# Prediction on test sample
self.y_predict = self.grid_search.predict(X=self.X_test)
# Score
if self.task == 'classification':
self.test_score = roc_auc_score(y_true=self.y_test, y_score=self.y_predict)
else:
self.test_score = mean_squared_error(y_true=self.y_test, y_pred=self.y_predict)