-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
194 lines (154 loc) · 6.19 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
import optuna
import pickle
import pandas as pd
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier, XGBRegressor
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error, accuracy_score, balanced_accuracy_score
import numpy as np
def transform_to_supervised(df,
previous_steps=1,
forecast_steps=1,
dropnan=False):
"""
https://gist.github.com/monocongo/6e0df19c9dd845f3f465a9a6ccfcef37
Transforms a DataFrame containing time series data into a DataFrame
containing data suitable for use as a supervised learning problem.
Derived from code originally found at
https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/
:param df: pandas DataFrame object containing columns of time series values
:param previous_steps: the number of previous steps that will be included in the
output DataFrame corresponding to each input column
:param forecast_steps: the number of forecast steps that will be included in the
output DataFrame corresponding to each input column
:return Pandas DataFrame containing original columns, renamed <orig_name>(t), as well as
columns for previous steps, <orig_name>(t-1) ... <orig_name>(t-n) and columns
for forecast steps, <orig_name>(t+1) ... <orig_name>(t+n)
"""
# original column names
col_names = df.columns
# list of columns and corresponding names we'll build from
# the originals found in the input DataFrame
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(previous_steps, 0, -1):
cols.append(df.shift(i))
names += [('%s(t-%d)' % (col_name, i)) for col_name in col_names]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, forecast_steps):
cols.append(df.shift(-i))
if i == 0:
names += [('%s(t)' % col_name) for col_name in col_names]
else:
names += [('%s(t+%d)' % (col_name, i)) for col_name in col_names]
# put all the columns together into a single aggregated DataFrame
agg = pd.concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
df = pd.read_csv("smart_road_measurements_new_d_weather.csv", header=0)
df = df.drop("Height", axis=1) # contain N/A
df = df.drop("Distance", axis=1)
df = df.drop("State", axis=1)
df = df.drop("Ta", axis=1)
df = df.drop("Tsurf", axis=1)
df = df.drop("Water", axis=1)
df = df.drop("moon_illumination", axis=1)
df = df.drop("uvIndex", axis=1)
df['Time(+01:00)'] = pd.to_datetime(df['Time(+01:00)'], format='%H:%M:%S').dt.hour
df = df.groupby(['Date','Time(+01:00)']).mean()
df = df.drop_duplicates()
bins = [0, 0.5, 1]
labels = [0, 1]
df["Friction"] = pd.cut(df["Friction"], bins, labels=labels)
#df = df.drop("Date", axis=1)
#df = df.drop("Time(+01:00)", axis=1)
df = transform_to_supervised(df, previous_steps=24, forecast_steps=1, dropnan=True)
Y = df.loc[:, "Friction(t)"].to_numpy()
cols = [c for c in df.columns if '(t)' not in c]
data=df[cols]
data['Friction'] = Y
data.to_csv('test.csv')
data = data.values.tolist()
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
print(n_test)
data = np.array(data)
return data[:-n_test, :], data[-n_test:, :]
# walk-forward validation for univariate data
def walk_forward_validation(params, data, n_test):
predictions = list()
# split dataset
train, test = train_test_split(data, n_test)
# seed history with training dataset
history = [x for x in train]
# step over each time-step in the test set
for i in range(len(test)):
# split test row into input and output columns
testX, testy = test[i, :-1], test[i, -1]
# fit model on history and make a prediction
yhat = xgboost_forecast(params, history, testX)
# store forecast in list of predictions
predictions.append(yhat)
# add actual observation to history for the next loop
history.append(test[i])
# summarize progress
print('>expected=%.1f, predicted=%.1f' % (testy, yhat))
# estimate prediction error
error = balanced_accuracy_score(test[:, -1], predictions)
return error, test[:, -1], predictions
# fit an xgboost model and make a one step prediction
def xgboost_forecast(params, train, testX):
# transform list into array
train = np.array(train)
# split into input and output columns
trainX, trainy = train[:, :-1], train[:, -1]
# fit model
model = XGBClassifier(**params)
model.fit(trainX, trainy)
# make a one-step prediction
yhat = model.predict(np.array([testX]))
return yhat[0]
def objective(trial: Trial, data) -> float:
params = {
"booster": "gbtree",
#"tree_method": "gpu_hist",
"n_estimators": trial.suggest_int("n_estimators", 0, 1000),
"max_depth": trial.suggest_int("max_depth", 2, 10),
"reg_alpha": trial.suggest_int("reg_alpha", 0, 5),
"reg_lambda": trial.suggest_int("reg_lambda", 0, 5),
"min_child_weight": trial.suggest_int("min_child_weight", 0, 5),
"gamma": trial.suggest_int("gamma", 0, 5),
"learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.5),
"colsample_bytree": trial.suggest_discrete_uniform(
"colsample_bytree", 0.1, 1, 0.01
),
"nthread": -1,
"use_label_encoder": False,
"eval_metric": "logloss"
}
mae, y, yhat = walk_forward_validation(params, data, 20)
return mae
if not os.path.exists('output'):
os.makedirs('output')
study = optuna.create_study(
direction="maximize",
sampler=TPESampler(seed=1337),
study_name="res",
storage="sqlite:///output/res.db",
load_if_exists=True
)
study.optimize(
lambda trial: objective(trial, data),
n_trials=50,
show_progress_bar=True
)
df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
df.to_csv("output/res.csv", sep="\t")
print(study.best_trial)