-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_training_forecasting.py
197 lines (160 loc) · 7.03 KB
/
model_training_forecasting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import subprocess
from sys import *
from datetime import datetime, timedelta
from copy import deepcopy
from gluonts.dataset.repository import get_dataset, dataset_names
from gluonts.dataset.util import to_pandas
from gluonts.evaluation.metrics import mse, mase
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
dataset = pd.read_csv('../data_for_SFS/SARI/sari.csv', parse_dates=['ds'])
print(dataset)
# create the values for 19 quantile forecasts to be explored potentially
lquant, uquant = [], []
for qq in range(5,50,5):
lquant.append(qq/100); uquant.append(1.0 - qq/100);
quant_levels = lquant + [0.5] + sorted(uquant)
print('Create these quantile levels:', quant_levels)
# these are the settings used as command line arguments on NeSI
# example call: "python model_training_forecasting.py 17-07-2019 NONE Chronos"
split_date = argv[1] # defines the date used to split into training and testing set
covars = argv[2] # covariates used: NONE, AGE, LC, AGE+LC
algo = argv[3] # algorithm used: TFT, DeepAR, AutoML, Chronos
sdds = datetime.strptime(split_date, '%Y-%m-%d')
# some default settings
eval_metric = 'MSE'
data_set = 'SARI'
target = 'SARI'
preset_used = None
time_limit = 150000
# prepare the list of time series variables that need to be integrated in the dataset
all_vars = [target]
if covars != 'NONE':
all_vars += covars.split('+')
if 'AGE' in covars:
all_vars.remove('AGE')
all_vars += ['AGE_jt1', 'AGE_1to4', 'AGE_5to14', 'AGE_15to64', 'AGE_65plus']
if 'LC' in covars:
all_vars.remove('LC')
all_vars += ['FLU', 'RSV', 'RV', 'ENTV', 'ADV', 'HMPV', 'PIV1', 'PIV2', 'PIV3']
# define a SEED, the forecsasting horizon and the output folder (depending on the algorithm)
SEED = 482020
horizon = 21
base_dir = 'out_' + algo
out_path = os.path.join(base_dir, eval_metric, data_set,
target, covars, split_date)
print('These are the vars:', all_vars)
# remove rows that have NANs in one of the important columns
# the prefix "avg" specifies columns in which SARI cases were averaged based
# on the 7-day sliding window as described in the manunscript
col_prefix = 'avg'
for col in all_vars:
dataset.dropna(subset=[col_prefix+col], axis=0, inplace=True)
# create a new DataFrame having averything needed
data = {'start':list(dataset['ds'])}
data['item_id'] = dataset.shape[0] * [target]
for col in all_vars:
data[col] = list(dataset['avg' + col])
data = pd.DataFrame(data)
temp_train = data.loc[data['start'] <= split_date]
# fill the summer gap with 0s as described in the manuscript
print('Filling with 0s now...')
min_ds, max_ds = temp_train['start'].min(), temp_train['start'].max()
print(min_ds, max_ds)
filled_train = {'start':None, 'item_id':None}
filling_value = 0.00001
for col in temp_train.columns:
if col in filled_train:
continue
value_map = dict(zip(temp_train['start'], temp_train[col]))
running_ds = deepcopy(min_ds)
new_ds, temp_col = [], []
while running_ds <= max_ds:
temp_col.append( value_map.get(running_ds, filling_value) )
new_ds.append(running_ds)
running_ds += timedelta(days=1)
filled_train[col] = temp_col
if filled_train['start'] == None:
filled_train['start'] = new_ds
filled_train['item_id'] = len(filled_train['start']) * [target]
filled_train = pd.DataFrame(filled_train)
# prepare a sequencially arranged dataset for DeepAR
# this is required by DeepAR for the multivariate-to-multivariate forecast
sequential = {'start':[], 'item_id':[], 'TS':[]}
for col in filled_train.columns:
if col in ['start','item_id']:
continue
sequential['start'] += list(filled_train['start'])
sequential['item_id'] += filled_train.shape[0] * [col]
sequential['TS'] += list(filled_train[col])
sequential = pd.DataFrame(sequential)
train_data = None
if algo == 'DeepAR':
train_data = TimeSeriesDataFrame.from_data_frame(sequential,
id_column="item_id", timestamp_column="start")
else:
train_data = TimeSeriesDataFrame.from_data_frame(filled_train,
id_column="item_id", timestamp_column="start")
# convert training data into the format required for autogluon
train_data = train_data.convert_frequency('D')
predictor = None
if algo == 'TFT':
# define the basic predictor
predictor = TimeSeriesPredictor(prediction_length=horizon, path=out_path,
target=target, eval_metric=eval_metric, quantile_levels=quant_levels)
agts_algo = 'TemporalFusionTransformerModel'
# try to set up the transformer
predictor.fit(train_data, time_limit=time_limit,
verbosity=4, random_seed=SEED,
hyperparameters={agts_algo: {}})
if algo == 'DeepAR':
predictor = TimeSeriesPredictor(prediction_length=horizon, path=out_path,
target='TS', eval_metric=eval_metric, quantile_levels=quant_levels)
agts_algo = 'DeepAR'
predictor.fit(train_data, time_limit=time_limit,
verbosity=4, random_seed=SEED, hyperparameters={agts_algo: {}})
if algo == 'AutoML':
predictor = TimeSeriesPredictor(prediction_length=horizon, path=out_path,
target=target, eval_metric=eval_metric, quantile_levels=quant_levels)
preset_used = 'best_quality' # it should be "best_quality", use "fast_training" for testing
predictor.fit(train_data, presets=preset_used, time_limit=time_limit,
verbosity=4, random_seed=SEED)
if algo == 'Chronos':
predictor = TimeSeriesPredictor(prediction_length=horizon, path=out_path,
target=target, eval_metric=eval_metric, quantile_levels=quant_levels)
preset_used = 'chronos_large' # it should be "chronos_large"
predictor.fit(train_data, presets=preset_used, time_limit=time_limit,
verbosity=4, random_seed=SEED)
predictions = predictor.predict(train_data, random_seed=SEED)
print(predictions)
# save agts predictions
pred_fp = out_path + '/agts_predictions.pkl'
pickle.dump(predictions, open(pred_fp, 'wb'))
print('AG-TS pred: ', list(predictions['mean']))
# write out main information
summary = predictor.fit_summary(verbosity=4)
leaderboard_train = summary['leaderboard']
out_fp = out_path + '/train_LB.csv'
leaderboard_train.to_csv(out_fp)
out_fp = out_path + '/train_summary.pkl'
pickle.dump(summary, open(out_fp, 'wb'))
# delete the saved models to save space on the project folder
model_dir = out_path + '/models/'
os.system('rm %s -r'%(model_dir))
# convert predictions to a simpler DataFrame structure
new_dates, items = [], []
for ii in predictions.index:
items.append(ii[0])
new_dates.append(ii[1])
preds_df_all = {'ds': new_dates, 'item':items}
for col in predictions.columns:
preds_df_all[col] = list(predictions[col])
preds_df_all = pd.DataFrame(preds_df_all)
preds_df = preds_df_all.loc[preds_df_all['item'] == target]
pred_fp = out_path + '/predictions.csv'
preds_df.to_csv(pred_fp, index=False)
print(preds_df)