-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutils.py
183 lines (147 loc) · 7.14 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
def prophet_fit(df, prophet_model, today_index, lookback_days=None, predict_days=21):
"""
Fit the model to the time-series data and generate forecast for specified time frames
Args
----
df : pandas DataFrame
The daily time-series data set contains ds column for
dates (datetime types such as datetime64[ns]) and y column for numerical values
prophet_model : Prophet model
Prophet model with configured parameters
today_index : int
The index of the date list in the df dataframe, where Day (today_index-lookback_days)th
to Day (today_index-1)th is the time frame for training
lookback_days: int, optional (default=None)
As described above, use all the available dates until today_index as
training set if no value assigned
predict_days: int, optional (default=21)
Make prediction for Day (today_index)th to Day (today_index+predict_days)th
Returns
-------
fig : matplotlib Figure
A plot with actual data, predicted values and the interval
forecast : pandas DataFrame
The predicted result in a format of dataframe
prophet_model : Prophet model
Trained model
"""
# segment the time frames
baseline_ts = df['ds'][:today_index]
baseline_y = df['y'][:today_index]
if not lookback_days:
print('Use the data from {} to {} ({} days)'.format(df['ds'][0],
df['ds'][today_index - 1],
today_index))
else:
baseline_ts = df['ds'][today_index - lookback_days:today_index]
baseline_y = df.y[today_index - lookback_days:today_index]
print('Use the data from {} to {} ({} days)'.format(df['ds'][today_index - lookback_days],
df['ds'][today_index - 1],
lookback_days))
print('Predict {} to {} ({} days)'.format(df['ds'][today_index],
df['ds'][today_index + predict_days - 1],
predict_days))
# fit the model
prophet_model.fit(pd.DataFrame({'ds': baseline_ts.values,
'y': baseline_y.values}))
future = prophet_model.make_future_dataframe(periods=predict_days)
# make prediction
forecast = prophet_model.predict(future)
# generate the plot
fig = prophet_model.plot(forecast)
return fig, forecast, prophet_model
def prophet_plot(df, fig, today_index, lookback_days=None, predict_days=21, outliers=list()):
"""
Plot the actual, predictions, and anomalous values
Args
----
df : pandas DataFrame
The daily time-series data set contains ds column for
dates (datetime types such as datetime64[ns]) and y column for numerical values
fig : matplotlib Figure
A plot with actual data, predicted values and the interval which we previously obtained
from Prophet's model.plot(forecast).
today_index : int
The index of the date list in the dataframe dividing the baseline and prediction time frames.
lookback_days : int, optional (default=None)
Day (today_index-lookback_days)th to Day (today_index-1)th is the baseline time frame for training.
predict_days : int, optional (default=21)
Make prediction for Day (today_index)th to Day (today_index+predict_days)th.
outliers : a list of (datetime, int) tuple
The outliers we want to highlight on the plot.
"""
# retrieve the subplot in the generated Prophets matplotlib figure
ax = fig.get_axes()[0]
start = 0
end = today_index + predict_days
x_pydatetime = df['ds'].dt.to_pydatetime()
# highlight the actual values of the entire time frame
ax.plot(x_pydatetime[start:end],
df.y[start:end],
color='orange', label='Actual')
# plot each outlier in red dot and annotate the date
for outlier in outliers:
ax.scatter(outlier[0], outlier[1], color='red', label='Anomaly')
ax.text(outlier[0], outlier[1], str(outlier[0])[:10], color='red')
# highlight baseline time frame with gray background
if lookback_days:
start = today_index - lookback_days
ax.axvspan(x_pydatetime[start],
x_pydatetime[today_index],
color=sns.xkcd_rgb['grey'],
alpha=0.2)
# annotate the areas, and position the text at the bottom 5% by using ymin + (ymax - ymin) / 20
ymin, ymax = ax.get_ylim()[0], ax.get_ylim()[1]
ax.text(x_pydatetime[int((start + today_index) / 2)], ymin + (ymax - ymin) / 20, 'Baseline area')
ax.text(x_pydatetime[int((today_index * 2 + predict_days) / 2)], ymin + (ymax - ymin) / 20, 'Prediction area')
# re-organize the legend
patch1 = mpatches.Patch(color='red', label='Anomaly')
patch2 = mpatches.Patch(color='orange', label='Actual')
patch3 = mpatches.Patch(color='skyblue', label='Predict and interval')
patch4 = mpatches.Patch(color='grey', label='Baseline area')
plt.legend(handles=[patch1, patch2, patch3, patch4])
plt.show()
def get_outliers(df, forecast, today_index, predict_days=21):
"""
Combine the actual values and forecast in a data frame and identify the outliers
Args
----
df : pandas DataFrame
The daily time-series data set contains ds column for
dates (datetime types such as datetime64[ns]) and y column for numerical values
forecast : pandas DataFrame
The predicted result in a dataframe which was previously generated by
Prophet's model.predict(future)
today_index : int
The summary statistics of the right tree node.
predict_days : int, optional (default=21)
The time frame we segment as prediction period
Returns
-------
outliers : a list of (datetime, int) tuple
A list of outliers, the date and the value for each
df_pred : pandas DataFrame
The data set contains actual and predictions for the forecast time frame
"""
df_pred = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(predict_days)
df_pred.index = df_pred['ds'].dt.to_pydatetime()
df_pred.columns = ['ds', 'preds', 'lower_y', 'upper_y']
df_pred['actual'] = df['y'][today_index: today_index + predict_days].values
# construct a list of outliers
outlier_index = list()
outliers = list()
for i in range(df_pred.shape[0]):
actual_value = df_pred['actual'][i]
if actual_value < df_pred['lower_y'][i] or actual_value > df_pred['upper_y'][i]:
outlier_index += [i]
outliers.append((df_pred.index[i], actual_value))
# optional, print out the evaluation for each outlier
print('=====')
print('actual value {} fall outside of the prediction interval'.format(actual_value))
print('interval: {} to {}'.format(df_pred['lower_y'][i], df_pred['upper_y'][i]))
print('Date: {}'.format(str(df_pred.index[i])[:10]))
return outliers, df_pred