-
Notifications
You must be signed in to change notification settings - Fork 0
/
predictor.py
235 lines (210 loc) · 9.8 KB
/
predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from models.tree_models import MyExtraTreesRegressor, DoubleExtraTreesRegressor
from models.monthly_average import MonthlyAverageClassifier
from data_helper import DataHelper
from days_statistics import DaysStatistics
from datetime import timedelta, date, datetime
import pandas as pd
import os
import json
import requests
from bs4 import BeautifulSoup
import urllib.request
import sys
import unicodedata
from utils import Day
WEATHER_API_KEY = None
# URL is missing API KYE. Full address with API KEY is constructed in get_weather() method
WEATHER_URL = 'http://api.openweathermap.org/data/2.5/forecast?id=3067696&units=metric&appid='
class Predictor():
def __init__(self):
"""
Constructor
"""
self.dh = DataHelper()
self.ds = DaysStatistics()
self.prediction_steps = 288
def get_weather(self):
"""
Downloads actual weather forcast. It is necessary to fill valid API Key to WEATHER_URL for correnct functionality.
"""
r = requests.get(url = WEATHER_URL+WEATHER_API_KEY)
json_data = json.loads(r)
weather = []
for weather_prediction in json_data['list']:
time = datetime.fromtimestamp(weather_prediction['dt'])
temperature = weather_prediction['main']['temp']
wind = weather_prediction['wind']['speed']
rain, snow = 0.0, 0.0
if 'rain' in weather_prediction['main']:
rain = weather_prediction['main']['rain']['3h']
if 'snow' in weather_prediction['main']:
rain = weather_prediction['main']['snow']['3h']
precipitation = rain + snow
pressure = weather_prediction['main']['pressure']
weather.append([time, temperature,wind,precipitation,pressure])
return weather
def get_date_string(self, date):
"""
Generates part of URL address for Sutka pool web page with table of reserved lines for given date.
:param date: Datetime.date
:return: The rest of URL that starts with: http://www.sutka.eu/obsazenost-bazenu
"""
next_day = date + timedelta(days=1)
day_str = date.strftime('%d.%m.%Y')
next_day_str = next_day.strftime('%d.%m.%Y')
addr_str = '?from='+day_str+'&to='+next_day_str+'&send=Zobrazit&do=form-submit'
return addr_str
def parse_line_usage_table(self, table_rows):
"""
Parse table with reserved lines from web page http://www.sutka.eu/obsazenost-bazenu
:param table_rows: HTML table from web that is prepared in method get_lines_usage_for_day
:return: list with 64 items. Each item is 15 minute time slot starting from 6:00
"""
time_slots = ['']*64
i = 0
for i in range(1,9):
row = table_rows[i]
slot = 0
row_str = '|'
th = row.find_all('th')
cols = row.find_all('td')
for col in cols:
if len(col) == 3:
title = unicodedata.normalize('NFKD', col.attrs['title']).encode('ascii','ignore')
if 'colspan' in col.attrs:
row_str += ' * | * |'
time_slots[slot] += title.decode('utf-8')+','
time_slots[slot+1] += title.decode('utf-8')+','
slot += 2
else:
row_str += ' * |'
time_slots[slot] += title.decode('utf-8')+','
slot += 1
else:
row_str += ' |'
slot += 1
time_slots[63] = time_slots[61]
time_slots[62] = time_slots[61]
time_slots[61] = time_slots[60]
return time_slots
def get_lines_usage_for_day(self, day):
"""
Generates list with lines reservations for the day
:param day: Datetime.date of interest
:return: list with 64 items. Each item is 15 minute time slot starting from 6:00
"""
date_str = self.get_date_string(day)
r = urllib.request.urlopen('http://www.sutka.eu/obsazenost-bazenu'+date_str).read()
soup = BeautifulSoup(r, 'html.parser')
table = soup.find('table', attrs={'class':'pooltable'})
table_rows = table.find_all('tr')
return self.parse_line_usage_table(table_rows)
def predict(self, predictor, columns, data_dict, time_steps_back, lines_reservations=None):
"""
Prepares feature vector for prediction algorithms an generates prediction
:param predictor: sklearn predictor or other with simmilar interface
:param columns: List with names of columns that must be in feature vector
:param data_dict: Dictionary with time data containing columns names and default vaules
:param time_steps_back: Number of time steps for one input to prediction algorithm
:param lines_reservations: List with line reservations for predicted day generated by get_lines_usage_for_day method
:return: Vector with prediction for the day (288 items)
"""
lines_reserved_id = -1
org_ids = dict()
for org_id, column in enumerate(columns):
if column.startswith('reserved_'):
org_ids[column] = org_id
if column == 'lines_reserved':
lines_reserved_id = org_id
if lines_reservations is None:
lines_reservations = ['']*64
data = list()
for i in range(self.prediction_steps):
data.append([0]*len(columns))
for j, column in enumerate(columns):
if column in data_dict:
data[i][j] = data_dict[column]
slot_id = (data_dict['minute_of_day']-360)//15
if slot_id >= 0 and slot_id < 64:
org_list = lines_reservations[slot_id].split(',')[:-1]
for name in org_list:
feature_name = 'reserved_' + name
if feature_name in columns:
data[i][org_ids[feature_name]] += 1
elif 'reserved_other' in columns:
data[i][org_ids['reserved_other']] += 1
if lines_reserved_id >= 0:
data[i][lines_reserved_id] += 1
data_dict['minute'] += 5
data_dict['minute_of_day'] += 5
if data_dict['minute'] == 60:
data_dict['minute'] = 0
data_dict['hour'] += 1
df = pd.DataFrame(data, columns=columns)
day = Day('ts')
day.data = df
x, y = self.dh.get_feature_vectors_from_days([day], [], time_steps_back, 1, True)
return self.dh.predict_day_from_features(x, predictor, time_steps_back)
def daterange(self, start_date, end_date):
"""
Generator of dates from start_date to end_date
:param start_date: Datetime.date of first date to be generated
:param end_date: Datetime.date where to stop generator. This day will not be generated.
"""
for n in range(int ((end_date - start_date).days)):
yield start_date + timedelta(n)
def export_prediction(self, y_pred, prediction_date, export_folder):
"""
Exports predictions to CSV file
:param y_pred: Vecotr of 288 items with prediction
:param prediction_date: Datetime with prediction date
:param export_folder: Directory where to store csv with prediction. Will be created if does not exist
"""
if not os.path.isdir(export_folder):
os.mkdir(export_folder)
day_of_week = prediction_date.weekday()
month = int(prediction_date.strftime('%m'))
day = int(prediction_date.strftime('%d'))
s = 'time,pool,lines_reserved\n'
pred_id = 0
slot_id = 0
for hour in range(24):
for minute in range(0,60,5):
prediction = 0
if hour < 23:
prediction = int(y_pred[slot_id])
s += '2020-%02d-%02d %02d:%02d:00,%d,0\n'%(month,day,hour,minute,prediction)
slot_id += 1
fn = '2020-%02d-%02d.csv'%(month,day)
text_file = open(os.path.join(export_folder, fn), "w")
text_file.write(s)
text_file.close()
def generate_predictions(self, estimator, prediction_date, export_folder):
"""
Generates prediction vector and saves it to CSV file recognized by web server
:param estimator: Estimator class with prediction model, time_steps_back and columns members
:param prediction_date: Datetime with prediction date
:param export_folder: Directory where to store csv with prediction. Will be created if does not exist
"""
data_dict = {
'month' : int(prediction_date.strftime('%m')),
'day' : int(prediction_date.strftime('%d')),
'hour' : 0,
'minute' : 0,
'minute_of_day' : 0,
'year' : int(prediction_date.strftime('%Y')),
'day_of_week' : prediction_date.weekday(),
'temperature_binned':3,
'wind_binned':1,
'humidity_binned':3,
'precipitation_binned':0,
'pressure_binned':2
}
for column in estimator.columns:
if column.startswith('reserved'):
data_dict[column] = 0
if prediction_date.weekday() < 5:
y_pred = self.predict(estimator.weekday_model, estimator.columns, data_dict, estimator.time_steps_back, self.get_lines_usage_for_day(prediction_date))
else:
y_pred = self.predict(estimator.weekend_model, estimator.columns, data_dict, estimator.time_steps_back, self.get_lines_usage_for_day(prediction_date))
self.export_prediction(y_pred, prediction_date, export_folder)