-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
400 lines (344 loc) · 21.2 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
import tabula
import numpy as np
import pandas as pd
# When importing schedules from PDF files, they might contain invalid characters or other formatting issues, this function does the cleanup.
def cleanup_trainschedule(df):
df = df.astype(str)
df = df.replace('\.', ':', regex=True)
df = df.replace('–', np.NaN)
df = df.astype(str)
for col in df.columns: # remove unwanted characters
df[col] = df[col].str.replace('[^0-9:]', '', regex=True) # Removes non numeric characters or :
df[col] = df[col].str.replace(r'(:\d)$', r'\g<1>0', regex=True) # Adds missing 0 at the end of a time
df = df.replace('', np.NaN)
#df = df.replace('nan', np.NaN)
return df
# Converts hh:mm or hh:mm:ss to seconds
def hora_a_minuts(hora):
h, m = hora.split(":")
return int(h) * 60 + int(m)
def hora_a_segons(hora):
if hora.count(":") == 1:
h, m = hora.split(":")
s = 0
else:
h, m, s = hora.split(":")
return int(h) * 3600 + int(m) * 60 + int(s)
# Some schedules contain hours that go beyond 23:59 (e.g. 24, 25, 26.. hours into 00, 01, 02...)
def convert_24_to_00(time):
if time.count(":") == 1:
h, m = time.split(":")
s = 0
else:
h, m, s = time.split(":")
seconds = int(h) * 3600 + int(m) * 60 + int(s)
hours, remainder = divmod(seconds, 3600)
if hours >= 24:
hours = hours % 24
minutes, seconds = divmod(remainder, 60)
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
# Main function that returns the position of a train according to a schedule.
def busca_estacions_multiday(df, time, row, inverse=False):
#print(f"Finding train {row} at {time}")
train = df.iloc[row]
# Get the times of the first and last stops
firsttrain = train.loc[train.first_valid_index()]
lasttrain = train.loc[train.last_valid_index()]
# Convert times to seconds
seg_firsttrain = hora_a_segons(firsttrain)
seg_lasttrain = hora_a_segons(lasttrain)
# First, check if it's a multiday trip (end of the trip is beyond 23:59)
def is_multiday():
if seg_lasttrain < seg_firsttrain:
return True
return False
#Checks if, in a multiday trip, the time already corresponds to the next day
def is_trip_nextday(train, time):
time_seconds = hora_a_segons(time)
time_laststop = train.loc[train.last_valid_index()]
if (time_seconds >= hora_a_segons("00:00")) and (time_seconds < hora_a_segons(time_laststop)):
return True
return False
# Checks if the train is currently circulating
def is_circulating(time_seconds):
#print("Time in seconds:", time_seconds)
#print("First stop, seconds:", seg_firsttrain)
#print("Last stop, seconds:", seg_lasttrain)
if (time_seconds >= seg_firsttrain and time_seconds < seg_lasttrain):
return True
return False
multiday = False
time_seconds = hora_a_segons(time)
if is_multiday():
multiday = True
seg_lasttrain = seg_lasttrain + 86400 # Add one day to the arrival time of the last stop
if (is_trip_nextday(train, time)):
#print("And we are beyond the first day!")
time_seconds = time_seconds + 86400 # Add one day to the current time
#else:
#print("Trip starts and ends the same day")
#print(time_seconds)
if is_circulating(time_seconds):
print(f"Train {row} at {time} is circulating from {firsttrain} to {lasttrain}")
if multiday: print("Trip is multiday")
# Find the next station the train will stop (i). Basically, for each station, check if the departure time is larger than the specified hour, and stop there.
for i in range(len(train)):
# If value is Nan, just skip cell
if pd.isna(train[i]):
continue
else:
#print("train[i]", train[i])
#print("is multiday", multiday)
#print("is next day for train[i]", is_trip_nextday(train, train[i]))
if not (multiday and is_trip_nextday(train, train[i])): time_at_stop = hora_a_segons(train[i])
else: time_at_stop = hora_a_segons(train[i]) + 86400 # If it's already the next day, add one more day to the times at the schedule
print(f"Time at stop {i}, {time_at_stop}")
print(f"Current time, {time_seconds}")
if time_at_stop >= time_seconds:
print("Ok, this one")
break
# Once we found the next stop the train will stop, check if it's exactly at the station
if time_at_stop == time_seconds:
# If the time is the same, just return the station index
print(f"El tren {row} està aturat a l'estació de {train.index[i]} ({i}) a les {time}.")
if inverse == False:
return i
elif inverse == True:
return (len(train)-1)-i #((number of stations-1) - index of current station)
# Otherwise, it means that the train is circulating between stations
# Compute the % of the route between the two stations
# If the train didn't stop at the previous station, we need to get the time of the last actual stop.
previousstop = train[i-1]
j = 1
while pd.isna(previousstop):
j +=1
previousstop = train[i-j]
#print(f"Previous stop was", train.index[i-j], (i-j))
#print(f"Next stop is", train.index[i], i)
#print(f"Previous stop was {j} stops ago.")
if not (multiday and is_trip_nextday(train, previousstop)): time_at_previous_stop = hora_a_segons(previousstop)
else: time_at_previous_stop = hora_a_segons(previousstop) + 86400
print("time at stop", time_at_stop)
print("time at previous stop", time_at_previous_stop)
print("previousstop", previousstop)
print("segons:", time_seconds)
secondsdifference = time_at_stop - time_at_previous_stop
print("seconds difference:",secondsdifference)
if secondsdifference == 0:
secondsdifference = 1
print("WARNING: THERE IS SOMETHING WRONG WITH THE SCHEDULE.")
secondscurrent = time_seconds - time_at_previous_stop
print("seconds current:", secondscurrent)
routepercent = round(secondscurrent / secondsdifference, 3)
print("route %:",routepercent)
print(f"El tren {row} està entre les estacions de {train.index[i-j]} i {train.index[i]} ({(i-j)+routepercent*j}) a les {time}.")
if inverse == False:
return (i-j)+routepercent*j
elif inverse == True: # invert the value ((number of stations-1) - fraction of the route)
return (len(train)-1)-((i-j)+routepercent*j)
else:
return -1 # That trip is not circulating right now
# Basically loops through busca_estacions()
def find_alltrains(df, time, inverse=False):
# Drop empty columns (stations where no train stops)
#df = df.dropna(axis=1, how='all')
# Insert some random nans, to add more complexity
for col in df.columns:
df.loc[df.sample(frac=0.0).index, col] = np.nan
#display(df)
positions = []
for row in range(0,len(df)):
positions.append(busca_estacions_multiday(df, time, row, inverse))
# Remove non-valid positions (-1)
positions = list(filter(lambda x: x != -1, positions))
return positions
# Filters out trains whose first stop is not between two hours
from datetime import datetime, timedelta
import pandas as pd
def filter_interval(df, beginning, end):
# Fill NaN values with previous non-NaN value, assuming that the DataFrame is sorted by time
df.iloc[:, 0].fillna(method='ffill', inplace=True)
# If the first value is NaN, copy the next non-NaN value and subtract one minute
if pd.isna(df.iloc[0, 0]):
next_nonnan = df.iloc[:, 0].dropna().iloc[0]
next_nonnan = datetime.strptime(next_nonnan, '%H:%M') - timedelta(minutes=1)
df.iloc[0, 0] = next_nonnan.strftime('%H:%M')
# Convert the beginning and end times to datetime objects
beginning_time = datetime.strptime(beginning, '%H:%M')
end_time = datetime.strptime(end, '%H:%M')
# If the end time is earlier than the beginning time, add 1 day to the end time
if end_time <= beginning_time:
end_time += timedelta(days=1)
# Define a lambda function to convert a time string to a datetime object
to_datetime = lambda x: datetime.strptime(x, '%H:%M') + timedelta(days=1) if int(x[:2]) < 4 else datetime.strptime(x, '%H:%M') # If the hour is less than 4(am), assume it's referring to the next day
# Apply the to_datetime function to the first column of the DataFrame
times = df.iloc[:, 0].apply(to_datetime)
# Filter the DataFrame based on the time interval
mask = (times >= beginning_time) & (times <= end_time)
filtered_df = df[mask]
filtered_df = filtered_df.sort_values(by=filtered_df.columns[0], key=lambda x: pd.to_datetime(x, format='%H:%M'))
filtered_df.iloc[filtered_df.iloc[:, 1].isna(), 0] = np.nan # Remove the placeholders to filter out the times in the first column. This is an ugly workaround...
return filtered_df
# Controls the cases when, if a time goes from 23:50 to 23:02, correct to 00:02
def fix_time_discontinuity(df):
for i in range(len(df)):
prev_hour, prev_minute = None, None
hour_offset = 0
for j in range(len(df.columns)):
current_time = df.iloc[i, j]
if pd.isna(current_time):
continue
current_hour, current_minute = map(int, current_time.split(":"))
if prev_hour is not None:
if current_minute < prev_minute:
hour_offset += 1 if current_minute - prev_minute < 0 else 0
current_hour += hour_offset
current_hour %= 24
prev_hour, prev_minute = current_hour, current_minute
df.iloc[i, j] = f"{current_hour:02d}:{current_minute:02d}"
return df
# For those schedules that only show the minutes, add the hours (h_values) (e.g. S2 and S6 schedules obtained through tabula)
def generate_hours(df, h_values):
def add_zero(x): # Adds 0 to values, like 12:5 -> 12:50
return x if x == 'nan' else (x if len(x) == 5 else x + '0')
dfs = []
for h in h_values:
df_temp = df.astype(str)
df_temp = df_temp.replace('0\.', f'{h}:', regex=True)
df_temp = cleanup_trainschedule(df_temp)
#df_temp = df_temp.applymap(add_zero) # cleanup_trainschedule(0) already does that
#df_temp = df_temp.replace('nan', np.nan)
dfs.append(df_temp)
return pd.concat(dfs, axis=0, ignore_index=True)
# Fixes the station names returned by GTFS and adds any missing intermediate station
from fuzzywuzzy import fuzz
def fix_stationnames(df, route):
print("Fixing station names")
df = df.copy()
print(route)
print(df.shape,df.columns)
# Define a function to match column names using fuzzy string matching
def match_columns(column_name, column_names):
# Find the best match between the column_name and the column_names using fuzzy string matching
best_match = max(column_names, key=lambda x: fuzz.ratio(column_name, x))
# If the best match has a high similarity ratio, return it as the new column name, otherwise return None
if fuzz.ratio(column_name, best_match) >= 70:
return best_match
else:
return column_name
# Preprocess special cases:
df = df.rename(columns={"Cabrera De Mar": "Cabrera de Mar-Vilassar de Mar",
"La Tour De Carol": "Latour-de-Carol-Enveig",
"Els Hostalets De Balenyà":"Balenyà-Els Hostalets",
"Balenyà":"Balenyà-Tona-Seva",
"Barcelona-Torre Del Baró":"Torre del Baró-Vallbona",
"Sant Andreu Arenal":"Barcelona St.Andreu Arenal",
"Barcelona-Fabra I Puig":"Barcelona St.Andreu Arenal",
"Montserrat-Aeri":"Aeri de Montserrat",
"Pl. Catalunya": "Barcelona - Pl. Catalunya",
"Martorell Vila":"Martorell Vila | Castellbisbal",
"El Prat Aeroport":"Aeroport",
"Terrassa Estacio Nord":"Terrassa Estació del Nord"})
# Rename the columns in df2 that have misspelled names using fuzzy string matching
df = df.rename(columns=lambda x: match_columns(x, stations_dict[route]))
_df = pd.DataFrame(columns=stations_dict[route])
print(df.shape, df.columns)
print(_df.shape, _df.columns)
if len(df. columns) == len(set(df. columns)): # check if there are duplicated columns
df = pd.concat([_df, df], join='outer')
else:
print("Duplicated columns, matching didn't work")
return df
# Usage:
# df = fix_stationnames(df, route)
# Check if the columns of a schedule are reversed horizontally, and returns the df in the correct order
def check_df_needsreversing(df):
for row in df.itertuples(index=False):
prev_time = None
for time in row:
if pd.isna(time):
continue
if prev_time is not None and time < prev_time:
return df[df.columns[::-1]]
prev_time = time
return df
# Usage: df = check_df_needsreversing(df)
# Data imported from gtfs might not include all columns (stops/stations).
# We might need to standarize the column names before passing them to the map
# Using dict, perhaps?
stations_dict = {
"R1": ["L'Hospitalet de Llobregat", "Barcelona - Sants", "Barcelona - Plaça de Catalunya",
"Barcelona - Arc de Triomf", "Barcelona - El Clot Aragó", "St. Adrià de Besòs",
"Badalona", "Montgat", "Montgat Nord", "El Masnou", "Ocata", "Premià de Mar", "Vilassar de Mar",
"Cabrera de Mar-Vilassar de Mar", "Mataró", "St. Andreu de Llavaneres", "Caldes d'Estrac",
"Arenys de Mar", "Canet de Mar", "St. Pol de Mar","Calella","Pineda de Mar","Santa Susanna",
"Malgrat de Mar","Blanes","Tordera","Maçanet-Massanes"],
"R2": ["Maçanet-Massanes", "Hostalric", "Riells i Viabrea-Breda", "Gualba", "Sant Celoni",
"Palautordera", "Llinars del Vallès", "Cardedeu", "Les Franqueses del Vallès-Granollers Nord",
"Granollers Centre", "Montmeló", "Mollet-Sant Fost", "La Llagosta", "Montcada i Reixac",
"Barcelona - Sant Andreu", "Barcelona-El Clot-Aragó", "Barcelona - Estació de França",
"Barcelona-Passeig de Gràcia", "Barcelona-Sants", "Bellvitge | Gornal", "El Prat de Llobregat",
"Aeroport", "Viladecans", "Gavà", "Castelldefels", "Platja de Castelldefels", "Garraf",
"Sitges", "Vilanova i la Geltrú", "Cubelles", "Cunit", "Segur de Calafell", "Calafell", "Sant Vicenç de Calders"],
"R2N": ["Maçanet-Massanes", "Hostalric", "Riells i Viabrea-Breda", "Gualba", "Sant Celoni",
"Palautordera", "Llinars del Vallès", "Cardedeu", "Les Franqueses del Vallès-Granollers Nord",
"Granollers Centre", "Montmeló", "Mollet-Sant Fost", "La Llagosta", "Montcada i Reixac",
"Barcelona - Sant Andreu", "Barcelona-El Clot-Aragó", "Barcelona-Passeig de Gràcia",
"Barcelona-Sants", "Bellvitge | Gornal", "El Prat de Llobregat", "Aeroport"],
"R2 Centre": ["Granollers Centre", "Montmeló", "Mollet-Sant Fost", "La Llagosta", "Montcada i Reixac",
"Barcelona - Sant Andreu", "Barcelona-El Clot-Aragó", "Barcelona-Passeig de Gràcia",
"Barcelona-Sants", "Bellvitge | Gornal", "El Prat de Llobregat", "Viladecans", "Gavà", "Castelldefels"],
"R2S": ["Barcelona - Estació de França", "Barcelona-Passeig de Gràcia", "Barcelona-Sants",
"Bellvitge | Gornal", "El Prat de Llobregat", "Viladecans", "Gavà", "Castelldefels", "Platja de Castelldefels",
"Garraf", "Sitges", "Vilanova i la Geltrú", "Cubelles", "Cunit", "Segur de Calafell", "Calafell",
"Sant Vicenç de Calders"],
"R3": ["L'Hospitalet de Llobregat", "Barcelona-Sants", "Barcelona-Plaça Catalunya", "Barcelona-Arc de Triomf",
"Barcelona-La Sagrera-Meridiana", "Sant Andreu Arenal", "Torre del Baró-Vallbona", "Montcada Bifurcació",
"Montcada Ripollet", "Santa Perpètua de Mogoda-La Florida", "Mollet-Santa Rosa", "Parets del Vallès",
"Granollers-Canovelles", "Les Franqueses del Vallès", "La Garriga", "Figaró", "Sant Martí de Centelles",
"Centelles", "Balenyà-Els Hostalets", "Balenyà-Tona-Seva", "Vic", "Manlleu", "Torelló", "Borgonyà",
"Sant Quirze de Besora", "La Farga de Bebié", "Ripoll", "Campdevànol", "Ribes de Freser", "Planoles",
"Toses", "La Molina", "Urtx-Alp", "Puigcerdà","Latour-de-Carol-Enveig"],
"R4": ["St. Vicenç de Calders", "El Vendrell", "L'Arboç", "Els Monjos", "Vilafranca del Penedès",
"La Granada", "Lavern-Subirats", "St. Sadurní d'Anoia", "Gelida", "Martorell Central", "Castellbisbal",
"El Papiol","Molins de Rei","St. Feliu de Llobregat","St. Joan Despí","Cornellà","L'Hospitalet de Llobregat",
"Barcelona Sants","Barcelona Plaça de Catalunya","Barcelona Arc de Triomf","Barcelona La Sagrera-Meridiana",
"Barcelona St.Andreu Arenal","Torre del Baró-Vallbona","Montcada Bifurcació","Montcada i Reixac-Manresa",
"Montcada i Reixac-Sta. Maria","Cerdanyola del Vallès","Barberà del Vallès","Sabadell Sud","Sabadell Centre",
"Sabadell Nord","Terrassa Est" ,"Terrassa Estació del Nord" ,"St. Miquel de Gonteres" ,"Viladecavalls" ,"Vacarisses-Torreblanca",
"Vacarisses" ,"Castellbell i el Vilar-Monistrol de Montserrat" ,"St. Vicenç de Castellet" ,"Manresa"],
"R5": ['Barcelona - Plaça Espanya', 'Magòria-La Campana', 'Ildefons Cerdà', 'Europa | Fira', 'Gornal',
'Sant Josep', 'L’Hospitalet-Av. Carrilet', 'Almeda', 'Cornellà Riera', 'Sant Boi',
'Molí Nou-Ciutat Cooperativa', 'Colònia Güell', 'Santa Coloma de Cervelló', 'Sant Vicenç dels Horts',
'Can Ros', 'Quatre Camins', 'Pallejà', 'Sant Andreu de la Barca', 'El Palau',
'Martorell Vila | Castellbisbal', 'Martorell Central', 'Martorell Enllaç', 'Abrera',
'Olesa de Montserrat', 'Aeri de Montserrat', 'Monistrol de Montserrat', 'Castellbell i el Vilar',
'Sant Vicenç | CastellGalí', 'Manresa-Viladordis', 'Manresa-Alta', 'Manresa-Baixador'],
"R6": ['Barcelona - Plaça Espanya', 'Magòria-La Campana', 'Ildefons Cerdà', 'Europa | Fira', 'Gornal', 'Sant Josep',
'L’Hospitalet-Av. Carrilet', 'Almeda', 'Cornellà Riera', 'Sant Boi', 'Molí Nou-Ciutat Cooperativa',
'Colònia Güell', 'Santa Coloma de Cervelló', 'Sant Vicenç dels Horts', 'Can Ros', 'Quatre Camins',
'Pallejà', 'Sant Andreu de la Barca', 'El Palau', 'Martorell Vila | Castellbisbal', 'Martorell Central',
'Martorell Enllaç', 'Sant Esteve Sesrovires', 'La Beguda', 'Can Parellada', 'Masquefa', 'Piera',
'Vallbona d’Anoia', 'Capellades', 'La Pobla de Claramunt', 'Vilanova del Camí', 'Igualada'],
"R7": ["Molins de Rei","St. Feliu de Llobregat","St. Joan Despí","Cornellà","L'Hospitalet de Llobregat",
"Barcelona Sants","Barcelona Plaça de Catalunya","Barcelona Arc de Triomf","Barcelona La Sagrera-Meridiana",
"Barcelona St.Andreu Arenal",'Torre del Baró-Vallbona','Montcada Bifurcació','Montcada i Reixac-Manresa',
'Montcada i Reixac-Sta. Maria', 'Cerdanyola del Vallès','Cerdanyola Universitat'],
"R8": ['Martorell Central', 'Castellbisbal', 'Rubi Can Vallhonrat', 'Sant Cugat Coll Favà', 'Cerdanyola Universitat',
'Santa Perpètua de Mogoda - Riera de caldes', 'Mollet - Sant Fost', 'Montmeló', 'Granollers Centre'],
"S1": ['Barcelona - Pl. Catalunya', 'Provença', 'Gràcia', 'Sant Gervasi', 'Muntaner', 'La Bonanova',
'Les Tres Torres', 'Sarrià', 'Peu del Funicular', 'Baixador de Vallvidrera', 'Les Planes', 'La Floresta',
'Valldoreix', 'Sant Cugat Centre', 'Mira-sol', 'Hospital General', 'Rubí Centre', 'Les Fonts', 'Terrassa Rambla',
'Vallparadís Universitat', 'Terrassa Estació del Nord', 'Terrassa Nacions Unides'],
"S2": ['Barcelona - Pl. Catalunya', 'Provença', 'Gràcia', 'Sant Gervasi', 'Muntaner', 'La Bonanova',
'Les Tres Torres', 'Sarrià', 'Peu del Funicular', 'Baixador de Vallvidrera', 'Les Planes', 'La Floresta',
'Valldoreix', 'Sant Cugat Centre', 'Volpelleres', 'Sant Joan', 'Bellaterra', 'Universitat Autònoma', 'Sant Quirze',
'Can Feu | Gràcia', 'Sabadell Plaça Major', 'La Creu Alta', 'Sabadell Nord', 'Sabadell Parc del Nord']
}
# Some keys are redundant, we can generate them dynamically:
stations_dict.update({'R50': stations_dict['R5'],
'R60': stations_dict['R6'],
'S3': stations_dict['R5'][:15],
'S4': stations_dict['R5'][:24],
'S8': stations_dict['R5'][:22],
'S9': stations_dict['R5'][:16]})