-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathC2Scrape.py
531 lines (447 loc) · 22.3 KB
/
C2Scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
import requests
from lxml import etree, html
import json
from datetime import datetime, date
from time import strftime,gmtime
import os
import shutil
from lxml import etree, html #reading html
from multi_webbing import multi_webbing as mw
import time #sleep
#TODO: os.path.join(),aws rds
class Scraper():
"""Instance this to set up the scraper"""
def __init__(self, config_path):
self.config = self.load_config(config_path)
#initialise data structures and output files
self.data = Data(self.config)
if self.config["use_cache"] == True:
self.cache = Cache(self.config)
else:
self.cache=None
# initialize threads
self.threads = mw.MultiWebbing(self.config["threads"])
#use same session as threads, log in to the website
self.s = self.threads.session
if self.config["C2_login"]:
#TODO move loading of username password to environment vars rather than config file
C2_login(self.s, self.config["url_login"], self.config["C2_username"], self.config["C2_password"], "https://log.concept2.com/log")
# start threads
self.threads.start()
#generate urls to visit
self.ranking_pages = generate_ranking_pages(self.config, self.threads, self.data, self.cache)
self.num_ranking_pages = len(self.ranking_pages)
#check for override of maximum ranking tables
if self.config["max_ranking_tables"] != "":
self.num_ranking_pages = int(self.config["max_ranking_tables"])
self.ranking_page_count = 0 #counts the number of ranking table objects processed
self.queue_added = 0 #counts the total number of objects added to the queue
def load_config(self,path):
"""load configuration from a JSON file"""
try:
fo = open("C2config.json")
return json.load(fo)
fo.close
except:
print("Could not open config file. Quitting")
quit()
def scrape(self):
"""Call to start the scraper"""
#main loop for master process over each ranking table
for ranking_page in self.ranking_pages[0:self.num_ranking_pages]:
self.ranking_page_count += 1
self.queue_added = ranking_page.scrape(self.ranking_page_count, self.queue_added, self.num_ranking_pages)
print("Finished scraping ranking tables, waiting for profile threads to finish...")
# wait for queue to be empty, then join the threads
while not self.threads.job_queue.empty():
time.sleep(1)
print(f"Queue size: {str(self.threads.job_queue.qsize())}/{self.queue_added}")
self.data.files.write()
if self.cache != None:
self.cache.files.write()
#write before thread join to minimise any date loss from failure
self.data.files.write(lock=self.threads.lock, force=True)
if self.cache != None:
self.cache.files.write(lock=self.threads.lock, force=True)
if self.threads.job_queue.empty():
#join threads
self.threads.finish()
#final write
self.data.files.write(force=True)
if self.cache != None:
self.cache.files.write(force=True)
print("Finished!")
class RankingPage():
"""Scrapes the raning pages of the logbook"""
def __init__(self, base_url, year, machine, event, config, threads, data, cache=None, query_parameters={}):
#query should be a dictionary of url query keys and values
self.machine = machine
self.base_url = base_url
self.year = year
self.event = event
self.query_parameters = query_parameters
self.url_parts = (base_url, year, machine, event)
self.url_string = self.get_url_string()
self.config = config
self.threads = threads
self.data = data
self.cache = cache
def get_url_string(self):
#construct url string
url_string = "/".join(map(str,self.url_parts)) + "?"
#construct url with query string
for key,val in self.query_parameters.items():
if (val != None and val != "") and (key != None and key != ""):
url_string = url_string + key + "=" + val + "&"
return url_string.strip("&")
def scrape(self, ranking_table_count, queue_added, num_ranking_tables):
r = get_url(self.threads.session, self.url_string)
if r != None:
tree = html.fromstring(r.text)
pagination_block = tree.xpath('//div[@class="pagination-block"]')
if pagination_block != []:
page_a_tag = pagination_block[0].xpath('ul/li/a')
#find the second to last one
pages = int(page_a_tag[-2].text)
else:
#no pagination block, only one page
pages = 1
for page in range(1,pages+1):
#master process sub-loop over each page
url_string = self.url_string + "&page=" + str(page)
print(f"{get_str_ranking_table_progress(self.threads.job_queue.qsize(), queue_added, ranking_table_count, num_ranking_tables, page,pages)} | Getting ranking page: {url_string}")#, end="\r")
if page > 1:
#don't get the first page again (if page is ommitted, page 1 is loaded)
workouts_page=[]
r = get_url(self.threads.session, url_string)
#master process checks each row, adds URLs to queue for threads to visit
if r != None:
tree = html.fromstring(r.text)
table_tree = tree.xpath('//html/body/div[2]/div/main/section[@class="content"]/table')
#get column headings for this page
if table_tree != []:
columns = table_tree[0].xpath('thead/tr/th')
column_headings = [column.text for column in columns]
# for headings in column_headings:
# row_dict = {}
rows_tree = table_tree[0].xpath('tbody/tr')
num_rows = len(rows_tree)
for row in range(0,num_rows):
row_tree = rows_tree[row]
if row_tree != []:
#get profile ID and workout ID
workout_info_link = row_tree.xpath('td/a')[0].attrib["href"]
profile_ID = None
workout_ID = None
#get profile and workout IDs from workout link
if workout_info_link.split("/")[-2] == "individual" or workout_info_link.split("/")[-2] == "race":
profile_ID = workout_info_link.split("/")[-1]
workout_ID = workout_info_link.split("/")[-3]
else:
workout_ID = workout_info_link.split("/")[-2]
#get workout data from row
self.data.workouts[workout_ID] = get_workout_data(row_tree, column_headings, self, profile_ID)
if self.config["get_profile_data"] and profile_ID != None:
#add athlete profile object to thread queue
self.threads.job_queue.put(mw.Job(get_athlete, self.config["url_profile_base"] + profile_ID, [self.data.athletes, self.cache.athletes, profile_ID]))
queue_added += 1
if self.config["get_extended_workout_data"]:
self.threads.job_queue.put(mw.Job(get_ext_workout, workout_info_link, [self.data.ext_workouts, self.cache.ext_workouts, workout_ID]))
queue_added += 1
#after each page, check to see if we should write to file
self.data.files.write(lock=self.threads.lock)
if self.cache != None:
self.cache.files.write(lock=self.threads.lock)
return queue_added
class Data():
def __init__(self, config):
self.athletes = {}
self.workouts = {}
self.ext_workouts = {}
self.list = [self.athletes, self.workouts, self.ext_workouts]
self.files = DataFiles(config)
self.files.set_data(self)
class DataFiles():
def __init__(self, config):
self.workouts = config["workouts_file"]
self.athletes = config["athletes_file"]
self.extended = config["extended_file"]
self.list = [self.workouts, self.athletes, self.extended]
self.timestamp_last_write = 0
self.write_buffer = config["write_buffer"] #write every X ranking pages
#backup previous output
self.backup_files()
self.init_files()
def set_data(self, data):
self.data = data
def write(self, lock=None, force=False):
if check_write_buffer(self.timestamp_last_write, self.write_buffer) or force:
if lock != None:
lock.acquire()
for out_file, data in zip(self.list, self.data.list):
try:
fw = open(out_file, "w")
output_data = json.dumps(data, ensure_ascii = False)
fw.write(output_data)
fw.close
print("Write complete: " + out_file)
except:
print("Write failed: " + out_file)
fl = open("log","a+")
fl.write("Write failed: " + out_file)
if lock != None:
lock.release()
self.timestamp_last_write = datetime.now().timestamp()
def backup_files(self):
for path in self.list:
if os.path.isfile(path):
try:
shutil.copyfile(path, path + "_backup")
except:
print("Could not back up: " + path)
def init_files(self):
for path in self.list:
try:
fw = open(path, "w+")
fw.close
except:
print("Init failed: " + path)
fl = open("log","a+")
fl.write("Init failed: " + path)
self.timestamp_last_write = datetime.now().timestamp()
class Cache():
def __init__(self, config):
self.files = CacheFiles(config)
try:
self.athletes = self.files.load(config["athletes_cache_file"])
self.ext_workouts = self.files.load(config["extended_cache_file"])
except FileNotFoundError:
print("Couldn't find cache files." )
self.athletes = {}
self.ext_workouts = {}
self.list = [self.athletes, self.ext_workouts]
self.files.set_cache(self)
class CacheFiles():
def __init__(self, config):
self.athletes = config["athletes_cache_file"]
self.extended = config["extended_cache_file"]
self.list = [self.athletes, self.extended]
self.timestamp_last_write = datetime.now().timestamp()
self.write_buffer = config["write_buffer"] #write every X ranking pages
#backup previous output
self.backup_files()
def set_cache(self, data):
self.cache = data
def load(self, path):
cache = {}
fo = open(path)
cache = json.load(fo)
fo.close
print(f"Loaded cache file: {path}")
return cache
def write(self, lock=None, force=False):
if check_write_buffer(self.timestamp_last_write, self.write_buffer) or force:
if lock != None:
lock.acquire()
for out_file, data in zip(self.list, self.cache.list):
try:
fw = open(out_file, "w")
output_data = json.dumps(data, ensure_ascii = False)
fw.write(output_data)
fw.close
print("Write complete: " + out_file)
except:
print("Write failed: " + out_file)
fl = open("log","a+")
fl.write("Write failed: " + out_file)
self.timestamp_last_write = datetime.now().timestamp()
if lock != None:
lock.release()
def backup_files(self):
for path in self.list:
if os.path.isfile(path):
try:
shutil.copyfile(path, path + "_backup")
except:
print("Could not back up: " + path)
def get_url(session, url, exception_on_error = False):
try:
r = session.get(url)
if r.status_code == 200:
return r
else:
if exception_on_error == False:
return None
else:
raise ValueError("A server error occured, status code: " + str(r.status_code))
except requests.exceptions.ConnectionError:
if exception_on_error == False:
return None
else:
raise ValueError("Could not access url: " + url)
def lists2dict(listkey,listval):
"""takes two lists, used the first as keys and the second as values, returns a dictionary"""
returndict={}
for key, val in zip(listkey, listval):
returndict[key] = val
return returndict
def generate_ranking_pages(config, threads, data, cache):
machine_parameters = config["machine_parameters"]
url_years = config["url_parameters"]["url_years"]
url_base = config["url_parameters"]["url_base"]
#this supports 4 query parameters for each machine type, the keys can be different, but exactly 4 must be present in the data structure below, the lists though can be blank
#can be increased by adding more nested for loops when constructing the query string
#TODO: to make this fully dynamic I think I need to use a recursive algorithm
#generate URLS for scraping
urls = []
#this can be improved I think using recursion, try googling "recursive generator"
for url_year in url_years:
for machine_type_key, machine_type_values in machine_parameters.items():
for url_event in machine_parameters[machine_type_key]["events"]:
param_keys=[]
for param_key,param_values in machine_type_values["query"].items():
#get all the parameter keys for this machine type
param_keys.append(param_key)
if len(param_values) == 0:
#safeguard against an empty entry, if nothing in list the below for loops will skip all the entries after
machine_type_values["query"][param_key] = [""]
#now iterate through them and construct the URL
for val0 in machine_parameters[machine_type_key]["query"][param_keys[0]]:
for val1 in machine_parameters[machine_type_key]["query"][param_keys[1]]:
for val2 in machine_parameters[machine_type_key]["query"][param_keys[2]]:
for val3 in machine_parameters[machine_type_key]["query"][param_keys[3]]:
query_parameters = lists2dict(param_keys,(val0,val1,val2,val3))
urls.append(RankingPage(url_base, url_year, machine_type_key, url_event, config, threads, data, cache, query_parameters))
return urls
def get_athlete_data(r):
#r: requests object
tree = html.fromstring(r.text)
#profile labels that are contained in <a> tags
a_tag_labels = ["Affiliation:", "Team:"]
athlete_profile = {}
content = tree.xpath('//section[@class="content"]')
athlete_profile["name"] = content[0].xpath('h2')[0].text
athlete_profile_labels = content[0].xpath('p/strong')
#store as list
athlete_profile_labels = [label.text for label in athlete_profile_labels]
i = 0
#check to see if I need to be logged in
if "You must be <a href=\"/login\">logged in</a> to see this user\'s profile" in r.text:
athlete_profile["availablity"] = "logged in"
elif "<div class=\"stats\">" in r.text:
#stat boxes only appear when profile is accessible
athlete_profile["availablity"] = "available"
elif "This user's profile is only accessible to training partners." in r.text:
athlete_profile["availablity"] = "training partner"
else:
athlete_profile["availablity"] = "private"
#profile values not contained in tags so need to be a bit messy to get them
for profile_label in athlete_profile_labels:
#cycle through each profile label and search for the matching value
if profile_label in a_tag_labels:
profile_value = content[0].xpath('p/strong[contains(text(), "' + profile_label +'")]/following-sibling::a/text()')
else:
profile_value = content[0].xpath('p/strong[contains(text(), "' + profile_label +'")]/following-sibling::text()[1]')
#clean up
profile_label = profile_label.strip(":").lower()
#add to profile dictionary
athlete_profile[profile_label] = profile_value[0].strip(" ")
return athlete_profile
def get_workout_data(row_tree, column_headings, ranking_table, profile_ID):
workout_data = []
row_data_tree = row_tree.xpath('td | td/a')
del row_data_tree[1] #hacky, but to remove a row that shouldn't be their due to the /a tag used for the name parameter
row_list = [x.text for x in row_data_tree]
workout_data = lists2dict(map(str.lower, column_headings),row_list)
workout_data["year"] = ranking_table.year
workout_data["machine"] = ranking_table.machine
workout_data["event"] = ranking_table.event
workout_data["retrieved"] = strftime("%d-%m-%Y %H:%M:%S", gmtime())
workout_data["profile_id"] = profile_ID
for key, val in ranking_table.query_parameters.items():
workout_data[key]=val
return workout_data
def get_ext_workout_data(r):
#r: requests object
tree = html.fromstring(r.text)
label_tree = tree.xpath('/html/body/div/div/div[1]/strong')
data_labels = [label.text for label in label_tree]
profile = {}
for data_label in data_labels:
value = tree.xpath(f'/html/body/div/div/div[1]/strong[contains(text(), "{data_label}")]/following-sibling::text()[1]')
label = data_label.strip(":").lower()
profile[label] = value[0]
return profile
def get_str_ranking_table_progress(queue_size, queue_added, ranking_url_count, num_ranking_urls, page,pages):
return f"Queue size: {str(queue_size)}/{str(queue_added)} | Ranking Table: {str(ranking_url_count)}/{str(num_ranking_urls)} | Page: {str(page)}/{str(pages)}"
def check_write_buffer(timestamp_last_write, write_buffer):
return datetime.now().timestamp() > timestamp_last_write + write_buffer
def C2_login(session, url_login, username, password, url_login_success):
login = session.get(url_login)
login_tree = html.fromstring(login.text)
hidden_inputs = login_tree.xpath(r'//form//input[@type="hidden"]')
form = {x.attrib["name"]: x.attrib["value"] for x in hidden_inputs} #get csrf token
form['username'] = username
form['password'] = password
response = session.post(url_login, data=form)
if response.url != url_login_success: #see that we get to the expected page
sys.exit("Unable to login to the logbook, quitting.")
else:
print("Login")
return session
def get_athlete(job):
#TODO these job functions will fail fairly silently (error prints will get swallowed up by other console output) if a none 200 response code or on connection error
#function executed by thread, updates cache and data dictionary
job_data = {}
athletes = job.custom_data[0]
cache = job.custom_data[1]
profile_id = job.custom_data[2]
#check if already in data dictionary, if so, do nothing
if profile_id not in athletes.keys():
#check if in cache.
if profile_id in cache.keys():
job_data = cache[profile_id]#retrieve from cache
else:
get_url_success = job.get_url() #get the URL
if get_url_success:
if job.request.status_code == 200: #check that the URL was recieved OK
job_data = get_athlete_data(job.request)
job_data["retrieved"] = strftime("%d-%m-%Y %H:%M:%S", gmtime())
job.lock.acquire()
cache.update({profile_id:job_data}) #cache
job.lock.release()
else:
print(f"There was a problem with {job.url}, status code: {job.request.status_code}")
if job_data != {}:
job.lock.acquire() #dict.update is thread safe but other fucntions used elsewhere (e.g. json.dumps) may not, need lock here
athletes.update({profile_id:job_data}) #main data
job.lock.release()
def get_ext_workout(job):
#function executed by thread, updates cache and data dictionary
job_data = {}
ext_workouts = job.custom_data[0]
cache = job.custom_data[1]
workout_id = job.custom_data[2]
#check if already in data dictionary, if so, do nothing
if workout_id not in ext_workouts.keys():
#check if in cache.
if workout_id in cache.keys():
job_data = cache[workout_id]#retrieve from cache
else:
get_url_success = job.get_url() #get the URL
if get_url_success:
if job.request.status_code == 200: #check that the URL was recieved OK
job_data = get_ext_workout_data(job.request)
job_data["retrieved"] = strftime("%d-%m-%Y %H:%M:%S", gmtime())
job.lock.acquire() #dict.update is thread safe but other fucntions used elsewhere (e.g. json.dumps) may not, need lock here
cache.update({workout_id:job_data}) #cache
job.lock.release()
else:
print(f"There was a problem with {job.url}, status code: {job.request.status_code}")
if job_data != {}:
job.lock.acquire() #dict.update is thread safe but other fucntions used elsewhere (e.g. json.dumps) may not, need lock here
ext_workouts.update({workout_id:job_data}) #main data
job.lock.release()
if __name__ == "__main__":
scraper = Scraper("C2config.json")
scraper.scrape()