-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfunc_read_vaccines_web.py
executable file
·73 lines (60 loc) · 2.88 KB
/
func_read_vaccines_web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
from selenium import webdriver
if os.name == 'nt':
chromedriver_dir = './tools/chromedriver.exe'
else:
chromedriver_dir = '/usr/local/bin/chromedriver'
vaccines_url = 'https://coronavirus.bg/bg/statistika'
codes_vaccines = ['BLG','BGS','VAR','VTR','VID','VRC','GAB',
'DOB','KRZ','KNL','LOV','MON','PAZ','PER',
'PVN','PDV','RAZ','RSE','SLS','SLV','SML',
'SFO','SOF','SZR','TGV','HKV','SHU','JAM']
def get_vaccines_data_web(url, chromedriver_dir):
options = webdriver.ChromeOptions()
#options.binary_location=str(os.environ.get('GOOGLE_CHROME_BIN')) # REQUIRED FOR HEROKU
#options.add_argument('--disable-gpu') # REQUIRED FOR HEROKU
#options.add_argument('--no-sandbox') # REQUIRED FOR HEROKU
options.add_argument("browser.download.folderList=2");
options.add_argument("browser.helperApps.alwaysAsk.force=False");
options.add_argument("browser.download.manager.showWhenStarting=False");
options.add_argument("browser.download.manager.showAlertOnComplete=False");
options.add_argument("browser.helperApps.neverAsk.saveToDisk=True");
#options.add_argument(f"browser.download.dir={download_dir}");
options.add_argument('--no-proxy-server');
options.add_argument("--proxy-server='direct://'");
options.add_argument("--proxy-bypass-list=*");
options.headless = True
# driver = webdriver.Chrome(chrome_options=options, executable_path=chromedriver_dir)
driver = webdriver.Chrome(chromedriver_dir, chrome_options=options)
driver.get(url)
table_vac = driver.find_element_by_xpath("//div[@class='col stats']")
# strip headers and footer
vaccines_raw = table_vac.text.split('\n')[5:-5]
import pandas as pd
vaccines_df = pd.DataFrame()
for i in vaccines_raw:
line = i.replace('-','0').split(' ')
# adjust the names of provinces containing a space
if len(line) == 10:
line = [line[0] + ' ' + line[1], line[2], line[3], line[4], line[5], line[6], line[7], line[8], line[9]]
# convert to int
line_int = [
line[0],
int(line[1]),
int(line[2]),
int(line[4]),
int(line[3]),
int(line[5]),
int(line[6]),
int(line[7]),
int(line[8])
]
# add each province data to a dataframe
vacc_line = pd.DataFrame([line_int])
vaccines_df = pd.concat([vaccines_df, vacc_line])
# rename columns
vaccines_df.columns=['province', 'total', 'new_pfizer', 'new_astrazeneca', 'new_moderna', 'new_johnson', 'second_dose', 'booster_jab', 'booster_jab2']
vaccines_df = vaccines_df.reset_index().drop(['index', 'province'], axis=1)
vaccines_df = pd.concat([pd.DataFrame(codes_vaccines, columns=['code']),
vaccines_df], axis=1)
return vaccines_df