-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathscraper.py
99 lines (80 loc) · 3.03 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import json
from time import time, gmtime, strftime
import logging
import scraperwiki
import json
from healthtools.config import LOGGING
from healthtools.scrapers import *
def setup_logging(default_level=logging.INFO):
"""
Setup logging configuration
"""
try:
logging.config.dictConfig(LOGGING)
except Exception as ex:
logging.basicConfig(level=default_level)
if __name__ == "__main__":
setup_logging()
time_start = time()
# Initialize the Scrapers
doctors_scraper = DoctorsScraper()
foreign_doctors_scraper = ForeignDoctorsScraper()
healthfacilities_scraper = HealthFacilitiesScraper()
nhif_inpatient_scraper = NhifInpatientScraper()
nhif_outpatient_scraper = NhifOutpatientScraper()
nhif_outpatient_cs_scraper = NhifOutpatientCsScraper()
# Run the scrapers
'''
Doctors Scraper
---------------
Doctors are a combination of local and foreign doctors. If the local
doctors' scraper fails, we shouldn't scrape the foreign doctors.
FIX: For now we are storing the doctors and foreign doctors in separate
doc types. We shall combine results in API result.
'''
doctors_result = doctors_scraper.run_scraper()
foreign_docs_result = foreign_doctors_scraper.run_scraper()
'''
Health Facilities Scraper
-------------------------
Scrapes the government's Kenya Health Facilities Master List.
'''
healthfacilities_result = healthfacilities_scraper.run_scraper()
'''
NHIF Scraper
------------
Scrapes the NHIF website for accredited hospital / facitilities.
'''
nhif_inpatient_result = nhif_inpatient_scraper.run_scraper()
nhif_outpatient_result = nhif_outpatient_scraper.run_scraper()
nhif_outpatient_cs_result = nhif_outpatient_cs_scraper.run_scraper()
'''
STATS
-----
Log time taken and other stats
'''
# TODO: Move this to its own module
time_total = time() - time_start
m, s = divmod(time_total, 60)
h, m = divmod(m, 60)
time_total_formatted = "%dhr:%02dmin:%02dsec" % (
h, m, s) if time_total > 60 else '{} seconds'.format(time_total)
scraping_statistics = {
'doctors_scraper': doctors_scraper.stat_log,
'foreign_doctors_scraper': foreign_doctors_scraper.stat_log,
'healthfacilities_scraper': healthfacilities_scraper.stat_log,
'nhif_inpatient_scraper': nhif_inpatient_scraper.stat_log,
'nhif_outpatient_cs_scraper': nhif_outpatient_cs_scraper.stat_log,
'nhif_outpatient_scraper': nhif_outpatient_scraper.stat_log,
}
# Save stats to sqlite
for stat_desc, stat in scraping_statistics.items():
scraperwiki.sqlite.save(
unique_keys=['description'],
data={"description": stat_desc, "stat": json.dumps(stat)}
)
# initialize a scraper to index scraper statistics
scraper_stats = Scraper()
scraper_stats.data_key = "stats.json"
scraper_stats.data_archive_key = "stats/stats-{}.json"
scraper_stats.archive_data(json.dumps(scraping_statistics))