-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_dataset.py
45 lines (37 loc) · 1.18 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import wikipedia
import bs4
import json
import sys
def main():
page = wikipedia.page("List_of_suicide_crisis_lines")
html = bs4.BeautifulSoup(page.html(), "html.parser")
dataset = {}
for row in html.find("tbody").findAll("tr")[1:]:
links = row.find_all("a")
country = ""
for link in links:
href = link.get("href")
text = link.text
if href != f"/wiki/{text}":
continue
if ".svg" in href:
continue
country = text
break
country_data = {
'raw_html': "".join([str(x) for x in row.find('td').contents]).strip(),
'parsed_batches': [],
}
for phone_batch in row.find_all("li"):
phone_numbers = [
lnk.get("href")
for lnk in phone_batch.findAll("a")
if "tel:" in lnk.get("href")
]
country_data['parsed_batches'].append(
{"phones": phone_numbers, "description": phone_batch.text}
)
dataset[country] = country_data
json.dump(dataset, sys.stdout, indent=4)
if __name__ == "__main__":
main()