-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathfetcher.py
115 lines (102 loc) · 3.79 KB
/
fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
Fetch the data from Johns Hopkins' github
"""
import pandas as pd
import urllib
URL_BASE = (
"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
"csse_covid_19_data/csse_covid_19_time_series/"
)
FILENAME_JOHN_HOPKINS = {
"confirmed": "time_series_covid19_confirmed_global.csv",
"death": "time_series_covid19_deaths_global.csv",
}
URL_COUNTRY_ISO = (
"https://gist.githubusercontent.com/tadast/8827699/raw/"
"7255fdfbf292c592b75cf5f7a19c16ea59735f74/"
"countries_codes_and_coordinates.csv"
)
MAP_UNMATCHED_COUNTRIES = {
'Congo (Kinshasa)': "Congo, the Democratic Republic of the",
"Cote d'Ivoire": "Côte d'Ivoire",
'Czechia': "Czech Republic",
'Eswatini': "Swaziland",
'Holy See': "Italy",
'Iran': "Iran, Islamic Republic of",
'Korea, South': "South Korea",
'Moldova': "Moldova, Republic of",
'North Macedonia': "Macedonia, the former Yugoslav Republic of",
'Reunion': "Réunion",
'Taiwan*': "Taiwan",
'US': "United States",
'occupied Palestinian territory': "Palestinian Territory, Occupied",
}
MISSING_COUNTRIES = pd.DataFrame({
"Country": ["Curacao"],
"Alpha-2 code": ["CW"],
"Alpha-3 code": ["CUW"],
"Numeric code": [531.0],
"Latitude (average)": [12.169570],
"Longitude (average)": [-68.990021],
})
UNMATCHED_COUNTRIES = ['Cruise Ship']
def fetch_john_hopkins_data():
df_covid = {
key: pd.read_csv(urllib.request.urljoin(URL_BASE, filename))
for key, filename in FILENAME_JOHN_HOPKINS.items()
}
columns_drop = ["Province/State", "Province_State",
"Lat", "Long", "Long_", "Population",
"UID", "iso2", "iso3", "code3", "FIPS", "Admin2",
]
for key in df_covid:
for col in columns_drop:
if col in df_covid[key].columns:
df_covid[key] = df_covid[key].drop(columns=[col, ])
df_covid[key] = df_covid[key].rename(
columns={'Country_Region': 'Country/Region'})
for key in df_covid:
df_covid[key] = df_covid[key].groupby("Country/Region").sum()
# move each date column as a row entry to get a "date" column instead
df_covid[key] = df_covid[key].stack().to_frame().reset_index().rename(
columns={"level_1": "date", 0: "cases"}
)
df_covid[key]["date"] = df_covid[key]["date"].apply(
lambda x: pd.to_datetime(x)
)
df_covid[key]["type"] = key
df_covid = pd.concat(df_covid.values(), ignore_index=True)
df_covid = df_covid.replace({"Country/Region": MAP_UNMATCHED_COUNTRIES})
df_countries = pd.read_csv(URL_COUNTRY_ISO)
for col_name in df_countries.columns:
df_countries[col_name] = df_countries[col_name].str.strip('. "."')
float_column = [
"Numeric code", "Latitude (average)", "Longitude (average)"
]
df_countries[float_column] = df_countries[float_column].astype(float)
df_countries = pd.concat(
[df_countries, MISSING_COUNTRIES], ignore_index=True
)
df = pd.merge(
left=df_covid,
right=df_countries,
left_on="Country/Region",
right_on="Country",
how="inner",
)
assert df["Country"].unique().shape[0] >= 143, \
"Missing countries when making the merge: not enough countries"
assert 'China' in df["Country"].unique(), \
"Missing countries when making the merge: China missing"
assert 'South Korea' in df["Country"].unique(), \
"Missing countries when making the merge: South Korea missing"
df = df.rename(
columns={
"Country/Region": "name",
"Country": "country_region",
"Alpha-3 code": "iso",
"Latitude (average)": "lat",
"Longitude (average)": "long",
}
)
return df