Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update get_number_of_deaths to match changed FHI schema #13

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 22 additions & 12 deletions code/04_fhi_daily_reports/02_get_number_of_deaths.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,36 @@
from sys import path
path.append('code/')
import re
import pandas as pd
from datetime import datetime

path.append("code")
from utils import get_fhi_datafiles

updated = False
df = pd.read_csv('data/04_deaths/deaths_total_fhi.csv')
df = pd.read_csv('data/04_deaths/deaths_total_fhi.csv', index_col="date")
datafiles = get_fhi_datafiles('data_covid19_demographics')

for datafile in datafiles:
date = re.search('\d{4}-\d{2}-\d{2}', datafile).group()
match = re.search(r"\d{4}-\d{2}-\d{2}", datafile)
if match:
date = match.group()
else:
# fhi now includes a 'latest' file
print(f"No date in {datafile}")
continue

if df['date'].str.contains(date).sum() == 0:
if date not in df.index:
print(f"Fetching {datafile}")
data = pd.read_csv(datafile)
n = data.loc[(data['age'] == 'total') & (data['sex'] == 'total'), ['n']].values[0][0]

df.loc[-1] = [date, n]
df.index = df.index + 1
df = df.sort_index()
# drop internal totals, do sums ourselves
# csvs stopped including redundant totals on 2021-03-09
data = data[(data["sex"] != "total") & (data["age"] != "total")]
# store new total
df.loc[date, "deaths"] = data.n.sum()
updated = True


if updated:
df.to_csv('data/04_deaths/deaths_total_fhi.csv', encoding='utf-8', index=False)
df.to_excel('data/04_deaths/deaths_total_fhi.xlsx', encoding='utf-8', index=False)
print("Writing updated deaths_total_fhi")
df = df.astype({'deaths': int}).sort_index(ascending=False)
df.to_csv("data/04_deaths/deaths_total_fhi.csv", encoding="utf-8")
df.to_excel("data/04_deaths/deaths_total_fhi.xlsx", encoding="utf-8")
47 changes: 37 additions & 10 deletions code/utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,44 @@
import requests
import os
import sys

import requests


def get_fhi_datafiles(filestr):
files = []
headers = { 'Authorization': 'token ' + os.environ.get('GITHUB_TOKEN') }
s = requests.Session()
s.headers = {"Authorization": "token " + os.environ.get("GITHUB_TOKEN")}
repo = "folkehelseinstituttet/surveillance_data"
repo_url = f"https://api.github.com/repos/{repo}"
covid19_dir = "covid19"
# use trees API to get large list of files
# contents API truncates to 1000 files
try:
# get the sha for the latest commit
r = s.get(repo_url + "/git/ref/heads/master")
r.raise_for_status()
head_sha = r.json()["object"]["sha"]
# get tree contents for top-level directory
r = s.get(f"{repo_url}/git/trees/{head_sha}")
r.raise_for_status()
tree_root = r.json()
# locate covid19 subdirectory
for subtree in tree_root["tree"]:
if subtree["path"] == covid19_dir:
covidtree_url = subtree["url"]

contents = requests.get('https://api.github.com/repos/folkehelseinstituttet/surveillance_data/contents/covid19', headers=headers)

if contents.status_code == 200:
for content in contents.json():
if filestr in content['name'] and 'csv' in content['name']:
files.append(content['download_url'])
# finally, list covid19 directory contents
r = requests.get(covidtree_url)
r.raise_for_status()
blobs = r.json()["tree"]
except requests.HTTPException as e:
print(f"Error accessing GitHub API: {e}", file=sys.stderr)
else:
print('Error accessing GitHub API.')
for blob in blobs:
name = blob["path"]
if filestr in name and name.endswith(".csv"):
# blobs API doesn't include download URL
download_url = f"https://raw.githubusercontent.com/{repo}/{head_sha}/{covid19_dir}/{name}"
files.append(download_url)

return files
return files