thohan88 · minrk · Apr 15, 2021 · Apr 29, 2021
diff --git a/code/04_fhi_daily_reports/02_get_number_of_deaths.py b/code/04_fhi_daily_reports/02_get_number_of_deaths.py
@@ -1,26 +1,36 @@
 from sys import path
-path.append('code/')
 import re
 import pandas as pd
-from datetime import datetime
+
+path.append("code")
 from utils import get_fhi_datafiles
 
 updated = False
-df = pd.read_csv('data/04_deaths/deaths_total_fhi.csv')
+df = pd.read_csv('data/04_deaths/deaths_total_fhi.csv', index_col="date")
 datafiles = get_fhi_datafiles('data_covid19_demographics')
 
 for datafile in datafiles:
-    date = re.search('\d{4}-\d{2}-\d{2}', datafile).group()
+    match = re.search(r"\d{4}-\d{2}-\d{2}", datafile)
+    if match:
+        date = match.group()
+    else:
+        # fhi now includes a 'latest' file
+        print(f"No date in {datafile}")
+        continue
 
-    if df['date'].str.contains(date).sum() == 0:
+    if date not in df.index:
+        print(f"Fetching {datafile}")
         data = pd.read_csv(datafile)
-        n = data.loc[(data['age'] == 'total') & (data['sex'] == 'total'), ['n']].values[0][0]
-
-        df.loc[-1] = [date, n]
-        df.index = df.index + 1
-        df = df.sort_index()
+        # drop internal totals, do sums ourselves
+        # csvs stopped including redundant totals on 2021-03-09
+        data = data[(data["sex"] != "total") & (data["age"] != "total")]
+        # store new total
+        df.loc[date, "deaths"] = data.n.sum()
         updated = True
 
+
 if updated:
-    df.to_csv('data/04_deaths/deaths_total_fhi.csv', encoding='utf-8', index=False)
-    df.to_excel('data/04_deaths/deaths_total_fhi.xlsx', encoding='utf-8', index=False)
+    print("Writing updated deaths_total_fhi")
+    df = df.astype({'deaths': int}).sort_index(ascending=False)
+    df.to_csv("data/04_deaths/deaths_total_fhi.csv", encoding="utf-8")
+    df.to_excel("data/04_deaths/deaths_total_fhi.xlsx", encoding="utf-8")
diff --git a/code/utils.py b/code/utils.py
@@ -1,17 +1,44 @@
-import requests
 import os
+import sys
+
+import requests
+
 
 def get_fhi_datafiles(filestr):
     files = []
-    headers = { 'Authorization': 'token ' + os.environ.get('GITHUB_TOKEN') }
+    s = requests.Session()
+    s.headers = {"Authorization": "token " + os.environ.get("GITHUB_TOKEN")}
+    repo = "folkehelseinstituttet/surveillance_data"
+    repo_url = f"https://api.github.com/repos/{repo}"
+    covid19_dir = "covid19"
+    # use trees API to get large list of files
+    # contents API truncates to 1000 files
+    try:
+        # get the sha for the latest commit
+        r = s.get(repo_url + "/git/ref/heads/master")
+        r.raise_for_status()
+        head_sha = r.json()["object"]["sha"]
+        # get tree contents for top-level directory
+        r = s.get(f"{repo_url}/git/trees/{head_sha}")
+        r.raise_for_status()
+        tree_root = r.json()
+        # locate covid19 subdirectory
+        for subtree in tree_root["tree"]:
+            if subtree["path"] == covid19_dir:
+                covidtree_url = subtree["url"]
 
-    contents = requests.get('https://api.github.com/repos/folkehelseinstituttet/surveillance_data/contents/covid19', headers=headers)
-
-    if contents.status_code == 200:
-        for content in contents.json():
-            if filestr in content['name'] and 'csv' in content['name']:
-                files.append(content['download_url'])
+        # finally, list covid19 directory contents
+        r = requests.get(covidtree_url)
+        r.raise_for_status()
+        blobs = r.json()["tree"]
+    except requests.HTTPException as e:
+        print(f"Error accessing GitHub API: {e}", file=sys.stderr)
     else:
-        print('Error accessing GitHub API.')
+        for blob in blobs:
+            name = blob["path"]
+            if filestr in name and name.endswith(".csv"):
+                # blobs API doesn't include download URL
+                download_url = f"https://raw.githubusercontent.com/{repo}/{head_sha}/{covid19_dir}/{name}"
+                files.append(download_url)
 
-    return files
+    return files