Skip to content

Commit 93d4b21

Browse files
Merge pull request #1250 from Annabella-Hines/main
Weekly summaries for CFA
2 parents 8ce93b7 + 7a9b389 commit 93d4b21

10 files changed

+54297
-0
lines changed

weekly-summaries/2024-11-23/2024-11-23-flu_forecasts_data.csv

+6,473
Large diffs are not rendered by default.

weekly-summaries/2024-11-23/2024-11-23-flu_map_data.csv

+160
Large diffs are not rendered by default.

weekly-summaries/2024-11-23/2024-11-23-flu_target_hospital_admisssions_data.csv

+11,344
Large diffs are not rendered by default.

weekly-summaries/2024-11-30/2024-11-30-flu_forecasts_data.csv

+6,473
Large diffs are not rendered by default.

weekly-summaries/2024-11-30/2024-11-30-flu_map_data.csv

+160
Large diffs are not rendered by default.

weekly-summaries/2024-11-30/2024-11-30-flu_target_hospital_admisssions_data.csv

+11,344
Large diffs are not rendered by default.

weekly-summaries/2024-12-07/2024-12-07-flu_forecasts_data.csv

+6,473
Large diffs are not rendered by default.

weekly-summaries/2024-12-07/2024-12-07-flu_map_data.csv

+160
Large diffs are not rendered by default.

weekly-summaries/2024-12-07/2024-12-07-flu_target_hospital_admisssions_data.csv

+11,344
Large diffs are not rendered by default.

weekly-summaries/get_weekly_summary.R

+366
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,366 @@
1+
#' Generate the All Forecasts file containing all FluSight hub model submissions. This script fetches all forecast submissions
2+
#' from the `flusight-forecast-hub` based on the `reference_date`. The forecast data is then pivoted to create a wide format with
3+
#' quantile levels as columns.
4+
#'
5+
#' The resulting csv file contains the following columns:
6+
#' - `location_name`: full state name
7+
#' (including "US" for the US state)
8+
#' - `abbreviation`: state abbreviation
9+
#' - `horizon`: forecast horizon
10+
#' - `forecast_date`: date the forecast was generated
11+
#' - `target_end_date`: target date for the forecast
12+
#' - `model`: model name
13+
#' - `quantile_*`: forecast values for various
14+
#' quantiles (e.g., 0.025, 0.5, 0.975)
15+
#' - `forecast_teams`: name of the team that generated the model
16+
#' - `forecast_fullnames`: full model name
17+
#'
18+
#' To run:
19+
#' Rscript gen_forecast_data.R --reference_date 2024-11-23 --base_hub_path ../../
20+
21+
ref_date <- lubridate::ceiling_date(Sys.Date(), "week") - days(1)
22+
reference_date <- ref_date
23+
base_hub_path <- paste0("C:/Users/", Sys.info()["user"], "/Desktop/GitHub/FluSight-forecast-hub")
24+
25+
# create model metadata path
26+
model_metadata <- hubData::load_model_metadata(
27+
base_hub_path, model_ids = NULL)
28+
29+
# get `flusight-forecast-hub` content
30+
hub_content <- hubData::connect_hub(base_hub_path)
31+
current_forecasts <- hub_content |>
32+
dplyr::filter(
33+
reference_date == current_ref_date
34+
) |>
35+
dplyr::collect() |>
36+
as_model_out_tbl()
37+
38+
# get data for All Forecasts file
39+
all_forecasts_data <- forecasttools::pivot_hubverse_quantiles_wider(
40+
hubverse_table = current_forecasts,
41+
pivot_quantiles = c(
42+
"quantile_0.025" = 0.025,
43+
"quantile_0.25" = 0.25,
44+
"quantile_0.5" = 0.5,
45+
"quantile_0.75" = 0.75,
46+
"quantile_0.975" = 0.975)
47+
) |>
48+
# convert location codes to full location
49+
# names and to abbreviations
50+
dplyr::mutate(
51+
location_name = forecasttools::location_lookup(
52+
location,
53+
location_input_format = "hub",
54+
location_output_format = "long_name"
55+
),
56+
abbreviation = forecasttools::us_loc_code_to_abbr(location),
57+
# round the quantiles to nearest integer
58+
# for rounded versions
59+
dplyr::across(dplyr::starts_with("quantile_"), round, .names = "{.col}_rounded")
60+
) |>
61+
dplyr::left_join(
62+
dplyr::distinct(model_metadata, model_id, .keep_all = TRUE), # duplicate model_ids
63+
model_metadata, by = "model_id") |>
64+
dplyr:: mutate(
65+
forecast_due_date = reference_date - days(3), # Wednesday is 3 days before Saturday
66+
forecast_due_date_formatted = format(forecast_due_date, "%B %d, %Y"), # Format as "Month DD, YYYY"
67+
forecast_due_date = format(forecast_due_date, "%Y-%m-%d") # Format as "YYYY-MM-DD"
68+
) |>
69+
dplyr::filter(horizon !=3)|>
70+
dplyr::select(
71+
location_name,
72+
abbreviation,
73+
horizon,
74+
reference_date = reference_date,
75+
target_end_date,
76+
model = model_id,
77+
quantile_0.025,
78+
quantile_0.25,
79+
quantile_0.5,
80+
quantile_0.75,
81+
quantile_0.975,
82+
quantile_0.025_rounded,
83+
quantile_0.25_rounded,
84+
quantile_0.5_rounded,
85+
quantile_0.75_rounded,
86+
quantile_0.975_rounded,
87+
forecast_team = team_name,
88+
model_full_name = model_name,
89+
forecast_due_date,
90+
forecast_due_date_formatted
91+
)
92+
93+
# output folder and file paths for All Forecasts
94+
output_folder_path <- fs::path(base_hub_path, "weekly-summaries", ref_date)
95+
output_filename <- paste0(ref_date, "-flu_forecasts_data.csv")
96+
output_filepath <- fs::path(output_folder_path, output_filename)
97+
98+
# determine if the output folder exists,
99+
# create it if not
100+
fs::dir_create(output_folder_path)
101+
message("Directory is ready: ", output_folder_path)
102+
103+
# check if the file exists, and if not,
104+
# save to csv, else throw an error
105+
if (!fs::file_exists(output_filepath)) {
106+
readr::write_csv(all_forecasts_data, output_filepath)
107+
message("File saved as: ", output_filepath)
108+
} else {
109+
stop("File already exists: ", output_filepath)
110+
}
111+
112+
113+
114+
#' Generate the Map file containing ensemble forecast data.
115+
#'
116+
#' This script loads the latest ensemble forecast data from the `FluSight-ensemble` folder and processes it into the required
117+
#' format. The resulting CSV file contains forecast values for all regions (including US, DC, and Puerto Rico), for various forecast
118+
#' horizons, and quantiles (0.025, 0.5, and 0.975).
119+
#'
120+
#' The ensemble data is expected to contain the following columns:
121+
#' - `reference_date`: the date of the forecast
122+
#' - `location`: state abbreviation
123+
#' - `horizon`: forecast horizon
124+
#' - `target`: forecast target (e.g., "wk inc
125+
#' flu hosp")
126+
#' - `target_end_date`: the forecast target date
127+
#' - `output_type`: type of output (e.g., "quantile")
128+
#' - `output_type_id`: quantile value (e.g.,
129+
#' 0.025, 0.5, 0.975)
130+
#' - `value`: forecast value
131+
#'
132+
#' The resulting `map.csv` file will have the following columns:
133+
#' - `location_name`: full state name (
134+
#' including "US" for the US state)
135+
#' - `quantile_*`: the quantile forecast values
136+
#' (rounded to two decimal places)
137+
#' - `horizon`: forecast horizon
138+
#' - `target`: forecast target (e.g., "7 day
139+
#' ahead inc hosp")
140+
#' - `target_end_date`: target date for the
141+
#' forecast (Ex: 2024-11-30)
142+
#' - `reference_date`: date that the forecast
143+
#' was generated (Ex: 2024-11-23)
144+
#' - `target_end_date_formatted`: target date
145+
#' for the forecast, prettily re-formatted as
146+
#' a string (Ex: "November 30, 2024")
147+
#' - `reference_date_formatted`: date that the
148+
#' forecast was generated, prettily re-formatted
149+
#' as a string (Ex: "November 23, 2024")
150+
#'
151+
#' To run:
152+
#' Rscript gen_map_data.R --reference_date 2024-11-23 --base_hub_path ../../
153+
154+
155+
# load the latest ensemble data from the
156+
# model-output folder
157+
ensemble_folder <- file.path(base_hub_path, "model-output", "FluSight-ensemble")
158+
ensemble_file_current <- file.path(ensemble_folder, paste0(ref_date, "-FluSight-ensemble.csv"))
159+
if (file.exists(ensemble_file_current)) {
160+
ensemble_file <- ensemble_file_current
161+
} else {
162+
stop("Ensemble file for reference date ", ref_date, " not found in the directory: ", ensemble_folder)
163+
}
164+
ensemble_data <- readr::read_csv(ensemble_file)
165+
required_columns <- c("reference_date", "target_end_date", "value", "location", "target")
166+
missing_columns <- setdiff(required_columns, colnames(ensemble_data))
167+
if (length(missing_columns) > 0) {
168+
stop(paste("Missing columns in ensemble data:", paste(missing_columns, collapse = ", ")))
169+
}
170+
171+
# population data, add later to forecasttools
172+
pop_data_path <- file.path(base_hub_path, "auxiliary-data", "locations.csv")
173+
pop_data <- readr::read_csv(pop_data_path)
174+
pop_required_columns <- c("abbreviation", "population")
175+
missing_pop_columns <- setdiff(pop_required_columns, colnames(pop_data))
176+
if (length(missing_pop_columns) > 0) {
177+
stop(paste("Missing columns in population data:", paste(missing_pop_columns, collapse = ", ")))
178+
}
179+
180+
# process ensemble data into the required
181+
# format for Map file
182+
map_data <-forecasttools::pivot_hubverse_quantiles_wider(
183+
hubverse_table = ensemble_data,
184+
pivot_quantiles = c(
185+
"quantile_0.025_count" = 0.025,
186+
"quantile_0.5_count" = 0.5,
187+
"quantile_0.975_count" = 0.975)
188+
) |>
189+
dplyr::mutate(
190+
reference_date = as.Date(reference_date),
191+
target_end_date = as.Date(target_end_date),
192+
model = "FluSight-ensemble"
193+
) |>
194+
dplyr::filter(target != "peak inc flu hosp", horizon != -1, horizon !=3) |>
195+
# convert location column codes to full
196+
# location names
197+
dplyr::mutate(
198+
location = forecasttools::location_lookup(
199+
location,
200+
location_input_format = "hub",
201+
location_output_format = "long_name"
202+
)
203+
) |>
204+
# long name "United States" to "US"
205+
dplyr::mutate(
206+
location = dplyr::if_else(
207+
location == "United States",
208+
"US",
209+
location)
210+
) |>
211+
# add population data for later calculations
212+
dplyr::left_join(
213+
pop_data,
214+
by = c("location" = "location_name")
215+
) |>
216+
# add quantile columns for per-100k rates
217+
# and rounded values
218+
# add quantile columns for per-100k rates
219+
# and rounded values
220+
dplyr::mutate(
221+
quantile_0.025_per100k = quantile_0.025_count/as.numeric(population) * 100000,
222+
quantile_0.5_per100k = quantile_0.5_count / as.numeric(population) * 100000,
223+
quantile_0.975_per100k = quantile_0.975_count / as.numeric(population) * 100000,
224+
quantile_0.025_per100k_rounded = round(quantile_0.025_per100k, 2),
225+
quantile_0.5_per100k_rounded = round(quantile_0.5_per100k, 2),
226+
quantile_0.975_per100k_rounded = round(quantile_0.975_per100k, 2),
227+
quantile_0.025_count_rounded = round(quantile_0.025_count),
228+
quantile_0.5_count_rounded = round(quantile_0.5_count),
229+
quantile_0.975_count_rounded = round(quantile_0.975_count),
230+
target_end_date_formatted = format(target_end_date, "%B %d, %Y"),
231+
reference_date_formatted = format(reference_date, "%B %d, %Y"),
232+
forecast_due_date = reference_date - days(3), # Wednesday is 3 days before Saturday
233+
forecast_due_date_formatted = format(forecast_due_date, "%B %d, %Y"), # Format as "Month DD, YYYY"
234+
forecast_due_date = format(forecast_due_date, "%Y-%m-%d") # Format as "YYYY-MM-DD"
235+
) |>
236+
dplyr::select(
237+
location_name = location, # rename location col
238+
horizon,
239+
quantile_0.025_per100k,
240+
quantile_0.5_per100k,
241+
quantile_0.975_per100k,
242+
quantile_0.025_count,
243+
quantile_0.5_count,
244+
quantile_0.975_count,
245+
quantile_0.025_per100k_rounded,
246+
quantile_0.5_per100k_rounded,
247+
quantile_0.975_per100k_rounded,
248+
quantile_0.025_count_rounded,
249+
quantile_0.5_count_rounded,
250+
quantile_0.975_count_rounded,
251+
target_end_date,
252+
reference_date,
253+
target_end_date_formatted,
254+
reference_date_formatted,
255+
forecast_due_date,
256+
forecast_due_date_formatted,
257+
model
258+
)|>
259+
dplyr::distinct()
260+
261+
# output folder and file paths for Map Data
262+
output_folder_path <- fs::path(base_hub_path, "weekly-summaries", ref_date)
263+
output_filename <- paste0(ref_date, "-flu_map_data.csv")
264+
output_filepath <- fs::path(output_folder_path, output_filename)
265+
266+
# determine if the output folder exists,
267+
# create it if not
268+
fs::dir_create(output_folder_path)
269+
message("Directory is ready: ", output_folder_path)
270+
271+
# check if the file exists, and if not,
272+
# save to csv, else throw an error
273+
if (!fs::file_exists(output_filepath)) {
274+
readr::write_csv(map_data, output_filepath)
275+
message("File saved as: ", output_filepath)
276+
} else {
277+
stop("File already exists: ", output_filepath)
278+
}
279+
280+
281+
282+
#' Generate the Truth Data file containing the most recent observed NHSN hospital admissions data.
283+
#' This script fetches the most recent observed influenza hospital admissions data for all regions
284+
#' (including US, DC, and Puerto Rico) and processes it into the required format. The data is sourced from the NHSN hospital respiratory
285+
#' data: (https://www.cdc.gov/nhsn/psc/hospital-respiratory-reporting.html).
286+
#'
287+
#' The resulting csv file contains the following columns:
288+
#' - `week_ending_date`: week ending date of
289+
#' observed data per row (Ex: 2024-11-16)
290+
#' - `location`: two-digit FIPS code
291+
#' associated with each state (Ex: 06)
292+
#' - `location_name`: full state name
293+
#' (including "US" for the US state)
294+
#' - `value`: the number of hospital
295+
#' admissions (integer)
296+
#'
297+
#' To run:
298+
#' Rscript gen_truth_data.R --reference_date 2024-11-23 --base_hub_path ../../
299+
300+
301+
# fetch all NHSN influenza hospital admissions
302+
flu_data <- forecasttools::pull_nhsn(
303+
api_endpoint = "https://data.cdc.gov/resource/mpgq-jmmr.json",
304+
columns = c("totalconfflunewadm"),
305+
) |>
306+
dplyr::rename(
307+
value = totalconfflunewadm,
308+
date = weekendingdate,
309+
state = jurisdiction
310+
) |>
311+
dplyr::mutate(
312+
date = as.Date(date),
313+
value = as.numeric(value),
314+
state = stringr::str_replace(
315+
state,
316+
"USA",
317+
"US"
318+
)
319+
)
320+
321+
# convert state abbreviation to location code
322+
# and to long name
323+
flu_data <- flu_data |>
324+
dplyr::mutate(
325+
location = forecasttools::us_loc_abbr_to_code(state),
326+
location_name = forecasttools::location_lookup(
327+
location,
328+
location_input_format = "hub",
329+
location_output_format = "long_name")
330+
) |>
331+
# long name "United States" to "US"
332+
dplyr::mutate(
333+
location_name = dplyr::if_else(
334+
location_name == "United States",
335+
"US",
336+
location_name)
337+
)
338+
339+
# filter and format the data
340+
truth_data <- flu_data |>
341+
dplyr::select(
342+
week_ending_date = date,
343+
location,
344+
location_name,
345+
value
346+
) |>
347+
dplyr::filter(!(week_ending_date >= as.Date("2024-04-27") & week_ending_date <= as.Date("2024-10-31"))
348+
)
349+
# output folder and file paths for Truth Data
350+
output_folder_path <- fs::path(base_hub_path, "weekly-summaries", reference_date)
351+
output_filename <- paste0(reference_date, "-flu_target_hospital_admisssions_data.csv")
352+
output_filepath <- fs::path(output_folder_path, output_filename)
353+
354+
# determine if the output folder exists,
355+
# create it if not
356+
fs::dir_create(output_folder_path)
357+
message("Directory is ready: ", output_folder_path)
358+
359+
# check if the file exists, and if not,
360+
# save to csv, else throw an error
361+
if (!fs::file_exists(output_filepath)) {
362+
readr::write_csv(truth_data, output_filepath)
363+
message("File saved as: ", output_filepath)
364+
} else {
365+
stop("File already exists: ", output_filepath)
366+
}

0 commit comments

Comments
 (0)