|
| 1 | +#' Generate the All Forecasts file containing all FluSight hub model submissions. This script fetches all forecast submissions |
| 2 | +#' from the `flusight-forecast-hub` based on the `reference_date`. The forecast data is then pivoted to create a wide format with |
| 3 | +#' quantile levels as columns. |
| 4 | +#' |
| 5 | +#' The resulting csv file contains the following columns: |
| 6 | +#' - `location_name`: full state name |
| 7 | +#' (including "US" for the US state) |
| 8 | +#' - `abbreviation`: state abbreviation |
| 9 | +#' - `horizon`: forecast horizon |
| 10 | +#' - `forecast_date`: date the forecast was generated |
| 11 | +#' - `target_end_date`: target date for the forecast |
| 12 | +#' - `model`: model name |
| 13 | +#' - `quantile_*`: forecast values for various |
| 14 | +#' quantiles (e.g., 0.025, 0.5, 0.975) |
| 15 | +#' - `forecast_teams`: name of the team that generated the model |
| 16 | +#' - `forecast_fullnames`: full model name |
| 17 | +#' |
| 18 | +#' To run: |
| 19 | +#' Rscript gen_forecast_data.R --reference_date 2024-11-23 --base_hub_path ../../ |
| 20 | + |
| 21 | +ref_date <- lubridate::ceiling_date(Sys.Date(), "week") - days(1) |
| 22 | +reference_date <- ref_date |
| 23 | +base_hub_path <- paste0("C:/Users/", Sys.info()["user"], "/Desktop/GitHub/FluSight-forecast-hub") |
| 24 | + |
| 25 | +# create model metadata path |
| 26 | +model_metadata <- hubData::load_model_metadata( |
| 27 | + base_hub_path, model_ids = NULL) |
| 28 | + |
| 29 | +# get `flusight-forecast-hub` content |
| 30 | +hub_content <- hubData::connect_hub(base_hub_path) |
| 31 | +current_forecasts <- hub_content |> |
| 32 | + dplyr::filter( |
| 33 | + reference_date == current_ref_date |
| 34 | + ) |> |
| 35 | + dplyr::collect() |> |
| 36 | + as_model_out_tbl() |
| 37 | + |
| 38 | +# get data for All Forecasts file |
| 39 | +all_forecasts_data <- forecasttools::pivot_hubverse_quantiles_wider( |
| 40 | + hubverse_table = current_forecasts, |
| 41 | + pivot_quantiles = c( |
| 42 | + "quantile_0.025" = 0.025, |
| 43 | + "quantile_0.25" = 0.25, |
| 44 | + "quantile_0.5" = 0.5, |
| 45 | + "quantile_0.75" = 0.75, |
| 46 | + "quantile_0.975" = 0.975) |
| 47 | +) |> |
| 48 | + # convert location codes to full location |
| 49 | + # names and to abbreviations |
| 50 | + dplyr::mutate( |
| 51 | + location_name = forecasttools::location_lookup( |
| 52 | + location, |
| 53 | + location_input_format = "hub", |
| 54 | + location_output_format = "long_name" |
| 55 | + ), |
| 56 | + abbreviation = forecasttools::us_loc_code_to_abbr(location), |
| 57 | + # round the quantiles to nearest integer |
| 58 | + # for rounded versions |
| 59 | + dplyr::across(dplyr::starts_with("quantile_"), round, .names = "{.col}_rounded") |
| 60 | + ) |> |
| 61 | + dplyr::left_join( |
| 62 | + dplyr::distinct(model_metadata, model_id, .keep_all = TRUE), # duplicate model_ids |
| 63 | + model_metadata, by = "model_id") |> |
| 64 | + dplyr:: mutate( |
| 65 | + forecast_due_date = reference_date - days(3), # Wednesday is 3 days before Saturday |
| 66 | + forecast_due_date_formatted = format(forecast_due_date, "%B %d, %Y"), # Format as "Month DD, YYYY" |
| 67 | + forecast_due_date = format(forecast_due_date, "%Y-%m-%d") # Format as "YYYY-MM-DD" |
| 68 | + ) |> |
| 69 | + dplyr::filter(horizon !=3)|> |
| 70 | + dplyr::select( |
| 71 | + location_name, |
| 72 | + abbreviation, |
| 73 | + horizon, |
| 74 | + reference_date = reference_date, |
| 75 | + target_end_date, |
| 76 | + model = model_id, |
| 77 | + quantile_0.025, |
| 78 | + quantile_0.25, |
| 79 | + quantile_0.5, |
| 80 | + quantile_0.75, |
| 81 | + quantile_0.975, |
| 82 | + quantile_0.025_rounded, |
| 83 | + quantile_0.25_rounded, |
| 84 | + quantile_0.5_rounded, |
| 85 | + quantile_0.75_rounded, |
| 86 | + quantile_0.975_rounded, |
| 87 | + forecast_team = team_name, |
| 88 | + model_full_name = model_name, |
| 89 | + forecast_due_date, |
| 90 | + forecast_due_date_formatted |
| 91 | + ) |
| 92 | + |
| 93 | +# output folder and file paths for All Forecasts |
| 94 | +output_folder_path <- fs::path(base_hub_path, "weekly-summaries", ref_date) |
| 95 | +output_filename <- paste0(ref_date, "-flu_forecasts_data.csv") |
| 96 | +output_filepath <- fs::path(output_folder_path, output_filename) |
| 97 | + |
| 98 | +# determine if the output folder exists, |
| 99 | +# create it if not |
| 100 | +fs::dir_create(output_folder_path) |
| 101 | +message("Directory is ready: ", output_folder_path) |
| 102 | + |
| 103 | +# check if the file exists, and if not, |
| 104 | +# save to csv, else throw an error |
| 105 | +if (!fs::file_exists(output_filepath)) { |
| 106 | + readr::write_csv(all_forecasts_data, output_filepath) |
| 107 | + message("File saved as: ", output_filepath) |
| 108 | +} else { |
| 109 | + stop("File already exists: ", output_filepath) |
| 110 | +} |
| 111 | + |
| 112 | + |
| 113 | + |
| 114 | +#' Generate the Map file containing ensemble forecast data. |
| 115 | +#' |
| 116 | +#' This script loads the latest ensemble forecast data from the `FluSight-ensemble` folder and processes it into the required |
| 117 | +#' format. The resulting CSV file contains forecast values for all regions (including US, DC, and Puerto Rico), for various forecast |
| 118 | +#' horizons, and quantiles (0.025, 0.5, and 0.975). |
| 119 | +#' |
| 120 | +#' The ensemble data is expected to contain the following columns: |
| 121 | +#' - `reference_date`: the date of the forecast |
| 122 | +#' - `location`: state abbreviation |
| 123 | +#' - `horizon`: forecast horizon |
| 124 | +#' - `target`: forecast target (e.g., "wk inc |
| 125 | +#' flu hosp") |
| 126 | +#' - `target_end_date`: the forecast target date |
| 127 | +#' - `output_type`: type of output (e.g., "quantile") |
| 128 | +#' - `output_type_id`: quantile value (e.g., |
| 129 | +#' 0.025, 0.5, 0.975) |
| 130 | +#' - `value`: forecast value |
| 131 | +#' |
| 132 | +#' The resulting `map.csv` file will have the following columns: |
| 133 | +#' - `location_name`: full state name ( |
| 134 | +#' including "US" for the US state) |
| 135 | +#' - `quantile_*`: the quantile forecast values |
| 136 | +#' (rounded to two decimal places) |
| 137 | +#' - `horizon`: forecast horizon |
| 138 | +#' - `target`: forecast target (e.g., "7 day |
| 139 | +#' ahead inc hosp") |
| 140 | +#' - `target_end_date`: target date for the |
| 141 | +#' forecast (Ex: 2024-11-30) |
| 142 | +#' - `reference_date`: date that the forecast |
| 143 | +#' was generated (Ex: 2024-11-23) |
| 144 | +#' - `target_end_date_formatted`: target date |
| 145 | +#' for the forecast, prettily re-formatted as |
| 146 | +#' a string (Ex: "November 30, 2024") |
| 147 | +#' - `reference_date_formatted`: date that the |
| 148 | +#' forecast was generated, prettily re-formatted |
| 149 | +#' as a string (Ex: "November 23, 2024") |
| 150 | +#' |
| 151 | +#' To run: |
| 152 | +#' Rscript gen_map_data.R --reference_date 2024-11-23 --base_hub_path ../../ |
| 153 | + |
| 154 | + |
| 155 | +# load the latest ensemble data from the |
| 156 | +# model-output folder |
| 157 | +ensemble_folder <- file.path(base_hub_path, "model-output", "FluSight-ensemble") |
| 158 | +ensemble_file_current <- file.path(ensemble_folder, paste0(ref_date, "-FluSight-ensemble.csv")) |
| 159 | +if (file.exists(ensemble_file_current)) { |
| 160 | + ensemble_file <- ensemble_file_current |
| 161 | +} else { |
| 162 | + stop("Ensemble file for reference date ", ref_date, " not found in the directory: ", ensemble_folder) |
| 163 | +} |
| 164 | +ensemble_data <- readr::read_csv(ensemble_file) |
| 165 | +required_columns <- c("reference_date", "target_end_date", "value", "location", "target") |
| 166 | +missing_columns <- setdiff(required_columns, colnames(ensemble_data)) |
| 167 | +if (length(missing_columns) > 0) { |
| 168 | + stop(paste("Missing columns in ensemble data:", paste(missing_columns, collapse = ", "))) |
| 169 | +} |
| 170 | + |
| 171 | +# population data, add later to forecasttools |
| 172 | +pop_data_path <- file.path(base_hub_path, "auxiliary-data", "locations.csv") |
| 173 | +pop_data <- readr::read_csv(pop_data_path) |
| 174 | +pop_required_columns <- c("abbreviation", "population") |
| 175 | +missing_pop_columns <- setdiff(pop_required_columns, colnames(pop_data)) |
| 176 | +if (length(missing_pop_columns) > 0) { |
| 177 | + stop(paste("Missing columns in population data:", paste(missing_pop_columns, collapse = ", "))) |
| 178 | +} |
| 179 | + |
| 180 | +# process ensemble data into the required |
| 181 | +# format for Map file |
| 182 | +map_data <-forecasttools::pivot_hubverse_quantiles_wider( |
| 183 | + hubverse_table = ensemble_data, |
| 184 | + pivot_quantiles = c( |
| 185 | + "quantile_0.025_count" = 0.025, |
| 186 | + "quantile_0.5_count" = 0.5, |
| 187 | + "quantile_0.975_count" = 0.975) |
| 188 | + ) |> |
| 189 | + dplyr::mutate( |
| 190 | + reference_date = as.Date(reference_date), |
| 191 | + target_end_date = as.Date(target_end_date), |
| 192 | + model = "FluSight-ensemble" |
| 193 | + ) |> |
| 194 | + dplyr::filter(target != "peak inc flu hosp", horizon != -1, horizon !=3) |> |
| 195 | + # convert location column codes to full |
| 196 | + # location names |
| 197 | + dplyr::mutate( |
| 198 | + location = forecasttools::location_lookup( |
| 199 | + location, |
| 200 | + location_input_format = "hub", |
| 201 | + location_output_format = "long_name" |
| 202 | + ) |
| 203 | + ) |> |
| 204 | + # long name "United States" to "US" |
| 205 | + dplyr::mutate( |
| 206 | + location = dplyr::if_else( |
| 207 | + location == "United States", |
| 208 | + "US", |
| 209 | + location) |
| 210 | + ) |> |
| 211 | + # add population data for later calculations |
| 212 | + dplyr::left_join( |
| 213 | + pop_data, |
| 214 | + by = c("location" = "location_name") |
| 215 | + ) |> |
| 216 | + # add quantile columns for per-100k rates |
| 217 | + # and rounded values |
| 218 | + # add quantile columns for per-100k rates |
| 219 | + # and rounded values |
| 220 | + dplyr::mutate( |
| 221 | + quantile_0.025_per100k = quantile_0.025_count/as.numeric(population) * 100000, |
| 222 | + quantile_0.5_per100k = quantile_0.5_count / as.numeric(population) * 100000, |
| 223 | + quantile_0.975_per100k = quantile_0.975_count / as.numeric(population) * 100000, |
| 224 | + quantile_0.025_per100k_rounded = round(quantile_0.025_per100k, 2), |
| 225 | + quantile_0.5_per100k_rounded = round(quantile_0.5_per100k, 2), |
| 226 | + quantile_0.975_per100k_rounded = round(quantile_0.975_per100k, 2), |
| 227 | + quantile_0.025_count_rounded = round(quantile_0.025_count), |
| 228 | + quantile_0.5_count_rounded = round(quantile_0.5_count), |
| 229 | + quantile_0.975_count_rounded = round(quantile_0.975_count), |
| 230 | + target_end_date_formatted = format(target_end_date, "%B %d, %Y"), |
| 231 | + reference_date_formatted = format(reference_date, "%B %d, %Y"), |
| 232 | + forecast_due_date = reference_date - days(3), # Wednesday is 3 days before Saturday |
| 233 | + forecast_due_date_formatted = format(forecast_due_date, "%B %d, %Y"), # Format as "Month DD, YYYY" |
| 234 | + forecast_due_date = format(forecast_due_date, "%Y-%m-%d") # Format as "YYYY-MM-DD" |
| 235 | + ) |> |
| 236 | + dplyr::select( |
| 237 | + location_name = location, # rename location col |
| 238 | + horizon, |
| 239 | + quantile_0.025_per100k, |
| 240 | + quantile_0.5_per100k, |
| 241 | + quantile_0.975_per100k, |
| 242 | + quantile_0.025_count, |
| 243 | + quantile_0.5_count, |
| 244 | + quantile_0.975_count, |
| 245 | + quantile_0.025_per100k_rounded, |
| 246 | + quantile_0.5_per100k_rounded, |
| 247 | + quantile_0.975_per100k_rounded, |
| 248 | + quantile_0.025_count_rounded, |
| 249 | + quantile_0.5_count_rounded, |
| 250 | + quantile_0.975_count_rounded, |
| 251 | + target_end_date, |
| 252 | + reference_date, |
| 253 | + target_end_date_formatted, |
| 254 | + reference_date_formatted, |
| 255 | + forecast_due_date, |
| 256 | + forecast_due_date_formatted, |
| 257 | + model |
| 258 | + )|> |
| 259 | + dplyr::distinct() |
| 260 | + |
| 261 | +# output folder and file paths for Map Data |
| 262 | +output_folder_path <- fs::path(base_hub_path, "weekly-summaries", ref_date) |
| 263 | +output_filename <- paste0(ref_date, "-flu_map_data.csv") |
| 264 | +output_filepath <- fs::path(output_folder_path, output_filename) |
| 265 | + |
| 266 | +# determine if the output folder exists, |
| 267 | +# create it if not |
| 268 | +fs::dir_create(output_folder_path) |
| 269 | +message("Directory is ready: ", output_folder_path) |
| 270 | + |
| 271 | +# check if the file exists, and if not, |
| 272 | +# save to csv, else throw an error |
| 273 | +if (!fs::file_exists(output_filepath)) { |
| 274 | + readr::write_csv(map_data, output_filepath) |
| 275 | + message("File saved as: ", output_filepath) |
| 276 | +} else { |
| 277 | + stop("File already exists: ", output_filepath) |
| 278 | +} |
| 279 | + |
| 280 | + |
| 281 | + |
| 282 | +#' Generate the Truth Data file containing the most recent observed NHSN hospital admissions data. |
| 283 | +#' This script fetches the most recent observed influenza hospital admissions data for all regions |
| 284 | +#' (including US, DC, and Puerto Rico) and processes it into the required format. The data is sourced from the NHSN hospital respiratory |
| 285 | +#' data: (https://www.cdc.gov/nhsn/psc/hospital-respiratory-reporting.html). |
| 286 | +#' |
| 287 | +#' The resulting csv file contains the following columns: |
| 288 | +#' - `week_ending_date`: week ending date of |
| 289 | +#' observed data per row (Ex: 2024-11-16) |
| 290 | +#' - `location`: two-digit FIPS code |
| 291 | +#' associated with each state (Ex: 06) |
| 292 | +#' - `location_name`: full state name |
| 293 | +#' (including "US" for the US state) |
| 294 | +#' - `value`: the number of hospital |
| 295 | +#' admissions (integer) |
| 296 | +#' |
| 297 | +#' To run: |
| 298 | +#' Rscript gen_truth_data.R --reference_date 2024-11-23 --base_hub_path ../../ |
| 299 | + |
| 300 | + |
| 301 | +# fetch all NHSN influenza hospital admissions |
| 302 | +flu_data <- forecasttools::pull_nhsn( |
| 303 | + api_endpoint = "https://data.cdc.gov/resource/mpgq-jmmr.json", |
| 304 | + columns = c("totalconfflunewadm"), |
| 305 | +) |> |
| 306 | + dplyr::rename( |
| 307 | + value = totalconfflunewadm, |
| 308 | + date = weekendingdate, |
| 309 | + state = jurisdiction |
| 310 | + ) |> |
| 311 | + dplyr::mutate( |
| 312 | + date = as.Date(date), |
| 313 | + value = as.numeric(value), |
| 314 | + state = stringr::str_replace( |
| 315 | + state, |
| 316 | + "USA", |
| 317 | + "US" |
| 318 | + ) |
| 319 | + ) |
| 320 | + |
| 321 | +# convert state abbreviation to location code |
| 322 | +# and to long name |
| 323 | +flu_data <- flu_data |> |
| 324 | + dplyr::mutate( |
| 325 | + location = forecasttools::us_loc_abbr_to_code(state), |
| 326 | + location_name = forecasttools::location_lookup( |
| 327 | + location, |
| 328 | + location_input_format = "hub", |
| 329 | + location_output_format = "long_name") |
| 330 | + ) |> |
| 331 | + # long name "United States" to "US" |
| 332 | + dplyr::mutate( |
| 333 | + location_name = dplyr::if_else( |
| 334 | + location_name == "United States", |
| 335 | + "US", |
| 336 | + location_name) |
| 337 | + ) |
| 338 | + |
| 339 | +# filter and format the data |
| 340 | +truth_data <- flu_data |> |
| 341 | + dplyr::select( |
| 342 | + week_ending_date = date, |
| 343 | + location, |
| 344 | + location_name, |
| 345 | + value |
| 346 | + ) |> |
| 347 | + dplyr::filter(!(week_ending_date >= as.Date("2024-04-27") & week_ending_date <= as.Date("2024-10-31")) |
| 348 | + ) |
| 349 | +# output folder and file paths for Truth Data |
| 350 | +output_folder_path <- fs::path(base_hub_path, "weekly-summaries", reference_date) |
| 351 | +output_filename <- paste0(reference_date, "-flu_target_hospital_admisssions_data.csv") |
| 352 | +output_filepath <- fs::path(output_folder_path, output_filename) |
| 353 | + |
| 354 | +# determine if the output folder exists, |
| 355 | +# create it if not |
| 356 | +fs::dir_create(output_folder_path) |
| 357 | +message("Directory is ready: ", output_folder_path) |
| 358 | + |
| 359 | +# check if the file exists, and if not, |
| 360 | +# save to csv, else throw an error |
| 361 | +if (!fs::file_exists(output_filepath)) { |
| 362 | + readr::write_csv(truth_data, output_filepath) |
| 363 | + message("File saved as: ", output_filepath) |
| 364 | +} else { |
| 365 | + stop("File already exists: ", output_filepath) |
| 366 | +} |
0 commit comments