From 9374054dad440c1a430be9907a2e4140580b2fc6 Mon Sep 17 00:00:00 2001 From: Andrew Pulsipher <45372570+apulsipher@users.noreply.github.com> Date: Thu, 16 Jan 2025 12:26:13 -0700 Subject: [PATCH] Make forecasts customizable (#54) * Start migrating code to separate R files * Refactor first part of report (getting data) * Refactor get_season_starts * Refactor up until end of LFMCMC * Refactor rest of index.qmd * Finish refactor, compartmentalizing code into separate files * Minor edits to report * Document functions in forecast.pkg.R and plot-forecast.R * Document functions in covid-forecast.R --- covid-forecast.R | 362 +++++++++++++++++++++++++++++++++++++++++++ forecast-pkg.R | 107 +++++++++++++ index.qmd | 390 ++--------------------------------------------- plot-forecast.R | 168 ++++++++++++++++++++ 4 files changed, 650 insertions(+), 377 deletions(-) create mode 100644 covid-forecast.R create mode 100644 forecast-pkg.R create mode 100644 plot-forecast.R diff --git a/covid-forecast.R b/covid-forecast.R new file mode 100644 index 0000000..aef672a --- /dev/null +++ b/covid-forecast.R @@ -0,0 +1,362 @@ +## ---------------------------------- +## GENERAL FUNCTIONS +## ---------------------------------- +source("forecast-pkg.R") +source("plot-forecast.R") +library(tidyr) + +## ---------------------------------- +## HELPER FUNCTIONS +## ---------------------------------- + +#' Get COVID-19 case counts from UDHHS +#' +#' @description +#' `get_covid_data` downloads the COVID-19 dataset from UDHHS. +#' +#' @details +#' This function downloads the COVID-19 daily case count dataset +#' from UDHHS, returning the data from the last N days. +#' +#' @param n_days Integer value. +#' +#' @returns The COVID-19 case count data. +get_covid_data <- function(n_days) { + # Download the Trends data from Utah DHSS + # - URL for UDHHS COVID-19 data (returns a zip archive with multiple files) + data_url <- "https://coronavirus-dashboard.utah.gov/Utah_COVID19_data.zip" + # - Target file from the above zip archive + target_file_regex <- "Trends_Epidemic+" + # - Perform data extraction + covid_data <- get_data_from_url(data_url, target_file_regex) + # - Format date column properly + covid_data$Date <- as.Date(covid_data$Date) + + # Extract last n_days of data + last_date <- max(covid_data$Date) + covid_data <- covid_data[covid_data$Date > (last_date - n_days), ] + return(covid_data) +} + +## ---------------------------------- +## LFMCMC FUNCTIONS +## ---------------------------------- + +#' Define the LFMCMC summary function +#' +#' @description +#' `lfmcmc_summary_fun` defines the summary function for LFMCMC. +#' +#' @details +#' In LFMCMC, the summary function computes summary statistics which +#' are used to compare the simulation function output with the observed +#' data. It must perform the same with the observed dataset +#' as it does with the results of the LFMCMC simulation function. +#' This function takes a vector of case counts and computes +#' `time_to_peak` (how many days to the highest case count), +#' `size_of_peak`, as well as the mean and standard deviation. +#' +#' @param case_counts Vector of integer case counts. +#' +#' @returns A vector of summary statistics. +lfmcmc_summary_fun <- function(case_counts) { + + time_to_peak <- which.max(case_counts) + size_of_peak <- case_counts[time_to_peak] + mean_cases <- mean(case_counts) + sd_cases <- sd(case_counts) + + c( + time_to_peak, + size_of_peak, + mean_cases, + sd_cases + ) +} + +#' Define the LFMCMC proposal function +#' +#' @description +#' `lfmcmc_proposal_fun` defines the proposal function for LFMCMC. +#' +#' @details +#' In LFMCMC, the proposal function generates new model parameters +#' to be used in the next run of the simulation function. +#' This function takes a vector of parameters (the set used in +#' the previous run) and takes a random step from those values +#' to compute new ("proposed") values. +#' +#' @param params_prev Vector of numeric parameter values. +#' +#' @returns A vector of proposed parameter values. +lfmcmc_proposal_fun <- function(params_prev) { + # Propose new model parameters + params_1_to_5 <- plogis( + qlogis(params_prev[1:5]) + + rnorm(length(params_prev[1:5]), mean = 0, sd = 0.025) + ) + + params_6_to_7 <- params_prev[6:7] + + rnorm(2, mean = 0, sd = 0.025) + + + # Reflect contact rates + if (params_6_to_7[1] < 0) { + params_6_to_7[1] <- params_prev[6] - + (params_6_to_7[1] - params_prev[6]) + } + if (params_6_to_7[2] < 0) { + params_6_to_7[2] <- params_prev[7] - + (params_6_to_7[2] - params_prev[7]) + } + + # Return proposed parameters + c(params_1_to_5, params_6_to_7) +} + +#' Define the LFMCMC kernel function +#' +#' @description +#' `lfmcmc_kernel_fun` defines the kernel function for LFMCMC. +#' +#' @details +#' In LFMCMC, the kernel function weighs the summary statistics from +#' the latest run of the simulation function against the summary +#' statistics of the observed data. +#' +#' @param simulated_stats Vector of numeric summary stats for simulated data. +#' @param observed_stats Vector of numeric summary stats for observed data. +#' @param epsilon Numeric epsilon value. +#' +#' @returns A numeric kernel score. +lfmcmc_kernel_fun <- function(simulated_stats, observed_stats, epsilon) { + diff <- ((simulated_stats - observed_stats)^2)^epsilon + dnorm(sqrt(sum(diff))) +} + +#' Get a sample of accepted parameters from LFMCMC +#' +#' @description +#' `get_params_sample` gets a sample of accepted parameters from the +#' LFMCMC run after the burn-in period. +#' +#' @param lfmcmc_obj Object of class [LFMCMC]. +#' @param total_samples Integer total samples from the LFMCMC object. +#' @param burnin Integer burn-in period for the LFMCMC object. +#' @param sample_size Integer number of samples to take. +#' +#' @returns An array of numeric vectors where each +#' vector represents a single sample set of parameters. +get_params_sample <- function(lfmcmc_obj, total_samples, burnin, sample_size) { + accepted_params <- get_accepted_params(lfmcmc_obj) + burnin_sample <- tail(accepted_params, n = (total_samples - burnin)) + params_sample <- burnin_sample[sample(nrow(burnin_sample), sample_size), ] + return(params_sample) +} + +## ---------------------------------- +## Setup Forecast +## ---------------------------------- +# Define forecast parameters +n_days <- 90 # Calibrate model last 90 days of data + +## ---------------------------------- +## Get Data +## ---------------------------------- + +# Get COVID-19 data +covid_data <- get_covid_data(n_days) +# Compute start date for each season +seasons <- get_season_starts(covid_data$Date) +# Get observed case counts +covid_cases <- covid_data$Daily.Cases + +## ---------------------------------- +## Create SIR CONN Model +## ---------------------------------- + +# Define SIRCONN model parameters +model_seed <- 112 # Random seed +model_ndays <- n_days # How many days to run the model +model_n <- 10000 # model population size + +# Define initial disease parameters +init_prevalence <- covid_cases[1] / model_n +init_contact_rate <- 10 +init_transmission_rate <- 0.05 +init_recovery_rate <- 1 / 7 + +# Create the SIRCONN model +covid_sirconn_model <- ModelSIRCONN( + name = "COVID-19", + n = model_n, + prevalence = init_prevalence, + contact_rate = init_contact_rate, + transmission_rate = init_transmission_rate, + recovery_rate = init_recovery_rate +) + +## ---------------------------------- +## Setup LFMCMC Calibration +## ---------------------------------- + +# Define LFMCMC parameters +lfmcmc_n_samples <- 6000 # number of LFMCMC iterations +lfmcmc_burnin <- 2000 # burn-in period +lfmcmc_epsilon <- 0.25 + +init_lfmcmc_params <- c( + 1 / 7, # r_rate + 0.05, # t_rate_spring + 0.04, # t_rate_summer + 0.06, # t_rate_fall + 0.07, # t_rate_winter + 10, # c_rate_weekday + 2 # c_rate_weekend +) +param_names <- c( + "Recovery rate", + "Transmission rate (spring)", + "Transmission rate (summer)", + "Transmission rate (fall)", + "Transmission rate (winter)", + "Contact rate (weekday)", + "Contact rate (weekend)" +) + +stats_names <- c( + "Time to peak", + "Size of peak", + "Mean (cases)", + "Standard deviation (cases)" +) + +#' Define the LFMCMC simulation function +#' +#' @description +#' `lfmcmc_simulation_fun` defines the simulation function for LFMCMC. +#' +#' @details +#' In LFMCMC, the simulation function run the model with the proposed +#' parameters and returns a simulated dataset that looks like the +#' observed dataset. This simulation function runs the SIR CONN +#' model created in earlier steps with the given model parameters, +#' adjusting contact rates for weekday vs weekend and adjusting +#' transmission rates based on the season. It then returns a set +#' of COVID-19 case counts for the same period as the observed case +#' counts. +#' +#' @param params Vector of numeric model parameters. +#' +#' @returns A simulated set of COVID-19 case counts. +lfmcmc_simulation_fun <- function(params) { + # Extract parameters + r_rate <- params[1] + t_rate_spring <- params[2] + t_rate_summer <- params[3] + t_rate_fall <- params[4] + t_rate_winter <- params[5] + c_rate_weekday <- params[6] + c_rate_weekend <- params[7] + + # Set recovery rate + set_param(covid_sirconn_model, "Recovery rate", r_rate) + + # Global event to change contact and transmission rates + change_c_and_t_rates <- function(model) { + # Get the current model day (step) + current_model_day <- today(model) + + ## Update contact rate based on weekday/weekend + if (any(c(6, 0) %in% (current_model_day %% 7L))) { + set_param(model, "Contact rate", c_rate_weekend) + } else { + set_param(model, "Contact rate", c_rate_weekday) + } + + ## Update transmission rate each season + if (current_model_day == seasons[["spring"]]) { + set_param(model, "Transmission rate", t_rate_spring) + } else if (current_model_day == seasons[["summer"]]) { + set_param(model, "Transmission rate", t_rate_summer) + } else if (current_model_day == seasons[["fall"]]) { + set_param(model, "Transmission rate", t_rate_fall) + } else if (current_model_day == seasons[["winter"]]) { + set_param(model, "Transmission rate", t_rate_winter) + } + + invisible(model) + } + + # Add global event to the model + change_c_and_t_event_name <- "Change Contact and Transmission Rates" + globalevent_fun(change_c_and_t_rates, name = change_c_and_t_event_name) |> + add_globalevent(model = covid_sirconn_model) + + # Run the model + verbose_off(covid_sirconn_model) + run(covid_sirconn_model, ndays = model_ndays) + + # Remove global event (new event set each simulation run) + rm_globalevent(covid_sirconn_model, change_c_and_t_event_name) + + # Get infected cases + hist_matrix <- get_hist_transition_matrix(covid_sirconn_model) + # - Drop the last day of data because the model returns data for (model_ndays + 1) days + infected_cases <- head( + hist_matrix[ + hist_matrix$state_to == "Infected" & + hist_matrix$state_from == "Susceptible", + c("counts")], + -1 + ) + + return(as.double(infected_cases)) +} + +# Create the LFMCMC object +calibration_lfmcmc <- LFMCMC(covid_sirconn_model) |> + set_simulation_fun(lfmcmc_simulation_fun) |> + set_summary_fun(lfmcmc_summary_fun) |> + set_proposal_fun(lfmcmc_proposal_fun) |> + set_kernel_fun(lfmcmc_kernel_fun) |> + set_observed_data(covid_cases) + +# Run LFMCMC calibration +run_lfmcmc( + lfmcmc = calibration_lfmcmc, + params_init_ = init_lfmcmc_params, + n_samples_ = lfmcmc_n_samples, + epsilon_ = lfmcmc_epsilon, + seed = model_seed +) +set_params_names(calibration_lfmcmc, param_names) +set_stats_names(calibration_lfmcmc, stats_names) + +## ---------------------------------- +## RUN FORECAST +## ---------------------------------- +# Create a new SIR CONN model +# - Compute prevalance based on most recent day +forecast_prevalence <- covid_cases[90] / model_n +# - Init the model +covid_sirconn_model <- ModelSIRCONN( + name = "COVID-19", + n = model_n, + prevalence = forecast_prevalence, + contact_rate = init_contact_rate, + transmission_rate = init_transmission_rate, + recovery_rate = init_recovery_rate +) + +# Run the simulation for each set of params in the sample +# - Select sample of accepted params from LFMCMC +forecast_sample_n <- 200 # Sample size +params_sample <- get_params_sample(calibration_lfmcmc, + lfmcmc_n_samples, + lfmcmc_burnin, + forecast_sample_n) +# - Set forecast length +model_ndays <- 14 +# - Run simulation function for each set of params from the sample +forecast_dist <- apply(params_sample, 1, lfmcmc_simulation_fun) diff --git a/forecast-pkg.R b/forecast-pkg.R new file mode 100644 index 0000000..241084b --- /dev/null +++ b/forecast-pkg.R @@ -0,0 +1,107 @@ +# Needed libraries +library(epiworldR) + +#' Download data from a URL +#' +#' @description +#' `get_data_from_url` downloads a CSV file from a URL and returns the file's data. +#' +#' @details +#' Downloads a zip archive from the given 'data_url', +#' selects the CSV file matching 'target_file' from the zip archive, +#' extracts the data from the target CSV file, and returns the data. +#' +#' @param data_url String URL pointing to the data file (zip archive). +#' @param target_file String filename of a CSV file in the zip archive. +#' +#' @returns The contents of the CSV data in `target_file` using `read.csv()` +get_data_from_url <- function(data_url, target_file) { + tryCatch({ + # Download the zipped data + temp_file <- tempfile() + download.file(data_url, temp_file) + # Select the appropriate data file + datafile_name <- grep( + target_file, + unzip(temp_file, list = TRUE)$Name, + ignore.case = TRUE, + value = TRUE + ) + datafile <- unz(temp_file, datafile_name) + # Extract the data + collected_data <- read.csv(datafile) + return(collected_data) + }, + error = function(cond) { + message("Error: ", conditionMessage(cond)) + return(-1) + }, + warning = function(cond) { + message("Warning: ", conditionMessage(cond)) + return(-2) + }, + finally = { + # Cleanup + unlink(temp_file) + } + ) +} + +#' Get the season of a given date +#' +#' @description +#' `get_date_season` returns the season (spring, summer, fall, winter) for the given date. +#' +#' @param date Date object. +#' +#' @returns A string representing the season for the given date: +#' "spring", "summer", "fall", or "winter" +get_date_season <- function(date) { + date_month <- as.integer(format(as.Date(date, format = "%d/%m/%Y"), "%m")) + + if (date_month >= 3 && date_month <= 5) { + return("spring") + } else if (date_month >= 6 && date_month <= 8) { + return("summer") + } else if (date_month >= 9 && date_month <= 11) { + return("fall") + } else { + return("winter") + } +} + +#' Get start date of each season from a list of dates +#' +#' @description +#' `get_season_starts` returns the start of each season (spring, summer, +#' fall, winter) from a list of dates +#' +#' @details +#' This function takes a list of Date objects in chronological order and uses +#' the [get_date_season] function assign a season to each date in the list. +#' The function then finds the position of the first instance of each season +#' in the list, returning a dictionary of four integer values corresponding to +#' the start of each season. The dictionary values are accessed using the +#' lowercase string name of the season ("spring", "summer", "fall", +#' or "winter"). If no match was found, the value of the season start +#' will be '-1'. +#' +#' @param dates A list of Date objects in chronological order. +#' +#' @returns A dictionary containing the starting positiong of +#' each season, accessed with the lowercase string name of +#' the season ("spring", "summer", "fall", or "winter"). +get_season_starts <- function(dates) { + seasons <- mapply(get_date_season, dates) + season_names <- c("spring", "summer", "fall", "winter") + + spring_start <- match(season_names[1], seasons, nomatch = -1) + summer_start <- match(season_names[2], seasons, nomatch = -1) + fall_start <- match(season_names[3], seasons, nomatch = -1) + winter_start <- match(season_names[4], seasons, nomatch = -1) + + season_starts <- c(spring_start, summer_start, fall_start, winter_start) + names(season_starts) <- season_names + + return(season_starts) +} diff --git a/index.qmd b/index.qmd index a3acf9c..5f3da77 100644 --- a/index.qmd +++ b/index.qmd @@ -15,50 +15,16 @@ Our forecast is calibrated on case counts from the last 90 days. ```{r} #| label: get-data -#| cache: true -# Download the Trends data from Utah DHHS -source("get-forecast-data.R") -data_url <- "https://coronavirus-dashboard.utah.gov/Utah_COVID19_data.zip" -target_file_regex <- "Trends_Epidemic+" -forecast_data <- get_forecast_data(data_url, target_file_regex) -forecast_data$Date <- as.Date(forecast_data$Date) - -# Get data from last 90 days -last_date <- max(forecast_data$Date) -forecast_data <- forecast_data[forecast_data$Date > (last_date - 90), ] +# Load in needed files +source("covid-forecast.R") ``` ```{r} -#| label: plot-data +#| label: plot-observed-data #| fig-width: 7 #| fig-height: 3 #| fig-align: center -library(ggplot2) -ggplot(forecast_data, aes(x = Date, y = Daily.Cases)) + - geom_line() + - labs(x = "Date", y = "Daily Cases") + - theme_classic() - -# Identify the start date of each season (spring, summer, fall, winter) in data -get_date_season <- function(date) { - date_month <- as.integer(format(as.Date(date, format = "%d/%m/%Y"), "%m")) - - if (date_month >= 3 && date_month <= 5) { - return("spring") - } else if (date_month >= 6 && date_month <= 8) { - return("summer") - } else if (date_month >= 9 && date_month <= 11) { - return("fall") - } else { - return("winter") - } -} -forecast_seasons <- mapply(get_date_season, forecast_data$Date) - -spring_start <- match("spring", forecast_seasons, nomatch = -1) -summer_start <- match("summer", forecast_seasons, nomatch = -1) -fall_start <- match("fall", forecast_seasons, nomatch = -1) -winter_start <- match("winter", forecast_seasons, nomatch = -1) +plot_covid_data(covid_data) ``` ## Calibrating the Forecasting Model @@ -68,7 +34,7 @@ We use epiworldR's implementation of Likelihood-Free Markhov Chain Monte Carlo ( * Transmission rates for each season (spring, summer, fall, winter) * Contact rates for weekdays and weekends -The LFMCMC simulation runs for 3,000 iterations. +The LFMCMC simulation runs for 6,000 iterations. In each iteration, it does the following: * Proposes a new set of parameter values @@ -78,271 +44,18 @@ In each iteration, it does the following: We note that while the population of Utah over 3 million, we only need to run our model with 10,000 agents. This is because we don't expect more than 10,000 COVID-19 cases and the model automatically scales the contact rate to account for the difference between the model population and Utah's population. -```{r} -#| label: prep-model -library(epiworldR) - -# Extract cases from observed data -forecast_data_cases <- forecast_data$Daily.Cases - -# Set model parameters -model_seed <- 112 -model_ndays <- 90 -model_n <- 10000 # model population size -n_samples <- 6000 # number of LFMCMC iterations -n_burnin <- 2000 # burn-in period -lfmcmc_epsilon <- 0.25 - -init_prevalence <- forecast_data_cases[1] / model_n -init_contact_rate <- 10 -init_transmission_rate <- 0.05 - -init_lfmcmc_params <- c( - 1 / 7, # recovery_rate - 0.05, # t_rate_spring - 0.04, # t_rate_summer - 0.06, # t_rate_fall - 0.07, # t_rate_winter - 10, # contact_rate_weekday - 2 # contact_rate_weekend -) - -# Create the base SIRCONN model -ef_model <- ModelSIRCONN( - name = "COVID-19", - n = model_n, - prevalence = init_prevalence, - contact_rate = init_contact_rate, - transmission_rate = init_transmission_rate, - recovery_rate = init_lfmcmc_params[1] -) -``` - -```{r} -#| label: setup-lfmcmc -# Define the LFMCMC simulation function -lfmcmc_simulation_fun <- function(params) { - # Extract parameters - recovery_rate <- params[1] - t_rate_spring <- params[2] - t_rate_summer <- params[3] - t_rate_fall <- params[4] - t_rate_winter <- params[5] - contact_rate_weekday <- params[6] - contact_rate_weekend <- params[7] - - # Set recovery rate - set_param(ef_model, "Recovery rate", recovery_rate) - - # Global event to change contact and transmission rates - change_c_and_t_rates <- function(model) { - # Get the current model day (step) - current_model_day <- today(model) - - ## Update contact rate based on weekday/weekend - if (any(c(6, 0) %in% (current_model_day %% 7L))) { - set_param(model, "Contact rate", contact_rate_weekend) - } else { - set_param(model, "Contact rate", contact_rate_weekday) - } - - ## Update transmission rate each season - if (current_model_day == spring_start) { - set_param(model, "Transmission rate", t_rate_spring) - } else if (current_model_day == summer_start) { - set_param(model, "Transmission rate", t_rate_summer) - } else if (current_model_day == fall_start) { - set_param(model, "Transmission rate", t_rate_fall) - } else if (current_model_day == winter_start) { - set_param(model, "Transmission rate", t_rate_winter) - } - - invisible(model) - } - - # Add global event to the model - change_c_and_t_event_name <- "Change Contact and Transmission Rates" - globalevent_fun(change_c_and_t_rates, name = change_c_and_t_event_name) |> - add_globalevent(model = ef_model) - - # Run the model - verbose_off(ef_model) - run(ef_model, ndays = model_ndays) - - # Remove global event (new event set each simulation run) - rm_globalevent(ef_model, change_c_and_t_event_name) - - # Get infected cases - hist_matrix <- get_hist_transition_matrix(ef_model) - # - Drop the last day of data because the model returns data for (model_ndays + 1) days - infected_cases <- head( - hist_matrix[ - hist_matrix$state_to == "Infected" & - hist_matrix$state_from == "Susceptible", - c("counts")], - -1 - ) - - return(as.double(infected_cases)) -} - -# Define the LFMCMC summary function -lfmcmc_summary_fun <- function(case_counts) { - # Extract summary statistics from the data - time_to_peak <- which.max(case_counts) - size_of_peak <- case_counts[time_to_peak] - mean_cases <- mean(case_counts) - sd_cases <- sd(case_counts) - - c( - time_to_peak, - size_of_peak, - mean_cases, - sd_cases - ) -} - -# Define the LFMCMC summary function -lfmcmc_proposal_fun <- function(params_prev) { - # Propose new model parameters - params_1_to_5 <- plogis( - qlogis(params_prev[1:5]) + - rnorm(length(params_prev[1:5]), mean = 0, sd = 0.025) - ) - - params_6_to_7 <- params_prev[6:7] + - rnorm(2, mean = 0, sd = 0.025) - - - # Reflect contact rates - if (params_6_to_7[1] < 0) { - params_6_to_7[1] <- params_prev[6] - - (params_6_to_7[1] - params_prev[6]) - } - if (params_6_to_7[2] < 0) { - params_6_to_7[2] <- params_prev[7] - - (params_6_to_7[2] - params_prev[7]) - } - - # Return proposed parameters - c(params_1_to_5, params_6_to_7) -} - -# Define the LFMCMC kernel function -# - Weighs simulation results against observed data -lfmcmc_kernel_fun <- function(simulated_stats, observed_stats, epsilon) { - diff <- ((simulated_stats - observed_stats)^2)^epsilon - dnorm(sqrt(sum(diff))) -} - -# Create the LFMCMC model -lfmcmc_model <- LFMCMC(ef_model) |> - set_simulation_fun(lfmcmc_simulation_fun) |> - set_summary_fun(lfmcmc_summary_fun) |> - set_proposal_fun(lfmcmc_proposal_fun) |> - set_kernel_fun(lfmcmc_kernel_fun) |> - set_observed_data(forecast_data_cases) -``` - -```{r} -#| label: run-lfmcmc -run_lfmcmc( - lfmcmc = lfmcmc_model, - params_init_ = init_lfmcmc_params, - n_samples_ = n_samples, - epsilon_ = lfmcmc_epsilon, - seed = model_seed -) -``` - -When the simulation is finished, we use a burn-in period of n = 2000 (66% of simulation iterations). +When the simulation is finished, we use a burn-in period of n = 2,000 (33% of simulation iterations). The epiworldR results printout (below) shows the mean parameter/statistic value, the 95% credible interval (in [ ]), and the initial/observed value (in ()). ```{r} -#| label: print-results -# Print the results with 50% burnin -param_names <- c( - "Recovery rate", - "Transmission rate (spring)", - "Transmission rate (summer)", - "Transmission rate (fall)", - "Transmission rate (winter)", - "Contact rate (weekday)", - "Contact rate (weekend)" -) -set_params_names(lfmcmc_model, param_names) - -stats_names <- c( - "Time to peak", - "Size of peak", - "Mean (cases)", - "Standard deviation (cases)" -) -set_stats_names(lfmcmc_model, stats_names) - -print(lfmcmc_model, burnin = n_burnin) +#| label: print-lfmcmc-results +print(calibration_lfmcmc, burnin = lfmcmc_burnin) ``` -Here is the posterior distribution of the LFMCMC samples with vertical lines representing the initial parameter values. -```{r} -#| label: get-posterior-dist - -accepted_params <- get_accepted_params(lfmcmc_model) -accepted_params <- lapply(seq_along(param_names), \(i) { - data.frame( - step = seq_along(nrow(accepted_params)), - param = param_names[i], - value = accepted_params[, i] - ) -}) |> do.call(what = "rbind") - -# Select transmission rates to plot -t_params_used <- character() -t_values_used <- numeric() -if (spring_start >= 0) { - t_params_used <- c(t_params_used, param_names[2]) - t_values_used <- c(t_values_used, init_lfmcmc_params[2]) -} -if (summer_start >= 0) { - t_params_used <- c(t_params_used, param_names[3]) - t_values_used <- c(t_values_used, init_lfmcmc_params[3]) -} -if (fall_start >= 0) { - t_params_used <- c(t_params_used, param_names[4]) - t_values_used <- c(t_values_used, init_lfmcmc_params[4]) -} -if (winter_start >= 0) { - t_params_used <- c(t_params_used, param_names[5]) - t_values_used <- c(t_values_used, init_lfmcmc_params[5]) -} - -# Extract transmission, recovery, and contact rates -t_rates <- accepted_params[accepted_params$param == t_params_used, ] -r_rate <- accepted_params[accepted_params$param == param_names[1], ] -c_rates <- accepted_params[accepted_params$param == param_names[6:7], ] -t_r_c_rates <- rbind(r_rate, c_rates, t_rates) - -# Generate initial values for vertical lines in each plot -init_values <- c(init_lfmcmc_params[6], - init_lfmcmc_params[7], - init_lfmcmc_params[1], - t_values_used) -init_names <- c(param_names[6], - param_names[7], - param_names[1], - t_params_used) - -init_df <- data.frame(param = init_names, - value = init_values) +Here is the posterior distribution of the LFMCMC samples with vertical lines representing the initial parameter values. -# Plot parameter distributions -ggplot(t_r_c_rates, aes(x = value, - fill = param, - y = after_stat(scaled))) + - geom_density(alpha = .3) + - geom_vline(aes(xintercept = value, color = param), - data = init_df) + - facet_wrap(~param, scales = "free", ncol = 1) + - theme_light() +```{r} +#| label: plot-posterior-dist +plot_lfmcmc_post_dist(calibration_lfmcmc, init_lfmcmc_params, param_names, seasons) ``` ## Epiworld Forecast @@ -355,84 +68,7 @@ The forecast mean is shown below along with the 50% and 95% confidence intervals ```{r} #| label: run-forecast -library(tidyr) - -forecast_sample_n <- 200 - -# Run model for 2 weeks -model_ndays <- 14 - -# Compute prevalance based on most recent day -forecast_prevalence <- forecast_data_cases[90] / model_n - -# Select parameters from second quantile of accepted params -accepted_params <- get_accepted_params(lfmcmc_model) -burnin_sample <- tail(accepted_params, n = (n_samples - n_burnin)) -params_sample <- burnin_sample[sample(nrow(burnin_sample), forecast_sample_n), ] - -# Create the new model -ef_model <- ModelSIRCONN( - name = "COVID-19", - n = model_n, - prevalence = forecast_prevalence, - contact_rate = init_contact_rate, - transmission_rate = init_transmission_rate, - recovery_rate = init_lfmcmc_params[1] -) - -# Run the simulation for each set of params in the sample -forecast_dist <- apply(params_sample, 1, lfmcmc_simulation_fun) - -# Find 2.5%, 25%, 50%, 75%, and 97.5% quantiles -forecast_quantiles <- apply(forecast_dist, 1, quantile, probs = c(0.025, 0.25, 0.5, 0.75, 0.975)) - -# Combine observed data with sample median for plotting forecast -observed_df <- data.frame( - date = forecast_data$Date[60:90], - counts = forecast_data$Daily.Cases[60:90], - observed = TRUE, - lb_95 = NA, - ub_95 = NA, - lb_50 = NA, - ub_50 = NA -) -sample_df <- data.frame( - date = forecast_data$Date[90] + 0:13, - counts = forecast_quantiles["50%", ], - observed = FALSE, - lb_95 = forecast_quantiles["2.5%", ], - ub_95 = forecast_quantiles["97.5%", ], - lb_50 = forecast_quantiles["25%", ], - ub_50 = forecast_quantiles["75%", ] -) - -forecast_df <- rbind(observed_df, sample_df) -``` -```{r} -#| label: plot-forecast -# Use color-blind-friendly palette: -cbb_light_blue <- "#56B4E9" -cbb_palette <- c(cbb_light_blue, "black") - -ggplot(forecast_df, - aes(x = date)) + - geom_ribbon(aes(ymin = lb_95, ymax = ub_95), - fill = cbb_light_blue, - alpha = 0.4, - na.rm = TRUE) + - geom_ribbon(aes(ymin = lb_50, ymax = ub_50), - fill = cbb_light_blue, - alpha = 0.4, - na.rm = TRUE) + - geom_point(aes(y = counts, - color = observed)) + - geom_line(aes(y = counts, - color = observed)) + - labs(x = "Date", y = "Daily Cases") + - scale_colour_manual(values = cbb_palette, - labels = c("Forecasted Cases", "Observed Cases")) + - scale_y_continuous(n.breaks = 20) + - theme_bw() +plot_forecast(forecast_dist, covid_data) ``` ## Acknowledgements diff --git a/plot-forecast.R b/plot-forecast.R new file mode 100644 index 0000000..7d8ed33 --- /dev/null +++ b/plot-forecast.R @@ -0,0 +1,168 @@ +# Needed libraries +library(ggplot2) + +#' Plot observed COVID-19 data from UDHHS +#' +#' @description +#' `plot_covid_data` plots the COVID-19 dataset downloaded from UDHHS. +#' +#' @details +#' This function uses `ggplot` to plot the COVID-19 data +#' as a time-series with Date along the x-axis and +#' case counts along the y-axis. +#' +#' @param data The data to plot. +#' +#' @returns The plot of the data. +plot_covid_data <- function(data) { + ggplot(data, aes(x = Date, y = Daily.Cases)) + + geom_line() + + labs(x = "Date", y = "Daily Cases") + + theme_classic() +} + +#' Plot posterior distribution of model parameters +#' +#' @description +#' `plot_lfmcmc_post_dist` plots posterior distribution of the +#' model parameters accepted during calibration. +#' +#' @details +#' This function plots the distribution of accepted parameters +#' from the calibration run of LFMCMC. It also plots a +#' vertical line representing the initial parameter value. +#' +#' @param lfmcmc_object Object of class [LFMCMC] which performed the simulation. +#' @param init_params Vector of the initial parameter values. +#' @param param_names The string names of the parameters. +#' @param seasons The dictionary of season start positions. +#' +#' @returns The plot of the posterior distribution of model parameters. +plot_lfmcmc_post_dist <- function(lfmcmc_object, init_params, param_names, seasons) { + + accepted_params <- get_accepted_params(lfmcmc_object) + accepted_params <- lapply(seq_along(param_names), \(i) { + data.frame( + step = seq_len(nrow(accepted_params)), + param = param_names[i], + value = accepted_params[, i] + ) + }) |> do.call(what = "rbind") + + # Select transmission rates to plot + t_params_used <- character() + t_values_used <- numeric() + if (seasons[["spring"]] >= 0) { + t_params_used <- c(t_params_used, param_names[2]) + t_values_used <- c(t_values_used, init_params[2]) + } + if (seasons[["summer"]] >= 0) { + t_params_used <- c(t_params_used, param_names[3]) + t_values_used <- c(t_values_used, init_params[3]) + } + if (seasons[["fall"]] >= 0) { + t_params_used <- c(t_params_used, param_names[4]) + t_values_used <- c(t_values_used, init_params[4]) + } + if (seasons[["winter"]] >= 0) { + t_params_used <- c(t_params_used, param_names[5]) + t_values_used <- c(t_values_used, init_params[5]) + } + + # Extract transmission, recovery, and contact rates + t_rates <- accepted_params[accepted_params$param == t_params_used, ] + r_rate <- accepted_params[accepted_params$param == param_names[1], ] + c_rates <- accepted_params[accepted_params$param == param_names[6:7], ] + t_r_c_rates <- rbind(r_rate, c_rates, t_rates) + + # Generate initial values for vertical lines in each plot + init_values <- c(init_params[6], + init_params[7], + init_params[1], + t_values_used) + + init_names <- c(param_names[6], + param_names[7], + param_names[1], + t_params_used) + + init_df <- data.frame(param = init_names, + value = init_values) + + # Plot parameter distributions + ggplot(t_r_c_rates, aes(x = value, + fill = param, + y = after_stat(scaled))) + + geom_density(alpha = .3) + + geom_vline(aes(xintercept = value, color = param), + data = init_df) + + facet_wrap(~param, scales = "free", ncol = 1) + + theme_light() + +} + +#' Plot forecasted COVID-19 case counts +#' +#' @description +#' `plot_forecast` plots the forecasted COVID case counts with 50% and 95% confidence intervals. +#' +#' @details +#' This function plots the observed COVID-19 case count data from +#' the last 30 days (in black), appending to the end of this data the +#' forecasted case counts (in blue). It also plots the 50% and 95% +#' confidence intervals for the forecasted case counts. +#' +#' @param forecast_dist An array of forecasted case counts. +#' @param covid_data A data frame of observed COVID-19 case counts. +#' +#' @returns The plot of the forecasted COVID-19 case counts. +plot_forecast <- function(forecast_dist, covid_data) { + # Find 2.5%, 25%, 50%, 75%, and 97.5% quantiles + forecast_quantiles <- apply(forecast_dist, 1, quantile, probs = c(0.025, 0.25, 0.5, 0.75, 0.975)) + + # Combine observed data with sample median for plotting forecast + observed_df <- data.frame( + date = covid_data$Date[60:90], + counts = covid_data$Daily.Cases[60:90], + observed = TRUE, + lb_95 = NA, + ub_95 = NA, + lb_50 = NA, + ub_50 = NA + ) + sample_df <- data.frame( + date = covid_data$Date[90] + 0:13, + counts = forecast_quantiles["50%", ], + observed = FALSE, + lb_95 = forecast_quantiles["2.5%", ], + ub_95 = forecast_quantiles["97.5%", ], + lb_50 = forecast_quantiles["25%", ], + ub_50 = forecast_quantiles["75%", ] + ) + + forecast_df <- rbind(observed_df, sample_df) + + # Use color-blind-friendly palette: + cbb_light_blue <- "#56B4E9" + cbb_palette <- c(cbb_light_blue, "black") + + ggplot(forecast_df, + aes(x = date)) + + geom_ribbon(aes(ymin = lb_95, ymax = ub_95), + fill = cbb_light_blue, + alpha = 0.4, + na.rm = TRUE) + + geom_ribbon(aes(ymin = lb_50, ymax = ub_50), + fill = cbb_light_blue, + alpha = 0.4, + na.rm = TRUE) + + geom_point(aes(y = counts, + color = observed)) + + geom_line(aes(y = counts, + color = observed)) + + labs(x = "Date", y = "Daily Cases") + + scale_colour_manual(values = cbb_palette, + labels = c("Forecasted Cases", "Observed Cases")) + + scale_y_continuous(n.breaks = 20) + + theme_bw() +}