Merge pull request #17 from aliceni7/main

added new sites and functions
EcoForecast · May 8, 2024 · 51af710 · 51af710
2 parents 39cbd1c + f1a18d4
commit 51af710
Show file tree

Hide file tree

Showing 5 changed files with 179 additions and 132 deletions.
diff --git a/05_historic_forecast.R b/05_historic_forecast.R
@@ -3,27 +3,30 @@ library(lubridate)
 library(tidyr)
 library(ecoforecastR)
 
+sites = c("HARV", "RMNP", "DSNY", "SRER", "HEAL")
+# deciduous broadleaf, evergreen needleleaf, grassland, shrubland, tundra 
+
 # Function for downloading historic data
 
-download_historic_data <- function(){
+download_historic_data <- function(sites){
   URL = "https://data.ecoforecast.org/neon4cast-targets/phenology/phenology-targets.csv.gz"
 
   # Get gcc data
   data = readr::read_csv(URL, col_names = c("Date", "Siteid", "Variable", "Observation"))
-  harv = na.omit(data[data$Siteid == "HARV",]) # filter by Harvard Forest site
-  gcc_90 = harv[harv$Variable == "gcc_90",]
+  data.s = na.omit(data[data$Siteid %in% sites,]) # filter by sites
+  gcc_90 = data.s[data.s$Variable == "gcc_90",]
 
   # Get historic temperature data
   weather_stage3_historic <- neon4cast::noaa_stage3()
   ds_historic <- weather_stage3_historic |> 
-    dplyr::filter(site_id == "HARV") |>
+    dplyr::filter(site_id %in% sites) |>
     dplyr::collect()
 
-  start_date <- Sys.Date() - lubridate::days(741) # hindcasting 2 years
+  start_date <- Sys.Date() - lubridate::days(1096) # hindcasting 3 years
   end_date <- Sys.Date() - 1
 
   # currently not using precip, but it works 
-  TMP_historic = ds_historic[ds_historic$variable == c("air_temperature", "precipitation_flux"),]
+  TMP_historic = ds_historic[ds_historic$variable == c("air_temperature"),]#, "precipitation_flux"),]
 
   data_historic <- TMP_historic %>% 
     mutate(datetime = lubridate::as_date(datetime)) %>%
@@ -35,58 +38,68 @@ download_historic_data <- function(){
   data_historic <- as.data.frame(data_historic) # save temperature data as dataframe
   data_historic$date <- date(data_historic$datetime)
   data_historic$date <- as.character(data_historic$date)
+
   # Get maximum temperature within each ensemble per day: 31 ensembles x 36 days
   temp_max_h <- data_historic %>% 
-    group_by(date, variable) %>%
+    group_by(date, variable, site_id) %>%
     summarize(prediction = max(prediction)) 
-  
+
   # For multiple covariates
   vars <- pivot_wider(temp_max_h, names_from = variable, values_from = prediction)
   data1 <- gcc_90[which(gcc_90$Date == temp_max_h$date[1]):nrow(gcc_90), ]
   covariates <- vars[vars$date %in% data1$Date, ]
+  covariates <- covariates[order(covariates$site_id),]
+  covariates <- covariates[covariates$date %in% data1$Date, ]
+  colnames(covariates) <- c("Date", "Siteid", "air_temperature")
+  cov <- covariates
+
+  data1 <- right_join(data1, select(cov, c("Date" = "Date","Siteid" = "Siteid"))) # subset by available temperature data
 
   # Add temperature as column in data dataframe
   data1$Temperature <- covariates$air_temperature - 273.15 # convert to celsius
   #data1$PcpFlux <- covariates$precipitation_flux
   data1$Observation <- as.numeric(data1$Observation)
+  data1 <- drop_na(data1)
 
   return(data1)
 }
 
 # Function for running a DLM hindcast
 
-historic_forecast <- function(data, observation, temperature) {
-
-  # Setup priors and observation and covariates for DLM
-  data2 <- list(x_ic = mean(observation, na.rm = T),
-                tau_ic = 1/sd(observation, na.rm = T),
-                a_obs = 1,
-                r_obs = 1,
-                a_add = 1,
-                r_add = 1,
-                n = length(nrow(data)),
-                Observation = observation,
-                Temperature = temperature)
-
-  # Run the DLM
-  gcc.out <- ecoforecastR::fit_dlm(model=list(obs="Observation", fixed="~ 1 + X + Temperature"), data2)
-
-  return(gcc.out)
+historic_forecast <- function(data, sites) {
+  models <- list()
+  for(i in 1:length(sites)) { # run DLM per site
+    dat <- data[data$Siteid == sites[i],]
+    data2 <- list(x_ic = mean(dat$Observation, na.rm = T),
+                  tau_ic = 1/sd(dat$Observation, na.rm = T),
+                  a_obs = 1,
+                  r_obs = 1,
+                  a_add = 1,
+                  r_add = 1,
+                  n = length(nrow(dat)),
+                  Observation = dat$Observation,
+                  Temperature = dat$Temperature)
+
+    # Run the DLM
+    gcc.out <- ecoforecastR::fit_dlm(model=list(obs="Observation", fixed="~ 1 + X + Temperature"), data2)
+    models[[sites[i]]] <- gcc.out
+  }
+  return(models)
 }
 
 # Conducting a historic forecast
-h_data <- download_historic_data()
-obs_HARV <- h_data$Observation
-temp_HARV <- h_data$Temperature
-out_HARV <- historic_forecast(h_data, obs_HARV, temp_HARV)
+h_data <- download_historic_data(sites)
+out_sites <- historic_forecast(h_data, sites)
 
 # Print the model
-#strsplit(out_HARV$model,"\n",fixed = TRUE)[[1]]
+strsplit(out_sites$HARV$model,"\n",fixed = TRUE)[[1]]
+
+# Plot the historic forecast: plotting rocky mountains site, change site name to get other sites
+site_name = "HEAL"
+plot_out <- as.matrix(out_sites[[site_name]]$predict) # get model predictions
+ci_out <- apply(plot_out, 2, quantile, c(0.025,0.5,0.975)) # get model confidence intervals
+plot(1:nrow(h_data[h_data$Siteid == site_name,]), ci_out[2,], type='n', ylab="GCC_90", xlab="Time") # create empty plot with proper axes
+ecoforecastR::ciEnvelope(1:nrow(h_data[h_data$Siteid == site_name,]),ci_out[1,],ci_out[3,],col=ecoforecastR::col.alpha("lightBlue",0.75)) # plot model CI
+points(1:nrow(h_data[h_data$Siteid == site_name,]), h_data[h_data$Siteid == site_name,]$Observation, pch="+",cex=0.5) # plot gcc_90 original points
+points(1:nrow(h_data[h_data$Siteid == site_name,]), ci_out[2,], pch=19, cex=0.2) # plot model 50% CI
 
-# Plot the historic forecast
-#plot_out <- as.matrix(out_HARV$predict) # get model predictions
-#ci_out <- apply(plot_out, 2, quantile, c(0.025,0.5,0.975)) # get model confidence intervals
-#plot(1:736, ci_out[2,], type='n', ylab="GCC_90", xlab="Time") # create empty plot with proper axes
-#ecoforecastR::ciEnvelope(1:736,ci_out[1,],ci_out[3,],col=ecoforecastR::col.alpha("lightBlue",0.75)) # plot model CI
-#points(1:736, out_HARV$Observation, pch="+",cex=0.5) # plot gcc_90 original points
-#points(1:736, ci_out[2,], pch=19) # plot model 50% CI
diff --git a/06_forecast.R b/06_forecast.R
@@ -3,9 +3,9 @@ source("05_historic_forecast.R")
 # Function for grabbing future forecast data based on the current date
 
 download_future_met <- function() {
-  weather_stage2 <- neon4cast::noaa_stage2(start_date = as.character(Sys.Date() - 1))
+  weather_stage2 <- neon4cast::noaa_stage2(start_date = as.character(Sys.Date()-2))
   ds1 <- weather_stage2 |> 
-    dplyr::filter(site_id == "HARV") |>
+    dplyr::filter(site_id %in% sites) |>
     dplyr::collect()
 
   TMP = ds1[ds1$variable == "air_temperature",]
@@ -17,29 +17,9 @@ download_future_met <- function() {
     select(datetime, site_id, variable, prediction, parameter)
 
   tmps <- as.data.frame(TMP) # save temperature data as dataframe
-
-  # Get maximum temperature within each ensemble per day: 31 ensembles x 36 days
-  #temp_max <- tmps %>% 
-  #  group_by(datetime, parameter) %>%
-  #  summarize(temp.max = max(prediction) - 273.15) # convert to celsius
-
-  # Get mean of maximum temperatures across ensembles
-  #temp_max <- temp_max %>%
-  #  group_by(datetime) %>% 
-  #  summarize(temp.mean = mean(temp.max))
-
-  #temps <- as.data.frame(temp_max)
-
-  # Define these outside the function to have them accessible to be used in forecast
-  # Get dataframe: columns = ensemble number, rows = prediction by time step
-  #temps_ensemble <- as.data.frame(pivot_wider(tmps, names_from = parameter, values_from = prediction)[,4:34] - 273.15)
-  # Get one maximum value for daily prediction
-  #temps.max <- matrix(apply(temps_ensemble,1,max),1, 36)
-
   return(tmps)
 }
 
-
 # Forecasting Function
 
 ##` @param IC    Initial Conditions
@@ -59,43 +39,46 @@ forecast <- function(IC, temp, betaTemp, betaX, betaI, Q, n){
   return(N)
 }
 
-# Making one deterministic run -- for submission?
-
-# Get future forecast temperature data
-temps_data <- download_future_met()
-c_temps <- temps_data |> 
-  select(prediction, parameter)
 
-#temps_ensemble <- as.data.frame(pivot_wider(c_temps, 
-#                                            names_from = parameter, 
-#                                            values_from = prediction))#[,1] - 273.15)
-temps_ensemble <- as.data.frame(pivot_wider(temps_data, 
-                                            names_from = parameter, 
-                                            values_from = prediction)[,4:34] - 273.15)
-temps_rot <- t(temps_ensemble)[,1:30]
-temps <- matrix(apply(temps_rot,2,mean), 1, 30) 
+# Making forecast
+forecast_sites <- function(f_temps, models, sites){
+  forecasts <- list()
+  for (i in 1:length(sites)) {
+    temps_ensemble <- as.data.frame(pivot_wider(f_temps[f_temps$site_id == sites[1],], 
+                                                names_from = parameter, 
+                                                values_from = prediction)[,4:34] - 273.15)
+    temps_rot <- t(temps_ensemble)[,1:30]
+    temps <- matrix(apply(temps_rot,2,mean), 1, 30) 
+    gcc.out <- models[[sites[1]]] # going through each sites models for the params
+    params <- as.matrix(gcc.out$params) # get model parameters
+    param.mean <- apply(params, 2, mean)
+    predicts <- as.matrix(gcc.out$predict) # get model predictions
+    data <- gcc.out$data
+
+    ci <- apply(predicts, 2, quantile, c(0.025,0.5,0.975))
+    prow <- sample.int(nrow(params), Nmc, replace=TRUE)
+    drow = sample.int(nrow(temps_rot), Nmc, replace=TRUE)
+    Qmc <- 1 / sqrt(params[prow,"tau_add"])  ## convert from precision to standard deviation
+
+    IC <- predicts
+    pheno_forecast <- forecast(IC[prow, ncol(predicts)],
+                               temp = temps_rot[drow,],
+                               betaTemp = params[prow, "betaTemperature"],
+                               betaX = params[prow, "betaX"],
+                               betaI = params[prow, "betaIntercept"],
+                               Q = Qmc,
+                               n = Nmc)
+    forecasts[[sites[i]]] <- pheno_forecast 
+  }
+  return(forecasts)
+}
 
 Nmc = 1000 # number of Monte Carlo draws
 NT = 30 # number of time steps into the future
-gcc.out <- out_HARV
-params <- as.matrix(gcc.out$params) # get model parameters
-param.mean <- apply(params, 2, mean)
-predicts <- as.matrix(gcc.out$predict) # get model predictions
-data <- gcc.out$data
 
-ci <- apply(predicts, 2, quantile, c(0.025,0.5,0.975))
-prow <- sample.int(nrow(params), Nmc, replace=TRUE)
-drow = sample.int(nrow(temps_rot), Nmc, replace=TRUE)
-Qmc <- 1 / sqrt(params[prow,"tau_add"])  ## convert from precision to standard deviation
+future_temps <- download_future_met()
+forecasts <- forecast_sites(future_temps, out_sites, sites)
 
-IC <- predicts
-pheno_forecast <- forecast(IC[prow, ncol(predicts)],
-                           temp = temps_rot[drow,],
-                           betaTemp = params[prow, "betaTemperature"],
-                           betaX = params[prow, "betaX"],
-                           betaI = params[prow, "betaIntercept"],
-                           Q = Qmc,
-                           n = Nmc)
 
 
 

diff --git a/06_submit_forecast.R b/06_submit_forecast.R
@@ -5,39 +5,59 @@ library(ecoforecastR)
 source("06_forecast.R")
 
 # Reformat the forecast so it fits the standard
-fcast <- as.data.frame(pheno_forecast)
-fcast <- as.data.frame(t(pheno_forecast))
-colnames(fcast) <- seq(1:1000)
 start_date <- Sys.Date() + 1
 end_date <- Sys.Date() + 30
-fcast$datetime <- seq(start_date, end_date, by = "+1 day")
-fcast$reference_datetime <- seq(start_date-1, end_date-1, by = "+1 day")
-n_fcast <- pivot_longer(fcast, cols=seq(1:1000))
-colnames(n_fcast) <- c("datetime", "reference_datetime", "parameter", "prediction")
-n_fcast$model_id <- "ChlorophyllCrusaders" # perhaps change?
-n_fcast$site_id <- "HARV" # change if we add more sites
-n_fcast$variable <- "gcc_90" 
-n_fcast$family <- "ensemble" # there are 1000
-n_fcast$duration <- "P1D" # daily forecasts
-n_fcast$project_id <- "neon4cast"
+sub_fcast <- data.frame()
+for(i in 1:length(sites)){
+  #fcast <- as.data.frame(forecasts)
+  fcast <- as.data.frame(t(forecasts[[sites[i]]]))
+  colnames(fcast) <- seq(1:1000)
+  fcast$datetime <- seq(start_date, end_date, by = "+1 day")
+  fcast$reference_datetime <- seq(start_date-1, end_date-1, by = "+1 day")
+  n_fcast <- pivot_longer(fcast, cols=seq(1:1000))
+  colnames(n_fcast) <- c("datetime", "reference_datetime", "parameter", "prediction")
+  n_fcast$model_id <- "ChlorophyllCrusaders" 
+  n_fcast$site_id <- sites[i] # change if we add more sites
+  n_fcast$variable <- "gcc_90" 
+  n_fcast$family <- "ensemble" # there are 1000
+  n_fcast$duration <- "P1D" # daily forecasts
+  n_fcast$project_id <- "neon4cast"
+  if (dim(sub_fcast)[1] == 0 & dim(sub_fcast)[2] == 0){ 
+    sub_fcast = n_fcast 
+  } else {
+    sub_fcast <- rbind(sub_fcast, n_fcast)
+  }
+}
 
-n_fcast <- n_fcast |> relocate(project_id, model_id, datetime, reference_datetime, duration, site_id, family, parameter, variable, prediction)
+sub_fcast <- sub_fcast |> relocate(project_id, model_id, datetime, reference_datetime, duration, site_id, family, parameter, variable, prediction)
 year <- year(Sys.Date())
 month <- month(Sys.Date())
 day <- day(Sys.Date())
 forecast_file <- paste0("phenology-", year, "-", month, "-", day, "-ChlorophyllCrusaders.csv.gz")
 write.csv(n_fcast, forecast_file)
 neon4cast::forecast_output_validator(forecast_file) # validated!
 
-# Plot the ensembles
-#pheno_means <- n_fcast %>% 
+# Plot the ensembles; change site name for different ensembles
+#pheno_means <- sub_fcast[sub_fcast$site_id == "RMNP",] %>% 
 #  group_by(datetime) |> 
 #  summarize(pred_mean = mean(prediction),.groups = "drop") |> 
 #  select(datetime, pred_mean)
 
-#plot(1:30, pheno_means$pred_mean, type="l")
-#ecoforecastR::ciEnvelope(time1,N.IPDE.ci[1,],N.IPDE.ci[3,],col="darkgoldenrod1")
-# Metadata part- A little unsure what to do here
+#plot(as.Date(pheno_means$datetime), pheno_means$pred_mean, type="l", ylim=c(0.25,0.50),
+#     ylab="Mean ensemble GCC", xlab="Time", main="Prediction Ensemble means")
+#ecoforecastR::ciEnvelope(pheno_means$datetime,N.IPDE.ci[1,],N.IPDE.ci[3,],col="darkgoldenrod1")
+#lines(as.Date(pheno_means$datetime), pheno_means$pred_mean)
+
+# Plot 10 random ensembles
+#samp <- sample.int(nrow(forecasts$HARV), 10, replace=TRUE)
+#plot(as.Date(pheno_means$datetime), forecasts$HARV[1,], type="b", col=1, ylim=c(0.25, 0.50),
+#     main="10 random ensemble forecasts", ylab="GCC_90", xlab="Time")
+#for(s in 2:10) {
+#  lines(as.Date(pheno_means$datetime), forecasts$HARV[s,], type="b", col=s)
+#}
+
+
+# Metadata part
 
 team_info <- list(team_name = "ChlorophyllCrusaders",
                   team_list = list(list(individualName = list(givenName = "Alice", 
@@ -73,7 +93,7 @@ model_metadata = list(
       repository = "https://github.com/EcoForecast/ChlorophyllCrusaders/"
     ),
     initial_conditions = list(
-      status = "abset"
+      status = "absent"
     ),
     drivers = list(
       status = "absent"