Skip to content

Commit

Permalink
Merge pull request #17 from aliceni7/main
Browse files Browse the repository at this point in the history
added new sites and functions
  • Loading branch information
aliceni7 authored May 8, 2024
2 parents 39cbd1c + f1a18d4 commit 51af710
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 132 deletions.
87 changes: 50 additions & 37 deletions 05_historic_forecast.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,30 @@ library(lubridate)
library(tidyr)
library(ecoforecastR)

sites = c("HARV", "RMNP", "DSNY", "SRER", "HEAL")
# deciduous broadleaf, evergreen needleleaf, grassland, shrubland, tundra

# Function for downloading historic data

download_historic_data <- function(){
download_historic_data <- function(sites){
URL = "https://data.ecoforecast.org/neon4cast-targets/phenology/phenology-targets.csv.gz"

# Get gcc data
data = readr::read_csv(URL, col_names = c("Date", "Siteid", "Variable", "Observation"))
harv = na.omit(data[data$Siteid == "HARV",]) # filter by Harvard Forest site
gcc_90 = harv[harv$Variable == "gcc_90",]
data.s = na.omit(data[data$Siteid %in% sites,]) # filter by sites
gcc_90 = data.s[data.s$Variable == "gcc_90",]

# Get historic temperature data
weather_stage3_historic <- neon4cast::noaa_stage3()
ds_historic <- weather_stage3_historic |>
dplyr::filter(site_id == "HARV") |>
dplyr::filter(site_id %in% sites) |>
dplyr::collect()

start_date <- Sys.Date() - lubridate::days(741) # hindcasting 2 years
start_date <- Sys.Date() - lubridate::days(1096) # hindcasting 3 years
end_date <- Sys.Date() - 1

# currently not using precip, but it works
TMP_historic = ds_historic[ds_historic$variable == c("air_temperature", "precipitation_flux"),]
TMP_historic = ds_historic[ds_historic$variable == c("air_temperature"),]#, "precipitation_flux"),]

data_historic <- TMP_historic %>%
mutate(datetime = lubridate::as_date(datetime)) %>%
Expand All @@ -35,58 +38,68 @@ download_historic_data <- function(){
data_historic <- as.data.frame(data_historic) # save temperature data as dataframe
data_historic$date <- date(data_historic$datetime)
data_historic$date <- as.character(data_historic$date)

# Get maximum temperature within each ensemble per day: 31 ensembles x 36 days
temp_max_h <- data_historic %>%
group_by(date, variable) %>%
group_by(date, variable, site_id) %>%
summarize(prediction = max(prediction))

# For multiple covariates
vars <- pivot_wider(temp_max_h, names_from = variable, values_from = prediction)
data1 <- gcc_90[which(gcc_90$Date == temp_max_h$date[1]):nrow(gcc_90), ]
covariates <- vars[vars$date %in% data1$Date, ]
covariates <- covariates[order(covariates$site_id),]
covariates <- covariates[covariates$date %in% data1$Date, ]
colnames(covariates) <- c("Date", "Siteid", "air_temperature")
cov <- covariates

data1 <- right_join(data1, select(cov, c("Date" = "Date","Siteid" = "Siteid"))) # subset by available temperature data

# Add temperature as column in data dataframe
data1$Temperature <- covariates$air_temperature - 273.15 # convert to celsius
#data1$PcpFlux <- covariates$precipitation_flux
data1$Observation <- as.numeric(data1$Observation)
data1 <- drop_na(data1)

return(data1)
}

# Function for running a DLM hindcast

historic_forecast <- function(data, observation, temperature) {

# Setup priors and observation and covariates for DLM
data2 <- list(x_ic = mean(observation, na.rm = T),
tau_ic = 1/sd(observation, na.rm = T),
a_obs = 1,
r_obs = 1,
a_add = 1,
r_add = 1,
n = length(nrow(data)),
Observation = observation,
Temperature = temperature)

# Run the DLM
gcc.out <- ecoforecastR::fit_dlm(model=list(obs="Observation", fixed="~ 1 + X + Temperature"), data2)

return(gcc.out)
historic_forecast <- function(data, sites) {
models <- list()
for(i in 1:length(sites)) { # run DLM per site
dat <- data[data$Siteid == sites[i],]
data2 <- list(x_ic = mean(dat$Observation, na.rm = T),
tau_ic = 1/sd(dat$Observation, na.rm = T),
a_obs = 1,
r_obs = 1,
a_add = 1,
r_add = 1,
n = length(nrow(dat)),
Observation = dat$Observation,
Temperature = dat$Temperature)

# Run the DLM
gcc.out <- ecoforecastR::fit_dlm(model=list(obs="Observation", fixed="~ 1 + X + Temperature"), data2)
models[[sites[i]]] <- gcc.out
}
return(models)
}

# Conducting a historic forecast
h_data <- download_historic_data()
obs_HARV <- h_data$Observation
temp_HARV <- h_data$Temperature
out_HARV <- historic_forecast(h_data, obs_HARV, temp_HARV)
h_data <- download_historic_data(sites)
out_sites <- historic_forecast(h_data, sites)

# Print the model
#strsplit(out_HARV$model,"\n",fixed = TRUE)[[1]]
strsplit(out_sites$HARV$model,"\n",fixed = TRUE)[[1]]

# Plot the historic forecast: plotting rocky mountains site, change site name to get other sites
site_name = "HEAL"
plot_out <- as.matrix(out_sites[[site_name]]$predict) # get model predictions
ci_out <- apply(plot_out, 2, quantile, c(0.025,0.5,0.975)) # get model confidence intervals
plot(1:nrow(h_data[h_data$Siteid == site_name,]), ci_out[2,], type='n', ylab="GCC_90", xlab="Time") # create empty plot with proper axes
ecoforecastR::ciEnvelope(1:nrow(h_data[h_data$Siteid == site_name,]),ci_out[1,],ci_out[3,],col=ecoforecastR::col.alpha("lightBlue",0.75)) # plot model CI
points(1:nrow(h_data[h_data$Siteid == site_name,]), h_data[h_data$Siteid == site_name,]$Observation, pch="+",cex=0.5) # plot gcc_90 original points
points(1:nrow(h_data[h_data$Siteid == site_name,]), ci_out[2,], pch=19, cex=0.2) # plot model 50% CI

# Plot the historic forecast
#plot_out <- as.matrix(out_HARV$predict) # get model predictions
#ci_out <- apply(plot_out, 2, quantile, c(0.025,0.5,0.975)) # get model confidence intervals
#plot(1:736, ci_out[2,], type='n', ylab="GCC_90", xlab="Time") # create empty plot with proper axes
#ecoforecastR::ciEnvelope(1:736,ci_out[1,],ci_out[3,],col=ecoforecastR::col.alpha("lightBlue",0.75)) # plot model CI
#points(1:736, out_HARV$Observation, pch="+",cex=0.5) # plot gcc_90 original points
#points(1:736, ci_out[2,], pch=19) # plot model 50% CI
89 changes: 36 additions & 53 deletions 06_forecast.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ source("05_historic_forecast.R")
# Function for grabbing future forecast data based on the current date

download_future_met <- function() {
weather_stage2 <- neon4cast::noaa_stage2(start_date = as.character(Sys.Date() - 1))
weather_stage2 <- neon4cast::noaa_stage2(start_date = as.character(Sys.Date()-2))
ds1 <- weather_stage2 |>
dplyr::filter(site_id == "HARV") |>
dplyr::filter(site_id %in% sites) |>
dplyr::collect()

TMP = ds1[ds1$variable == "air_temperature",]
Expand All @@ -17,29 +17,9 @@ download_future_met <- function() {
select(datetime, site_id, variable, prediction, parameter)

tmps <- as.data.frame(TMP) # save temperature data as dataframe

# Get maximum temperature within each ensemble per day: 31 ensembles x 36 days
#temp_max <- tmps %>%
# group_by(datetime, parameter) %>%
# summarize(temp.max = max(prediction) - 273.15) # convert to celsius

# Get mean of maximum temperatures across ensembles
#temp_max <- temp_max %>%
# group_by(datetime) %>%
# summarize(temp.mean = mean(temp.max))

#temps <- as.data.frame(temp_max)

# Define these outside the function to have them accessible to be used in forecast
# Get dataframe: columns = ensemble number, rows = prediction by time step
#temps_ensemble <- as.data.frame(pivot_wider(tmps, names_from = parameter, values_from = prediction)[,4:34] - 273.15)
# Get one maximum value for daily prediction
#temps.max <- matrix(apply(temps_ensemble,1,max),1, 36)

return(tmps)
}


# Forecasting Function

##` @param IC Initial Conditions
Expand All @@ -59,43 +39,46 @@ forecast <- function(IC, temp, betaTemp, betaX, betaI, Q, n){
return(N)
}

# Making one deterministic run -- for submission?

# Get future forecast temperature data
temps_data <- download_future_met()
c_temps <- temps_data |>
select(prediction, parameter)

#temps_ensemble <- as.data.frame(pivot_wider(c_temps,
# names_from = parameter,
# values_from = prediction))#[,1] - 273.15)
temps_ensemble <- as.data.frame(pivot_wider(temps_data,
names_from = parameter,
values_from = prediction)[,4:34] - 273.15)
temps_rot <- t(temps_ensemble)[,1:30]
temps <- matrix(apply(temps_rot,2,mean), 1, 30)
# Making forecast
forecast_sites <- function(f_temps, models, sites){
forecasts <- list()
for (i in 1:length(sites)) {
temps_ensemble <- as.data.frame(pivot_wider(f_temps[f_temps$site_id == sites[1],],
names_from = parameter,
values_from = prediction)[,4:34] - 273.15)
temps_rot <- t(temps_ensemble)[,1:30]
temps <- matrix(apply(temps_rot,2,mean), 1, 30)
gcc.out <- models[[sites[1]]] # going through each sites models for the params
params <- as.matrix(gcc.out$params) # get model parameters
param.mean <- apply(params, 2, mean)
predicts <- as.matrix(gcc.out$predict) # get model predictions
data <- gcc.out$data

ci <- apply(predicts, 2, quantile, c(0.025,0.5,0.975))
prow <- sample.int(nrow(params), Nmc, replace=TRUE)
drow = sample.int(nrow(temps_rot), Nmc, replace=TRUE)
Qmc <- 1 / sqrt(params[prow,"tau_add"]) ## convert from precision to standard deviation

IC <- predicts
pheno_forecast <- forecast(IC[prow, ncol(predicts)],
temp = temps_rot[drow,],
betaTemp = params[prow, "betaTemperature"],
betaX = params[prow, "betaX"],
betaI = params[prow, "betaIntercept"],
Q = Qmc,
n = Nmc)
forecasts[[sites[i]]] <- pheno_forecast
}
return(forecasts)
}

Nmc = 1000 # number of Monte Carlo draws
NT = 30 # number of time steps into the future
gcc.out <- out_HARV
params <- as.matrix(gcc.out$params) # get model parameters
param.mean <- apply(params, 2, mean)
predicts <- as.matrix(gcc.out$predict) # get model predictions
data <- gcc.out$data

ci <- apply(predicts, 2, quantile, c(0.025,0.5,0.975))
prow <- sample.int(nrow(params), Nmc, replace=TRUE)
drow = sample.int(nrow(temps_rot), Nmc, replace=TRUE)
Qmc <- 1 / sqrt(params[prow,"tau_add"]) ## convert from precision to standard deviation
future_temps <- download_future_met()
forecasts <- forecast_sites(future_temps, out_sites, sites)

IC <- predicts
pheno_forecast <- forecast(IC[prow, ncol(predicts)],
temp = temps_rot[drow,],
betaTemp = params[prow, "betaTemperature"],
betaX = params[prow, "betaX"],
betaI = params[prow, "betaIntercept"],
Q = Qmc,
n = Nmc)



Expand Down
60 changes: 40 additions & 20 deletions 06_submit_forecast.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,59 @@ library(ecoforecastR)
source("06_forecast.R")

# Reformat the forecast so it fits the standard
fcast <- as.data.frame(pheno_forecast)
fcast <- as.data.frame(t(pheno_forecast))
colnames(fcast) <- seq(1:1000)
start_date <- Sys.Date() + 1
end_date <- Sys.Date() + 30
fcast$datetime <- seq(start_date, end_date, by = "+1 day")
fcast$reference_datetime <- seq(start_date-1, end_date-1, by = "+1 day")
n_fcast <- pivot_longer(fcast, cols=seq(1:1000))
colnames(n_fcast) <- c("datetime", "reference_datetime", "parameter", "prediction")
n_fcast$model_id <- "ChlorophyllCrusaders" # perhaps change?
n_fcast$site_id <- "HARV" # change if we add more sites
n_fcast$variable <- "gcc_90"
n_fcast$family <- "ensemble" # there are 1000
n_fcast$duration <- "P1D" # daily forecasts
n_fcast$project_id <- "neon4cast"
sub_fcast <- data.frame()
for(i in 1:length(sites)){
#fcast <- as.data.frame(forecasts)
fcast <- as.data.frame(t(forecasts[[sites[i]]]))
colnames(fcast) <- seq(1:1000)
fcast$datetime <- seq(start_date, end_date, by = "+1 day")
fcast$reference_datetime <- seq(start_date-1, end_date-1, by = "+1 day")
n_fcast <- pivot_longer(fcast, cols=seq(1:1000))
colnames(n_fcast) <- c("datetime", "reference_datetime", "parameter", "prediction")
n_fcast$model_id <- "ChlorophyllCrusaders"
n_fcast$site_id <- sites[i] # change if we add more sites
n_fcast$variable <- "gcc_90"
n_fcast$family <- "ensemble" # there are 1000
n_fcast$duration <- "P1D" # daily forecasts
n_fcast$project_id <- "neon4cast"
if (dim(sub_fcast)[1] == 0 & dim(sub_fcast)[2] == 0){
sub_fcast = n_fcast
} else {
sub_fcast <- rbind(sub_fcast, n_fcast)
}
}

n_fcast <- n_fcast |> relocate(project_id, model_id, datetime, reference_datetime, duration, site_id, family, parameter, variable, prediction)
sub_fcast <- sub_fcast |> relocate(project_id, model_id, datetime, reference_datetime, duration, site_id, family, parameter, variable, prediction)
year <- year(Sys.Date())
month <- month(Sys.Date())
day <- day(Sys.Date())
forecast_file <- paste0("phenology-", year, "-", month, "-", day, "-ChlorophyllCrusaders.csv.gz")
write.csv(n_fcast, forecast_file)
neon4cast::forecast_output_validator(forecast_file) # validated!

# Plot the ensembles
#pheno_means <- n_fcast %>%
# Plot the ensembles; change site name for different ensembles
#pheno_means <- sub_fcast[sub_fcast$site_id == "RMNP",] %>%
# group_by(datetime) |>
# summarize(pred_mean = mean(prediction),.groups = "drop") |>
# select(datetime, pred_mean)

#plot(1:30, pheno_means$pred_mean, type="l")
#ecoforecastR::ciEnvelope(time1,N.IPDE.ci[1,],N.IPDE.ci[3,],col="darkgoldenrod1")
# Metadata part- A little unsure what to do here
#plot(as.Date(pheno_means$datetime), pheno_means$pred_mean, type="l", ylim=c(0.25,0.50),
# ylab="Mean ensemble GCC", xlab="Time", main="Prediction Ensemble means")
#ecoforecastR::ciEnvelope(pheno_means$datetime,N.IPDE.ci[1,],N.IPDE.ci[3,],col="darkgoldenrod1")
#lines(as.Date(pheno_means$datetime), pheno_means$pred_mean)

# Plot 10 random ensembles
#samp <- sample.int(nrow(forecasts$HARV), 10, replace=TRUE)
#plot(as.Date(pheno_means$datetime), forecasts$HARV[1,], type="b", col=1, ylim=c(0.25, 0.50),
# main="10 random ensemble forecasts", ylab="GCC_90", xlab="Time")
#for(s in 2:10) {
# lines(as.Date(pheno_means$datetime), forecasts$HARV[s,], type="b", col=s)
#}


# Metadata part

team_info <- list(team_name = "ChlorophyllCrusaders",
team_list = list(list(individualName = list(givenName = "Alice",
Expand Down Expand Up @@ -73,7 +93,7 @@ model_metadata = list(
repository = "https://github.com/EcoForecast/ChlorophyllCrusaders/"
),
initial_conditions = list(
status = "abset"
status = "absent"
),
drivers = list(
status = "absent"
Expand Down
Loading

0 comments on commit 51af710

Please sign in to comment.