prioritizr · jeffreyhanson · Jul 10, 2024 · Jul 7, 2024 · Jul 7, 2024 · Jul 7, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: surveyvoi
 Type: Package
-Version: 1.0.6
+Version: 1.1.0.0
 Title: Survey Value of Information
 Description: Decision support tool for prioritizing sites for ecological
     surveys based on their potential to improve plans for conserving
@@ -103,6 +103,7 @@ Collate:
     'fit_hglm_occupancy_models.R'
     'fit_xgb_occupancy_models.R'
     'geo_cov_survey_scheme.R'
+    'greedy_heuristic_optimization.R'
     'n_states.R'
     'optimal_survey_scheme.R'
     'package.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ export(feasible_survey_schemes)
 export(fit_hglm_occupancy_models)
 export(fit_xgb_occupancy_models)
 export(geo_cov_survey_scheme)
+export(greedy_heuristic_prioritization)
 export(n_states)
 export(optimal_survey_scheme)
 export(prior_probability_matrix)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,19 @@
+# surveyvoi 1.1.0.0
+
+- New `greedy_heuristic_algorithm()` function that can be used to generate
+  reserve selection prioritizations. This function provides access to the
+  internal algorithm used for the reserve selection component of the
+  value of information calculations.
+- Update internal greedy reserve selection algorithm to find better quality
+  prioritizations. This impacts `approx_near_optimal_survey_scheme()`,
+  `approx_optimal_survey_scheme()`, `optimal_survey_scheme()`,
+  `approx_evdsi()`, `evdsi()`, and `evdci()`.
+- Update `sim_sites` and `sim_features` example datasets.
+- Fix bug in `approx_near_optimal_survey_scheme()`,
+  `approx_optimal_survey_scheme()`, `optimal_survey_scheme()`,
+  `approx_evdsi()`, `evdsi()`, and `evdci()` where all species were incorrectly
+  assigned the same target as the first species during calculations.
+
 # surveyvoi 1.0.6
 
 - Fix installation for Windows on arm64 (#50).

diff --git a/R/greedy_heuristic_optimization.R b/R/greedy_heuristic_optimization.R
@@ -0,0 +1,179 @@
+#' @include internal.R
+NULL
+
+#' Greedy heuristic prioritization
+#'
+#' Generate a prioritization for protected area establishment.
+#'
+#' @inheritParams evdci
+#'
+#' @details
+#' The prioritization is generated using a greedy heuristic algorithm.
+#' The objective function for this algorithm is calculated by:
+#' (i) estimating the probability that each species meets its target,
+#' and (ii) calculating the sum of these probabilities.
+#' Note that this function underpins the value of information calculations
+#' because it is used to assess a potential management decision
+#' given updated information on the presence of particular species
+#' in particular sites.
+#'
+#' @return
+#' A `list` containing the following elements:
+#' \describe{
+#' \item{x}{`logical` vector indicating if each site is selected for
+#' protection or not.}
+#' \item{objval}{`numeric` value denoting the objective value for
+#' the prioritization.}
+#' }
+#'
+#' @examples
+#' # set seeds for reproducibility
+#' set.seed(123)
+#'
+#' # load example site data
+#' data(sim_sites)
+#' print(sim_sites)
+#'
+#' # load example feature data
+#' data(sim_features)
+#' print(sim_features)
+#'
+#' # set total budget for managing sites for conservation
+#'  # (i.e. 50% of the cost of managing all sites)
+#' total_budget <- sum(sim_sites$management_cost) * 0.5
+#'
+#' # generate reserve selection prioritization
+#' results <- greedy_heuristic_prioritization(
+#'   sim_sites, sim_features,
+#'   c("p1", "p2", "p3"),
+#'   "management_cost",
+#'   "target",
+#'   total_budget
+#' )
+#'
+#' # print results
+#' print(results)
+#'
+#' @export
+greedy_heuristic_prioritization <- function(
+  site_data,
+  feature_data,
+  site_probability_columns,
+  site_management_cost_column,
+  feature_target_column,
+  total_budget,
+  site_management_locked_in_column = NULL,
+  site_management_locked_out_column = NULL
+) {
+  # assert arguments are valid
+  assertthat::assert_that(
+    ## site_data
+    inherits(site_data, "sf"), ncol(site_data) > 0,
+    nrow(site_data) > 0,
+    ## feature_data
+    inherits(feature_data, "data.frame"), ncol(feature_data) > 0,
+    nrow(feature_data) > 0,
+    ## feature_target_column
+    assertthat::is.string(feature_target_column),
+    all(assertthat::has_name(feature_data, feature_target_column)),
+    is.numeric(feature_data[[feature_target_column]]),
+    assertthat::noNA(feature_data[[feature_target_column]]),
+    all(feature_data[[feature_target_column]] >= 0),
+    ## site_management_cost_column
+    assertthat::is.string(site_management_cost_column),
+    all(assertthat::has_name(site_data, site_management_cost_column)),
+    is.numeric(site_data[[site_management_cost_column]]),
+    assertthat::noNA(site_data[[site_management_cost_column]]),
+    ## total_budget
+    assertthat::is.number(total_budget), assertthat::noNA(total_budget),
+    isTRUE(total_budget > 0)
+  )
+  ## site_management_locked_in_column
+  if (!is.null(site_management_locked_in_column)) {
+    assertthat::assert_that(
+      assertthat::is.string(site_management_locked_in_column),
+      all(assertthat::has_name(site_data, site_management_locked_in_column)),
+      is.logical(site_data[[site_management_locked_in_column]]),
+      assertthat::noNA(site_data[[site_management_locked_in_column]]))
+    assertthat::assert_that(
+      sum(site_data[[site_management_locked_in_column]] *
+          site_data[[site_management_cost_column]]) <=
+      total_budget,
+      msg = "cost of managing locked in sites exceeds total budget")
+  }
+  ## site_management_locked_out_column
+  if (!is.null(site_management_locked_out_column)) {
+    assertthat::assert_that(
+      assertthat::is.string(site_management_locked_out_column),
+      all(assertthat::has_name(site_data, site_management_locked_out_column)),
+      is.logical(site_data[[site_management_locked_out_column]]),
+      assertthat::noNA(site_data[[site_management_locked_out_column]]))
+    if (all(site_data[[site_management_locked_out_column]]))
+      warning("all sites locked out")
+  }
+  ## validate locked arguments if some locked in and some locked out
+  if (!is.null(site_management_locked_in_column) &&
+      !is.null(site_management_locked_out_column)) {
+    assertthat::assert_that(
+      all(site_data[[site_management_locked_in_column]] +
+          site_data[[site_management_locked_out_column]] <= 1),
+      msg = "at least one planning unit is locked in and locked out")
+  }
+  ## validate targets
+  validate_target_data(feature_data, feature_target_column)
+  ## validate pij values
+  validate_site_probability_data(site_data, site_probability_columns)
+
+  # drop spatial information
+  if (inherits(site_data, "sf"))
+    site_data <- sf::st_drop_geometry(site_data)
+
+  ## prepare locked in data
+  if (!is.null(site_management_locked_in_column)) {
+    site_management_locked_in <- site_data[[site_management_locked_in_column]]
+  } else {
+    site_management_locked_in <- rep(FALSE, nrow(site_data))
+  }
+
+  ## prepare locked out data
+  if (!is.null(site_management_locked_out_column)) {
+    site_management_locked_out <- site_data[[site_management_locked_out_column]]
+  } else {
+    site_management_locked_out <- rep(FALSE, nrow(site_data))
+  }
+
+  ## validate that targets are feasible given budget and locked out units
+  sorted_costs <- sort(
+    site_data[[site_management_cost_column]][!site_management_locked_out])
+  sorted_costs <- sorted_costs[
+    seq_len(max(feature_data[[feature_target_column]]))]
+  assertthat::assert_that(
+    sum(sorted_costs) <= total_budget,
+    msg = paste("targets cannot be achieved given budget and locked out",
+                "planning units"))
+
+  # create prior data
+  prior_data <-
+    t(as.matrix(site_data[, site_probability_columns, drop = FALSE]))
+
+  # main calculation
+  out <- rcpp_greedy_heuristic_prioritization(
+    rij = prior_data,
+    pu_costs = site_data[[site_management_cost_column]],
+    pu_locked_in = as.numeric(site_management_locked_in),
+    pu_locked_out = as.numeric(site_management_locked_out),
+    target = round(feature_data[[feature_target_column]]),
+    budget = total_budget
+  )
+
+  # update objective value with exact calculation
+  out$objval <- rcpp_expected_value_of_action(
+    out$x,
+    prior_data,
+    round(feature_data[[feature_target_column]])
+  )
+
+  # return result
+  out
+
+}
diff --git a/README.Rmd b/README.Rmd
@@ -17,11 +17,11 @@ output:
 [![Coverage Status](https://img.shields.io/codecov/c/github/prioritizr/surveyvoi?label=Coverage)](https://app.codecov.io/gh/prioritizr/surveyvoi/branch/master)
 [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/surveyvoi)](https://CRAN.R-project.org/package=surveyvoi)
 
-```{r, include = FALSE}
+```{r "knitr_config", include = FALSE}
 knitr::opts_chunk$set(fig.path = "man/figures/README-", fig.align = "center")
 ```
 
-```{r, include = FALSE}
+```{r "initialization", include = FALSE}
 devtools::load_all()
 h = 2.75
 w = 3.0
@@ -86,7 +86,7 @@ Please cite the _surveyvoi R_ package when using it in publications. To cite the
 Here we provide a short example showing how to use the _surveyvoi R_ package to prioritize funds for ecological surveys. In this example, we will generate plans for conducting ecological surveys (termed "survey schemes") using simulated data for six sites and three conservation features (e.g. bird species). To start off, we will set the seed for the random number generator for reproducibility and load some R packages.
 
 ```{r, message = FALSE}
-set.seed(501)      # set RNG for reproducibility
+set.seed(502)      # set RNG for reproducibility
 library(surveyvoi) # package for value of information analysis
 library(dplyr)     # package for preparing data
 library(tidyr)     # package for preparing data
@@ -96,19 +96,22 @@ library(ggplot2)   # package for plotting data
 Now we will load some datasets that are distributed with the package. First, we will load the `sim_sites` object. This spatially explicit dataset (i.e. `sf` object) contains information on the sites within our study area. Critically, it contains (i) sites that have already been surveyed, (ii) candidate sites for additional surveys, (iii) sites that have already been protected, and (iv) candidate sites that could be protected in the future. Each row corresponds to a different site, and each column describes different properties associated with each site. In this table, the `"management_cost"` column indicates the cost of protecting each site; `"survey_cost"` column indicates the cost of conducting an ecological survey within each site; and `"e1"` and `"e2"` columns contain environmental data for each site (not used in this example). The remaining columns describe the existing survey data and the spatial distribution of the features across the sites. The `"n1"`, `"n2"`, and `"n3"` columns indicate the number of surveys conducted within each site that looked for each of the three features (respectively); and `"f1"`, `"f2"`, and `"f3"` columns describe the proportion of surveys within each site that looked for each feature where the feature was detected (respectively). For example, if `"n1"` has a value of 2 and `"f1"` has a value of 0.5 for a given site, then the feature `"f1"` was detected in only one of the two surveys conducted in this site that looked for the feature. Finally, the `"p1"`, `"p2"`, and `"p3"` columns contain modeled probability estimates of each species being present in each site (see `fit_hglm_occupancy_models()` and `fit_xgb_occupancy_models()` to generate such estimates for your own data).
 
 
-```{r, fig.height = h, fig.width = w}
+```{r "load_site_data"}
 # load data
 data(sim_sites)
 
 # print table
 print(sim_sites, width = Inf)
 
+```{r "management_cost_plot", fig.height = h, fig.width = w}
 # plot cost of protecting each site
 ggplot(sim_sites) +
 geom_sf(aes(color = management_cost), size = 4) +
 ggtitle("management_cost") +
 theme(legend.title = element_blank())
+```
 
+```{r "survey_cost_plot", fig.height = h, fig.width = w}
 # plot cost of conducting an additional survey in each site
 # note that these costs are much lower than the protection costs
 ggplot(sim_sites) +
@@ -154,7 +157,7 @@ scale_color_continuous(limits = c(0, 1))
 
 Next, we will load the `sim_features` object. This table contains information on the conservation features (e.g. species). Specifically, each row corresponds to a different feature, and each column contains information associated with the features. In this table, the `"name"` column contains the name of each feature; `"survey"` column indicates whether future surveys would look for this species; `"survey_sensitivity"` and `"survey_specificity"` columns denote the sensitivity (true positive rate) and specificity (true negative rate) for the survey methodology for correctly detecting the feature;  `"model_sensitivity"` and `"model_specificity"` columns denote the sensitivity (true positive rate) and specificity (true negative rate) for the species distribution models fitted for each feature; and `"target"` column denotes the required number of protected sites for each feature (termed "representation target", each feature has a target of `r sim_features$target[1]` site).
 
-```{r}
+```{r "load_feature_data"}
 # load data
 data(sim_features)
 
@@ -166,9 +169,9 @@ After loading the data, we will now generate an optimized ecological survey sche
 
 To perform the optimization, we will set a total budget for (i) protecting sites and (ii) surveying sites. Although you might be hesitant to specify a budget, please recall that you would make very different plans depending on available funds. For instance, if you have near infinite funds then you wouldn't bother conducting any surveys and simply protect everything. Similarly, if you had very limited funds, then you wouldn't survey any sites to ensure that at least one site could be protected. Generally, conservation planning problems occur somewhere between these two extremes---but the optimization process can't take that into account if you don't specify a budget. For brevity, here we will set the total budget as 90% of the total costs for protecting sites.
 
-```{r, message = FALSE, results = FALSE}
+```{r "generate_optimal_survey_scheme", message = FALSE, results = FALSE}
 # calculate budget
-budget <- sum(0.90 * sim_sites$management_cost)
+budget <- sum(0.4 * sim_sites$management_cost)
 
 # generate optimized survey scheme
 opt_scheme <-
@@ -188,18 +191,19 @@ opt_scheme <-
     feature_target_column = "target",
     total_budget = budget,
     survey_budget = budget,
-    verbose = TRUE)
+    verbose = TRUE
+  )
 ```
 
-```{r, include = FALSE}
+```{r "validation", include = FALSE}
 # ensure that at least one site is selected
 assertthat::assert_that(
   sum(c(opt_scheme[1, ])) >= 1,
   msg = "no sites selected, not a compelling example"
 )
 ```
 
-```{r, fig.height = h, fig.width = w * 1.5}
+```{r "survey_scheme_plot", fig.height = h, fig.width = w * 1.5}
 # the opt_scheme object is a matrix that contains the survey schemes
 # each column corresponds to a different site,
 # and each row corresponds to a different solution