Variable population size update (#78)

* Variable pop size for simulate functions * Update GeoTox.print * Message format consistency * Ran devtools::document() * Added set_population function * Updated paper doi * Increment version number to 0.2.0.9001 * Updated NEWS * Added set_population to _pkgdown.yml * Updated documentation * Fix for devtools::check() note
NIEHS · Jan 31, 2025 · c1a6017 · c1a6017
1 parent f2b0809
commit c1a6017
Show file tree

Hide file tree

Showing 26 changed files with 567 additions and 155 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: GeoTox
 Title: Spatiotemporal Mixture Risk Assessment
-Version: 0.2.0.9000
+Version: 0.2.0.9001
 Authors@R: c(
     person("Skylar", "Marvel", , "skylar.marvel@nih.gov", role = c("aut"),
            comment = c(ORCID = "0000-0002-2971-9743")),
@@ -12,8 +12,8 @@ Authors@R: c(
   )
 Description: Connecting spatiotemporal exposure to individual and
     population-level risk via source-to-outcome continuum modeling. The package,
-    methods, and case-studies are described in Messier, Reif, and Marvel (2024)
-    <doi:10.1101/2024.09.23.24314096> and Eccles et al. (2023)
+    methods, and case-studies are described in Messier, Reif, and Marvel (2025)
+    <doi:10.1186/s40246-024-00711-8> and Eccles et al. (2023)
     <doi:10.1016/j.scitotenv.2022.158905>.
 License: MIT + file LICENSE
 URL: https://niehs.github.io/GeoTox/, https://github.com/NIEHS/GeoTox
@@ -23,6 +23,7 @@ Imports:
     dplyr,
     ggplot2,
     ggridges,
+    purrr,
     rlang,
     sf,
     stats,
@@ -37,7 +38,6 @@ Suggests:
     httk,
     httr2,
     knitr,
-    purrr,
     readr,
     readxl,
     rmarkdown,

diff --git a/NAMESPACE b/NAMESPACE
@@ -26,6 +26,7 @@ export(sample_Css)
 export(sensitivity_analysis)
 export(set_boundaries)
 export(set_hill_params)
+export(set_population)
 export(simulate_age)
 export(simulate_exposure)
 export(simulate_inhalation_rate)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # GeoTox (development version)
 
+* Added set_population() function.
+
+* The simulate\_\* functions can now handle population sizes that vary
+across regions.
+
 # GeoTox 0.2.0
 
 * Initial CRAN submission.
diff --git a/R/GeoTox.R b/R/GeoTox.R
@@ -41,11 +41,11 @@
 #' # Plot exposure data
 #' plot(geoTox, type = "exposure", ncol = 5)
 #' # Plot response data
-#' plot(geoTox)
 #' plot(geoTox, assays = "TOX21_H2AX_HTRF_CHO_Agonist_ratio")
 #' # Plot sensitivity data
-#' plot(geoTox, type = "sensitivity")
-#' plot(geoTox, type = "sensitivity", assay = "TOX21_H2AX_HTRF_CHO_Agonist_ratio")
+#' plot(geoTox,
+#'      type = "sensitivity",
+#'      assay = "TOX21_H2AX_HTRF_CHO_Agonist_ratio")
 GeoTox <- function() {
   structure(
     list(
@@ -70,78 +70,104 @@ GeoTox <- function() {
 #' @export
 print.GeoTox <- function(x, ...) {
 
-  names_simulated <- c("age", "IR", "obesity", "C_ext", "C_ss")
-  names_computed <- c("D_int", "C_invitro", "resp", "sensitivity")
-  names_other <- setdiff(names(x),
-                         c(names_simulated, names_computed))
-
-  get_info <- function(names) {
-    info <- lapply(names, \(name) {
-      class <- dim <- ""
-      if (is.null(x[[name]])) {
-        return(data.frame(Name = name, Class = "", Dim = ""))
-      }
-      is_list <- inherits(x[[name]], "list")
-      if (is_list && length(x[[name]]) > 0) {
-        item <- x[[name]][[1]]
-      } else if (!is_list) {
-        item <- x[[name]]
-      } else {
-        item <- NULL
-      }
-      class <- class(item)
-      if (any(c("matrix", "data.frame") %in% class)) {
-        dim <- paste(dim(item), collapse = " x ")
-      } else {
-        dim <- length(item)
-      }
-      if (is_list) {
-        dim <- paste0(length(x[[name]]), " x (", dim, ")")
-        class <- paste0("list(", class[[1]], ")")
-      } else {
-        class <- paste(class, collapse = ", ")
-      }
-      data.frame(Name = name, Class = class, Dim = dim)
-    })
-    do.call(rbind, info)
-  }
-
-  info_simulated <- get_info(names_simulated)
-  info_simulated <- info_simulated[info_simulated$Class != "", , drop = FALSE]
-  info_computed <- get_info(names_computed)
-  info_computed <- info_computed[info_computed$Class != "", , drop = FALSE]
-
-  cat("GeoTox object\n")
+  # Get n_assay and n_chem from GeoTox()$hill_params
   if (is.null(x$hill_params)) {
-    n_assays <- 0
-    n_chems <- 0
+    n_assay <- 0
+    n_chem <- 0
   } else {
     if ("assay" %in% names(x$hill_params)) {
-      n_assays <- length(unique(x$hill_params$assay))
+      n_assay <- length(unique(x$hill_params$assay))
     } else {
-      n_assays <- 1
+      n_assay <- 1
     }
     if ("chem" %in% names(x$hill_params)) {
-      n_chems <- length(unique(x$hill_params$chem))
+      n_chem <- length(unique(x$hill_params$chem))
     } else {
-      n_chems <- 1
+      n_chem <- 1
+    }
+  }
+
+  # Categorize different GeoTox() fields
+  names_data_vec      <- c("age", "IR", "obesity")
+  names_data_mat      <- c("C_ext", "C_ss")
+  names_computed_mat  <- c("D_int", "C_invitro")
+  names_computed_df   <- c("resp")
+  names_computed_list <- c("sensitivity")
+  names_other <- setdiff(names(x),
+                         c(names_data_vec, names_data_mat,
+                           names_computed_mat, names_computed_df,
+                           names_computed_list))
+
+  # Functions to get size info for each type of field
+  # m = number of regions
+  # n = population size
+  get_info_vec <- function(name) {
+    size <- ifelse(is.null(x[[name]]), "", "m * (n)")
+    data.frame(Name = name, Size = size)
+  }
+  get_info_mat <- function(name) {
+    size <- ""
+    if (!is.null(x[[name]])) {
+      dim <- dim(x[[name]][[1]])
+      size <- paste0("m * (n x ", dim[2], ")")
+    }
+    data.frame(Name = name, Size = size)
+  }
+  get_info_df <- function(name) {
+    size <- ""
+    if (!is.null(x[[name]])) {
+      dim <- dim(x[[name]][[1]])
+      size <- paste0("m * (", n_assay, " * n x ", dim[2], ")")
     }
+    data.frame(Name = name, Size = size)
   }
-  cat("Assays: ", n_assays, "\n", sep = "")
-  cat("Chemicals: ", n_chems, "\n", sep = "")
-  if (nrow(info_simulated) > 0) {
-    n_regions <- length(x[[info_simulated$Name[1]]])
-  } else if (nrow(info_computed) > 0) {
-    n_regions <- length(x[[info_computed$Name[1]]])
+  get_info_list <- function(name) {
+    size <- ""
+    if (!is.null(x[[name]])) {
+      n_list <- length(x[[name]])
+      dim <- dim(x[[name]][[1]][[1]])
+      size <- paste0(n_list, " * (m * (", n_assay, " * n x ", dim[2], "))")
+    }
+    data.frame(Name = name, Size = size)
+  }
+
+  # Get size info for each type of field
+  info_data <- dplyr::bind_rows(
+    purrr::map(names_data_vec, \(name) get_info_vec(name)),
+    purrr::map(names_data_mat, \(name) get_info_mat(name))) |> 
+    dplyr::filter(.data$Size != "")
+
+  info_computed <- dplyr::bind_rows(
+    purrr::map(names_computed_mat,  \(name) get_info_mat(name)),
+    purrr::map(names_computed_df,   \(name) get_info_df(name)),
+    purrr::map(names_computed_list, \(name) get_info_list(name))) |> 
+    dplyr::filter(.data$Size != "")
+
+  # Get population size from GeoTox()$par$n
+  if (is.null(x$par$n)) {
+    n_pop <- 0
+  } else if (length(unique(x$par$n)) == 1) {
+    n_pop <- x$par$n[[1]]
   } else {
-    n_regions <- 0
+    n_pop <- paste0("[", paste(range(x$par$n), collapse = ", "), "]")
   }
-  cat("Regions: ", n_regions, "\n", sep = "")
-  cat("Population: ", x$par$n, "\n", sep = "")
+
+  # Get number of regions from potential data fields
+  n_region <- purrr::map_int(c(names_data_vec, names_data_mat,
+                               names_computed_mat, names_computed_df),
+                             \(name) length(x[[name]])) |> 
+    max()
+
+  # Output info
+  cat("GeoTox object\n")
+  cat("Assays: ", n_assay, "\n", sep = "")
+  cat("Chemicals: ", n_chem, "\n", sep = "")
+  cat("Regions: m = ", n_region, "\n", sep = "")
+  cat("Population: n = ", n_pop, "\n", sep = "")
   cat("Data Fields:")
-  if (nrow(info_simulated) > 0) {
+  if (nrow(info_data) > 0) {
     cat("\n")
-    print(info_simulated, row.names = FALSE, print.gap = 2)
+    print(info_data, row.names = FALSE, print.gap = 2)
   } else {
     cat(" None\n")
   }

diff --git a/R/set_population.R b/R/set_population.R
@@ -0,0 +1,78 @@
+#' Set population data
+#'
+#' @param x GeoTox object.
+#' @param age numeric vector or list of numeric vectors of age values.
+#' @param obesity character vector or list of character vectors of obesity
+#' status.
+#'
+#' @return The same object with simulated fields added.
+#' @export
+#'
+#' @examples
+#' # Single region
+#' age <- round(runif(10, 1, 100))
+#' obesity <- sample(c("Normal", "Obese"), 10, replace = TRUE)
+#' geoTox <- set_population(GeoTox(), age = age, obesity = obesity)
+#' 
+#' # Multiple regions
+#' age <- list("37001" = round(runif(10, 1, 100)),
+#'             "37007" = round(runif(8, 1, 100)))
+#' obesity <- list("37001" = sample(c("Normal", "Obese"), 10, replace = TRUE),
+#'                 "37007" = sample(c("Normal", "Obese"), 8, replace = TRUE))
+#' geoTox <- set_population(GeoTox(), age = age, obesity = obesity)
+set_population <- function(x, age = NULL, obesity = NULL) {
+
+  set_age <- !is.null(age)
+  set_obesity <- !is.null(obesity)
+
+  if (set_age) {
+    age <- .check_types(age,
+                        c("numeric", "integer"),
+                        paste0("`age` must be a numeric vector or list of ",
+                               "numeric vectors"))
+  }
+
+  if (set_obesity) {
+    obesity <- .check_types(obesity,
+                            "character",
+                            paste0("`obesity` must be a character vector or ",
+                                   "list of character vectors"))
+    if (any(purrr::map_lgl(obesity,
+                           \(x) !all(x %in% c("Normal", "Obese"))))) {
+      stop("`obesity` values must be 'Normal' or 'Obese'", call. = FALSE)
+    }
+  }
+
+  # Update population size
+  if (set_age) n_age <- purrr::map_int(age, length)
+  if (set_obesity) n_obesity <- purrr::map_int(obesity, length)
+  if (set_age & set_obesity) {
+    if (!identical(n_age, n_obesity)) {
+      stop("Population sizes for `age` and `obesity` do not match",
+           call. = FALSE)
+    }
+    x$par$n <- n_age
+  } else if (set_age) {
+    x$par$n <- n_age
+  } else if (set_obesity) {
+    x$par$n <- n_obesity
+  }
+
+  # Set fields
+  if (set_age) x$age <- age
+  if (set_obesity) x$obesity <- obesity
+
+  # Clear downstream fields
+  if (set_age & !is.null(x$IR)) {
+    warning("Clearing `IR` field", call. = FALSE)
+    x$IR <- NULL
+  }
+  if ((set_age | set_obesity) &
+      !(is.null(x$C_ss) & is.null(x$css_sensitivity))) {
+    warning("Clearing `C_ss` and `css_sensitivity` fields", call. = FALSE)
+    x$C_ss <- NULL
+    x$css_sensitivity <- NULL
+  }
+
+  x
+}
diff --git a/R/simulate_age.R b/R/simulate_age.R
@@ -2,13 +2,18 @@
 #'
 #' @param x data frame or list of data frames containing population data for age
 #' groups. Each data frame must contain columns "AGEGRP" and "TOT_POP".
-#' @param n simulated sample size.
+#' @param n simulated sample size(s).
 #'
 #' @details
 #' Each data frame must contain 19 rows. The first row represents the total
 #' population of all age groups while the next 18 rows represent age groups
 #' from 0 to 89 in increments of 5 years.
 #' 
+#' The sample size can be either a single value or a vector the same length as
+#' the number of data frames in x. If a single value is provided, the same
+#' sample size is used for all data frames. If a vector is provided, each
+#' element corresponds to the sample size for each data frame in x.
+#' 
 #' @return List of arrays containing simulated ages.
 #'
 #' @examples
@@ -25,25 +30,30 @@
 #' # set population total for all age groups
 #' y$TOT_POP[1] <- sum(y$TOT_POP)
 #' simulate_age(list(x = x, y = y), 15)
+#' # different sample sizes
+#' simulate_age(list(x = x, y = y), c(15, 10))
 #'
 #' @export
 simulate_age <- function(x, n = 1e3) {
 
-  if (!any(c("data.frame", "list") %in% class(x))) {
-    stop("x must be a data.frame or list")
-  }
-
-  if (is.data.frame(x)) x <- list(x)
+  x <- .check_types(x,
+                    "data.frame",
+                    "`x` must be a data frame or list of data frames")
 
   if (.check_names(x, c("AGEGRP", "TOT_POP"))) {
-    stop("x data frames must contain columns 'AGEGRP' and 'TOT_POP'")
+    stop("`x` data frames must contain columns 'AGEGRP' and 'TOT_POP'")
+  }
+
+  if (any(unlist(lapply(x, \(y) nrow(y) != 19)))) {
+    stop("`x` data frames must contain 19 rows")
   }
 
-  if (any(unlist(lapply(x, function(y) nrow(y) != 19)))) {
-    stop("x data frames must contain 19 rows")
+  if (!(length(n) == 1 | length(n) == length(x))) {
+    stop("`n` must be a single value or a vector with values for ",
+         "each data frame in `x`")
   }
 
-  lapply(x, function(df) .simulate_age(df, n))
+  purrr::pmap(list(x, n), \(df, n) .simulate_age(df, n))
 
 }