add standardize_sdtm_id()

billdenney · Sep 7, 2021 · b16a1a7 · b16a1a7
1 parent 1ac13c7
commit b16a1a7
Show file tree

Hide file tree

Showing 5 changed files with 197 additions and 1 deletion.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: Rsdtm
 Title: Study Data Tabulation Model (SDTM) import, export, and management assistance
-Version: 0.0.0.9011
+Version: 0.0.0.9012
 Authors@R:
   c(
     person("William", "Denney", email = "wdenney@humanpredictions.com", role = c("aut", "cre")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -18,6 +18,7 @@ export(sdtm_dtc_to_datetime)
 export(sdtm_first_dose)
 export(sdtm_time_actual)
 export(simplify_sdtm_names)
+export(standardize_sdtm_id)
 export(strip_attributes)
 export(supp_reformat)
 importFrom(dplyr,anti_join)

diff --git a/R/standardize_ids.R b/R/standardize_ids.R
@@ -0,0 +1,68 @@
+#' Standardize the SDTM ID columns (STUDYID, USUBJID, and SUBJID)
+#' 
+#' @param data data.frame or similar to have standard IDs added
+#' @param id a data.frame or list of data.frames to merge other identifiers into
+#'   the data
+#' @param studyid a character scalar indicating the study number
+#' @param sep_usubjid a character scalar separating the STUDYID from the SUBJID
+#' @param allow_missing_id Is it acceptable for some USUBJID and SUBJID to be
+#'   missing?
+#' @return A data.frame with columns for STUDYID, USUBJID, and SUBJID and other
+#'   identifier columns removed.
+#' @export
+standardize_sdtm_id <- function(data, id, studyid, sep_usubjid="-", allow_missing_id=FALSE) {
+  stopifnot("'sep_usubjid' must be a character"=is.character(sep_usubjid))
+  stopifnot("'sep_usubjid' must not be NA"=!is.na(sep_usubjid))
+  stopifnot("'sep_usubjid' must be a scalar"=length(sep_usubjid) == 1)
+  # Track that no data were created or removed during this process (mainly to
+  # confirm id merge accuracy)
+  rowid_col <- paste0(max(names(data)), "X")
+  data[[rowid_col]] <- seq_len(nrow(data))
+  id_cols <- c("STUDYID", "USUBJID", "SUBJID")
+  ret <- data
+  if (!("STUDYID" %in% names(ret))) {
+    stopifnot("'studyid' must be a character"=is.character(studyid))
+    stopifnot("'studyid' must not be NA"=!is.na(studyid))
+    stopifnot("'studyid' must be a scalar"=length(studyid) == 1)
+    ret$STUDYID <- studyid
+  }
+  stopifnot("NA not allowed in STUDYID"=!any(is.na(ret$STUDYID)))
+  if (!any(c("SUBJID", "USUBJID") %in% names(ret))) {
+    if (missing(id)) {
+      stop("'SUBJID' and 'USUBJID' columns are not in 'data' and no 'id' was provided")
+    }
+    if (is.data.frame(id)) {
+      id <- list(id)
+    }
+    for (idx in seq_along(id)) {
+      current_id <- id[[idx]]
+      if (length(intersect(names(ret), setdiff(names(current_id), id_cols))) > 0) {
+        ret <- dplyr::left_join(ret, current_id, by=intersect(names(ret), names(current_id)))
+      }
+      # Drop columns from id (e.g. RANDID) that are not STUDYID, SUBJID, or USUBJID
+      ret <- ret[, setdiff(names(ret), setdiff(names(current_id), id_cols)), drop=FALSE]
+    }
+  }
+  if (!allow_missing_id) {
+    if ("SUBJID" %in% names(ret)) stopifnot("NA not allowed in SUBJID"=!any(is.na(ret$SUBJID)))
+    if ("USUBJID" %in% names(ret)) stopifnot("NA not allowed in USUBJID"=!any(is.na(ret$USUBJID)))
+  }
+  if ("SUBJID" %in% names(ret) & !("USUBJID" %in% names(ret))) {
+    # Put USUBJID in
+    ret$USUBJID <- paste(ret$STUDYID, ret$SUBJID, sep=sep_usubjid)
+  }
+  if ("USUBJID" %in% names(ret) & !("SUBJID" %in% names(ret))) {
+    ret$SUBJID <-
+      substr(
+        ret$USUBJID,
+        start=nchar(ret$STUDYID) + nchar(sep_usubjid) + 1,
+        stop=nchar(ret$USUBJID)
+      )
+  }
+  if (!all(id_cols %in% names(ret))) {
+    stop("Not all required columns are in the data.frame for return.  Report a bug.") # nocov
+  }
+  stopifnot("Likely merge error, different number of rows in output than input"=nrow(ret) == nrow(data))
+  stopifnot("Likely subtle merge error, input rows are missing from output"=!any(duplicated(ret[[rowid_col]])))
+  ret[, c(id_cols, setdiff(names(ret), c(rowid_col, id_cols)))]
+}
diff --git a/man/standardize_sdtm_id.Rd b/man/standardize_sdtm_id.Rd
diff --git a/tests/testthat/test-standardize_ids.R b/tests/testthat/test-standardize_ids.R
@@ -0,0 +1,93 @@
+test_that("standardize_sdtm_id", {
+  # Column order is standardized
+  expect_equal(
+    standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID="C")),
+    data.frame(STUDYID="A", USUBJID="C", SUBJID="B")
+  )
+  # STUDYID is added
+  expect_equal(
+    standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid="A"),
+    data.frame(STUDYID="A", USUBJID="C", SUBJID="B")
+  )
+  # SUBJID is added
+  expect_equal(
+    standardize_sdtm_id(data=data.frame(USUBJID="A-C"), studyid="A"),
+    data.frame(STUDYID="A", USUBJID="A-C", SUBJID="C")
+  )
+  # USUBJID is added
+  expect_equal(
+    standardize_sdtm_id(data=data.frame(SUBJID="A-C"), studyid="B"),
+    data.frame(STUDYID="B", USUBJID="B-A-C", SUBJID="A-C")
+  )
+})
+
+test_that("standardize_sdtm_id id addition", {
+  expect_equal(
+    standardize_sdtm_id(
+      data=data.frame(RANDID="foo"), studyid="B",
+      id=data.frame(RANDID="foo", SUBJID="C")
+    ),
+    data.frame(STUDYID="B", USUBJID="B-C", SUBJID="C")
+  )
+  expect_error(
+    standardize_sdtm_id(
+      data=data.frame(RANDID="foo"), studyid="B",
+      id=data.frame(RANDID="foo", bar=c("bar", "baz"), SUBJID="C")
+    ),
+    regexp="Likely merge error, different number of rows in output than input"
+  )
+})
+
+test_that("standardize_sdtm_id alternate paths", {
+  expect_error(
+    standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID=NA_character_, USUBJID="C")),
+    regexp="NA not allowed in SUBJID"
+  )
+  expect_error(
+    standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID=NA_character_)),
+    regexp="NA not allowed in USUBJID"
+  )
+  expect_equal(
+    standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID=NA_character_, USUBJID="C"), allow_missing_id=TRUE),
+    data.frame(STUDYID="A", USUBJID="C", SUBJID=NA_character_)
+  )
+  expect_equal(
+    standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID=NA_character_), allow_missing_id=TRUE),
+    data.frame(STUDYID="A", USUBJID=NA_character_, SUBJID="B")
+  )
+})
+
+test_that("standardize_sdtm_id errors", {
+  expect_error(
+    standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=factor("A")),
+    regexp="'sep_usubjid' must be a character"
+  )
+  expect_error(
+    standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=NA_character_),
+    regexp="'sep_usubjid' must not be NA"
+  )
+  expect_error(
+    standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=c("A", "B")),
+    regexp="'sep_usubjid' must be a scalar"
+  )
+  expect_error(
+    standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C")),
+    regexp='argument "studyid" is missing, with no default'
+  )
+  expect_error(
+    standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=factor("A")),
+    regexp="'studyid' must be a character"
+  )
+  expect_error(
+    standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=NA_character_),
+    regexp="'studyid' must not be NA"
+  )
+  expect_error(
+    standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=c("A", "B")),
+    regexp="'studyid' must be a scalar"
+  )
+  expect_error(
+    standardize_sdtm_id(data=data.frame(STUDYID=NA_character_, SUBJID="B", USUBJID="C")),
+    regexp="NA not allowed in STUDYID"
+  )
+})