Skip to content

Commit

Permalink
add standardize_sdtm_id()
Browse files Browse the repository at this point in the history
  • Loading branch information
billdenney committed Sep 7, 2021
1 parent 1ac13c7 commit b16a1a7
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 1 deletion.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: Rsdtm
Title: Study Data Tabulation Model (SDTM) import, export, and management assistance
Version: 0.0.0.9011
Version: 0.0.0.9012
Authors@R:
c(
person("William", "Denney", email = "wdenney@humanpredictions.com", role = c("aut", "cre")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export(sdtm_dtc_to_datetime)
export(sdtm_first_dose)
export(sdtm_time_actual)
export(simplify_sdtm_names)
export(standardize_sdtm_id)
export(strip_attributes)
export(supp_reformat)
importFrom(dplyr,anti_join)
Expand Down
68 changes: 68 additions & 0 deletions R/standardize_ids.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#' Standardize the SDTM ID columns (STUDYID, USUBJID, and SUBJID)
#'
#' @param data data.frame or similar to have standard IDs added
#' @param id a data.frame or list of data.frames to merge other identifiers into
#' the data
#' @param studyid a character scalar indicating the study number
#' @param sep_usubjid a character scalar separating the STUDYID from the SUBJID
#' @param allow_missing_id Is it acceptable for some USUBJID and SUBJID to be
#' missing?
#' @return A data.frame with columns for STUDYID, USUBJID, and SUBJID and other
#' identifier columns removed.
#' @export
standardize_sdtm_id <- function(data, id, studyid, sep_usubjid="-", allow_missing_id=FALSE) {
stopifnot("'sep_usubjid' must be a character"=is.character(sep_usubjid))
stopifnot("'sep_usubjid' must not be NA"=!is.na(sep_usubjid))
stopifnot("'sep_usubjid' must be a scalar"=length(sep_usubjid) == 1)
# Track that no data were created or removed during this process (mainly to
# confirm id merge accuracy)
rowid_col <- paste0(max(names(data)), "X")
data[[rowid_col]] <- seq_len(nrow(data))
id_cols <- c("STUDYID", "USUBJID", "SUBJID")
ret <- data
if (!("STUDYID" %in% names(ret))) {
stopifnot("'studyid' must be a character"=is.character(studyid))
stopifnot("'studyid' must not be NA"=!is.na(studyid))
stopifnot("'studyid' must be a scalar"=length(studyid) == 1)
ret$STUDYID <- studyid
}
stopifnot("NA not allowed in STUDYID"=!any(is.na(ret$STUDYID)))
if (!any(c("SUBJID", "USUBJID") %in% names(ret))) {
if (missing(id)) {
stop("'SUBJID' and 'USUBJID' columns are not in 'data' and no 'id' was provided")
}
if (is.data.frame(id)) {
id <- list(id)
}
for (idx in seq_along(id)) {
current_id <- id[[idx]]
if (length(intersect(names(ret), setdiff(names(current_id), id_cols))) > 0) {
ret <- dplyr::left_join(ret, current_id, by=intersect(names(ret), names(current_id)))
}
# Drop columns from id (e.g. RANDID) that are not STUDYID, SUBJID, or USUBJID
ret <- ret[, setdiff(names(ret), setdiff(names(current_id), id_cols)), drop=FALSE]
}
}
if (!allow_missing_id) {
if ("SUBJID" %in% names(ret)) stopifnot("NA not allowed in SUBJID"=!any(is.na(ret$SUBJID)))
if ("USUBJID" %in% names(ret)) stopifnot("NA not allowed in USUBJID"=!any(is.na(ret$USUBJID)))
}
if ("SUBJID" %in% names(ret) & !("USUBJID" %in% names(ret))) {
# Put USUBJID in
ret$USUBJID <- paste(ret$STUDYID, ret$SUBJID, sep=sep_usubjid)
}
if ("USUBJID" %in% names(ret) & !("SUBJID" %in% names(ret))) {
ret$SUBJID <-
substr(
ret$USUBJID,
start=nchar(ret$STUDYID) + nchar(sep_usubjid) + 1,
stop=nchar(ret$USUBJID)
)
}
if (!all(id_cols %in% names(ret))) {
stop("Not all required columns are in the data.frame for return. Report a bug.") # nocov
}
stopifnot("Likely merge error, different number of rows in output than input"=nrow(ret) == nrow(data))
stopifnot("Likely subtle merge error, input rows are missing from output"=!any(duplicated(ret[[rowid_col]])))
ret[, c(id_cols, setdiff(names(ret), c(rowid_col, id_cols)))]
}
34 changes: 34 additions & 0 deletions man/standardize_sdtm_id.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

93 changes: 93 additions & 0 deletions tests/testthat/test-standardize_ids.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
test_that("standardize_sdtm_id", {
# Column order is standardized
expect_equal(
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID="C")),
data.frame(STUDYID="A", USUBJID="C", SUBJID="B")
)
# STUDYID is added
expect_equal(
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid="A"),
data.frame(STUDYID="A", USUBJID="C", SUBJID="B")
)
# SUBJID is added
expect_equal(
standardize_sdtm_id(data=data.frame(USUBJID="A-C"), studyid="A"),
data.frame(STUDYID="A", USUBJID="A-C", SUBJID="C")
)
# USUBJID is added
expect_equal(
standardize_sdtm_id(data=data.frame(SUBJID="A-C"), studyid="B"),
data.frame(STUDYID="B", USUBJID="B-A-C", SUBJID="A-C")
)
})

test_that("standardize_sdtm_id id addition", {
expect_equal(
standardize_sdtm_id(
data=data.frame(RANDID="foo"), studyid="B",
id=data.frame(RANDID="foo", SUBJID="C")
),
data.frame(STUDYID="B", USUBJID="B-C", SUBJID="C")
)
expect_error(
standardize_sdtm_id(
data=data.frame(RANDID="foo"), studyid="B",
id=data.frame(RANDID="foo", bar=c("bar", "baz"), SUBJID="C")
),
regexp="Likely merge error, different number of rows in output than input"
)
})

test_that("standardize_sdtm_id alternate paths", {
expect_error(
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID=NA_character_, USUBJID="C")),
regexp="NA not allowed in SUBJID"
)
expect_error(
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID=NA_character_)),
regexp="NA not allowed in USUBJID"
)
expect_equal(
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID=NA_character_, USUBJID="C"), allow_missing_id=TRUE),
data.frame(STUDYID="A", USUBJID="C", SUBJID=NA_character_)
)
expect_equal(
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID=NA_character_), allow_missing_id=TRUE),
data.frame(STUDYID="A", USUBJID=NA_character_, SUBJID="B")
)
})

test_that("standardize_sdtm_id errors", {
expect_error(
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=factor("A")),
regexp="'sep_usubjid' must be a character"
)
expect_error(
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=NA_character_),
regexp="'sep_usubjid' must not be NA"
)
expect_error(
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=c("A", "B")),
regexp="'sep_usubjid' must be a scalar"
)
expect_error(
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C")),
regexp='argument "studyid" is missing, with no default'
)
expect_error(
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=factor("A")),
regexp="'studyid' must be a character"
)
expect_error(
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=NA_character_),
regexp="'studyid' must not be NA"
)
expect_error(
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=c("A", "B")),
regexp="'studyid' must be a scalar"
)
expect_error(
standardize_sdtm_id(data=data.frame(STUDYID=NA_character_, SUBJID="B", USUBJID="C")),
regexp="NA not allowed in STUDYID"
)
})

0 comments on commit b16a1a7

Please sign in to comment.