-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1ac13c7
commit b16a1a7
Showing
5 changed files
with
197 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#' Standardize the SDTM ID columns (STUDYID, USUBJID, and SUBJID) | ||
#' | ||
#' @param data data.frame or similar to have standard IDs added | ||
#' @param id a data.frame or list of data.frames to merge other identifiers into | ||
#' the data | ||
#' @param studyid a character scalar indicating the study number | ||
#' @param sep_usubjid a character scalar separating the STUDYID from the SUBJID | ||
#' @param allow_missing_id Is it acceptable for some USUBJID and SUBJID to be | ||
#' missing? | ||
#' @return A data.frame with columns for STUDYID, USUBJID, and SUBJID and other | ||
#' identifier columns removed. | ||
#' @export | ||
standardize_sdtm_id <- function(data, id, studyid, sep_usubjid="-", allow_missing_id=FALSE) { | ||
stopifnot("'sep_usubjid' must be a character"=is.character(sep_usubjid)) | ||
stopifnot("'sep_usubjid' must not be NA"=!is.na(sep_usubjid)) | ||
stopifnot("'sep_usubjid' must be a scalar"=length(sep_usubjid) == 1) | ||
# Track that no data were created or removed during this process (mainly to | ||
# confirm id merge accuracy) | ||
rowid_col <- paste0(max(names(data)), "X") | ||
data[[rowid_col]] <- seq_len(nrow(data)) | ||
id_cols <- c("STUDYID", "USUBJID", "SUBJID") | ||
ret <- data | ||
if (!("STUDYID" %in% names(ret))) { | ||
stopifnot("'studyid' must be a character"=is.character(studyid)) | ||
stopifnot("'studyid' must not be NA"=!is.na(studyid)) | ||
stopifnot("'studyid' must be a scalar"=length(studyid) == 1) | ||
ret$STUDYID <- studyid | ||
} | ||
stopifnot("NA not allowed in STUDYID"=!any(is.na(ret$STUDYID))) | ||
if (!any(c("SUBJID", "USUBJID") %in% names(ret))) { | ||
if (missing(id)) { | ||
stop("'SUBJID' and 'USUBJID' columns are not in 'data' and no 'id' was provided") | ||
} | ||
if (is.data.frame(id)) { | ||
id <- list(id) | ||
} | ||
for (idx in seq_along(id)) { | ||
current_id <- id[[idx]] | ||
if (length(intersect(names(ret), setdiff(names(current_id), id_cols))) > 0) { | ||
ret <- dplyr::left_join(ret, current_id, by=intersect(names(ret), names(current_id))) | ||
} | ||
# Drop columns from id (e.g. RANDID) that are not STUDYID, SUBJID, or USUBJID | ||
ret <- ret[, setdiff(names(ret), setdiff(names(current_id), id_cols)), drop=FALSE] | ||
} | ||
} | ||
if (!allow_missing_id) { | ||
if ("SUBJID" %in% names(ret)) stopifnot("NA not allowed in SUBJID"=!any(is.na(ret$SUBJID))) | ||
if ("USUBJID" %in% names(ret)) stopifnot("NA not allowed in USUBJID"=!any(is.na(ret$USUBJID))) | ||
} | ||
if ("SUBJID" %in% names(ret) & !("USUBJID" %in% names(ret))) { | ||
# Put USUBJID in | ||
ret$USUBJID <- paste(ret$STUDYID, ret$SUBJID, sep=sep_usubjid) | ||
} | ||
if ("USUBJID" %in% names(ret) & !("SUBJID" %in% names(ret))) { | ||
ret$SUBJID <- | ||
substr( | ||
ret$USUBJID, | ||
start=nchar(ret$STUDYID) + nchar(sep_usubjid) + 1, | ||
stop=nchar(ret$USUBJID) | ||
) | ||
} | ||
if (!all(id_cols %in% names(ret))) { | ||
stop("Not all required columns are in the data.frame for return. Report a bug.") # nocov | ||
} | ||
stopifnot("Likely merge error, different number of rows in output than input"=nrow(ret) == nrow(data)) | ||
stopifnot("Likely subtle merge error, input rows are missing from output"=!any(duplicated(ret[[rowid_col]]))) | ||
ret[, c(id_cols, setdiff(names(ret), c(rowid_col, id_cols)))] | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
test_that("standardize_sdtm_id", { | ||
# Column order is standardized | ||
expect_equal( | ||
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID="C")), | ||
data.frame(STUDYID="A", USUBJID="C", SUBJID="B") | ||
) | ||
# STUDYID is added | ||
expect_equal( | ||
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid="A"), | ||
data.frame(STUDYID="A", USUBJID="C", SUBJID="B") | ||
) | ||
# SUBJID is added | ||
expect_equal( | ||
standardize_sdtm_id(data=data.frame(USUBJID="A-C"), studyid="A"), | ||
data.frame(STUDYID="A", USUBJID="A-C", SUBJID="C") | ||
) | ||
# USUBJID is added | ||
expect_equal( | ||
standardize_sdtm_id(data=data.frame(SUBJID="A-C"), studyid="B"), | ||
data.frame(STUDYID="B", USUBJID="B-A-C", SUBJID="A-C") | ||
) | ||
}) | ||
|
||
test_that("standardize_sdtm_id id addition", { | ||
expect_equal( | ||
standardize_sdtm_id( | ||
data=data.frame(RANDID="foo"), studyid="B", | ||
id=data.frame(RANDID="foo", SUBJID="C") | ||
), | ||
data.frame(STUDYID="B", USUBJID="B-C", SUBJID="C") | ||
) | ||
expect_error( | ||
standardize_sdtm_id( | ||
data=data.frame(RANDID="foo"), studyid="B", | ||
id=data.frame(RANDID="foo", bar=c("bar", "baz"), SUBJID="C") | ||
), | ||
regexp="Likely merge error, different number of rows in output than input" | ||
) | ||
}) | ||
|
||
test_that("standardize_sdtm_id alternate paths", { | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID=NA_character_, USUBJID="C")), | ||
regexp="NA not allowed in SUBJID" | ||
) | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID=NA_character_)), | ||
regexp="NA not allowed in USUBJID" | ||
) | ||
expect_equal( | ||
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID=NA_character_, USUBJID="C"), allow_missing_id=TRUE), | ||
data.frame(STUDYID="A", USUBJID="C", SUBJID=NA_character_) | ||
) | ||
expect_equal( | ||
standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID=NA_character_), allow_missing_id=TRUE), | ||
data.frame(STUDYID="A", USUBJID=NA_character_, SUBJID="B") | ||
) | ||
}) | ||
|
||
test_that("standardize_sdtm_id errors", { | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=factor("A")), | ||
regexp="'sep_usubjid' must be a character" | ||
) | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=NA_character_), | ||
regexp="'sep_usubjid' must not be NA" | ||
) | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=c("A", "B")), | ||
regexp="'sep_usubjid' must be a scalar" | ||
) | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C")), | ||
regexp='argument "studyid" is missing, with no default' | ||
) | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=factor("A")), | ||
regexp="'studyid' must be a character" | ||
) | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=NA_character_), | ||
regexp="'studyid' must not be NA" | ||
) | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=c("A", "B")), | ||
regexp="'studyid' must be a scalar" | ||
) | ||
expect_error( | ||
standardize_sdtm_id(data=data.frame(STUDYID=NA_character_, SUBJID="B", USUBJID="C")), | ||
regexp="NA not allowed in STUDYID" | ||
) | ||
}) |