diff --git a/DESCRIPTION b/DESCRIPTION index 39ca68e..05655d0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Rsdtm Title: Study Data Tabulation Model (SDTM) import, export, and management assistance -Version: 0.0.0.9011 +Version: 0.0.0.9012 Authors@R: c( person("William", "Denney", email = "wdenney@humanpredictions.com", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index a7337c2..2c0d0ab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,6 +18,7 @@ export(sdtm_dtc_to_datetime) export(sdtm_first_dose) export(sdtm_time_actual) export(simplify_sdtm_names) +export(standardize_sdtm_id) export(strip_attributes) export(supp_reformat) importFrom(dplyr,anti_join) diff --git a/R/standardize_ids.R b/R/standardize_ids.R new file mode 100644 index 0000000..3d0dd17 --- /dev/null +++ b/R/standardize_ids.R @@ -0,0 +1,68 @@ +#' Standardize the SDTM ID columns (STUDYID, USUBJID, and SUBJID) +#' +#' @param data data.frame or similar to have standard IDs added +#' @param id a data.frame or list of data.frames to merge other identifiers into +#' the data +#' @param studyid a character scalar indicating the study number +#' @param sep_usubjid a character scalar separating the STUDYID from the SUBJID +#' @param allow_missing_id Is it acceptable for some USUBJID and SUBJID to be +#' missing? +#' @return A data.frame with columns for STUDYID, USUBJID, and SUBJID and other +#' identifier columns removed. +#' @export +standardize_sdtm_id <- function(data, id, studyid, sep_usubjid="-", allow_missing_id=FALSE) { + stopifnot("'sep_usubjid' must be a character"=is.character(sep_usubjid)) + stopifnot("'sep_usubjid' must not be NA"=!is.na(sep_usubjid)) + stopifnot("'sep_usubjid' must be a scalar"=length(sep_usubjid) == 1) + # Track that no data were created or removed during this process (mainly to + # confirm id merge accuracy) + rowid_col <- paste0(max(names(data)), "X") + data[[rowid_col]] <- seq_len(nrow(data)) + id_cols <- c("STUDYID", "USUBJID", "SUBJID") + ret <- data + if (!("STUDYID" %in% names(ret))) { + stopifnot("'studyid' must be a character"=is.character(studyid)) + stopifnot("'studyid' must not be NA"=!is.na(studyid)) + stopifnot("'studyid' must be a scalar"=length(studyid) == 1) + ret$STUDYID <- studyid + } + stopifnot("NA not allowed in STUDYID"=!any(is.na(ret$STUDYID))) + if (!any(c("SUBJID", "USUBJID") %in% names(ret))) { + if (missing(id)) { + stop("'SUBJID' and 'USUBJID' columns are not in 'data' and no 'id' was provided") + } + if (is.data.frame(id)) { + id <- list(id) + } + for (idx in seq_along(id)) { + current_id <- id[[idx]] + if (length(intersect(names(ret), setdiff(names(current_id), id_cols))) > 0) { + ret <- dplyr::left_join(ret, current_id, by=intersect(names(ret), names(current_id))) + } + # Drop columns from id (e.g. RANDID) that are not STUDYID, SUBJID, or USUBJID + ret <- ret[, setdiff(names(ret), setdiff(names(current_id), id_cols)), drop=FALSE] + } + } + if (!allow_missing_id) { + if ("SUBJID" %in% names(ret)) stopifnot("NA not allowed in SUBJID"=!any(is.na(ret$SUBJID))) + if ("USUBJID" %in% names(ret)) stopifnot("NA not allowed in USUBJID"=!any(is.na(ret$USUBJID))) + } + if ("SUBJID" %in% names(ret) & !("USUBJID" %in% names(ret))) { + # Put USUBJID in + ret$USUBJID <- paste(ret$STUDYID, ret$SUBJID, sep=sep_usubjid) + } + if ("USUBJID" %in% names(ret) & !("SUBJID" %in% names(ret))) { + ret$SUBJID <- + substr( + ret$USUBJID, + start=nchar(ret$STUDYID) + nchar(sep_usubjid) + 1, + stop=nchar(ret$USUBJID) + ) + } + if (!all(id_cols %in% names(ret))) { + stop("Not all required columns are in the data.frame for return. Report a bug.") # nocov + } + stopifnot("Likely merge error, different number of rows in output than input"=nrow(ret) == nrow(data)) + stopifnot("Likely subtle merge error, input rows are missing from output"=!any(duplicated(ret[[rowid_col]]))) + ret[, c(id_cols, setdiff(names(ret), c(rowid_col, id_cols)))] +} diff --git a/man/standardize_sdtm_id.Rd b/man/standardize_sdtm_id.Rd new file mode 100644 index 0000000..53f4893 --- /dev/null +++ b/man/standardize_sdtm_id.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardize_ids.R +\name{standardize_sdtm_id} +\alias{standardize_sdtm_id} +\title{Standardize the SDTM ID columns (STUDYID, USUBJID, and SUBJID)} +\usage{ +standardize_sdtm_id( + data, + id, + studyid, + sep_usubjid = "-", + allow_missing_id = FALSE +) +} +\arguments{ +\item{data}{data.frame or similar to have standard IDs added} + +\item{id}{a data.frame or list of data.frames to merge other identifiers into +the data} + +\item{studyid}{a character scalar indicating the study number} + +\item{sep_usubjid}{a character scalar separating the STUDYID from the SUBJID} + +\item{allow_missing_id}{Is it acceptable for some USUBJID and SUBJID to be +missing?} +} +\value{ +A data.frame with columns for STUDYID, USUBJID, and SUBJID and other + identifier columns removed. +} +\description{ +Standardize the SDTM ID columns (STUDYID, USUBJID, and SUBJID) +} diff --git a/tests/testthat/test-standardize_ids.R b/tests/testthat/test-standardize_ids.R new file mode 100644 index 0000000..a3fcdd9 --- /dev/null +++ b/tests/testthat/test-standardize_ids.R @@ -0,0 +1,93 @@ +test_that("standardize_sdtm_id", { + # Column order is standardized + expect_equal( + standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID="C")), + data.frame(STUDYID="A", USUBJID="C", SUBJID="B") + ) + # STUDYID is added + expect_equal( + standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid="A"), + data.frame(STUDYID="A", USUBJID="C", SUBJID="B") + ) + # SUBJID is added + expect_equal( + standardize_sdtm_id(data=data.frame(USUBJID="A-C"), studyid="A"), + data.frame(STUDYID="A", USUBJID="A-C", SUBJID="C") + ) + # USUBJID is added + expect_equal( + standardize_sdtm_id(data=data.frame(SUBJID="A-C"), studyid="B"), + data.frame(STUDYID="B", USUBJID="B-A-C", SUBJID="A-C") + ) +}) + +test_that("standardize_sdtm_id id addition", { + expect_equal( + standardize_sdtm_id( + data=data.frame(RANDID="foo"), studyid="B", + id=data.frame(RANDID="foo", SUBJID="C") + ), + data.frame(STUDYID="B", USUBJID="B-C", SUBJID="C") + ) + expect_error( + standardize_sdtm_id( + data=data.frame(RANDID="foo"), studyid="B", + id=data.frame(RANDID="foo", bar=c("bar", "baz"), SUBJID="C") + ), + regexp="Likely merge error, different number of rows in output than input" + ) +}) + +test_that("standardize_sdtm_id alternate paths", { + expect_error( + standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID=NA_character_, USUBJID="C")), + regexp="NA not allowed in SUBJID" + ) + expect_error( + standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID=NA_character_)), + regexp="NA not allowed in USUBJID" + ) + expect_equal( + standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID=NA_character_, USUBJID="C"), allow_missing_id=TRUE), + data.frame(STUDYID="A", USUBJID="C", SUBJID=NA_character_) + ) + expect_equal( + standardize_sdtm_id(data=data.frame(STUDYID="A", SUBJID="B", USUBJID=NA_character_), allow_missing_id=TRUE), + data.frame(STUDYID="A", USUBJID=NA_character_, SUBJID="B") + ) +}) + +test_that("standardize_sdtm_id errors", { + expect_error( + standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=factor("A")), + regexp="'sep_usubjid' must be a character" + ) + expect_error( + standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=NA_character_), + regexp="'sep_usubjid' must not be NA" + ) + expect_error( + standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), sep_usubjid=c("A", "B")), + regexp="'sep_usubjid' must be a scalar" + ) + expect_error( + standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C")), + regexp='argument "studyid" is missing, with no default' + ) + expect_error( + standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=factor("A")), + regexp="'studyid' must be a character" + ) + expect_error( + standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=NA_character_), + regexp="'studyid' must not be NA" + ) + expect_error( + standardize_sdtm_id(data=data.frame(SUBJID="B", USUBJID="C"), studyid=c("A", "B")), + regexp="'studyid' must be a scalar" + ) + expect_error( + standardize_sdtm_id(data=data.frame(STUDYID=NA_character_, SUBJID="B", USUBJID="C")), + regexp="NA not allowed in STUDYID" + ) +})