diff --git a/DESCRIPTION b/DESCRIPTION index 8be1d693..51ef2a44 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: MotrpacBicQC Type: Package Title: QC/QA functions for the MoTrPAC community -Version: 0.9.5 -Date: 2024-05-22 +Version: 0.9.51 +Date: 2024-08-07 Author: MoTrPAC Bioinformatics Center Maintainer: David Jimenez-Morales Description: R Package for the analysis of MoTrPAC datasets. diff --git a/R/metabolomics_qc.R b/R/metabolomics_qc.R index ab50a226..cebcaaa5 100644 --- a/R/metabolomics_qc.R +++ b/R/metabolomics_qc.R @@ -181,7 +181,7 @@ check_metadata_samples <- function(df, # filter only expected columns df <- filter_required_columns(df = df, type = "m_s", - verbose = TRUE) + verbose = verbose) # Check every column # sample_id: si @@ -276,12 +276,17 @@ check_metadata_samples <- function(df, } if("acquisition_date" %in% colnames(df)){ - if( any(grepl(":", df$acquisition_date)) ){ - if(verbose) message(" + (i) Assuming `acquisition_date` is in `MM/DD/YYYY HH:MM:SS AM/PM` format. Validating:") - icdt <- validate_dates_times(df = df, column_name = "acquisition_date", verbose = verbose) + if(any(is.na(df$acquisition_date))){ + if(verbose) message(" - (-) `acquisition_date` has NA values: FAIL") + ic <- ic + 1 }else{ - icdate <- validate_yyyymmdd_dates(df = df, date_column = "acquisition_date", verbose = verbose) - ic <- ic + icdate + if( any(grepl(":", df$acquisition_date)) ){ + if(verbose) message(" + (i) Assuming `acquisition_date` is in `MM/DD/YYYY HH:MM:SS AM/PM` format. Validating:") + icdt <- validate_dates_times(df = df, column_name = "acquisition_date", verbose = verbose) + }else{ + icdate <- validate_yyyymmdd_dates(df = df, date_column = "acquisition_date", verbose = verbose) + ic <- ic + icdate + } } }else{ if(verbose) message(" - (-) `acquisition_date` column missed: FAIL") @@ -1200,15 +1205,17 @@ load_metabolomics_batch <- function(input_results_folder, assay <- validate_assay(input_results_folder) tissue_code <- validate_tissue(input_results_folder) - total_issues <- validate_metabolomics(input_results_folder = input_results_folder, - cas = cas, - return_n_issues = TRUE, - full_report = FALSE, - f_proof = FALSE, - verbose = FALSE) + total_issues <- + validate_metabolomics( + input_results_folder = input_results_folder, + cas = cas, + return_n_issues = TRUE, + full_report = FALSE, + f_proof = FALSE, + verbose = FALSE) if(total_issues > 0){ - message("\n\tWARNING!!! Too many issues identified (", total_issues,"). This batch should not be processed until the issues are solved") + message("\tWARNING!!! Too many issues identified (", total_issues,"). This batch should not be processed until the issues are solved") } # Load Metabolomics---- @@ -1376,8 +1383,10 @@ combine_metabolomics_batch <- function(input_results_folder, verbose = TRUE){ # Load all datasets - metab_dfs <- load_metabolomics_batch(input_results_folder = input_results_folder, - cas = cas) + metab_dfs <- + load_metabolomics_batch( + input_results_folder = input_results_folder, + cas = cas, verbose = verbose) if(verbose) message("\n## MERGE") if(verbose) message("\nAll metabolomics datasets + basic phenotypic information") @@ -1677,9 +1686,11 @@ write_metabolomics_releases <- function(input_results_folder, if(verbose) message("+ Writing out ", cas, " ", phase_details, " ", tissue_code, " ", assay, " files", appendLF = FALSE) # Load all datasets---- - metab_dfs <- load_metabolomics_batch(input_results_folder = input_results_folder, - cas = cas, - verbose = FALSE) + metab_dfs <- + load_metabolomics_batch( + input_results_folder = input_results_folder, + cas = cas, + verbose = FALSE) # Create output folder------- if (is.null(folder_root)){ diff --git a/R/misc.R b/R/misc.R index 2c226804..4a855d86 100644 --- a/R/misc.R +++ b/R/misc.R @@ -310,27 +310,37 @@ filter_required_columns <- function(df, colnames(df) <- tolower(colnames(df)) missing_cols <- setdiff(emeta_metabo_coln_named, colnames(df)) if (length(missing_cols) > 0) { - if(verbose) message(" - (-) `metadata_metabolite`: Expected COLUMN NAMES are missed: FAIL") - message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`")) + if (verbose) message(" - (-) `metadata_metabolite`: Expected COLUMN NAMES are missed: FAIL") + message(paste0("\t The following required columns are not present: `", + paste(missing_cols, collapse = ", "), "`")) } else { - if(verbose) message(" + (+) All required columns present") + if (verbose) message(" + (+) All required columns present") df <- subset(df, select = emeta_metabo_coln_named) } return(df) - - } else if (type == "m_s"){ + } else if (type == "m_s") { emeta_sample_coln <- c("sample_id", "sample_type", "sample_order", "raw_file", "extraction_date", "acquisition_date", "lc_column_id") + required_cols <- setdiff(emeta_sample_coln, c("extraction_date", "acquisition_date", "lc_column_id")) missing_cols <- setdiff(emeta_sample_coln, colnames(df)) - - if (length(missing_cols) > 0) { - if(verbose) message(" - (-) `metadata_sample`: Expected COLUMN NAMES are missed: FAIL") - message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`")) + missing_required_cols <- setdiff(required_cols, colnames(df)) + + if (length(missing_required_cols) > 0) { + if (verbose) message(" - (-) `metadata_sample`: Expected COLUMN NAMES are missed: FAIL") + message(paste0("\t The following required columns are not present: `", + paste(missing_required_cols, collapse = ", "), "`")) } else { - if(verbose) message(" + (+) All required columns present") + if (length(missing_cols) > 0) { + message(" - (-) `metadata_sample`: recently required COLUMN NAMES are missed: Adding with NA values: FAIL") + for (col in c("extraction_date", "acquisition_date", "lc_column_id")) { + if (!(col %in% colnames(df))) { + df[[col]] <- NA + } + } + } + if (verbose) message(" + (+) All required columns present") df <- subset(df, select = emeta_sample_coln) } return(df) - } else if (type == "v_m"){ emeta_sample_coln <- c("vial_label", "tmt_plex") if( all(emeta_sample_coln %in% colnames(df)) ){ @@ -348,10 +358,10 @@ filter_required_columns <- function(df, if(verbose) message(" + (+) All required columns present (tmt18 experiment)") df <- subset(df, select = emeta_sample_coln) }else{ - if(verbose) message(" - (-) Expected COLUMN NAMES are missed: FAIL") + message(" - (-) Expected COLUMN NAMES are missed: FAIL") } }else{ - if(verbose) message(" - (-) Expected COLUMN NAMES are missed: FAIL") + message(" - (-) Expected COLUMN NAMES are missed: FAIL") } return(df) } else if (type == "olproteins"){ diff --git a/R/validations.R b/R/validations.R index bfe9335b..bd26c071 100644 --- a/R/validations.R +++ b/R/validations.R @@ -396,7 +396,9 @@ validate_dates_times <- function(df, column_name, verbose = TRUE) { #' validate_lc_column_id(df, column_name = "lc_column_id") #' #' @export -validate_lc_column_id <- function(df, column_name, verbose = TRUE) { +validate_lc_column_id <- function(df, + column_name, + verbose = TRUE) { # issue counter ic <- 0 diff --git a/tests/testthat/test-metabolomics_qc.R b/tests/testthat/test-metabolomics_qc.R index 79cbbeeb..a76943e8 100644 --- a/tests/testthat/test-metabolomics_qc.R +++ b/tests/testthat/test-metabolomics_qc.R @@ -7,8 +7,8 @@ test_that("check_metadata_metabolites returns the right number of issues", { }) test_that("check_metadata_sample returns the right number of issues", { - expect_equal(check_metadata_samples(df = metadata_sample_named, cas = "umichigan", return_n_issues = TRUE, verbose = FALSE), 3) - expect_equal(check_metadata_samples(df = metadata_sample_unnamed, cas = "umichigan", return_n_issues = TRUE, verbose = FALSE), 3) + expect_equal(check_metadata_samples(df = metadata_sample_named, cas = "umichigan", return_n_issues = TRUE, verbose = FALSE), 2) + expect_equal(check_metadata_samples(df = metadata_sample_unnamed, cas = "umichigan", return_n_issues = TRUE, verbose = FALSE), 2) }) test_that("check_results returns the right number of issues", {