Merge pull request #244 from MoTrPAC/develop

MotrpacBicQC 0.9.6: download GCP with recursive, metabolomics updates
MoTrPAC · Sep 23, 2024 · 25e929c · 25e929c
2 parents e48e4cb + 017b1ca
commit 25e929c
Show file tree

Hide file tree

Showing 96 changed files with 436 additions and 361 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: MotrpacBicQC
 Type: Package
 Title: QC/QA functions for the MoTrPAC community
-Version: 0.9.5
-Date: 2024-05-22
+Version: 0.9.6
+Date: 2024-09-23
 Author: MoTrPAC Bioinformatics Center
 Maintainer: David Jimenez-Morales <davidjm@stanford.edu>
 Description: R Package for the analysis of MoTrPAC datasets. 

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# MotrpacBicQC 0.9.6 (2024-09-23)
+
+* Download and read file from GCP function can create recursive folders (@christopherjin)
+* Adjustments in metabolomics metadata sample files QC to enable processing 
+of old submissions (before batch related variables were required)
+
 # MotrpacBicQC 0.9.5 (2024-05-22)
 
 * Proteomics: provide QC support for TMT-18
@@ -10,7 +16,6 @@
   + Improve detecting the source of errors
   + Improve verbosity and feedback to the user
 
-
 # MotrpacBicQC 0.9.3 (2024-03-25)
 
 * Critical update `validate_refmetname`: ensure checking the refmet standarized name. Update refmet tests

diff --git a/R/metabolomics_qc.R b/R/metabolomics_qc.R
@@ -181,7 +181,7 @@ check_metadata_samples <- function(df,
   # filter only expected columns
   df <- filter_required_columns(df = df,
                                 type = "m_s",
-                                verbose = TRUE)
+                                verbose = verbose)
 
   # Check every column
   # sample_id: si
@@ -276,12 +276,17 @@ check_metadata_samples <- function(df,
   }
 
   if("acquisition_date" %in% colnames(df)){
-    if( any(grepl(":", df$acquisition_date)) ){
-      if(verbose) message("  + (i) Assuming `acquisition_date` is in `MM/DD/YYYY HH:MM:SS AM/PM` format. Validating:")
-      icdt <- validate_dates_times(df = df, column_name = "acquisition_date", verbose = verbose)
+    if(any(is.na(df$acquisition_date))){
+      if(verbose) message("   - (-) `acquisition_date` has NA values: FAIL")
+      ic <- ic + 1
     }else{
-      icdate <- validate_yyyymmdd_dates(df = df, date_column = "acquisition_date", verbose = verbose)
-      ic <- ic + icdate
+      if( any(grepl(":", df$acquisition_date)) ){
+        if(verbose) message("  + (i) Assuming `acquisition_date` is in `MM/DD/YYYY HH:MM:SS AM/PM` format. Validating:")
+        icdt <- validate_dates_times(df = df, column_name = "acquisition_date", verbose = verbose)
+      }else{
+        icdate <- validate_yyyymmdd_dates(df = df, date_column = "acquisition_date", verbose = verbose)
+        ic <- ic + icdate
+      }
     }
   }else{
     if(verbose) message("   - (-) `acquisition_date` column missed: FAIL")
@@ -1200,15 +1205,17 @@ load_metabolomics_batch <- function(input_results_folder,
   assay <- validate_assay(input_results_folder)
   tissue_code <- validate_tissue(input_results_folder)
 
-  total_issues <- validate_metabolomics(input_results_folder = input_results_folder, 
-                                        cas = cas, 
-                                        return_n_issues = TRUE, 
-                                        full_report = FALSE, 
-                                        f_proof = FALSE, 
-                                        verbose = FALSE)
+  total_issues <- 
+    validate_metabolomics(
+      input_results_folder = input_results_folder, 
+      cas = cas, 
+      return_n_issues = TRUE, 
+      full_report = FALSE, 
+      f_proof = FALSE, 
+      verbose = FALSE)
 
   if(total_issues > 0){
-    message("\n\tWARNING!!! Too many issues identified (", total_issues,"). This batch should not be processed until the issues are solved")
+    message("\tWARNING!!! Too many issues identified (", total_issues,"). This batch should not be processed until the issues are solved")
   }
 
   # Load Metabolomics----
@@ -1376,8 +1383,10 @@ combine_metabolomics_batch <- function(input_results_folder,
                                        verbose = TRUE){
 
   # Load all datasets
-  metab_dfs <- load_metabolomics_batch(input_results_folder = input_results_folder,
-                                       cas = cas)
+  metab_dfs <- 
+    load_metabolomics_batch(
+      input_results_folder = input_results_folder,
+      cas = cas, verbose = verbose)
 
   if(verbose) message("\n## MERGE")
   if(verbose) message("\nAll metabolomics datasets + basic phenotypic information")
@@ -1677,9 +1686,11 @@ write_metabolomics_releases <- function(input_results_folder,
   if(verbose) message("+ Writing out ", cas, " ", phase_details, " ", tissue_code, " ", assay, " files", appendLF = FALSE)
 
   # Load all datasets----
-  metab_dfs <- load_metabolomics_batch(input_results_folder = input_results_folder,
-                                       cas = cas,
-                                       verbose = FALSE)
+  metab_dfs <- 
+    load_metabolomics_batch(
+      input_results_folder = input_results_folder,
+      cas = cas,
+      verbose = FALSE)
 
   # Create output folder-------
   if (is.null(folder_root)){

diff --git a/R/misc.R b/R/misc.R
@@ -54,33 +54,33 @@ create_folder <- function(folder_name = NULL,
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #' Download and Read File from Google Cloud Storage
 #'
-#' This function downloads a file from Google Cloud Storage (GCS) to a local 
-#' directory and reads it into R as a data frame. It uses the `gsutil` 
+#' This function downloads a file from Google Cloud Storage (GCS) to a local
+#' directory and reads it into R as a data frame. It uses the `gsutil`
 #' command-line tool to handle the file download.
 #'
 #' @param path Character. The path to the file in GCS, e.g., `gs://bucket-name/file-name.csv`.
 #' @param sep Character. The field separator character. Default is `\t`.
-#' @param header Logical. Whether the file contains the names of the variables 
+#' @param header Logical. Whether the file contains the names of the variables
 #'   as its first line. Default is TRUE.
-#' @param tmpdir Character. The local directory to which the file will be 
+#' @param tmpdir Character. The local directory to which the file will be
 #'   downloaded.
-#' @param gsutil_path Character. The path to the `gsutil` command-line tool. 
+#' @param gsutil_path Character. The path to the `gsutil` command-line tool.
 #'   Default is "gsutil".
-#' @param check_first Logical. Whether to check if the file already exists 
+#' @param check_first Logical. Whether to check if the file already exists
 #'   locally before downloading. Default is TRUE.
-#' @param verbose Logical. If TRUE, prints messages about the download process. 
+#' @param verbose Logical. If TRUE, prints messages about the download process.
 #'   Default is FALSE.
 #' @param ... Additional arguments passed to `readr::read_delim`.
 #'
 #' @details
-#' This function first checks if the specified file exists in GCS. If the file 
-#' exists, it downloads the file to the specified local directory (`tmpdir`). If 
-#' the local directory does not exist, it will be created. The function handles 
-#' spaces in directory paths by quoting them appropriately. If the file is 
+#' This function first checks if the specified file exists in GCS. If the file
+#' exists, it downloads the file to the specified local directory (`tmpdir`). If
+#' the local directory does not exist, it will be created. The function handles
+#' spaces in directory paths by quoting them appropriately. If the file is
 #' successfully downloaded, it is read into R using `readr::read_delim`.
 #'
-#' If the `check_first` argument is set to TRUE, the function will first check 
-#' if the file already exists locally to avoid redundant downloads. If the file 
+#' If the `check_first` argument is set to TRUE, the function will first check
+#' if the file already exists locally to avoid redundant downloads. If the file
 #' is already present locally, it will not be downloaded again.
 #'
 #' @return A data frame containing the contents of the downloaded file.
@@ -97,7 +97,7 @@ create_folder <- function(folder_name = NULL,
 #'   verbose = TRUE
 #' )
 #' }
-#' 
+#'
 #' @export
 dl_read_gcp <- function(path,
                         sep = "\t",
@@ -107,20 +107,20 @@ dl_read_gcp <- function(path,
                         check_first = TRUE,
                         verbose = FALSE,
                         ...){
-  
+
   # Detect the operating system
   os_name <- Sys.info()["sysname"]
-  
+
   # Default arguments for Mac
   ignore_std_err <- TRUE
   ignore_std_out <- TRUE
-  
+
   # Change default arguments if the OS is Windows
   if (os_name == "Windows") {
     ignore_std_err <- FALSE
     ignore_std_out <- FALSE
   }
-  
+
   # Validate gsutil path first
   validate_cmd <- sprintf('%s version', gsutil_path)
   if(verbose) message(paste0("- Validating `gsutil_path` on your system: ", gsutil_path))
@@ -131,40 +131,40 @@ dl_read_gcp <- function(path,
   }, error = function(e) {
     FALSE
   })
-  
+
   if(!gsutil_valid){
     stop("The gsutil path is incorrect or gsutil is not installed. Please ensure that gsutil is installed and the `gsutil_path` is correct.")
   }
-  
+
   # Check if the file exists in GCP
   check_cmd <- sprintf('%s ls %s', gsutil_path, path)
-  file_exists <- system(check_cmd, 
-                        ignore.stdout = ignore_std_out, 
+  file_exists <- system(check_cmd,
+                        ignore.stdout = ignore_std_out,
                         ignore.stderr = ignore_std_err) == 0
-  
+
   if(!file_exists){
     stop(paste0("\nThe file `", path, "` does not exist in GCP"))
   }
-  
+
   # Create directory
   if(!dir.exists(tmpdir)){
-    dir.create(tmpdir)
+    dir.create(tmpdir, recursive = TRUE)
     if(verbose) message(paste0("- New folder `", tmpdir, "` created successfully"))
   }else{
     if(verbose) message(paste0("- Folder `", tmpdir, "` already exists"))
   }
-  
+
   # create the normalized version of the destination path
   tmpdir_norm <- normalizePath(tmpdir)
-  
-  # if the normalized path name contains spaces, 
-  # add shell quotes before it is saved to tmpdir, 
+
+  # if the normalized path name contains spaces,
+  # add shell quotes before it is saved to tmpdir,
   # which ultimately goes to system()
   if(grepl("\\s", tmpdir_norm)){
     tmpdir <- shQuote(tmpdir_norm)
     if(verbose) message("- The temp folder has spaces")
   } else{
-  # Otherwise, tmpdir_norm and tmpdir can remain the same  
+  # Otherwise, tmpdir_norm and tmpdir can remain the same
     tmpdir <- tmpdir_norm
   }
 
@@ -175,7 +175,7 @@ dl_read_gcp <- function(path,
     new_path <- file.path(tmpdir_norm, basename(path))
   }
 
-  # only download if it doesn't exist to avoid conflicts when running this 
+  # only download if it doesn't exist to avoid conflicts when running this
   # script in parallel; clear scratch space when you're done
   if(check_first){
     if( !file.exists(new_path) ){
@@ -200,16 +200,16 @@ dl_read_gcp <- function(path,
 
   # read in the data using readr instead of data.table
   if(file.exists(new_path)){
-    df <- readr::read_delim(new_path, 
-                            delim = sep, 
-                            col_names = header, 
-                            skip_empty_rows = TRUE, 
+    df <- readr::read_delim(new_path,
+                            delim = sep,
+                            col_names = header,
+                            skip_empty_rows = TRUE,
                             show_col_types = FALSE, ...)
     df <- as.data.frame(df)
     return(df)
   }else{
     stop("Problems loading the file. Two possible reasons:
-         - Something might have gone wrong with the download. 
+         - Something might have gone wrong with the download.
          - This is not a tab-delimited file (default): if you are trying to download a csv file instead, then use `sep = \",\"` instead.
     Re-run the command again with `verbose = TRUE`)")
   }
@@ -282,10 +282,10 @@ get_full_path2batch <- function(input_results_folder){
 #' }
 #' @export
 filter_required_columns <- function(df,
-                                    type = c("m_m", 
-                                             "m_s", 
-                                             "v_m", 
-                                             "olproteins", 
+                                    type = c("m_m",
+                                             "m_s",
+                                             "v_m",
+                                             "olproteins",
                                              "olsamples"),
                                     name_id = NULL,
                                     verbose = TRUE){
@@ -310,27 +310,37 @@ filter_required_columns <- function(df,
     colnames(df) <- tolower(colnames(df))
     missing_cols <- setdiff(emeta_metabo_coln_named, colnames(df))
     if (length(missing_cols) > 0) {
-      if(verbose) message("   - (-) `metadata_metabolite`: Expected COLUMN NAMES are missed: FAIL")
-      message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
+      if (verbose) message("   - (-) `metadata_metabolite`: Expected COLUMN NAMES are missed: FAIL")
+      message(paste0("\t The following required columns are not present: `", 
+                     paste(missing_cols, collapse = ", "), "`"))
     } else {
-      if(verbose) message("  + (+) All required columns present")
+      if (verbose) message("  + (+) All required columns present")
       df <- subset(df, select = emeta_metabo_coln_named)
     }
     return(df)
-
-  } else if (type == "m_s"){
+  } else if (type == "m_s") {
     emeta_sample_coln <- c("sample_id", "sample_type", "sample_order", "raw_file", "extraction_date", "acquisition_date", "lc_column_id")
+    required_cols <- setdiff(emeta_sample_coln, c("extraction_date", "acquisition_date", "lc_column_id"))
     missing_cols <- setdiff(emeta_sample_coln, colnames(df))
-
-    if (length(missing_cols) > 0) {
-      if(verbose) message("   - (-) `metadata_sample`: Expected COLUMN NAMES are missed: FAIL")
-      message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
+    missing_required_cols <- setdiff(required_cols, colnames(df))
+
+    if (length(missing_required_cols) > 0) {
+      if (verbose) message("   - (-) `metadata_sample`: Expected COLUMN NAMES are missed: FAIL")
+      message(paste0("\t The following required columns are not present: `", 
+                     paste(missing_required_cols, collapse = ", "), "`"))
     } else {
-      if(verbose) message("  + (+) All required columns present")
+      if (length(missing_cols) > 0) {
+        message("   - (-) `metadata_sample`: recently required COLUMN NAMES are missed: Adding with NA values: FAIL")
+        for (col in c("extraction_date", "acquisition_date", "lc_column_id")) {
+          if (!(col %in% colnames(df))) {
+            df[[col]] <- NA
+          }
+        }
+      }
+      if (verbose) message("  + (+) All required columns present")
       df <- subset(df, select = emeta_sample_coln)
     }
     return(df)
-
   } else if (type == "v_m"){
     emeta_sample_coln <- c("vial_label", "tmt_plex")
     if( all(emeta_sample_coln %in% colnames(df)) ){
@@ -348,16 +358,16 @@ filter_required_columns <- function(df,
         if(verbose) message("  + (+) All required columns present (tmt18 experiment)")
         df <- subset(df, select = emeta_sample_coln)
       }else{
-        if(verbose) message("   - (-) Expected COLUMN NAMES are missed: FAIL")
+        message("   - (-) Expected COLUMN NAMES are missed: FAIL")
       }
     }else{
-      if(verbose) message("   - (-) Expected COLUMN NAMES are missed: FAIL")
+      message("   - (-) Expected COLUMN NAMES are missed: FAIL")
     }
     return(df)
   } else if (type == "olproteins"){
     emeta_sample_coln <- c("olink_id", "uniprot_entry", "assay", "missing_freq", "panel_name", "panel_lot_nr", "normalization")
     missing_cols <- setdiff(emeta_sample_coln, colnames(df))
-    
+
     if (length(missing_cols) > 0) {
       if(verbose) message("   - (-) `metadata_proteins`: Expected COLUMN NAMES are missed: FAIL")
       message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
@@ -369,7 +379,7 @@ filter_required_columns <- function(df,
   }else if (type == "olsamples"){
     emeta_sample_coln <- c("sample_id", "sample_type", "sample_order", "plate_id")
     missing_cols <- setdiff(emeta_sample_coln, colnames(df))
-    
+
     if (length(missing_cols) > 0) {
       if(verbose) message("   - (-) `metadata_samples`: Expected COLUMN NAMES are missed: FAIL")
       message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
@@ -422,7 +432,7 @@ open_file <- function(input_results_folder,
     ofile <- NULL
     filename <- NULL
   }else{
-    
+
     filename <- file_metametabolites[1]
     file_ext <- sub(".*\\.(.*)$", "\\1", filename)
     if (!file_ext %in% c("txt", "tsv")) {
@@ -530,7 +540,7 @@ set_phase <- function(input_results_folder,
                            ignore.case = TRUE,
                            full.names=TRUE,
                            recursive = TRUE)
-  
+
   if(length(file_phase) > 1){
     if(verbose) message("- (-) `More than one `metadata_phase.txt` file available. Only one is valid (place the valid one in the BATCH folder): FAIL")
   }