Skip to content

Commit

Permalink
Merge pull request #244 from MoTrPAC/develop
Browse files Browse the repository at this point in the history
MotrpacBicQC 0.9.6: download GCP with recursive, metabolomics updates
  • Loading branch information
biodavidjm authored Sep 23, 2024
2 parents e48e4cb + 017b1ca commit 25e929c
Show file tree
Hide file tree
Showing 96 changed files with 436 additions and 361 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: MotrpacBicQC
Type: Package
Title: QC/QA functions for the MoTrPAC community
Version: 0.9.5
Date: 2024-05-22
Version: 0.9.6
Date: 2024-09-23
Author: MoTrPAC Bioinformatics Center
Maintainer: David Jimenez-Morales <davidjm@stanford.edu>
Description: R Package for the analysis of MoTrPAC datasets.
Expand Down
7 changes: 6 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# MotrpacBicQC 0.9.6 (2024-09-23)

* Download and read file from GCP function can create recursive folders (@christopherjin)
* Adjustments in metabolomics metadata sample files QC to enable processing
of old submissions (before batch related variables were required)

# MotrpacBicQC 0.9.5 (2024-05-22)

* Proteomics: provide QC support for TMT-18
Expand All @@ -10,7 +16,6 @@
+ Improve detecting the source of errors
+ Improve verbosity and feedback to the user


# MotrpacBicQC 0.9.3 (2024-03-25)

* Critical update `validate_refmetname`: ensure checking the refmet standarized name. Update refmet tests
Expand Down
47 changes: 29 additions & 18 deletions R/metabolomics_qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ check_metadata_samples <- function(df,
# filter only expected columns
df <- filter_required_columns(df = df,
type = "m_s",
verbose = TRUE)
verbose = verbose)

# Check every column
# sample_id: si
Expand Down Expand Up @@ -276,12 +276,17 @@ check_metadata_samples <- function(df,
}

if("acquisition_date" %in% colnames(df)){
if( any(grepl(":", df$acquisition_date)) ){
if(verbose) message(" + (i) Assuming `acquisition_date` is in `MM/DD/YYYY HH:MM:SS AM/PM` format. Validating:")
icdt <- validate_dates_times(df = df, column_name = "acquisition_date", verbose = verbose)
if(any(is.na(df$acquisition_date))){
if(verbose) message(" - (-) `acquisition_date` has NA values: FAIL")
ic <- ic + 1
}else{
icdate <- validate_yyyymmdd_dates(df = df, date_column = "acquisition_date", verbose = verbose)
ic <- ic + icdate
if( any(grepl(":", df$acquisition_date)) ){
if(verbose) message(" + (i) Assuming `acquisition_date` is in `MM/DD/YYYY HH:MM:SS AM/PM` format. Validating:")
icdt <- validate_dates_times(df = df, column_name = "acquisition_date", verbose = verbose)
}else{
icdate <- validate_yyyymmdd_dates(df = df, date_column = "acquisition_date", verbose = verbose)
ic <- ic + icdate
}
}
}else{
if(verbose) message(" - (-) `acquisition_date` column missed: FAIL")
Expand Down Expand Up @@ -1200,15 +1205,17 @@ load_metabolomics_batch <- function(input_results_folder,
assay <- validate_assay(input_results_folder)
tissue_code <- validate_tissue(input_results_folder)

total_issues <- validate_metabolomics(input_results_folder = input_results_folder,
cas = cas,
return_n_issues = TRUE,
full_report = FALSE,
f_proof = FALSE,
verbose = FALSE)
total_issues <-
validate_metabolomics(
input_results_folder = input_results_folder,
cas = cas,
return_n_issues = TRUE,
full_report = FALSE,
f_proof = FALSE,
verbose = FALSE)

if(total_issues > 0){
message("\n\tWARNING!!! Too many issues identified (", total_issues,"). This batch should not be processed until the issues are solved")
message("\tWARNING!!! Too many issues identified (", total_issues,"). This batch should not be processed until the issues are solved")
}

# Load Metabolomics----
Expand Down Expand Up @@ -1376,8 +1383,10 @@ combine_metabolomics_batch <- function(input_results_folder,
verbose = TRUE){

# Load all datasets
metab_dfs <- load_metabolomics_batch(input_results_folder = input_results_folder,
cas = cas)
metab_dfs <-
load_metabolomics_batch(
input_results_folder = input_results_folder,
cas = cas, verbose = verbose)

if(verbose) message("\n## MERGE")
if(verbose) message("\nAll metabolomics datasets + basic phenotypic information")
Expand Down Expand Up @@ -1677,9 +1686,11 @@ write_metabolomics_releases <- function(input_results_folder,
if(verbose) message("+ Writing out ", cas, " ", phase_details, " ", tissue_code, " ", assay, " files", appendLF = FALSE)

# Load all datasets----
metab_dfs <- load_metabolomics_batch(input_results_folder = input_results_folder,
cas = cas,
verbose = FALSE)
metab_dfs <-
load_metabolomics_batch(
input_results_folder = input_results_folder,
cas = cas,
verbose = FALSE)

# Create output folder-------
if (is.null(folder_root)){
Expand Down
124 changes: 67 additions & 57 deletions R/misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,33 +54,33 @@ create_folder <- function(folder_name = NULL,
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' Download and Read File from Google Cloud Storage
#'
#' This function downloads a file from Google Cloud Storage (GCS) to a local
#' directory and reads it into R as a data frame. It uses the `gsutil`
#' This function downloads a file from Google Cloud Storage (GCS) to a local
#' directory and reads it into R as a data frame. It uses the `gsutil`
#' command-line tool to handle the file download.
#'
#' @param path Character. The path to the file in GCS, e.g., `gs://bucket-name/file-name.csv`.
#' @param sep Character. The field separator character. Default is `\t`.
#' @param header Logical. Whether the file contains the names of the variables
#' @param header Logical. Whether the file contains the names of the variables
#' as its first line. Default is TRUE.
#' @param tmpdir Character. The local directory to which the file will be
#' @param tmpdir Character. The local directory to which the file will be
#' downloaded.
#' @param gsutil_path Character. The path to the `gsutil` command-line tool.
#' @param gsutil_path Character. The path to the `gsutil` command-line tool.
#' Default is "gsutil".
#' @param check_first Logical. Whether to check if the file already exists
#' @param check_first Logical. Whether to check if the file already exists
#' locally before downloading. Default is TRUE.
#' @param verbose Logical. If TRUE, prints messages about the download process.
#' @param verbose Logical. If TRUE, prints messages about the download process.
#' Default is FALSE.
#' @param ... Additional arguments passed to `readr::read_delim`.
#'
#' @details
#' This function first checks if the specified file exists in GCS. If the file
#' exists, it downloads the file to the specified local directory (`tmpdir`). If
#' the local directory does not exist, it will be created. The function handles
#' spaces in directory paths by quoting them appropriately. If the file is
#' This function first checks if the specified file exists in GCS. If the file
#' exists, it downloads the file to the specified local directory (`tmpdir`). If
#' the local directory does not exist, it will be created. The function handles
#' spaces in directory paths by quoting them appropriately. If the file is
#' successfully downloaded, it is read into R using `readr::read_delim`.
#'
#' If the `check_first` argument is set to TRUE, the function will first check
#' if the file already exists locally to avoid redundant downloads. If the file
#' If the `check_first` argument is set to TRUE, the function will first check
#' if the file already exists locally to avoid redundant downloads. If the file
#' is already present locally, it will not be downloaded again.
#'
#' @return A data frame containing the contents of the downloaded file.
Expand All @@ -97,7 +97,7 @@ create_folder <- function(folder_name = NULL,
#' verbose = TRUE
#' )
#' }
#'
#'
#' @export
dl_read_gcp <- function(path,
sep = "\t",
Expand All @@ -107,20 +107,20 @@ dl_read_gcp <- function(path,
check_first = TRUE,
verbose = FALSE,
...){

# Detect the operating system
os_name <- Sys.info()["sysname"]

# Default arguments for Mac
ignore_std_err <- TRUE
ignore_std_out <- TRUE

# Change default arguments if the OS is Windows
if (os_name == "Windows") {
ignore_std_err <- FALSE
ignore_std_out <- FALSE
}

# Validate gsutil path first
validate_cmd <- sprintf('%s version', gsutil_path)
if(verbose) message(paste0("- Validating `gsutil_path` on your system: ", gsutil_path))
Expand All @@ -131,40 +131,40 @@ dl_read_gcp <- function(path,
}, error = function(e) {
FALSE
})

if(!gsutil_valid){
stop("The gsutil path is incorrect or gsutil is not installed. Please ensure that gsutil is installed and the `gsutil_path` is correct.")
}

# Check if the file exists in GCP
check_cmd <- sprintf('%s ls %s', gsutil_path, path)
file_exists <- system(check_cmd,
ignore.stdout = ignore_std_out,
file_exists <- system(check_cmd,
ignore.stdout = ignore_std_out,
ignore.stderr = ignore_std_err) == 0

if(!file_exists){
stop(paste0("\nThe file `", path, "` does not exist in GCP"))
}

# Create directory
if(!dir.exists(tmpdir)){
dir.create(tmpdir)
dir.create(tmpdir, recursive = TRUE)
if(verbose) message(paste0("- New folder `", tmpdir, "` created successfully"))
}else{
if(verbose) message(paste0("- Folder `", tmpdir, "` already exists"))
}

# create the normalized version of the destination path
tmpdir_norm <- normalizePath(tmpdir)
# if the normalized path name contains spaces,
# add shell quotes before it is saved to tmpdir,

# if the normalized path name contains spaces,
# add shell quotes before it is saved to tmpdir,
# which ultimately goes to system()
if(grepl("\\s", tmpdir_norm)){
tmpdir <- shQuote(tmpdir_norm)
if(verbose) message("- The temp folder has spaces")
} else{
# Otherwise, tmpdir_norm and tmpdir can remain the same
# Otherwise, tmpdir_norm and tmpdir can remain the same
tmpdir <- tmpdir_norm
}

Expand All @@ -175,7 +175,7 @@ dl_read_gcp <- function(path,
new_path <- file.path(tmpdir_norm, basename(path))
}

# only download if it doesn't exist to avoid conflicts when running this
# only download if it doesn't exist to avoid conflicts when running this
# script in parallel; clear scratch space when you're done
if(check_first){
if( !file.exists(new_path) ){
Expand All @@ -200,16 +200,16 @@ dl_read_gcp <- function(path,

# read in the data using readr instead of data.table
if(file.exists(new_path)){
df <- readr::read_delim(new_path,
delim = sep,
col_names = header,
skip_empty_rows = TRUE,
df <- readr::read_delim(new_path,
delim = sep,
col_names = header,
skip_empty_rows = TRUE,
show_col_types = FALSE, ...)
df <- as.data.frame(df)
return(df)
}else{
stop("Problems loading the file. Two possible reasons:
- Something might have gone wrong with the download.
- Something might have gone wrong with the download.
- This is not a tab-delimited file (default): if you are trying to download a csv file instead, then use `sep = \",\"` instead.
Re-run the command again with `verbose = TRUE`)")
}
Expand Down Expand Up @@ -282,10 +282,10 @@ get_full_path2batch <- function(input_results_folder){
#' }
#' @export
filter_required_columns <- function(df,
type = c("m_m",
"m_s",
"v_m",
"olproteins",
type = c("m_m",
"m_s",
"v_m",
"olproteins",
"olsamples"),
name_id = NULL,
verbose = TRUE){
Expand All @@ -310,27 +310,37 @@ filter_required_columns <- function(df,
colnames(df) <- tolower(colnames(df))
missing_cols <- setdiff(emeta_metabo_coln_named, colnames(df))
if (length(missing_cols) > 0) {
if(verbose) message(" - (-) `metadata_metabolite`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
if (verbose) message(" - (-) `metadata_metabolite`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `",
paste(missing_cols, collapse = ", "), "`"))
} else {
if(verbose) message(" + (+) All required columns present")
if (verbose) message(" + (+) All required columns present")
df <- subset(df, select = emeta_metabo_coln_named)
}
return(df)

} else if (type == "m_s"){
} else if (type == "m_s") {
emeta_sample_coln <- c("sample_id", "sample_type", "sample_order", "raw_file", "extraction_date", "acquisition_date", "lc_column_id")
required_cols <- setdiff(emeta_sample_coln, c("extraction_date", "acquisition_date", "lc_column_id"))
missing_cols <- setdiff(emeta_sample_coln, colnames(df))

if (length(missing_cols) > 0) {
if(verbose) message(" - (-) `metadata_sample`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
missing_required_cols <- setdiff(required_cols, colnames(df))

if (length(missing_required_cols) > 0) {
if (verbose) message(" - (-) `metadata_sample`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `",
paste(missing_required_cols, collapse = ", "), "`"))
} else {
if(verbose) message(" + (+) All required columns present")
if (length(missing_cols) > 0) {
message(" - (-) `metadata_sample`: recently required COLUMN NAMES are missed: Adding with NA values: FAIL")
for (col in c("extraction_date", "acquisition_date", "lc_column_id")) {
if (!(col %in% colnames(df))) {
df[[col]] <- NA
}
}
}
if (verbose) message(" + (+) All required columns present")
df <- subset(df, select = emeta_sample_coln)
}
return(df)

} else if (type == "v_m"){
emeta_sample_coln <- c("vial_label", "tmt_plex")
if( all(emeta_sample_coln %in% colnames(df)) ){
Expand All @@ -348,16 +358,16 @@ filter_required_columns <- function(df,
if(verbose) message(" + (+) All required columns present (tmt18 experiment)")
df <- subset(df, select = emeta_sample_coln)
}else{
if(verbose) message(" - (-) Expected COLUMN NAMES are missed: FAIL")
message(" - (-) Expected COLUMN NAMES are missed: FAIL")
}
}else{
if(verbose) message(" - (-) Expected COLUMN NAMES are missed: FAIL")
message(" - (-) Expected COLUMN NAMES are missed: FAIL")
}
return(df)
} else if (type == "olproteins"){
emeta_sample_coln <- c("olink_id", "uniprot_entry", "assay", "missing_freq", "panel_name", "panel_lot_nr", "normalization")
missing_cols <- setdiff(emeta_sample_coln, colnames(df))

if (length(missing_cols) > 0) {
if(verbose) message(" - (-) `metadata_proteins`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
Expand All @@ -369,7 +379,7 @@ filter_required_columns <- function(df,
}else if (type == "olsamples"){
emeta_sample_coln <- c("sample_id", "sample_type", "sample_order", "plate_id")
missing_cols <- setdiff(emeta_sample_coln, colnames(df))

if (length(missing_cols) > 0) {
if(verbose) message(" - (-) `metadata_samples`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
Expand Down Expand Up @@ -422,7 +432,7 @@ open_file <- function(input_results_folder,
ofile <- NULL
filename <- NULL
}else{

filename <- file_metametabolites[1]
file_ext <- sub(".*\\.(.*)$", "\\1", filename)
if (!file_ext %in% c("txt", "tsv")) {
Expand Down Expand Up @@ -530,7 +540,7 @@ set_phase <- function(input_results_folder,
ignore.case = TRUE,
full.names=TRUE,
recursive = TRUE)

if(length(file_phase) > 1){
if(verbose) message("- (-) `More than one `metadata_phase.txt` file available. Only one is valid (place the valid one in the BATCH folder): FAIL")
}
Expand Down
Loading

0 comments on commit 25e929c

Please sign in to comment.