Skip to content

Commit

Permalink
Merge pull request #222 from MoTrPAC/develop
Browse files Browse the repository at this point in the history
MotrpacBicQC 0.9.0: add OLINK QC support and more
  • Loading branch information
biodavidjm authored Jan 5, 2024
2 parents 5113b9b + d916a28 commit 4cd4486
Show file tree
Hide file tree
Showing 166 changed files with 28,622 additions and 667 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: MotrpacBicQC
Type: Package
Title: QC/QA functions for the MoTrPAC community
Version: 0.8.9
Date: 2023-07-07
Version: 0.9.0
Date: 2024-01-04
Author: MoTrPAC Bioinformatics Center
Maintainer: David Jimenez-Morales <davidjm@stanford.edu>
Description: R Package for the analysis of MoTrPAC datasets.
Expand All @@ -27,6 +27,7 @@ Imports:
ggplot2,
grDevices,
gridExtra,
httr,
inspectdf,
jsonlite,
knitr,
Expand Down
12 changes: 12 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
# Generated by roxygen2: do not edit by hand

export(check_crossfile_olink_validation)
export(check_failedsamples)
export(check_manifest_rawdata)
export(check_metadata_metabolites)
export(check_metadata_phase_file)
export(check_metadata_proteins)
export(check_metadata_samples)
export(check_metadata_samples_olink)
export(check_missing_values)
export(check_ratio_proteomics)
export(check_results)
export(check_results_olink)
export(check_rii_proteomics)
export(check_vial_metadata_proteomics)
export(check_viallabel_dmaqc)
Expand All @@ -18,12 +23,14 @@ export(generate_phase_details)
export(get_and_validate_mdd)
export(get_full_path2batch)
export(load_metabolomics_batch)
export(load_olink_batch)
export(load_proteomics)
export(merge_all_metabolomics)
export(merge_metabolomics_metadata)
export(merge_phenotype_metabolomics)
export(open_file)
export(plot_basic_metabolomics_qc)
export(plot_basic_olink_qc)
export(proteomics_plots_rii)
export(remove_empty_columns)
export(remove_empty_rows)
Expand All @@ -35,14 +42,17 @@ export(validate_dates_times)
export(validate_lc_column_id)
export(validate_metabolomics)
export(validate_na_empty)
export(validate_olink)
export(validate_phase)
export(validate_processFolder)
export(validate_proteomics)
export(validate_refmetname)
export(validate_tissue)
export(validate_two_phases)
export(validate_uniprot_ids_with_uniprot)
export(validate_yyyymmdd_dates)
export(write_metabolomics_releases)
export(write_olink_releases)
export(write_proteomics_releases)
import(dplyr)
import(forcats)
Expand All @@ -62,6 +72,8 @@ importFrom(grDevices,dev.off)
importFrom(grDevices,pdf)
importFrom(gridExtra,arrangeGrob)
importFrom(gridExtra,grid.arrange)
importFrom(httr,GET)
importFrom(httr,status_code)
importFrom(inspectdf,inspect_na)
importFrom(jsonlite,fromJSON)
importFrom(lubridate,parse_date_time)
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# MotrpacBicQC 0.9.0 (2024-01-04)

* Add support for OLINK datasets (check `olink_qc` vignette to find out more)
* Adjust function to download data from GCP (`dl_read_gcp`):
it automatically detects the operating system (arguments `ignore_std_err` and
`ignore_std_out` deprecated)
* Multiple fixes and enhancements


# MotrpacBicQC 0.8.9 (2023-07-07)

Expand Down
116 changes: 76 additions & 40 deletions R/metabolomics_qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ check_metadata_metabolites <- function(df,


# Evaluate every column
flag_mm <- FALSE
if("metabolite_name" %in% colnames(df)){
if(length(unique(df$metabolite_name)) != dim(df)[1]){
duplis_details <- df$metabolite_name[duplicated(df$metabolite_name)]
Expand All @@ -45,7 +44,6 @@ check_metadata_metabolites <- function(df,
if(verbose) message("\n\t\t - ", paste(duplis_details, collapse = "\n\t\t - "))
ic <- ic + 1
}else{
flag_mm <- TRUE
if(verbose) message(" + (+) `metabolite_name` OK")
}

Expand Down Expand Up @@ -197,7 +195,7 @@ check_metadata_samples <- function(df,
if(verbose) message(" + (+) `sample_id` seems OK")
}
}else{
if(verbose) message(" - (-) `metabolite_name` is missed: FAIL")
if(verbose) message(" - (-) `sample_id` is missed: FAIL")
ic <- ic + 1
}

Expand All @@ -215,7 +213,7 @@ check_metadata_samples <- function(df,
if(verbose) message(" + (+) `sample_type` seems OK")
}
}else{
if(verbose) message(" - (-) `refmet_name` column missed: FAIL")
if(verbose) message(" - (-) `sample_type` column missed: FAIL")
ic <- ic + 1
}

Expand Down Expand Up @@ -324,6 +322,8 @@ check_results <- function(r_m,
return_n_issues = FALSE,
verbose = TRUE){

metabolite_name = NULL

# issue_count
ic = 0

Expand All @@ -333,13 +333,13 @@ check_results <- function(r_m,
flag_out <- TRUE
if(!setequal(colnames(r_m), eresults_coln)){
extra_in_results <- setdiff(colnames(r_m), eresults_coln)
if(length(extra_in_results > 0)){
if(length(extra_in_results) > 0){
if(verbose) message("\n - (-) Column(s) NOT expected in `results_metabolite` file which are missed in `metadata_samples`: \n\t\t - ",
paste(extra_in_results, collapse = "\n\t\t - "))
}

extra_in_msr <- setdiff(eresults_coln, colnames(r_m))
if(length(extra_in_msr)){
if(length(extra_in_msr) > 0){
if(verbose) message("\n - (-) Column(s) available in `metadata_samples` missed in `results_metabolite`: \n\t\t - ",
paste(extra_in_msr, collapse = "\n\t\t - "))
}
Expand Down Expand Up @@ -370,11 +370,13 @@ check_results <- function(r_m,
}
# No duplications allowed
if( any(duplicated(r_m$metabolite_name)) ){
if(verbose) message(" - (-) DUPLICATIONS in `metabolite_name` in [results]:\n\t\t- ",
paste(r_m$metabolite_name[duplicated(r_m$metabolite_name)], collapse = ", "))
dupli_meta <- r_m$metabolite_name[duplicated(r_m$metabolite_name)]
ic <- ic + 1

if(verbose){
message(" - (-) DUPLICATIONS in `metabolite_name` in [results]:\n\t\t- ",
paste(r_m$metabolite_name[duplicated(r_m$metabolite_name)], collapse = ", "))
dupli_meta <- r_m$metabolite_name[duplicated(r_m$metabolite_name)]
ic <- ic + 1

}
# remove duplications: uncommented for 20200630 internal release
# bef <- dim(r_m)[1]
# r_m <- unique(r_m)
Expand All @@ -389,8 +391,20 @@ check_results <- function(r_m,
if(verbose) message(" - (-) `metabolite_name` NA values detected: FAIL")
ic <- ic + 1
}

# Identify values with trailing whitespace in the "metabolite_name" column
values_with_whitespace <- r_m$metabolite_name[grep("\\s+$", r_m$metabolite_name)]

# Print the result
if(length(values_with_whitespace) > 0) {
if(verbose){
message(" - (-) Extra space detected at the end of the following `metabolite_name` ids:\n\t\t- ",
paste(values_with_whitespace, collapse = ", "))
}
ic <- ic + 1
}
}else{
if(verbose) message(" - (-) `metabolite_name` column is not available in both [results] and `metadata_metabolites`")
if(verbose) message(" - (-) `metabolite_name` column is not available in both `results` and `metadata_metabolites`: FAIL")
ic <- ic + 1
}
}else{
Expand All @@ -410,6 +424,18 @@ check_results <- function(r_m,
}else{
if(verbose) message(" + (+) `sample_id` columns are numeric: OK")
}

# Check for negative values in all columns except 'metabolite_name'
negative_values_exist <- r_m %>%
select(-metabolite_name) %>%
summarise(across(everything(), ~any(. < 0, na.rm = TRUE))) %>%
any()

if(negative_values_exist){
message(" - (-) NEGATIVE VALUES FOUND!!!: FAIL")
ic <- ic + 1
}

}else{
ic <- ic + 1
}
Expand Down Expand Up @@ -523,7 +549,7 @@ check_manifest_rawdata <- function(input_results_folder,
#' is not required since it should be extracted from the input folder or from the
#' new required file `metadata_phase.txt`. Please, ignore.
#' However, if this argument is provided,
#' it will take priority (and the phase from the input folder and the
#' it will take priority and this will be the phase.
#' `metadata_phase.txt` will be ignored). Examples
#' - Folder with `PASS1A-06`: type either `PASS1A-06` or leave it `NULL`
#' - Both `PASS1A-06` and `PASS1C-06`: type `PASS1A-06|PASS1C-06`
Expand Down Expand Up @@ -841,15 +867,6 @@ validate_metabolomics <- function(input_results_folder,
}else{
if(verbose) message("\n- (-) QC plots are not possible: critical datasets are missed")
}

if(f_rmn & f_mmn){
m_m_n <- filter_required_columns(df = m_m_n,
type = "m_m",
name_id = "named",
verbose = FALSE)

r_m_merge <- merge(r_m_n, m_m_n, by = "metabolite_name")
}
}

# MANIFEST all files-----
Expand Down Expand Up @@ -1113,6 +1130,7 @@ validate_metabolomics <- function(input_results_folder,
}
}

# RETURN report----
if(ic > 4){
message("\nTOTAL NUMBER OF CRITICAL ERROR: ", ic,"\n")
message("WARNING: Too many errors. Revise input folder")
Expand Down Expand Up @@ -1182,9 +1200,6 @@ load_metabolomics_batch <- function(input_results_folder,
assay <- validate_assay(input_results_folder)
tissue_code <- validate_tissue(input_results_folder)

# Output name----
output_name <- paste0(cas, ".", tissue_code, ".", tolower(phase), ".",tolower(assay), ".", tolower(processfolder))

total_issues <- validate_metabolomics(input_results_folder = input_results_folder,
cas = cas,
return_n_issues = TRUE,
Expand All @@ -1196,9 +1211,6 @@ load_metabolomics_batch <- function(input_results_folder,
message("\n\tWARNING!!! Too many issues identified (", total_issues,"). This batch should not be processed until the issues are solved")
}

vial_label <- NA
qc_samples <- NA

# Load Metabolomics----
if(verbose) message("# LOAD METABOLOMICS BATCH")
if(verbose) message("+ Site: ", cas)
Expand Down Expand Up @@ -1278,7 +1290,7 @@ load_metabolomics_batch <- function(input_results_folder,
}


# results --------------------------------------------------------------------
# results ---------
if(verbose) message("\n\n## Results\n")
if(verbose) message("\n*NAMED `results_metabolites`*\n")

Expand Down Expand Up @@ -1324,6 +1336,7 @@ load_metabolomics_batch <- function(input_results_folder,
}
}

# RETURN list of dfs----
if(untargeted){
list_df <- list ("m_m_n" = m_m_n,
"m_m_u" = m_m_u,
Expand Down Expand Up @@ -1605,17 +1618,22 @@ merge_all_metabolomics <- function(m_m_n,
#' @param input_results_folder (char) Path to the PROCESSED_YYYYMMDD folder
#' @param cas (char) Chemical Analytical Site code (e.g "umichigan")
#' @param folder_name (char) output files name. Must have a `.yaml` extension.
#' @param folder_root (char) absolute path to write the output files. Default: current directory
#' @param version_file (char) file version number (v#.#)
#' @param folder_root (char) absolute path to write the output files.
#' Default: current directory
#' @param version_file (char) file version number (`v#.#`)
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return bic release folder/file structure `PHASE/OMICS/TCODE_NAME/ASSAY/` and file names, including:
#' `motrpac_YYYYMMDD_phasecode_tissuecode_omics_assay_file-details.txt` where files-details can be:
#' `named-experimentalDetails.txt`, `named-metadata-metabolites.txt`, `metadata-samples.txt`,
#' `named-results.txt`
#' @return bic release folder/file structure
#' `PHASE/OMICS/TCODE_NAME/ASSAY/` and file names, including:
#' - `motrpac_YYYYMMDD_phasecode_tissuecode_omics_assay_file-details.txt`
#' where files-details can be:
#' - `named-experimentalDetails.txt`
#' - `named-metadata-metabolites.txt`
#' - `metadata-samples.txt`
#' - `named-results.txt`
#' @examples
#' \dontrun{
#' write_metabolomics_releases(
#' input_results_folder = "/full/path/to/PROCESSED_YYYYMMDD/")
#' write_metabolomics_releases(input_results_folder = "/path/to/PROCESSED_YYYYMMDD/",
#' cas = "umichigan")
#' }
#' @export
write_metabolomics_releases <- function(input_results_folder,
Expand All @@ -1639,13 +1657,24 @@ write_metabolomics_releases <- function(input_results_folder,
tissue_code <- validate_tissue(input_results_folder)

folder_tissue <- bic_animal_tissue_code$tissue_name_release[which(bic_animal_tissue_code$bic_tissue_code == tissue_code)]

# # or make a function:
# get_folder_tissue <- function(tissue_code) {
# folder_tissue <- MotrpacBicQC::bic_animal_tissue_code$tissue_name_release[
# which(MotrpacBicQC::bic_animal_tissue_code$bic_tissue_code == tissue_code)
# ]
# return(folder_tissue)
# }
#
# folder_tissue2 <- get_folder_tissue(tissue_code)

if( length(assay_codes$assay_code[which(assay_codes$submission_code == assay)]) == 1 ){
folder_assay <- assay_codes$assay_code[which(assay_codes$submission_code == assay)]
}else{
stop("ASSAY code ", assay, " not available in `assay_codes`")
}

if(verbose) message("+ Writing out ", cas, " ", phase, " ", tissue_code, " ", assay, " files", appendLF = FALSE)
if(verbose) message("+ Writing out ", cas, " ", phase_details, " ", tissue_code, " ", assay, " files", appendLF = FALSE)

# Load all datasets----
metab_dfs <- load_metabolomics_batch(input_results_folder = input_results_folder,
Expand All @@ -1658,11 +1687,18 @@ write_metabolomics_releases <- function(input_results_folder,
}else{
folder_root <- normalizePath(folder_root)
}

# Exception for PASS1C-06: the main folder is pass1a
if(phase_details == "pass1c-06"){
phase_folder_release <- "pass1a-06"
}else{
phase_folder_release <- phase_details
}

if(cas %in% c("umichigan", "broad_met", "gtech")){
output_folder <- file.path(folder_root, folder_name, folder_phase, "metabolomics-untargeted", folder_tissue, folder_assay)
output_folder <- file.path(folder_root, folder_name, phase_folder_release, "metabolomics-untargeted", folder_tissue, folder_assay)
}else{
output_folder <- file.path(folder_root, folder_name, folder_phase, "metabolomics-targeted", folder_tissue, folder_assay)
output_folder <- file.path(folder_root, folder_name, phase_folder_release, "metabolomics-targeted", folder_tissue, folder_assay)
}

if(!dir.exists(file.path(output_folder))){
Expand Down
Loading

0 comments on commit 4cd4486

Please sign in to comment.