Merge pull request #217 from morinlab/cmattsson-dev

get_manta_sv, id_ease, website & documentation improvements + minor hot-fixes
morinlab · Jun 30, 2023 · 104ed3b · 104ed3b
2 parents 23d8815 + 4ea51da
commit 104ed3b
Show file tree

Hide file tree

Showing 183 changed files with 5,463 additions and 5,577 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -14,8 +14,6 @@ This can be checked and addressed by running `check_functions.pl` and responding
 
 - [ ] I generated the documentation and checked for errors relating to the new function (e.g. `devtools::document()`) and added `NAMESPACE` and all other modified files in the root directory and under `man`. 
 
-- [ ] I have rebuilt the site with `pkgdown::build_site(lazy = TRUE)` to reflect any updated package documentation.
-
 ### Optional but preferred with PRs
 
 - [ ] I updated and/or successfully knitted a vignette that relies on the modified code (which ones?)
@@ -24,10 +22,12 @@ This can be checked and addressed by running `check_functions.pl` and responding
 
 ### Required
 
-- [ ] I documented my function using [ROxygen style](https://jozef.io/r102-addin-roxytags/#:~:text=Inserting%20a%20skeleton%20%2D%20Do%20this,Shift%2BAlt%2BR%20).)
+- [ ] I documented my function using [Roxygen style](https://jozef.io/r102-addin-roxytags/#:~:text=Inserting%20a%20skeleton%20%2D%20Do%20this,Shift%2BAlt%2BR%20).)
 
 - [ ] Adequate function documentation (see [new-function documentation template](https://github.com/morinlab/GAMBLR#title) for more info)
 
+- [ ] I have ran `devtools::document()` to add the newly created function to NAMESPACE (do not manually add anything to this file!).
+
 Example:
 ```
 #' @title ASHM Rainbow Plot

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,9 +3,11 @@ Title: GAMBLR
 Version: 0.0.0.9500
 Authors@R: c(
     person("Ryan", "Morin", , "rdmorin@sfu.ca", role = c("aut", "cre"),
-           comment = c(ORCID = "YOUR-ORCID-ID")),
-    person("Kostia", "Dreval", role = "aut"),
-    person("Laura", "Hilton", role = "ctb"),
+           comment = c(ORCID = "0000-0003-2932-7800")),
+    person("Kostia", "Dreval", role = "aut",
+            comment = c(ORCID = "0000-0002-6214-2843")),
+    person("Laura", "Hilton", role = "ctb",
+           comment = c(ORCID = "0000-0002-6413-6586")),
     person("Adam", "Mattsson", , "cmattsson@bcgsc.ca", role = "aut",
            comment = c(ORCID = "0000-0002-6318-7912")),
     person("Haya", "Shaalan", role = "ctb"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -75,6 +75,7 @@ export(get_ssm_by_regions)
 export(get_ssm_by_sample)
 export(get_ssm_by_samples)
 export(get_study_info)
+export(id_ease)
 export(intersect_maf)
 export(liftover_bedpe)
 export(maf_to_custom_track)

diff --git a/R/data.R b/R/data.R
@@ -433,3 +433,42 @@
 #'   \item{Weight_tValue}{Weight Value for the specified gene}
 #' }
 "wright_genes_with_weights"
+
+
+#' Default mapping table between mutation type (aka, variant classification) to mutation class
+#'
+#' A dataset containing the mapping table between genomic mutation type (aka, variant classification) to mutation class.
+#' This dataset comes from the g3viz package and was obtained via this URL:
+#' https://github.com/morinlab/g3viz/tree/master/data
+#'
+#' @format A data frame with three columns:
+#' \describe{
+#'   \item{Mutation_Type}{Mutation type, aka, variant classification}
+#'   \item{Mutation_Class}{mutation class}
+#'   \item{Short_Name}{short name of mutation type}
+#' }
+#' @examples
+#' mutation.table.df
+"mutation.table.df"
+
+#' Mapping table between gene.symbol, uniprot.id, and pfam
+#'
+#' A dataset containing the mapping table between Hugo symbol, UniProt ID, and
+#' Pfam ACC. This dataset comes from the g3viz package and was obtained via this URL:
+#' https://github.com/morinlab/g3viz/tree/master/data
+#'
+#' @format A data frame with columns:
+#' \describe{
+#'   \item{symbol}{Gene symbol}
+#'   \item{uniprot}{UniProt ID}
+#'   \item{length}{protein length}
+#'   \item{start}{starting position of Pfam domain}
+#'   \item{end}{ending position of Pfam domain}
+#'   \item{hmm.acc}{Pfam accession number}
+#'   \item{hmm.name}{Pfam name}
+#'   \item{type}{Pfam type, i.e., domain/family/motif/repeat/disordered/coiled-coil}
+#' }
+#' @examples
+#' hgnc2pfam.df
+#' @source Pfam (v31.0) and UniProt
+"hgnc2pfam.df"
diff --git a/R/database.R b/R/database.R
diff --git a/R/preprocessing_io.R b/R/preprocessing_io.R
@@ -1165,11 +1165,17 @@ liftover_bedpe = function(bedpe_file,
   if(!standard_bed){
     colnames(original_bedpe)[1] = "CHROM_A"
     original_bedpe = as.data.frame(original_bedpe)
-
-    #print(head(original_bedpe))
     original_bedpe = original_bedpe %>%
       dplyr::mutate(CHROM_A = ifelse(!grepl("chr", CHROM_A), paste0("chr", CHROM_A), CHROM_A),
                     CHROM_B = ifelse(!grepl("chr", CHROM_B), paste0("chr", CHROM_B), CHROM_B))
+    #convert to strings manually to avoid caused by scientific notation in rare cases when R coerces to strings
+    #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
+    #scan() expected 'an integer', got '4.7e+07'
+
+    original_bedpe = original_bedpe %>% mutate(START_A = format(START_A,scientific=F),
+                                               START_B = format(START_B,scientific=F),
+                                               END_A = format(END_A,scientific=F),
+                                               END_B = format(END_B,scientific=F))
 
     if(verbose){
       print(head(original_bedpe))

diff --git a/R/utilities.R b/R/utilities.R
@@ -4553,6 +4553,7 @@ subset_cnstates = function(cn_segments,
 #' @param exclude_sex Boolean argument specifying whether to exclude sex chromosomes from calculation. Default is FALSE.
 #' @param return_heatmap Boolean argument specifying whether to return a heatmap of cnvKompare scores. Default is TRUE.
 #' @param compare_pairwise Boolean argument specifying whether to perform pairwise comparisons if there are more than 2 time points in the group. Default is TRUE.
+#' @param show_x_labels Optional boolean parameter for hiding/showing x axis labels, default is TRUE.
 #'
 #' @return A list of overall and pairwise percent concordance, concordant and discordant cytobands, comparison heatmap of cnvKompare scores, and time series ggplot object.
 #'
@@ -4568,7 +4569,8 @@ subset_cnstates = function(cn_segments,
 #'                                  "MYC",
 #'                                  "CREBBP",
 #'                                  "GNA13"),
-#'            projection = "hg38")
+#'            projection = "hg38", 
+#'            show_x_labels = FALSE)
 #'
 cnvKompare = function(patient_id,
                       these_sample_ids,
@@ -4582,7 +4584,9 @@ cnvKompare = function(patient_id,
                       min_concordance = 90,
                       exclude_sex = FALSE,
                       return_heatmap = TRUE,
-                      compare_pairwise = TRUE) {
+                      compare_pairwise = TRUE,
+                      show_x_labels = TRUE){
+
   # initialize output list
   output = list()
 
@@ -4742,6 +4746,7 @@ cnvKompare = function(patient_id,
       t %>%
       ComplexHeatmap::Heatmap(
         .,
+        show_column_names = show_x_labels,
         cluster_columns = FALSE,
         cluster_rows = FALSE,
         heatmap_legend_param = hmap_legend_param
@@ -4934,3 +4939,101 @@ supplement_maf <- function(incoming_maf,
   full_maf = rbind(incoming_maf, missing_sample_maf)
   return(full_maf)
 }
+
+
+#' @title ID Ease
+#'
+#' @aliases id_ease, id ease
+#'
+#' @description Convenience function that standardize the way GAMBLR functions deals with sample IDs (these_sample_ids)
+#' and metadata (these_samples_metadata).
+#'
+#' @details This function can take sample IDs as a vector of characters, or a metadata table in data frame format.
+#' If no sample IDs are provided to the function, the function will operate on all gambl sample IDs available for the given seq type.
+#' It is highly recommended to run this function with `verbose = TRUE` (default). 
+#' Since this will not only improve the overall logic on how the function operates.
+#' But also might help with debugging functions that are internally calling this function.
+#' The function also performs sanity checks and notifies the user if any of the requested sample IDs are not found in the metadata.
+#' In addition, the function also notifies the dimensions of the returned object, providing further insight to what is returned. 
+#' 
+#' @param these_samples_metadata A data frame with metadata, subset to sample IDs of interest.
+#' If not provided will retrieve GAMBL metadata for all available samples.
+#' @param these_sample_ids Sample IDs as a character of vectors.
+#' @param this_seq_type The seq type of interest. Default is genome.
+#' @param verbose Set to FALSE to limit the information that gets printed to the console. Default is TRUE.
+#'
+#' @return A list with metadata (data frame) as the first element and sample IDs (vector of characters) as the second element.
+#'
+#' @export
+#'
+#' @examples
+#' #give the function nothing (i.e return all sample IDs in the metadata for the default seq type)
+#' this_is_wrong = id_ease()
+#'
+#' #return metadata for all samples in the default seq type
+#' all_meta = id_ease(return_this = "metadata")
+#'
+#' #return metadata based on a sample ID
+#' sample_meta = id_ease(these_sample_ids = "94-15772_tumorA", 
+#'                       return_this = "metadata")
+#'
+#' #return sample IDs based on an already filtered metadata
+#' this_metadata = get_gambl_metadata(seq_type_filter = "genome") %>% 
+#'   head(5)
+#'
+#' thes_ids = id_ease(these_samples_metadata = this_metadata)
+#'
+id_ease = function(these_samples_metadata,
+                   these_sample_ids,
+                   this_seq_type = "genome",
+                   verbose = TRUE){
+
+  #check for provided metadata, else use GAMBL metadata
+  if(missing(these_samples_metadata)){
+    if(verbose){
+      message("id_ease: No metadata provided, the helper function will fetch metadata for all gambl samples in the selected seq type...") 
+    }
+    metadata = get_gambl_metadata(seq_type_filter = this_seq_type) #useful to add other get_gambl_metadata parameters?
+  }else{
+    if(verbose){
+      message("id_ease: Metadata is provided...") 
+    }
+    metadata = these_samples_metadata
+  }
+
+  #ensure metadata is subset to specified sample IDs
+  if(!missing(these_sample_ids)){
+    if(verbose){
+      message("id_ease: Sample IDs are provided, filtering the metadata for selected sample IDs...") 
+    }
+    metadata = dplyr::filter(metadata, sample_id %in% these_sample_ids)
+
+    #check the existence of provided sample IDs in the metadata
+    not_in_meta = setdiff(these_sample_ids, metadata$sample_id)
+
+    #assign the sample_ids variable
+    sample_ids = these_sample_ids
+
+    if(length(not_in_meta) > 0){
+      message("id_ease: WARNING! The following sample IDs were not found in the metadata:")
+      print(not_in_meta)
+    }
+  }else{
+    if(verbose){
+      message("id_ease: No sample IDs provided, defaulting to all IDs in the metadata...")
+    }
+    sample_ids = metadata$sample_id
+  }
+
+  #return a list with metadata (data frame) as the first element and sample IDs (vector of characters) as the second element
+  if(verbose){
+    unique_samples = unique(sample_ids)
+    message(paste0("id_ease: Returning ", length(unique_samples), " sample IDs.."))
+    message(paste0("id_ease: Returning metadata for ", length(unique_samples), " samples..." ))
+  }
+
+  #bind the objects into a list for return
+  IDs = list(this_metadata = metadata, these_samples = sample_ids)
+
+  return(IDs) 
+}
diff --git a/R/viz.R b/R/viz.R
@@ -564,6 +564,7 @@ focal_cn_plot = function(region,
 #' @param gene The gene symbol to plot.
 #' @param plot_title Optional (defaults to gene name).
 #' @param plot_theme Options: cbioportal(default), blue, simple, nature, nature2, ggplot2, and dark.
+#' @param out_name Optional, set the file name of the plot, if you export it to disk. Default name is my_lollipop_plot_{gene}.
 #'
 #' @return Nothing.
 #'
@@ -588,10 +589,16 @@ focal_cn_plot = function(region,
 pretty_lollipop_plot = function(maf_df,
                                 gene,
                                 plot_title,
-                                plot_theme = "cbioportal"){
+                                plot_theme = "cbioportal",
+                                out_name = paste0("my_lollipop_plot_", gene)){
+  if(missing(gene)){
+    stop("Plese provide a gene...")
+  }
+
   if(missing(plot_title)){
     plot_title = gene
   }
+
   maf_df = maf_df %>%
     dplyr::filter(Hugo_Symbol == gene)
 
@@ -601,7 +608,7 @@ pretty_lollipop_plot = function(maf_df,
   g3Lollipop(maf_df,
              gene.symbol = gene,
              plot.options = chart.options,
-             output.filename = "default_theme")
+             output.filename = out_name)
 }
 
 
@@ -2183,6 +2190,7 @@ prettyCoOncoplot = function(maf,
 #' @param custom_colours Provide named vector (or named list of vectors) containing custom annotation colours if you do not want to use standartized pallette.
 #' @param classification_column Optional. Override default column for assigning the labels used for colouring in the figure.
 #' @param maf_data An already loaded maf, if no provided, this function will call `get_ssm_by_region`, using the regions supplied into `regions_bed`.
+#' @param verbose Set to FALSE to rpevent printing the full regions bed file to the console. Default is TRUE.
 #'
 #' @return Nothing
 #'
@@ -2208,7 +2216,8 @@ ashm_multi_rainbow_plot = function(regions_bed,
                                    seq_type,
                                    custom_colours,
                                    classification_column = "lymphgen",
-                                   maf_data){
+                                   maf_data,
+                                   verbose = TRUE){
 
   table_name = check_config_value(config::get("results_tables")$ssm)
   db = check_config_value(config::get("database_name"))
@@ -2224,7 +2233,7 @@ ashm_multi_rainbow_plot = function(regions_bed,
     meta_arranged = dplyr::filter(meta_arranged, !get(classification_column) %in% exclude_classifications)
   }
   if(missing(regions_bed)){
-    regions_bed = grch37_ashm_regions
+    regions_bed = GAMBLR.data::somatic_hypermutation_locations_GRCh37_v_latest
     regions_bed = mutate(regions_bed, regions = paste0(chr_name, ":", hg19_start, "-", hg19_end))
     regions_bed = mutate(regions_bed, name = paste0(gene, "-", region))
   }else{
@@ -2235,7 +2244,11 @@ ashm_multi_rainbow_plot = function(regions_bed,
       regions_bed$name = regions_bed$regions
     }
   }
-  print(regions_bed)
+
+  if(verbose){
+    print(regions_bed)
+  }
+
   names = pull(regions_bed, name)
   names = c(names, "NFKBIZ-UTR", "MAF", "PAX5", "WHSC1", "CCND1",
                    "FOXP1-TSS1", "FOXP1-TSS2", "FOXP1-TSS3", "FOXP1-TSS4",
@@ -5055,17 +5068,16 @@ comp_report = function(this_sample_id,
 #' fl_genes_list = gene_to_region(gene_symbol = fl_genes,
 #'                                return_as = "bed")
 #'
-#' fancy_circos_plot_new(this_sample_id = "DOHH-2",
-#'                       ssm_calls = FALSE,
-#'                       gene_list = fl_genes_list,
-#'                       chr_select = c("chr8",
-#'                                      "chr14",
-#'                                      "chr18"),
-#'                       out = "../../plots/",
-#'                       plot_title = "DOHH-2 (SVs) Example Plot",
-#'                       pdf = FALSE,
-#'                       pdf = FALSE,
-#'                       file_name = "dohh2_example.png")
+#' fancy_circos_plot(this_sample_id = "DOHH-2",
+#'                   ssm_calls = FALSE,
+#'                   gene_list = fl_genes_list,
+#'                   chr_select = c("chr8",
+#'                                  "chr14",
+#'                                  "chr18"),
+#'                   out = "../../plots/",
+#'                   plot_title = "DOHH-2 (SVs) Example Plot",
+#'                   pdf = FALSE,
+#'                   file_name = "dohh2_example.png")
 #' }
 #'
 fancy_circos_plot = function(this_sample_id,
@@ -5191,7 +5203,7 @@ fancy_circos_plot = function(this_sample_id,
     ssm_ins = dplyr::filter(maf_tmp, Variant_Type == "INS") #subset on insertions
     ssm_snp = dplyr::filter(maf_tmp, Variant_Type == "SNP") #subset on single nucleotide polymorphism
     ssm_dnp = dplyr::filter(maf_tmp, Variant_Type == "DNP") #subset on dinucleotide polymorphism
-    message(paste0(nrow(ssm_del) + nrow(ssm_dnp) + nrow(ssm_ins) + nrow(ssm_snp)), " SSMs found for ", this_sample)
+    message(paste0(nrow(ssm_del) + nrow(ssm_dnp) + nrow(ssm_ins) + nrow(ssm_snp)), " SSMs found for ", this_sample_id)
   }
 
   #get SVs

diff --git a/README.html b/README.html