Skip to content

Commit

Permalink
Merge pull request #217 from morinlab/cmattsson-dev
Browse files Browse the repository at this point in the history
get_manta_sv, id_ease, website & documentation improvements + minor hot-fixes
  • Loading branch information
mattssca authored Jun 30, 2023
2 parents 23d8815 + 4ea51da commit 104ed3b
Show file tree
Hide file tree
Showing 183 changed files with 5,463 additions and 5,577 deletions.
6 changes: 3 additions & 3 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ This can be checked and addressed by running `check_functions.pl` and responding

- [ ] I generated the documentation and checked for errors relating to the new function (e.g. `devtools::document()`) and added `NAMESPACE` and all other modified files in the root directory and under `man`.

- [ ] I have rebuilt the site with `pkgdown::build_site(lazy = TRUE)` to reflect any updated package documentation.

### Optional but preferred with PRs

- [ ] I updated and/or successfully knitted a vignette that relies on the modified code (which ones?)
Expand All @@ -24,10 +22,12 @@ This can be checked and addressed by running `check_functions.pl` and responding

### Required

- [ ] I documented my function using [ROxygen style](https://jozef.io/r102-addin-roxytags/#:~:text=Inserting%20a%20skeleton%20%2D%20Do%20this,Shift%2BAlt%2BR%20).)
- [ ] I documented my function using [Roxygen style](https://jozef.io/r102-addin-roxytags/#:~:text=Inserting%20a%20skeleton%20%2D%20Do%20this,Shift%2BAlt%2BR%20).)

- [ ] Adequate function documentation (see [new-function documentation template](https://github.com/morinlab/GAMBLR#title) for more info)

- [ ] I have ran `devtools::document()` to add the newly created function to NAMESPACE (do not manually add anything to this file!).

Example:
```
#' @title ASHM Rainbow Plot
Expand Down
8 changes: 5 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ Title: GAMBLR
Version: 0.0.0.9500
Authors@R: c(
person("Ryan", "Morin", , "rdmorin@sfu.ca", role = c("aut", "cre"),
comment = c(ORCID = "YOUR-ORCID-ID")),
person("Kostia", "Dreval", role = "aut"),
person("Laura", "Hilton", role = "ctb"),
comment = c(ORCID = "0000-0003-2932-7800")),
person("Kostia", "Dreval", role = "aut",
comment = c(ORCID = "0000-0002-6214-2843")),
person("Laura", "Hilton", role = "ctb",
comment = c(ORCID = "0000-0002-6413-6586")),
person("Adam", "Mattsson", , "cmattsson@bcgsc.ca", role = "aut",
comment = c(ORCID = "0000-0002-6318-7912")),
person("Haya", "Shaalan", role = "ctb"),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ export(get_ssm_by_regions)
export(get_ssm_by_sample)
export(get_ssm_by_samples)
export(get_study_info)
export(id_ease)
export(intersect_maf)
export(liftover_bedpe)
export(maf_to_custom_track)
Expand Down
39 changes: 39 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -433,3 +433,42 @@
#' \item{Weight_tValue}{Weight Value for the specified gene}
#' }
"wright_genes_with_weights"


#' Default mapping table between mutation type (aka, variant classification) to mutation class
#'
#' A dataset containing the mapping table between genomic mutation type (aka, variant classification) to mutation class.
#' This dataset comes from the g3viz package and was obtained via this URL:
#' https://github.com/morinlab/g3viz/tree/master/data
#'
#' @format A data frame with three columns:
#' \describe{
#' \item{Mutation_Type}{Mutation type, aka, variant classification}
#' \item{Mutation_Class}{mutation class}
#' \item{Short_Name}{short name of mutation type}
#' }
#' @examples
#' mutation.table.df
"mutation.table.df"

#' Mapping table between gene.symbol, uniprot.id, and pfam
#'
#' A dataset containing the mapping table between Hugo symbol, UniProt ID, and
#' Pfam ACC. This dataset comes from the g3viz package and was obtained via this URL:
#' https://github.com/morinlab/g3viz/tree/master/data
#'
#' @format A data frame with columns:
#' \describe{
#' \item{symbol}{Gene symbol}
#' \item{uniprot}{UniProt ID}
#' \item{length}{protein length}
#' \item{start}{starting position of Pfam domain}
#' \item{end}{ending position of Pfam domain}
#' \item{hmm.acc}{Pfam accession number}
#' \item{hmm.name}{Pfam name}
#' \item{type}{Pfam type, i.e., domain/family/motif/repeat/disordered/coiled-coil}
#' }
#' @examples
#' hgnc2pfam.df
#' @source Pfam (v31.0) and UniProt
"hgnc2pfam.df"
340 changes: 214 additions & 126 deletions R/database.R

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions R/preprocessing_io.R
Original file line number Diff line number Diff line change
Expand Up @@ -1165,11 +1165,17 @@ liftover_bedpe = function(bedpe_file,
if(!standard_bed){
colnames(original_bedpe)[1] = "CHROM_A"
original_bedpe = as.data.frame(original_bedpe)

#print(head(original_bedpe))
original_bedpe = original_bedpe %>%
dplyr::mutate(CHROM_A = ifelse(!grepl("chr", CHROM_A), paste0("chr", CHROM_A), CHROM_A),
CHROM_B = ifelse(!grepl("chr", CHROM_B), paste0("chr", CHROM_B), CHROM_B))
#convert to strings manually to avoid caused by scientific notation in rare cases when R coerces to strings
#Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
#scan() expected 'an integer', got '4.7e+07'

original_bedpe = original_bedpe %>% mutate(START_A = format(START_A,scientific=F),
START_B = format(START_B,scientific=F),
END_A = format(END_A,scientific=F),
END_B = format(END_B,scientific=F))

if(verbose){
print(head(original_bedpe))
Expand Down
107 changes: 105 additions & 2 deletions R/utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -4553,6 +4553,7 @@ subset_cnstates = function(cn_segments,
#' @param exclude_sex Boolean argument specifying whether to exclude sex chromosomes from calculation. Default is FALSE.
#' @param return_heatmap Boolean argument specifying whether to return a heatmap of cnvKompare scores. Default is TRUE.
#' @param compare_pairwise Boolean argument specifying whether to perform pairwise comparisons if there are more than 2 time points in the group. Default is TRUE.
#' @param show_x_labels Optional boolean parameter for hiding/showing x axis labels, default is TRUE.
#'
#' @return A list of overall and pairwise percent concordance, concordant and discordant cytobands, comparison heatmap of cnvKompare scores, and time series ggplot object.
#'
Expand All @@ -4568,7 +4569,8 @@ subset_cnstates = function(cn_segments,
#' "MYC",
#' "CREBBP",
#' "GNA13"),
#' projection = "hg38")
#' projection = "hg38",
#' show_x_labels = FALSE)
#'
cnvKompare = function(patient_id,
these_sample_ids,
Expand All @@ -4582,7 +4584,9 @@ cnvKompare = function(patient_id,
min_concordance = 90,
exclude_sex = FALSE,
return_heatmap = TRUE,
compare_pairwise = TRUE) {
compare_pairwise = TRUE,
show_x_labels = TRUE){

# initialize output list
output = list()

Expand Down Expand Up @@ -4742,6 +4746,7 @@ cnvKompare = function(patient_id,
t %>%
ComplexHeatmap::Heatmap(
.,
show_column_names = show_x_labels,
cluster_columns = FALSE,
cluster_rows = FALSE,
heatmap_legend_param = hmap_legend_param
Expand Down Expand Up @@ -4934,3 +4939,101 @@ supplement_maf <- function(incoming_maf,
full_maf = rbind(incoming_maf, missing_sample_maf)
return(full_maf)
}


#' @title ID Ease
#'
#' @aliases id_ease, id ease
#'
#' @description Convenience function that standardize the way GAMBLR functions deals with sample IDs (these_sample_ids)
#' and metadata (these_samples_metadata).
#'
#' @details This function can take sample IDs as a vector of characters, or a metadata table in data frame format.
#' If no sample IDs are provided to the function, the function will operate on all gambl sample IDs available for the given seq type.
#' It is highly recommended to run this function with `verbose = TRUE` (default).
#' Since this will not only improve the overall logic on how the function operates.
#' But also might help with debugging functions that are internally calling this function.
#' The function also performs sanity checks and notifies the user if any of the requested sample IDs are not found in the metadata.
#' In addition, the function also notifies the dimensions of the returned object, providing further insight to what is returned.
#'
#' @param these_samples_metadata A data frame with metadata, subset to sample IDs of interest.
#' If not provided will retrieve GAMBL metadata for all available samples.
#' @param these_sample_ids Sample IDs as a character of vectors.
#' @param this_seq_type The seq type of interest. Default is genome.
#' @param verbose Set to FALSE to limit the information that gets printed to the console. Default is TRUE.
#'
#' @return A list with metadata (data frame) as the first element and sample IDs (vector of characters) as the second element.
#'
#' @export
#'
#' @examples
#' #give the function nothing (i.e return all sample IDs in the metadata for the default seq type)
#' this_is_wrong = id_ease()
#'
#' #return metadata for all samples in the default seq type
#' all_meta = id_ease(return_this = "metadata")
#'
#' #return metadata based on a sample ID
#' sample_meta = id_ease(these_sample_ids = "94-15772_tumorA",
#' return_this = "metadata")
#'
#' #return sample IDs based on an already filtered metadata
#' this_metadata = get_gambl_metadata(seq_type_filter = "genome") %>%
#' head(5)
#'
#' thes_ids = id_ease(these_samples_metadata = this_metadata)
#'
id_ease = function(these_samples_metadata,
these_sample_ids,
this_seq_type = "genome",
verbose = TRUE){

#check for provided metadata, else use GAMBL metadata
if(missing(these_samples_metadata)){
if(verbose){
message("id_ease: No metadata provided, the helper function will fetch metadata for all gambl samples in the selected seq type...")
}
metadata = get_gambl_metadata(seq_type_filter = this_seq_type) #useful to add other get_gambl_metadata parameters?
}else{
if(verbose){
message("id_ease: Metadata is provided...")
}
metadata = these_samples_metadata
}

#ensure metadata is subset to specified sample IDs
if(!missing(these_sample_ids)){
if(verbose){
message("id_ease: Sample IDs are provided, filtering the metadata for selected sample IDs...")
}
metadata = dplyr::filter(metadata, sample_id %in% these_sample_ids)

#check the existence of provided sample IDs in the metadata
not_in_meta = setdiff(these_sample_ids, metadata$sample_id)

#assign the sample_ids variable
sample_ids = these_sample_ids

if(length(not_in_meta) > 0){
message("id_ease: WARNING! The following sample IDs were not found in the metadata:")
print(not_in_meta)
}
}else{
if(verbose){
message("id_ease: No sample IDs provided, defaulting to all IDs in the metadata...")
}
sample_ids = metadata$sample_id
}

#return a list with metadata (data frame) as the first element and sample IDs (vector of characters) as the second element
if(verbose){
unique_samples = unique(sample_ids)
message(paste0("id_ease: Returning ", length(unique_samples), " sample IDs.."))
message(paste0("id_ease: Returning metadata for ", length(unique_samples), " samples..." ))
}

#bind the objects into a list for return
IDs = list(this_metadata = metadata, these_samples = sample_ids)

return(IDs)
}
46 changes: 29 additions & 17 deletions R/viz.R
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,7 @@ focal_cn_plot = function(region,
#' @param gene The gene symbol to plot.
#' @param plot_title Optional (defaults to gene name).
#' @param plot_theme Options: cbioportal(default), blue, simple, nature, nature2, ggplot2, and dark.
#' @param out_name Optional, set the file name of the plot, if you export it to disk. Default name is my_lollipop_plot_{gene}.
#'
#' @return Nothing.
#'
Expand All @@ -588,10 +589,16 @@ focal_cn_plot = function(region,
pretty_lollipop_plot = function(maf_df,
gene,
plot_title,
plot_theme = "cbioportal"){
plot_theme = "cbioportal",
out_name = paste0("my_lollipop_plot_", gene)){
if(missing(gene)){
stop("Plese provide a gene...")
}

if(missing(plot_title)){
plot_title = gene
}

maf_df = maf_df %>%
dplyr::filter(Hugo_Symbol == gene)

Expand All @@ -601,7 +608,7 @@ pretty_lollipop_plot = function(maf_df,
g3Lollipop(maf_df,
gene.symbol = gene,
plot.options = chart.options,
output.filename = "default_theme")
output.filename = out_name)
}


Expand Down Expand Up @@ -2183,6 +2190,7 @@ prettyCoOncoplot = function(maf,
#' @param custom_colours Provide named vector (or named list of vectors) containing custom annotation colours if you do not want to use standartized pallette.
#' @param classification_column Optional. Override default column for assigning the labels used for colouring in the figure.
#' @param maf_data An already loaded maf, if no provided, this function will call `get_ssm_by_region`, using the regions supplied into `regions_bed`.
#' @param verbose Set to FALSE to rpevent printing the full regions bed file to the console. Default is TRUE.
#'
#' @return Nothing
#'
Expand All @@ -2208,7 +2216,8 @@ ashm_multi_rainbow_plot = function(regions_bed,
seq_type,
custom_colours,
classification_column = "lymphgen",
maf_data){
maf_data,
verbose = TRUE){

table_name = check_config_value(config::get("results_tables")$ssm)
db = check_config_value(config::get("database_name"))
Expand All @@ -2224,7 +2233,7 @@ ashm_multi_rainbow_plot = function(regions_bed,
meta_arranged = dplyr::filter(meta_arranged, !get(classification_column) %in% exclude_classifications)
}
if(missing(regions_bed)){
regions_bed = grch37_ashm_regions
regions_bed = GAMBLR.data::somatic_hypermutation_locations_GRCh37_v_latest
regions_bed = mutate(regions_bed, regions = paste0(chr_name, ":", hg19_start, "-", hg19_end))
regions_bed = mutate(regions_bed, name = paste0(gene, "-", region))
}else{
Expand All @@ -2235,7 +2244,11 @@ ashm_multi_rainbow_plot = function(regions_bed,
regions_bed$name = regions_bed$regions
}
}
print(regions_bed)

if(verbose){
print(regions_bed)
}

names = pull(regions_bed, name)
names = c(names, "NFKBIZ-UTR", "MAF", "PAX5", "WHSC1", "CCND1",
"FOXP1-TSS1", "FOXP1-TSS2", "FOXP1-TSS3", "FOXP1-TSS4",
Expand Down Expand Up @@ -5055,17 +5068,16 @@ comp_report = function(this_sample_id,
#' fl_genes_list = gene_to_region(gene_symbol = fl_genes,
#' return_as = "bed")
#'
#' fancy_circos_plot_new(this_sample_id = "DOHH-2",
#' ssm_calls = FALSE,
#' gene_list = fl_genes_list,
#' chr_select = c("chr8",
#' "chr14",
#' "chr18"),
#' out = "../../plots/",
#' plot_title = "DOHH-2 (SVs) Example Plot",
#' pdf = FALSE,
#' pdf = FALSE,
#' file_name = "dohh2_example.png")
#' fancy_circos_plot(this_sample_id = "DOHH-2",
#' ssm_calls = FALSE,
#' gene_list = fl_genes_list,
#' chr_select = c("chr8",
#' "chr14",
#' "chr18"),
#' out = "../../plots/",
#' plot_title = "DOHH-2 (SVs) Example Plot",
#' pdf = FALSE,
#' file_name = "dohh2_example.png")
#' }
#'
fancy_circos_plot = function(this_sample_id,
Expand Down Expand Up @@ -5191,7 +5203,7 @@ fancy_circos_plot = function(this_sample_id,
ssm_ins = dplyr::filter(maf_tmp, Variant_Type == "INS") #subset on insertions
ssm_snp = dplyr::filter(maf_tmp, Variant_Type == "SNP") #subset on single nucleotide polymorphism
ssm_dnp = dplyr::filter(maf_tmp, Variant_Type == "DNP") #subset on dinucleotide polymorphism
message(paste0(nrow(ssm_del) + nrow(ssm_dnp) + nrow(ssm_ins) + nrow(ssm_snp)), " SSMs found for ", this_sample)
message(paste0(nrow(ssm_del) + nrow(ssm_dnp) + nrow(ssm_ins) + nrow(ssm_snp)), " SSMs found for ", this_sample_id)
}

#get SVs
Expand Down
42 changes: 17 additions & 25 deletions README.html

Large diffs are not rendered by default.

Loading

0 comments on commit 104ed3b

Please sign in to comment.