# Load packages
library(GAMBLR.data)
-library(GAMBLR.viz)
-library(dplyr)
diff --git a/docs/search.json b/docs/search.json index 7b5c676..131c395 100644 --- a/docs/search.json +++ b/docs/search.json @@ -46,14 +46,42 @@ "href": "tutorials/forestplot.html#prepare-setup", "title": "Tutorial: The prettiest forestplot", "section": "Prepare setup", - "text": "Prepare setup\nWe will first import the necessary packages:\n\n# Load packages\nlibrary(GAMBLR.data)\nlibrary(GAMBLR.viz)\nlibrary(dplyr)\n\nNext, we will get some data to display. The metadata is expected to be a data frame with one required column: sample_id and another column that will contain sample annotations according to the comparison group. In this example, we will use as example the data set and variant calls from the study that identified genetic subgroup of Burkitt lymphoma (BL).\n\nmetadata <- get_gambl_metadata() %>%\n filter(cohort == \"BL_Thomas\")\n\nNext, we will obtain the coding mutations that will be used in the plotting. The data is a data frame in a standartized maf format.\n\nmaf <- get_ssm_by_samples(\n these_samples_metadata = metadata,\n tool_name = \"publication\",\n projection = \"hg38\"\n)\n\n# How does it look like?\ndim(maf)\n\n[1] 47043 45\n\nhead(maf) %>%\n select(\n Tumor_Sample_Barcode,\n Hugo_Symbol,\n Variant_Classification\n )\n\n Tumor_Sample_Barcode Hugo_Symbol Variant_Classification\n1: Akata CPTP Missense_Mutation\n2: Akata FNDC10 Missense_Mutation\n3: Akata MORN1 Missense_Mutation\n4: Akata MEGF6 Missense_Mutation\n5: Akata NPHP4 Silent\n6: Akata GPR157 Missense_Mutation\n\n\nFor the purpose of this tutorial, we will focus on a small subset of genes known to be significantly mutated in BL.\n\ngenes <- lymphoma_genes_bl_v_latest$Gene\nhead(genes)\n\n[1] \"ALPK2\" \"ARHGEF1\" \"ARID1A\" \"B2M\" \"BACH2\" \"BCL10\" \n\n\nNow we have our metadata and mutations we want to explore, so we are ready to start visualizing the data." + "text": "Prepare setup\nWe will first import the necessary packages:\n\n# Load packages\nlibrary(GAMBLR.data)\nlibrary(GAMBLR.utils)\nlibrary(GAMBLR.viz)\nlibrary(tibble)\nlibrary(dplyr)\n\nNext, we will get some data to display. The metadata is expected to be a data frame with one required column: sample_id and another column that will contain sample annotations according to the comparison group. In this example, we will use as example the data set and variant calls from the study that identified genetic subgroup of Burkitt lymphoma (BL).\n\nmetadata <- get_gambl_metadata() %>%\n filter(cohort == \"BL_Thomas\")\n\nNext, we will obtain the coding mutations that will be used in the plotting. The data is a data frame in a standartized maf format.\n\nmaf <- get_ssm_by_samples(\n these_samples_metadata = metadata,\n tool_name = \"publication\",\n projection = \"hg38\"\n)\n\n# How does it look like?\ndim(maf)\n\n[1] 47043 45\n\nhead(maf) %>%\n select(\n Tumor_Sample_Barcode,\n Hugo_Symbol,\n Variant_Classification\n )\n\n Tumor_Sample_Barcode Hugo_Symbol Variant_Classification\n1: Akata CPTP Missense_Mutation\n2: Akata FNDC10 Missense_Mutation\n3: Akata MORN1 Missense_Mutation\n4: Akata MEGF6 Missense_Mutation\n5: Akata NPHP4 Silent\n6: Akata GPR157 Missense_Mutation\n\n\nFor the purpose of this tutorial, we will focus on a small subset of genes known to be significantly mutated in BL.\n\ngenes <- lymphoma_genes_bl_v_latest$Gene\nhead(genes)\n\n[1] \"ALPK2\" \"ARHGEF1\" \"ARID1A\" \"B2M\" \"BACH2\" \"BCL10\" \n\n\nNow we have our metadata and mutations we want to explore, so we are ready to start visualizing the data." }, { "objectID": "tutorials/forestplot.html#the-default-forest-plot", "href": "tutorials/forestplot.html#the-default-forest-plot", "title": "Tutorial: The prettiest forestplot", "section": "The default forest plot", - "text": "The default forest plot\nThe forest plot is ready to be called with the default parameters after just providing the metadata and data frame with mutations in standard maf format. Here is an example of the output with all default parameters:\n\ncomparison_column <- \"EBV_status_inf\" # character of column name for comparison\nfp <- prettyForestPlot(\n metadata = metadata,\n maf = maf,\n genes = genes,\n comparison_column = comparison_column\n)" + "text": "The default forest plot\nThe forest plot is ready to be called with the default parameters after just providing the metadata and data frame with mutations in standard maf format. Here is an example of the output with all default parameters:\n\ncomparison_column <- \"EBV_status_inf\" # character of column name for comparison\nfp <- prettyForestPlot(\n metadata = metadata,\n maf = maf,\n genes = genes,\n comparison_column = comparison_column\n)\n\nThe output of the function is a list containing the following objects: - fisher: a data frame with detailed statistics of the Fisher’s test for each gene - mutmat: a binary matrix used for the Fisher’s test - forest: a ggplot2 object with the forest plot of the ORs from the Fisher’s test for each gene - bar: a ggplot2 object wiht mutation frequencies for each Gene - arranged: a display item where both the forest and bar plots are nicely arranged side-by-side\n\nnames(fp)\n\n[1] \"fisher\" \"forest\" \"bar\" \"arranged\" \"mutmat\"" + }, + { + "objectID": "tutorials/forestplot.html#report-only-significant-differences", + "href": "tutorials/forestplot.html#report-only-significant-differences", + "title": "Tutorial: The prettiest forestplot", + "section": "Report only significant differences", + "text": "Report only significant differences\nBy default, all of the genes of interest are reported in the output. After the Fisher’s test is performed, the prettyForestPlot also calculates FDR and we can use it to only report significant differences by providing a significance cutoff with the parameter max_q:\n\nmax_q <- 0.1 # only those qith Q value <= 0.1 will be reported\nfp <- prettyForestPlot(\n metadata = metadata,\n maf = maf,\n genes = genes,\n comparison_column = comparison_column,\n max_q = max_q\n)\n\nWe now can take a look at what genes are passing the significance cutoff:\n\nfp$arranged" + }, + { + "objectID": "tutorials/forestplot.html#comparing-categories-with-more-than-two-groups", + "href": "tutorials/forestplot.html#comparing-categories-with-more-than-two-groups", + "title": "Tutorial: The prettiest forestplot", + "section": "Comparing categories with more than two groups", + "text": "Comparing categories with more than two groups\nAs the prettyForestPlot construcst the 2x2 contingency tables to run Fisher’s test to find significant differences, it can only operate on comparing 2 groups between themselves - but what if you have more than that and want to see the difference between some of them? To handle this scenario, we can take advantage of the comparison_values parameter, which will be used to subset the metadata to only requested groups and only perform testing and plotting on this subset. Let’s see it in action:\n\ncomparison_column <- \"genetic_subgroup\" # change the comparison column\ncomparison_values <- c(\"IC-BL\", \"Q53-BL\")\nfp <- prettyForestPlot(\n metadata = metadata,\n maf = maf,\n genes = genes,\n comparison_column = comparison_column,\n comparison_values = comparison_values,\n max_q = max_q\n)\n\nfp$arranged\n\n\n\n\nThis plot is exactly reproducing the Supplemmental Figure 12D from the Thomas et al study!" + }, + { + "objectID": "tutorials/forestplot.html#separating-genes-with-hotspots", + "href": "tutorials/forestplot.html#separating-genes-with-hotspots", + "title": "Tutorial: The prettiest forestplot", + "section": "Separating genes with hotspots", + "text": "Separating genes with hotspots\nWe can additionally separate hotspots from the other mutations and compare those separately. First, we need to annotate the maf data, for which we will use the annotate_hotspots from GAMBLR family. This function will add a new column to the maf named hot_spot indicating whether or not the specific mutation is in the hotspot region.\n\n# Annotate hotspots\nmaf <- annotate_hotspots(maf)\n\n# What are the hotspots?\nmaf %>%\n filter(hot_spot) %>%\n select(Hugo_Symbol, hot_spot) %>%\n table()\n\n< table of extent 0 x 0 >\n\n\n\n\n\n\n\n\nNote\n\n\n\nThe GAMBLR.data version of the annotate_hotspots only handles very specific genes and does not have functionality to annotate all hotspots.\n\n\nOh no! Looks like there is no hotspots in this maf data. This does not make sense, so what happened? Aha, the hotspot annotation in GAMBLR.data works only on the data in grch37 projection. But our maf is in hg38, so what should we do? One way is to lift the maf data to another projection using the UCSC’s liftOver, and GAMBLR family has exactly the function that serves this purpose:\n\nmaf_grch37 <- liftover(\n maf,\n mode = \"maf\",\n target_build = \"grch37\"\n) %>%\nmutate(Chromosome = gsub(\"chr\", \"\", Chromosome)) %>%\nselect(-hot_spot) # since it is empty we can just drop it\n\nCan we annotate the hotspots now?\n\nmaf_grch37 <- annotate_hotspots(maf_grch37)\n\n# What are the hotspots?\nmaf_grch37 %>%\n filter(hot_spot) %>%\n select(Hugo_Symbol, hot_spot) %>%\n table()\n\n hot_spot\nHugo_Symbol TRUE\n CREBBP 1\n EZH2 1\n FOXO1 60\n MYD88 2\n STAT6 4\n\n\nIndeed, the hotspots are properly annotated once we have maf in correct projection. Now, we can simply toggle the separate_hotspots parameter to perform separate comparisons within hotspots:\n\ncomparison_column <- \"EBV_status_inf\"\nfp <- prettyForestPlot(\n metadata = metadata,\n maf = maf_grch37,\n genes = genes,\n comparison_column = comparison_column,\n max_q = max_q,\n separate_hotspots = TRUE\n)\n\nfp$arranged" + }, + { + "objectID": "tutorials/forestplot.html#using-binary-matrix-as-input", + "href": "tutorials/forestplot.html#using-binary-matrix-as-input", + "title": "Tutorial: The prettiest forestplot", + "section": "Using binary matrix as input", + "text": "Using binary matrix as input\nSometimes it might be useful to have different input format instead of maf - for example, binary matrix of features. Can we use the prettyForestPlot in this case? Yes, sure we can!\nFirst, let’s construct the binary matrix. We will supplement our maf with the non-coding mutations to look at the aSHM regions in addition to coding mutations, and this will already give us the data in correct projection:\n\nmaf <- get_ssm_by_samples(\n these_samples_metadata = metadata\n)\nmaf$Variant_Classification %>% table\n\n.\n 3'Flank 3'UTR 5'Flank \n 1457 513 2957 \n 5'UTR Frame_Shift_Del Frame_Shift_Ins \n 1102 124 97 \n IGR In_Frame_Del In_Frame_Ins \n 457 40 14 \n Intron Missense_Mutation Nonsense_Mutation \n 44397 1859 286 \n Nonstop_Mutation RNA Silent \n 6 74 481 \n Splice_Region Splice_Site Translation_Start_Site \n 148 114 22 \n\n\nNow we convert this maf into binary matrix:\n\n# Generate binary matrix\ncoding_matrix <- get_coding_ssm_status(\n these_samples_metadata = metadata,\n maf_data = maf,\n gene_symbols = genes,\n include_hotspots = TRUE,\n review_hotspots = TRUE\n)\n\nNext, supplement this with the matrix of non-coding mutation across aSHM regions\n\n# Use aSHM regions from GAMBLR.data\nregions_bed <- somatic_hypermutation_locations_GRCh37_v0.2\n\n# Add convenient name column\nregions_bed <- regions_bed %>%\n mutate(\n name = paste(gene, region, sep = \"-\")\n )\n\n# Generate matrix of mutations per each site\nashm_matrix <- get_ashm_count_matrix(\n regions_bed = regions_bed,\n maf_data = maf,\n these_samples_metadata = metadata\n)\n\n# Binarize matrix using arbitrary 3 muts/region cutoff\nashm_matrix[ashm_matrix <= 3] = 0\nashm_matrix[ashm_matrix > 3] = 1\nashm_matrix <- ashm_matrix %>%\n rownames_to_column(\"sample_id\")\n\nWe can now combine both coding and non-coding features into single matrix:\n\nfeature_matrix <- left_join(\n coding_matrix,\n ashm_matrix\n)\n\n# Drop any fearures absent across at least 10 samples to clean any noise\nfeature_matrix <- feature_matrix %>%\n select_if(is.numeric) %>%\n select(where(~ sum(. > 0, na.rm = TRUE) >= 10)) %>%\n bind_cols(\n feature_matrix %>% select(sample_id),\n .\n )\n\nNow we can provide the binary matrix to the prettyForestPlot and regenerate the Supplemmental Figure 12C from the Thomas et al study!\n\ncomparison_column <- \"genetic_subgroup\"\ncomparison_values <- c(\"DGG-BL\", \"Q53-BL\")\nfp <- prettyForestPlot(\n metadata = metadata,\n mutmat = feature_matrix,\n genes = genes,\n comparison_column = comparison_column,\n comparison_values = comparison_values,\n max_q = max_q\n)\n\nfp$arranged" }, { "objectID": "install.html", @@ -270,7 +298,7 @@ "href": "tutorials/oncoplot.html#tallying-mutation-burden", "title": "Tutorial: The prettiest oncoplot", "section": "Tallying mutation burden", - "text": "Tallying mutation burden\nPreviously, we noted that the maf data we were supplying to the prettyOncoplot was not subset to contain only coding mutations, and also discouraged from pre-filtering maf to a subset of genes if we are insterested only looking at some of them. Here is why this is important: if we want to layer on additional information like total mutation burden per sample, any subsetting or filtering of the maf would generate inaccurate and misleading results. Therefore, prettyOncoplot handles all of this for you! So if we were to go ahead with tallying the total mutation burden, we could just add some additional parameters to the function call:\n\nhideTopBarplot <- FALSE # will display TMB annotations at the top\ntally_all_mutations <- TRUE # will tally all mutations per sample\n\nprettyOncoplot(\n these_samples_metadata = metadata,\n maf_df = maf,\n metadataColumns = metadataColumns,\n metadataBarHeight = metadataBarHeight,\n metadataBarFontsize = metadataBarFontsize,\n fontSizeGene = fontSizeGene,\n legendFontSize = legendFontSize,\n sortByColumns = metadataColumns,\n genes = genes,\n splitGeneGroups = gene_groups,\n splitColumnName = \"pathology\",\n groupNames = c(\"Follicular lymphoma\", \"DLBCL\", \"COMFL\"),\n hideTopBarplot = hideTopBarplot,\n tally_all_mutations = tally_all_mutations\n)\n\n\n\n\n\n\n\n\n\n\nDid you know?\n\n\n\nIf the dynamic range of total mutation burden is too big and there are some extreme outliers, the bar chart at the top of the oncoplot can be capped of at any numeric value by providing tally_all_mutations_max parameter.\n\n\nWhat if we want to additionally force the ordering based on the total number of mutations, so they are nicely arranged in the decreasing order? We can do so by adding the mutation counts as one of the annotation tracks and using it to sort the samples:\n\n# Count all muts to define the order of samples\ntotal_mut_burden <- maf %>%\n count(Tumor_Sample_Barcode)\n\nhead(total_mut_burden)\n\n Tumor_Sample_Barcode n\n1: 01-20260T 71\n2: 02-13135T 98\n3: 02-20170T 67\n4: 02-22991T 53\n5: 03-34157T 26\n6: 04-24937T 146\n\n# Add this info to metadata\nmetadata <- left_join(\n metadata,\n total_mut_burden\n) \n\nprettyOncoplot(\n these_samples_metadata = metadata,\n maf_df = maf,\n metadataColumns = metadataColumns,\n metadataBarHeight = metadataBarHeight,\n metadataBarFontsize = metadataBarFontsize,\n fontSizeGene = fontSizeGene,\n legendFontSize = legendFontSize,\n sortByColumns = c(\"n\", metadataColumns),\n genes = genes,\n splitGeneGroups = gene_groups,\n splitColumnName = \"pathology\",\n groupNames = c(\"Follicular lymphoma\", \"DLBCL\", \"COMFL\"),\n hideTopBarplot = hideTopBarplot,\n tally_all_mutations = tally_all_mutations,\n numericMetadataColumns = \"n\",\n arrange_descending = TRUE\n)\n\n\n\n\n\n\n\n\n\n\nNote\n\n\n\nWe have modified here the sortByColumns parameter, and provided two additional parameters numericMetadataColumns and arrange_descending.\n\n\n\n\n\n\n\n\nDid you know?\n\n\n\nThe top annotation and n annotation at the bottom are the same thing? Remove n from the legend by adding hide_annotations = \"n\" and remove display of annotation track while keeping the ordering by adding hide_annotations_tracks = TRUE." + "text": "Tallying mutation burden\nPreviously, we noted that the maf data we were supplying to the prettyOncoplot was not subset to contain only coding mutations, and also discouraged from pre-filtering maf to a subset of genes if we are insterested only looking at some of them. Here is why this is important: if we want to layer on additional information like total mutation burden per sample, any subsetting or filtering of the maf would generate inaccurate and misleading results. Therefore, prettyOncoplot handles all of this for you! So if we were to go ahead with tallying the total mutation burden, we could just add some additional parameters to the function call:\n\nhideTopBarplot <- FALSE # will display TMB annotations at the top\ntally_all_mutations <- TRUE # will tally all mutations per sample\n\nprettyOncoplot(\n these_samples_metadata = metadata,\n maf_df = maf,\n metadataColumns = metadataColumns,\n metadataBarHeight = metadataBarHeight,\n metadataBarFontsize = metadataBarFontsize,\n fontSizeGene = fontSizeGene,\n legendFontSize = legendFontSize,\n sortByColumns = metadataColumns,\n genes = genes,\n splitGeneGroups = gene_groups,\n splitColumnName = \"pathology\",\n groupNames = c(\"Follicular lymphoma\", \"DLBCL\", \"COMFL\"),\n hideTopBarplot = hideTopBarplot,\n tally_all_mutations = tally_all_mutations\n)\n\n\n\n\n\n\n\n\n\n\nDid you know?\n\n\n\nIf the dynamic range of total mutation burden is too big and there are some extreme outliers, the bar chart at the top of the oncoplot can be capped of at any numeric value by providing tally_all_mutations_max parameter.\n\n\nWhat if we want to additionally force the ordering based on the total number of mutations, so they are nicely arranged in the decreasing order? We can do so by adding the mutation counts as one of the annotation tracks and using it to sort the samples:\n\n# Count all muts to define the order of samples\ntotal_mut_burden <- maf %>%\n count(Tumor_Sample_Barcode)\n\nhead(total_mut_burden)\n\n Tumor_Sample_Barcode n\n1: 01-20260T 71\n2: 02-13135T 98\n3: 02-20170T 67\n4: 02-22991T 53\n5: 03-34157T 26\n6: 04-24937T 146\n\n# Add this info to metadata\nmetadata <- left_join(\n metadata,\n total_mut_burden\n)\n\nprettyOncoplot(\n these_samples_metadata = metadata,\n maf_df = maf,\n metadataColumns = metadataColumns,\n metadataBarHeight = metadataBarHeight,\n metadataBarFontsize = metadataBarFontsize,\n fontSizeGene = fontSizeGene,\n legendFontSize = legendFontSize,\n sortByColumns = c(\"n\", metadataColumns),\n genes = genes,\n splitGeneGroups = gene_groups,\n splitColumnName = \"pathology\",\n groupNames = c(\"Follicular lymphoma\", \"DLBCL\", \"COMFL\"),\n hideTopBarplot = hideTopBarplot,\n tally_all_mutations = tally_all_mutations,\n numericMetadataColumns = \"n\",\n arrange_descending = TRUE\n)\n\n\n\n\n\n\n\n\n\n\nNote\n\n\n\nWe have modified here the sortByColumns parameter, and provided two additional parameters numericMetadataColumns and arrange_descending.\n\n\n\n\n\n\n\n\nDid you know?\n\n\n\nThe top annotation and n annotation at the bottom are the same thing? Remove n from the legend by adding hide_annotations = \"n\" and remove display of annotation track while keeping the ordering by adding hide_annotations_tracks = TRUE." }, { "objectID": "tutorials/oncoplot.html#annotating-significance-of-mutation-frequencies-in-sample-groups", diff --git a/docs/tutorials/forestplot.html b/docs/tutorials/forestplot.html index 2c6249a..c1ce5ce 100644 --- a/docs/tutorials/forestplot.html +++ b/docs/tutorials/forestplot.html @@ -273,6 +273,10 @@
# Load packages
library(GAMBLR.data)
-library(GAMBLR.viz)
-library(dplyr)
Next, we will get some data to display. The metadata is expected to be a data frame with one required column: sample_id
and another column that will contain sample annotations according to the comparison group. In this example, we will use as example the data set and variant calls from the study that identified genetic subgroup of Burkitt lymphoma (BL).
The output of the function is a list containing the following objects: - fisher
: a data frame with detailed statistics of the Fisher’s test for each gene - mutmat
: a binary matrix used for the Fisher’s test - forest
: a ggplot2 object with the forest plot of the ORs from the Fisher’s test for each gene - bar
: a ggplot2 object wiht mutation frequencies for each Gene - arranged
: a display item where both the forest and bar plots are nicely arranged side-by-side
names(fp)
[1] "fisher" "forest" "bar" "arranged" "mutmat"
+By default, all of the genes of interest are reported in the output. After the Fisher’s test is performed, the prettyForestPlot
also calculates FDR and we can use it to only report significant differences by providing a significance cutoff with the parameter max_q
:
<- 0.1 # only those qith Q value <= 0.1 will be reported
+ max_q <- prettyForestPlot(
+ fp metadata = metadata,
+ maf = maf,
+ genes = genes,
+ comparison_column = comparison_column,
+ max_q = max_q
+ )
We now can take a look at what genes are passing the significance cutoff:
+$arranged fp
As the prettyForestPlot
construcst the 2x2 contingency tables to run Fisher’s test to find significant differences, it can only operate on comparing 2 groups between themselves - but what if you have more than that and want to see the difference between some of them? To handle this scenario, we can take advantage of the comparison_values
parameter, which will be used to subset the metadata to only requested groups and only perform testing and plotting on this subset. Let’s see it in action:
<- "genetic_subgroup" # change the comparison column
+ comparison_column <- c("IC-BL", "Q53-BL")
+ comparison_values <- prettyForestPlot(
+ fp metadata = metadata,
+ maf = maf,
+ genes = genes,
+ comparison_column = comparison_column,
+ comparison_values = comparison_values,
+ max_q = max_q
+
+ )
+$arranged fp
This plot is exactly reproducing the Supplemmental Figure 12D from the Thomas et al study!
+We can additionally separate hotspots from the other mutations and compare those separately. First, we need to annotate the maf data, for which we will use the annotate_hotspots
from GAMBLR family. This function will add a new column to the maf named hot_spot
indicating whether or not the specific mutation is in the hotspot region.
# Annotate hotspots
+<- annotate_hotspots(maf)
+ maf
+# What are the hotspots?
+%>%
+ maf filter(hot_spot) %>%
+ select(Hugo_Symbol, hot_spot) %>%
+ table()
< table of extent 0 x 0 >
+The GAMBLR.data version of the annotate_hotspots
only handles very specific genes and does not have functionality to annotate all hotspots.
Oh no! Looks like there is no hotspots in this maf data. This does not make sense, so what happened? Aha, the hotspot annotation in GAMBLR.data works only on the data in grch37 projection. But our maf is in hg38, so what should we do? One way is to lift the maf data to another projection using the UCSC’s liftOver, and GAMBLR family has exactly the function that serves this purpose:
+<- liftover(
+ maf_grch37
+ maf,mode = "maf",
+ target_build = "grch37"
+ %>%
+ ) mutate(Chromosome = gsub("chr", "", Chromosome)) %>%
+select(-hot_spot) # since it is empty we can just drop it
Can we annotate the hotspots now?
+<- annotate_hotspots(maf_grch37)
+ maf_grch37
+# What are the hotspots?
+%>%
+ maf_grch37 filter(hot_spot) %>%
+ select(Hugo_Symbol, hot_spot) %>%
+ table()
hot_spot
+Hugo_Symbol TRUE
+ CREBBP 1
+ EZH2 1
+ FOXO1 60
+ MYD88 2
+ STAT6 4
+Indeed, the hotspots are properly annotated once we have maf in correct projection. Now, we can simply toggle the separate_hotspots
parameter to perform separate comparisons within hotspots:
<- "EBV_status_inf"
+ comparison_column <- prettyForestPlot(
+ fp metadata = metadata,
+ maf = maf_grch37,
+ genes = genes,
+ comparison_column = comparison_column,
+ max_q = max_q,
+ separate_hotspots = TRUE
+
+ )
+$arranged fp
Sometimes it might be useful to have different input format instead of maf - for example, binary matrix of features. Can we use the prettyForestPlot
in this case? Yes, sure we can!
First, let’s construct the binary matrix. We will supplement our maf with the non-coding mutations to look at the aSHM regions in addition to coding mutations, and this will already give us the data in correct projection:
+<- get_ssm_by_samples(
+ maf these_samples_metadata = metadata
+
+ )$Variant_Classification %>% table maf
.
+ 3'Flank 3'UTR 5'Flank
+ 1457 513 2957
+ 5'UTR Frame_Shift_Del Frame_Shift_Ins
+ 1102 124 97
+ IGR In_Frame_Del In_Frame_Ins
+ 457 40 14
+ Intron Missense_Mutation Nonsense_Mutation
+ 44397 1859 286
+ Nonstop_Mutation RNA Silent
+ 6 74 481
+ Splice_Region Splice_Site Translation_Start_Site
+ 148 114 22
+Now we convert this maf into binary matrix:
+# Generate binary matrix
+<- get_coding_ssm_status(
+ coding_matrix these_samples_metadata = metadata,
+ maf_data = maf,
+ gene_symbols = genes,
+ include_hotspots = TRUE,
+ review_hotspots = TRUE
+ )
Next, supplement this with the matrix of non-coding mutation across aSHM regions
+# Use aSHM regions from GAMBLR.data
+<- somatic_hypermutation_locations_GRCh37_v0.2
+ regions_bed
+# Add convenient name column
+<- regions_bed %>%
+ regions_bed mutate(
+ name = paste(gene, region, sep = "-")
+
+ )
+# Generate matrix of mutations per each site
+<- get_ashm_count_matrix(
+ ashm_matrix regions_bed = regions_bed,
+ maf_data = maf,
+ these_samples_metadata = metadata
+
+ )
+# Binarize matrix using arbitrary 3 muts/region cutoff
+<= 3] = 0
+ ashm_matrix[ashm_matrix > 3] = 1
+ ashm_matrix[ashm_matrix <- ashm_matrix %>%
+ ashm_matrix rownames_to_column("sample_id")
We can now combine both coding and non-coding features into single matrix:
+<- left_join(
+ feature_matrix
+ coding_matrix,
+ ashm_matrix
+ )
+# Drop any fearures absent across at least 10 samples to clean any noise
+<- feature_matrix %>%
+ feature_matrix select_if(is.numeric) %>%
+ select(where(~ sum(. > 0, na.rm = TRUE) >= 10)) %>%
+ bind_cols(
+ %>% select(sample_id),
+ feature_matrix
+ . )
Now we can provide the binary matrix to the prettyForestPlot
and regenerate the Supplemmental Figure 12C from the Thomas et al study!
<- "genetic_subgroup"
+ comparison_column <- c("DGG-BL", "Q53-BL")
+ comparison_values <- prettyForestPlot(
+ fp metadata = metadata,
+ mutmat = feature_matrix,
+ genes = genes,
+ comparison_column = comparison_column,
+ comparison_values = comparison_values,
+ max_q = max_q
+
+ )
+$arranged fp