diff --git a/DESCRIPTION b/DESCRIPTION index be6b45f..cbb8702 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,15 +10,20 @@ Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 Imports: + bayesplot, cli, cmdstanr, dplyr, ggplot2, + gridExtra, loo, magrittr, + patchwork, ppclust, rlang, + scales, stats, + stringr, TailRank, tibble, tidyr, diff --git a/NAMESPACE b/NAMESPACE index bebb216..8428f8d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,10 @@ export(get_clonal_peaks) export(get_initialization) export(karyo_to_int) export(model_selection_h) +export(plot_elbo_h) +export(plot_model_selection_h) +export(plot_posterior_clocks_h) +export(plot_posterior_weights_h) export(plot_timing) export(plot_timing_h) export(prepare_input_data) diff --git a/R/model_selection_plot_h.R b/R/model_selection_plot_h.R new file mode 100755 index 0000000..d031093 --- /dev/null +++ b/R/model_selection_plot_h.R @@ -0,0 +1,45 @@ +#' Plot the behavior of the model selection scores vs the number of clusters +#' +#' @description Plot the BIC, Log Likelihood, ICL scores obtatined after the fit for each number of cluster. +#' +#' @param model_selection_tibble a tibble with 3 scores and k_max values, one for each inference +#' @param best_K the integer corresponding to the best number of components +#' +#' @return model_selection_plot +#' @export + +plot_model_selection_h <- function(model_selection_tibble, best_K) { + create_plot <- function(data, x, y, best_K, y_label, score_name) { + ggplot2::ggplot(data, ggplot2::aes(x = !!ggplot2::sym(x), y = !!ggplot2::sym(y))) + + ggplot2::geom_line(color = "steelblue", linewidth = 1) + + ggplot2::geom_point(color = "steelblue", size = 3) + + ggplot2::geom_point(data = data[data[[x]] == best_K, ], + ggplot2::aes(x = !!ggplot2::sym(x), y = !!ggplot2::sym(y)), color = "firebrick", size = 4) + + ggplot2::labs(y = y_label, x = "Number of Clusters (K)", title = score_name) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + legend.position = "none", + plot.title = ggplot2::element_text(face = "bold", size = 14, hjust = 0.5), + axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 10)), + axis.title.y = ggplot2::element_text(margin = ggplot2::margin(r = 10)), + panel.grid.minor = ggplot2::element_blank() + ) + } + + bic_plot <- create_plot(model_selection_tibble, "K", "BIC", best_K, "BIC", "BIC vs K") + log_lik_plot <- create_plot(model_selection_tibble, "K", "Log_lik", best_K, "Log-Likelihood", "Log-Likelihood vs K") + icl_plot <- create_plot(model_selection_tibble, "K", "ICL", best_K, "ICL", "ICL vs K") + + model_selection_plot <- (bic_plot) / icl_plot / (log_lik_plot) + + patchwork::plot_annotation( + title = "Model Selection Graphs: Scores vs Number of Clusters", + caption = "Source: Your Data", + theme = ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(size = 16, hjust = 0.5), + plot.caption = ggplot2::element_text(size = 10, hjust = 0.5, margin = ggplot2::margin(t = 10)) + ) + ) + + return(model_selection_plot) +} diff --git a/R/plot_elbo_h.R b/R/plot_elbo_h.R new file mode 100755 index 0000000..8c1d88c --- /dev/null +++ b/R/plot_elbo_h.R @@ -0,0 +1,34 @@ +#' Plot elbo Function +#' +#' @description Plot the ELBO behaviour for each fit fixing the number of cluster. +#' @param elbo_iteration data.frame': #iterations until convergence of elbo obs. of 2 variables: iteration: int, elbo : num +#' one of the element list elbo_iteration obtained from fit_h "results" +#' +#' @return p +#' @export +plot_elbo_h <- function(elbo_iteration){ + + data <- elbo_iteration + + p <- ggplot2::ggplot(data, ggplot2::aes(x = .data$iteration, y = .data$elbo)) + + ggplot2::geom_line(color = "blue", size = 0.2) + + # geom_point(color = "black", size = 1.5) + + ggplot2::labs( + title = paste0("ELBO Convergence for K = ", nrow(data) - 1), + x = "Iteration", + y = "ELBO" + ) + + # theme_classic(base_size = 12) + + ggplot2::theme( + axis.title = ggplot2::element_text(size = 14), + axis.text = ggplot2::element_text(size = 12), + panel.grid = ggplot2::element_line(color = "gray100"), + ) + + ggplot2::scale_x_continuous( + breaks = seq(min(data$iteration), max(data$iteration), by = 1) # Ensure only integers appear + ) + + ggplot2::scale_y_continuous(labels = scales::comma) + + + return(p) +} diff --git a/R/plot_posterior_clocks_h.R b/R/plot_posterior_clocks_h.R new file mode 100755 index 0000000..bf22206 --- /dev/null +++ b/R/plot_posterior_clocks_h.R @@ -0,0 +1,30 @@ +#' Plot posterior clocks distributions obtained from the hierarchical model fit +#' +#' @param results list of 4: $data, $draws_and_summary, $log_lik_matrix_list and $elbo_iterations +#' @param K index of inference whose results want to be plotted +#' +#' @return areas_tau +#' @export +#' +plot_posterior_clocks_h <- function(results, K){ + draws <- results$draws_and_summary[[K]]$draws + + names <- paste("tau[", 1:K, "]", sep = "") + + areas_tau <- bayesplot::mcmc_areas( + draws, + pars = names, + prob = 0.8, # 80% intervals + prob_outer = 0.95, # 99% + point_est = "median" + )+ + ggplot2::labs( + title = "Approximate Posterior distributions", + subtitle = "With median and 80% and 95% intervals" + )+ + ggplot2::xlim(0, 1) + + return(areas_tau) +} + + diff --git a/R/plot_posterior_weights_h.R b/R/plot_posterior_weights_h.R new file mode 100755 index 0000000..dff4e34 --- /dev/null +++ b/R/plot_posterior_weights_h.R @@ -0,0 +1,29 @@ +#' Plot posterior weights distributions obtained from the hierarchical model fit +#' +#' @param results list of 4: $data, $draws_and_summary, $log_lik_matrix_list and $elbo_iterations +#' @param K index of inference whose results want to be plotted +#' +#' @return areas_tau +#' @export + +plot_posterior_weights_h <- function(results, K){ + + draws <- results$draws_and_summary[[K]]$draws + S = nrow(results$data$accepted_cna) + + intervals_weigths_per_tau <- list() + for (k in 1:K){ + names_weights <- paste("w[",1:S,",", k, "]", sep = "") + p <- bayesplot::mcmc_intervals(draws, pars = names_weights, point_est = "median", prob = 0.8, prob_outer = 0.95)+ + ggplot2::labs( + title = stringr::str_wrap( paste0("Posterior distributions of the weigths for tau ",k), width = 30 + K + sqrt(S)), + subtitle = "With median and 80% and 95% intervals" + ) + intervals_weigths_per_tau[[k]] <- ggplot2::ggplotGrob(p) + } + p <- gridExtra::grid.arrange(grobs = intervals_weigths_per_tau, ncol = K) #add global title + + return(p) +} + + diff --git a/man/plot_elbo_h.Rd b/man/plot_elbo_h.Rd new file mode 100644 index 0000000..e4bc0bb --- /dev/null +++ b/man/plot_elbo_h.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_elbo_h.R +\name{plot_elbo_h} +\alias{plot_elbo_h} +\title{Plot elbo Function} +\usage{ +plot_elbo_h(elbo_iteration) +} +\arguments{ +\item{elbo_iteration}{data.frame': #iterations until convergence of elbo obs. of 2 variables: iteration: int, elbo : num +one of the element list elbo_iteration obtained from fit_h "results"} +} +\value{ +p +} +\description{ +Plot the ELBO behaviour for each fit fixing the number of cluster. +} diff --git a/man/plot_model_selection_h.Rd b/man/plot_model_selection_h.Rd new file mode 100644 index 0000000..9e609ed --- /dev/null +++ b/man/plot_model_selection_h.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model_selection_plot_h.R +\name{plot_model_selection_h} +\alias{plot_model_selection_h} +\title{Plot the behavior of the model selection scores vs the number of clusters} +\usage{ +plot_model_selection_h(model_selection_tibble, best_K) +} +\arguments{ +\item{model_selection_tibble}{a tibble with 3 scores and k_max values, one for each inference} + +\item{best_K}{the integer corresponding to the best number of components} +} +\value{ +model_selection_plot +} +\description{ +Plot the BIC, Log Likelihood, ICL scores obtatined after the fit for each number of cluster. +} diff --git a/man/plot_posterior_clocks_h.Rd b/man/plot_posterior_clocks_h.Rd new file mode 100644 index 0000000..972786d --- /dev/null +++ b/man/plot_posterior_clocks_h.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_posterior_clocks_h.R +\name{plot_posterior_clocks_h} +\alias{plot_posterior_clocks_h} +\title{Plot posterior clocks distributions obtained from the hierarchical model fit} +\usage{ +plot_posterior_clocks_h(results, K) +} +\arguments{ +\item{results}{list of 4: $data, $draws_and_summary, $log_lik_matrix_list and $elbo_iterations} + +\item{K}{index of inference whose results want to be plotted} +} +\value{ +areas_tau +} +\description{ +Plot posterior clocks distributions obtained from the hierarchical model fit +} diff --git a/man/plot_posterior_weights_h.Rd b/man/plot_posterior_weights_h.Rd new file mode 100644 index 0000000..7ecad91 --- /dev/null +++ b/man/plot_posterior_weights_h.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_posterior_weights_h.R +\name{plot_posterior_weights_h} +\alias{plot_posterior_weights_h} +\title{Plot posterior weights distributions obtained from the hierarchical model fit} +\usage{ +plot_posterior_weights_h(results, K) +} +\arguments{ +\item{results}{list of 4: $data, $draws_and_summary, $log_lik_matrix_list and $elbo_iterations} + +\item{K}{index of inference whose results want to be plotted} +} +\value{ +areas_tau +} +\description{ +Plot posterior weights distributions obtained from the hierarchical model fit +} diff --git a/vignettes/a4_Multiple_segment_timing_files/figure-html/unnamed-chunk-6-1.png b/vignettes/a4_Multiple_segment_timing_files/figure-html/unnamed-chunk-6-1.png new file mode 100644 index 0000000..de23c4f Binary files /dev/null and b/vignettes/a4_Multiple_segment_timing_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/vignettes/tickTack_files/a4_Multiple_segment_timing.Rmd b/vignettes/tickTack_files/a4_Multiple_segment_timing.Rmd new file mode 100755 index 0000000..2fdb59f --- /dev/null +++ b/vignettes/tickTack_files/a4_Multiple_segment_timing.Rmd @@ -0,0 +1,183 @@ +--- +title: "4. Timing Clonal Peaks in a hierarchical fashioin" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Timing Clonal Peaks in a hierarchical fashioin} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +The `_fit_h` function in the `tickTack` package estimates the timing of `K` clonal peaks in cancer genome sequencing data according to which a clustering can be performed. This vignette describes the functionality of the `fit_h` function, including input requirements, output, and an example analysis using the `tickTack::pcawg_example_2` dataset. + +## Overview of the `fit_h` Function + +The `fit_h` function uses a hierarchical model to fit clonal peaks in sequencing data considering the grouping tructure of the chromosomes segments. It identifies segments of the genome with specific karyotypes and mutations that meet the input criteria, then estimates the timing of the groups of events and assign each segment to a clock. + +### Key Parameters + +- **`x`**: a CNAqc object with mutations, cna and metadata +- **`max_attempts`**: Number of times the variational inference is repeated to avoid local minima. +- **`INIT`**: Logical flag to pass some initialization values tothe variational inference, default is `TRURE`. +- **`tolerance`**: tolerance between two value of subsequent iterations of gradient ascent pn elbo, default is `0.01`. +- **`possible_k`**: A character vector of possible karyotypes, defaulting to `c("2:1", "2:2", "2:0")`. +- **`alpha`**: Significance level, defaulting to `0.05`. +- **`min_mutations_number`**: Minimum number of mutations required for analysis, defaulting to `2`. +- **`n_components`**: If `0`, then the strategy to choose the #components follows the default procedure, otherwise the inference is repeated for K equal up to a maximun of n_components. + + +### Output + +The function returns a list containing: +1. **`data`**: The data used to perform the inference after selecting the ones that respect the assumptions to be used in the model. +2. **`draws_and_summary`**: List of 3 for each K the inference is performed with. draws are available both for the clocks and for the weights Summary statistics for the estimated timing of clonal peaks. +3. **`log_lik_matrix_list`**: Summary statistics for the estimated timing of clonal peaks. +4. **`elbo_iterations`**: Summary statistics for the estimated timing of clonal peaks. + +If no segments meet the criteria, the function returns `NULL`. + +## Analyzing `tickTack::pcawg_example_2` + +We will use the `tickTack::pcawg_example_2` dataset to demonstrate how to use the `fit_h` function. + +### Input Data + +The `tickTack::pcawg_example_2` dataset contains three components: + +- **`mutations`**: Mutation data. +- **`cna`**: Copy number alterations (CNA). +- **`metadata`**: Sample metadata, including tumor purity. + +Preview the data: + +```{r echo=TRUE} +library(tickTack) + +# View example dataset components +mutations <- tickTack::pcawg_example_2$mutations +cna <- tickTack::pcawg_example_2$cna +metadata <- tickTack::pcawg_example_2$metadata + +head(mutations) +head(cna) +metadata +``` + +### Running the `fit_h` function + +We can run the `fit_h` function on the `tickTack::pcawg_example_2` data to infer the timing of clonal peaks + +```{r echo=FALSE} +# Extract input data +data <- tickTack::pcawg_example_2 +tolerance = 0.1 + +# Run the fit function +data <- fit_h( + x = data, + max_attempts = 2, + INIT = TRUE, + tolerance = tolerance +) +``` + +### Results + +The `results` object that is returned together with the CNAqc input object contains four components: `data`, `draws_and_summary`, `log_lik_matrix_list` and `elbo_iterations`. + +```{r echo=TRUE} +# View summary for a specific K, here K = 2 +results <- data$results +``` + +### Interpreting the output + +We can inspect the main output of interest to understand the timing of clonal peaks. +`results$draws_and_summary` contains: +- **`draws`** the draws from the approximate posterior distribution of the taus and weights; +- **`summary`** a summary with the main statistics of the approximate posterior distributions; +- **`summarized_results`** represents the clock assignment, a tibble with the estimate of taus for each segment with a copy number event that has been included in the hierarchical inference + +```{r echo=TRUE} +# View summary for a specific K, here K = 2 +results$draws_and_summary[[2]]$summary + +# View detailed summarized results for a specific K, here K = 2 +results$draws_and_summary[[2]]$summarized_results +``` + + + +### Obtain the best K with model_selection_h +W e can run the `model_selection_h` function to obtain the scores for each inference performed with a different K and take the one with best ICL score if the BIC score prefer 2 components instead of 1, otherwise choose 1 as best K. The function takes as input the `results` and `n_components` and outputs the `best_K` and the corresponding `best_fit` together with the `model_selection_tibble` and the `entropy_list` used to evaluate the ICL score. + +```{r} +results_model_selection <- tickTack::model_selection_h(results, n_components = 0) + +best_K <- results_model_selection$best_K +model_selection_tibble <- results_model_selection$model_selection_tibble +entropy <- results_model_selection$entropy_list + +``` + + +## Visulizing the output + +The results can be viewed is genome-wise perspective using the `tickTack::plot_timing_h` function. + +```{r} +tickTack::plot_timing_h(results, best_K) +``` + + + +## Visualize distributions of draws from the approximate posterior +The approximate posterior distributions can be viewed using the `tickTack::plot_posterior_clocks_h` and `tickTack::plot_posterior_weights_h` functions, that internally use functions from Bayesplot. + + +```{r} +posterior_clocks <- tickTack::plot_posterior_clocks_h(results, best_K) +posterior_weights <- tickTack::plot_posterior_weights_h(results, best_K) + +``` + + +## Visualize the behavior of the ELBO during the inference + + +```{r} + +K = nrow(results_model_selection$model_selection_tibble) + +p_elbo <- list() +for (i in 1:K){ + p_elbo[[i]] <- tickTack::plot_elbo_h(elbo_iterations, i) + ggtitle(paste0("K = ", i)) +} +p_elbo <- gridExtra::grid.arrange(grobs = p_elbo, nrow=K) #add global title +p_elbo + +``` + + +## Visualize model selection scores + +```{r} + +p_model_selection <- plot_model_selection_h(model_selection_tibble, best_K) +p_model_selection + +cat("Best K =",best_K) +``` + + +## Visualize all the inference results for each K + +```{r} + +plot_model_selection_inference <- list() +for (i in 1:K){ + plot_model_selection_inference[[i]] <- tickTack::plot_timing_h(results, i) + ggtitle(paste0("K = ", i)) +} +plot_model_selection_inference <- gridExtra::grid.arrange(grobs = plot_model_selection_inference, nrow = K) #add global title +plot_model_selection_inference + +``` \ No newline at end of file