diff --git a/R/model_selection_h.R b/R/model_selection_h.R index 0f66d30..a6a6ff3 100755 --- a/R/model_selection_h.R +++ b/R/model_selection_h.R @@ -1,161 +1,165 @@ -#' model_selection Function -#' -#' Perform model selection among the models fit with varying number of mixture components (number of clocks). -#' @param data list of k_max of lists: $input_data:list: List of 7: $S: int, $N: int, $karyotype: num (0 or 1), $seg_assignment: num, $peaks:List of N of num (1:2), $NV: num, $DP: num -#' @param draws_and_summary list of lenght k_max of draws form the variational method with the summary statistics of the draws from the approximate posterior -#' @param log_lik_matrix_list list of lenght k_max -#' @param elbo_iterations list of lenght k_max -#' @param n_components number of components specified from user -#' -#' @keywords fit -#' -#' @return result_model_selection: list $best_fit, $best_K, $model_selection_tibble, $entropy_list) -#' -#' @export -model_selection_h = function(data, draws_and_summary, log_lik_matrix_list, elbo_iterations, n_components = 0) { - karyo <- data$karyotype - - if (n_components != 0){ - k_max = n_components - } else if (length(karyo) <= 2){ - k_max = (length(karyo)) - } else if (length(karyo) <= 7){ - k_max = (length(karyo)-1) - } else if (length(karyo) <= 15) { - k_max = ((floor(length(karyo)/2))-1) - } else{ - k_max = ceiling(sqrt(length(karyo))) - } - - model_selection_tibble <- dplyr::tibble() - S <- length(length(karyo)) - - entropy_per_segment_matrix = matrix(0, k_max, S) # k_max rows and S columns - entropy_per_segment_matrix_norm = matrix(0, k_max, S) - - for (K in 1:k_max) { - - # select it from results - # elbo_df <- elbo_iterations[[K]] - - ############################################ - # SELECT THE BEST MODEL - draws = 1000 - - total_number_params <- K+((K-1)*S)+2 # tau = K, w = K*S, phi, kappa (dirichlet reparametrization) - N <- length(data$NV) - - log_lik_matrix <- log_lik_matrix_list[[K]] - log_lik_total_per_sample <- rowSums(log_lik_matrix) - L <- stats::median(log_lik_total_per_sample) - - BIC <- ((total_number_params * log(N)) - 2 * L) # %>% unname() - AIC <- 2 * total_number_params - 2 * L - - loo_result <- loo::loo(log_lik_matrix) - loo_value <- loo_result$estimates[3, "Estimate"] # LOO-CV estimate #getter? - - # Calculate ICL --> move outside the main function - # Generate names for w[sim_params_num, K] format - - - # extract w from results instead that from stanfit - names_weights <- outer(1:S, 1:K, - FUN = function(i, j) paste0("w[", i, ",", j, "]")) - names_weights <- as.vector(names_weights) - tau_and_w_draws <- draws_and_summary[[K]]$draws - w_ICL <- tau_and_w_draws[, colnames(tau_and_w_draws) %in% names_weights] - - dim(w_ICL) = c(draws,S*K) - w_ICL <- apply(w_ICL, 2, stats::median) # median check over draws - dim(w_ICL) = c(S,K) # check by row - w_ICL = t(w_ICL) - - num_mutations_all <- c() - for (i in seq_along(unique(data$seg_assignment))) { - segment <- unique(data$seg_assignment)[i] - num_mutations_single <- length( data$seg_assignment[(data$seg_assignment == segment)]) - num_mutations_all <- c(num_mutations_all, num_mutations_single) - } - mut_per_seg = num_mutations_all - - res_entropy = 0 - post = w_ICL - for (k in 1:K ){ - post_k = post[k] - log_post_k = log(post_k + 0.000001) - post_k_entr = post_k * log_post_k * mut_per_seg - post_k_entr = sum(post_k_entr) - post_k_entr = -1 * (post_k_entr) - res_entropy = res_entropy + post_k_entr - } - entropy = res_entropy - ICL = BIC + entropy - - - model_selection_tibble <- dplyr::bind_rows(model_selection_tibble, dplyr::tibble(K = K, BIC = BIC, Log_lik = L, ICL = ICL)) # AIC = AIC, LOO = loo_value, - - # ICL PER SEGMENT to see its behaviour with increasing K - post_Segments = t(w_ICL) - entropy_per_segment = c() - entropy_per_segment_norm = c() - for (s in 1:S ){ - post_s = post_Segments[s] - log_post_s = log(post_s + 0.000001) - post_s_entr = post_s * log_post_s * mut_per_seg - post_s_entr = sum(post_s_entr) - post_s_entr = -1 * (post_s_entr) - entropy_per_segment = c(entropy_per_segment, post_s_entr) - - post_s_entr_norm = post_s * log_post_s - post_s_entr_norm = sum(post_s_entr_norm) - post_s_entr_norm = -1 * (post_s_entr_norm) - entropy_per_segment_norm = c(entropy_per_segment_norm, post_s_entr_norm) - - } - - print("entropy per segment: ") - print(entropy_per_segment) - - print("entropy per segment normalized: ") - print(entropy_per_segment_norm) - - entropy_per_segment_matrix_norm[K,] = entropy_per_segment_norm - entropy_per_segment_matrix[K,] = entropy_per_segment - - } - - entropy_list <- list(entropy_per_segment_matrix = entropy_per_segment_matrix, entropy_per_segment_matrix_norm = entropy_per_segment_matrix_norm) - - model_selection_tibble_temp <- model_selection_tibble[1:2, bycol= TRUE] - best_K_temp <- model_selection_tibble_temp %>% dplyr::filter(BIC == min(BIC)) %>% dplyr::pull(K) - - if (best_K_temp!=1){ - if (k_max==2){ - best_K <- 2 - cli::cli_alert_info("The algorithm should be run with more Components ") - }else{ - - while(mean(entropy_per_segment_matrix_norm[best_K_temp+1,]) - mean(entropy_per_segment_matrix_norm[best_K_temp,]) < 0 & best_K_temp < k_max ){ - best_K_temp = best_K_temp + 1 - if ( best_K_temp == k_max ){ - break - }}} - } else { - best_K <- 1 - } - best_K <- best_K_temp - # model_selection_tibble_temp <- model_selection_tibble[2:k_max, bycol= TRUE] - # best_K <- model_selection_tibble_temp %>% dplyr::filter(ICL == min(ICL)) %>% dplyr::pull(K) - # } - # }else { - # best_K <- 1} - - if(best_K==k_max){ - cli::cli_alert_info("The algorithm should be run with more Components ") - } - - result_model_selection = list(best_fit = draws_and_summary[[best_K]], best_K = best_K, model_selection_tibble = model_selection_tibble, entropy_list = entropy_list) - return (result_model_selection) -} - +#' model_selection Function +#' +#' Perform model selection among the models fit with varying number of mixture components (number of clocks). +#' @param results list of 4: $data, $draws_and_summary, $log_lik_matrix_list and $elbo_iterations +#' @param n_components number of components specified from user +#' +#' @keywords fit +#' +#' @return result_model_selection: list $best_fit, $best_K, $model_selection_tibble, $entropy_list) +#' +#' @export +model_selection_h = function(results, n_components = 0) { + + data <- results$data$input_data + draws_and_summary <- results$draws_and_summary + log_lik_matrix_list <- results$log_lik_matrix_list + elbo_iterations <- results$elbo_iterations + + + karyo <- data$karyotype + + if (n_components != 0){ + k_max = n_components + } else if (length(karyo) <= 2){ + k_max = (length(karyo)) + } else if (length(karyo) <= 7){ + k_max = (length(karyo)-1) + } else if (length(karyo) <= 15) { + k_max = ((floor(length(karyo)/2))-1) + } else{ + k_max = ceiling(sqrt(length(karyo))) + } + + model_selection_tibble <- dplyr::tibble() + S <- length(length(karyo)) + + entropy_per_segment_matrix = matrix(0, k_max, S) # k_max rows and S columns + entropy_per_segment_matrix_norm = matrix(0, k_max, S) + + for (K in 1:k_max) { + + # select it from results + # elbo_df <- elbo_iterations[[K]] + + ############################################ + # SELECT THE BEST MODEL + draws = 1000 + + total_number_params <- K+((K-1)*S)+2 # tau = K, w = K*S, phi, kappa (dirichlet reparametrization) + N <- length(data$NV) + + log_lik_matrix <- log_lik_matrix_list[[K]] + log_lik_total_per_sample <- rowSums(log_lik_matrix) + L <- stats::median(log_lik_total_per_sample) + + BIC <- ((total_number_params * log(N)) - 2 * L) # %>% unname() + AIC <- 2 * total_number_params - 2 * L + + loo_result <- loo::loo(log_lik_matrix) + loo_value <- loo_result$estimates[3, "Estimate"] # LOO-CV estimate #getter? + + # Calculate ICL --> move outside the main function + # Generate names for w[sim_params_num, K] format + + + # extract w from results instead that from stanfit + names_weights <- outer(1:S, 1:K, + FUN = function(i, j) paste0("w[", i, ",", j, "]")) + names_weights <- as.vector(names_weights) + tau_and_w_draws <- draws_and_summary[[K]]$draws + w_ICL <- tau_and_w_draws[, colnames(tau_and_w_draws) %in% names_weights] + + dim(w_ICL) = c(draws,S*K) + w_ICL <- apply(w_ICL, 2, stats::median) # median check over draws + dim(w_ICL) = c(S,K) # check by row + w_ICL = t(w_ICL) + + num_mutations_all <- c() + for (i in seq_along(unique(data$seg_assignment))) { + segment <- unique(data$seg_assignment)[i] + num_mutations_single <- length( data$seg_assignment[(data$seg_assignment == segment)]) + num_mutations_all <- c(num_mutations_all, num_mutations_single) + } + mut_per_seg = num_mutations_all + + res_entropy = 0 + post = w_ICL + for (k in 1:K ){ + post_k = post[k] + log_post_k = log(post_k + 0.000001) + post_k_entr = post_k * log_post_k * mut_per_seg + post_k_entr = sum(post_k_entr) + post_k_entr = -1 * (post_k_entr) + res_entropy = res_entropy + post_k_entr + } + entropy = res_entropy + ICL = BIC + entropy + + + model_selection_tibble <- dplyr::bind_rows(model_selection_tibble, dplyr::tibble(K = K, BIC = BIC, Log_lik = L, ICL = ICL)) # AIC = AIC, LOO = loo_value, + + # ICL PER SEGMENT to see its behaviour with increasing K + post_Segments = t(w_ICL) + entropy_per_segment = c() + entropy_per_segment_norm = c() + for (s in 1:S ){ + post_s = post_Segments[s] + log_post_s = log(post_s + 0.000001) + post_s_entr = post_s * log_post_s * mut_per_seg + post_s_entr = sum(post_s_entr) + post_s_entr = -1 * (post_s_entr) + entropy_per_segment = c(entropy_per_segment, post_s_entr) + + post_s_entr_norm = post_s * log_post_s + post_s_entr_norm = sum(post_s_entr_norm) + post_s_entr_norm = -1 * (post_s_entr_norm) + entropy_per_segment_norm = c(entropy_per_segment_norm, post_s_entr_norm) + + } + + print("entropy per segment: ") + print(entropy_per_segment) + + print("entropy per segment normalized: ") + print(entropy_per_segment_norm) + + entropy_per_segment_matrix_norm[K,] = entropy_per_segment_norm + entropy_per_segment_matrix[K,] = entropy_per_segment + + } + + entropy_list <- list(entropy_per_segment_matrix = entropy_per_segment_matrix, entropy_per_segment_matrix_norm = entropy_per_segment_matrix_norm) + + model_selection_tibble_temp <- model_selection_tibble[1:2, bycol= TRUE] + best_K_temp <- model_selection_tibble_temp %>% dplyr::filter(BIC == min(BIC)) %>% dplyr::pull(K) + + if (best_K_temp!=1){ + if (k_max==2){ + best_K <- 2 + cli::cli_alert_info("The algorithm should be run with more Components ") + }else{ + + while(mean(entropy_per_segment_matrix_norm[best_K_temp+1,]) - mean(entropy_per_segment_matrix_norm[best_K_temp,]) < 0 & best_K_temp < k_max ){ + best_K_temp = best_K_temp + 1 + if ( best_K_temp == k_max ){ + break + }}} + } else { + best_K <- 1 + } + best_K <- best_K_temp + # model_selection_tibble_temp <- model_selection_tibble[2:k_max, bycol= TRUE] + # best_K <- model_selection_tibble_temp %>% dplyr::filter(ICL == min(ICL)) %>% dplyr::pull(K) + # } + # }else { + # best_K <- 1} + + if(best_K==k_max){ + cli::cli_alert_info("The algorithm should be run with more Components ") + } + + result_model_selection = list(best_fit = draws_and_summary[[best_K]], best_K = best_K, model_selection_tibble = model_selection_tibble, entropy_list = entropy_list) + return (result_model_selection) +} + diff --git a/R/plot_inference_h.R b/R/plot_timing_h.R similarity index 94% rename from R/plot_inference_h.R rename to R/plot_timing_h.R index 4df570e..d476c6c 100644 --- a/R/plot_inference_h.R +++ b/R/plot_timing_h.R @@ -1,66 +1,66 @@ -#' plot_inference_h Function -#' -#' This function obtains the list of input data to be used in the stan model. -#' @param results list(data = input_data_list, results = results, output_files_list = output_files_list) -#' @param colour_by chr: default = "karyotype" -#' @param K mun: number of clocks -#' @param split_contiguous_segments option to plot segments' setalarion lines -#' -#' @return p : plot of inference results with credibility intervals in the chromosome absolute positions -#' -#' @keywords plot -#' @export - -plot_inference_h = function(results, K, colour_by = "karyotype", split_contiguous_segments = TRUE) { - - #segments <- x_segments[ x_segments$chr %in% results$data$accepted_cna$chr, ] - - segments <- results$data$accepted_cna - segments$from = lapply(segments$segment_name, function(s) {unlist(strsplit(s, "_"))[2]}) %>% unlist() %>% as.numeric() - segments$to = lapply(segments$segment_name, function(s) {unlist(strsplit(s, "_"))[3]}) %>% unlist() %>% as.numeric() - - reference_genome <- tickTack::chr_coordinates_GRCh38 - - vfrom = reference_genome$from - names(vfrom) = reference_genome$chr - - absoulte_segments <- segments %>% - dplyr::mutate(from = .data$from + vfrom[.data$chr], - to = .data$to + vfrom[.data$chr]) - - summarized_results <- results$draws_and_summary[[K]]$summarized_results %>% - dplyr::mutate(from = absoulte_segments[.data$segment_id,]$from) %>% - dplyr::mutate(to = absoulte_segments[.data$segment_id,]$to) %>% - dplyr::mutate(tau_mean = ifelse(.data$clock_mean < 1, .data$clock_mean , 1)) %>% - dplyr::mutate(tau_high = ifelse(.data$clock_high < 1, .data$clock_high, 1)) %>% - dplyr::mutate(tau_low = .data$clock_low) - - p <- ggplot2::ggplot() + - ggplot2::geom_rect(data = summarized_results, ggplot2::aes(xmin=.data$from, xmax=.data$to, ymin=.data$tau_low, ymax=.data$tau_high, fill = as.factor(.data[[colour_by]])), alpha = .5) + - ggplot2::geom_segment(data = summarized_results, ggplot2::aes(y = .data$tau_mean, yend = .data$tau_mean, x = .data$from, xend = .data$to)) + - ggplot2::scale_x_continuous(breaks = reference_genome$to, labels = gsub("chr", "", reference_genome$chr)) + - ggplot2::theme_bw() + - ggplot2::theme( - legend.position = "bottom", - axis.text.x = ggplot2::element_text(angle = 90)) + - ggplot2::lims(y = c(0,1)) + - ggplot2::labs(x = "Chromosome", y = bquote("Pseudotime"~tau), fill=colour_by) - # scale_fill_manual(values = c("forestgreen", "indianred3", "steelblue"), name = "") - - if (split_contiguous_segments) { - p <- p + - ggplot2::geom_segment(data = summarized_results, ggplot2::aes(y = .data$tau_low, yend = .data$tau_high, x = .data$from, xend = .data$from), linetype = "dashed", color = "darkslategray") + - ggplot2::geom_segment(data = summarized_results, ggplot2::aes(y = .data$tau_low, yend = .data$tau_high, x = .data$to, xend = .data$to), linetype = "dashed", color = "darkslategray") - } - - return(p) -} - - - - - - - - - +#' plot_inference_h Function +#' +#' This function obtains the list of input data to be used in the stan model. +#' @param results list(data = input_data_list, results = results, output_files_list = output_files_list) +#' @param colour_by chr: default = "karyotype" +#' @param K mun: number of clocks +#' @param split_contiguous_segments option to plot segments' setalarion lines +#' +#' @return p : plot of inference results with credibility intervals in the chromosome absolute positions +#' +#' @keywords plot +#' @export + +plot_timing_h = function(results, K, colour_by = "karyotype", split_contiguous_segments = TRUE) { + + #segments <- x_segments[ x_segments$chr %in% results$data$accepted_cna$chr, ] + + segments <- results$data$accepted_cna + segments$from = lapply(segments$segment_name, function(s) {unlist(strsplit(s, "_"))[2]}) %>% unlist() %>% as.numeric() + segments$to = lapply(segments$segment_name, function(s) {unlist(strsplit(s, "_"))[3]}) %>% unlist() %>% as.numeric() + + reference_genome <- tickTack::chr_coordinates_GRCh38 + + vfrom = reference_genome$from + names(vfrom) = reference_genome$chr + + absoulte_segments <- segments %>% + dplyr::mutate(from = .data$from + vfrom[.data$chr], + to = .data$to + vfrom[.data$chr]) + + summarized_results <- results$draws_and_summary[[K]]$summarized_results %>% + dplyr::mutate(from = absoulte_segments[.data$segment_id,]$from) %>% + dplyr::mutate(to = absoulte_segments[.data$segment_id,]$to) %>% + dplyr::mutate(tau_mean = ifelse(.data$clock_mean < 1, .data$clock_mean , 1)) %>% + dplyr::mutate(tau_high = ifelse(.data$clock_high < 1, .data$clock_high, 1)) %>% + dplyr::mutate(tau_low = .data$clock_low) + + p <- ggplot2::ggplot() + + ggplot2::geom_rect(data = summarized_results, ggplot2::aes(xmin=.data$from, xmax=.data$to, ymin=.data$tau_low, ymax=.data$tau_high, fill = as.factor(.data[[colour_by]])), alpha = .5) + + ggplot2::geom_segment(data = summarized_results, ggplot2::aes(y = .data$tau_mean, yend = .data$tau_mean, x = .data$from, xend = .data$to)) + + ggplot2::scale_x_continuous(breaks = reference_genome$to, labels = gsub("chr", "", reference_genome$chr)) + + ggplot2::theme_bw() + + ggplot2::theme( + legend.position = "bottom", + axis.text.x = ggplot2::element_text(angle = 90)) + + ggplot2::lims(y = c(0,1)) + + ggplot2::labs(x = "Chromosome", y = bquote("Pseudotime"~tau), fill=colour_by) + # scale_fill_manual(values = c("forestgreen", "indianred3", "steelblue"), name = "") + + if (split_contiguous_segments) { + p <- p + + ggplot2::geom_segment(data = summarized_results, ggplot2::aes(y = .data$tau_low, yend = .data$tau_high, x = .data$from, xend = .data$from), linetype = "dashed", color = "darkslategray") + + ggplot2::geom_segment(data = summarized_results, ggplot2::aes(y = .data$tau_low, yend = .data$tau_high, x = .data$to, xend = .data$to), linetype = "dashed", color = "darkslategray") + } + + return(p) +} + + + + + + + + + diff --git a/vignettes/tickTack.Rmd b/vignettes/tickTack.Rmd index b0b6343..4bb15e8 100644 --- a/vignettes/tickTack.Rmd +++ b/vignettes/tickTack.Rmd @@ -1,287 +1,287 @@ ---- -title: "tickTack" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{tickTack} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) - -options(crayon.enabled=F) -``` - -## Overview - -`tickTack` requires as input a `CNAqc` object with attributes `cna`, `mutations` and `metadata`. -The main input for the tool are: - -- the read counts from somatic mutations such as single-nucleotide variants (SNVs) in the mutation attribute; -- allele-specific copy number segments (CNAs) for clonal segments must be encoded in the cna attribute; -- a tumor purity estimate in the metadata. - -The tool uses chromosome coordinates to map mutations to segments. The conversion of relative to absolute genome coordinates requires to fix a reference genome build; supported reference is GRCh38/hg17 that is also supported in CNAqc. - -`tickTack` can be used to: - -- time the genomic segments affected by a Copy Number event, obtaining one clock per segment -- time multiple CNAs in a hierarchical fashion, identifying $K$ clocks that cluster some segemnts together. - -The following concepts are used to infer copy number timing. - -### VAF peaks - -The point mutations that are present on the duplicated region are duplicated in copy with the segment. Therefore we can use the proportion of mutations happede before and after the Copy Number event distinguishing between mutations in single copy and double copies. - -Overview timing problem
- - -Therefore, for a single segment the value of the clock associated with the Copy Number event is obtained as a transformation from the proportions of mutations in single and double copy. -The following quantities need to be considered: - - -\begin{itemize} - \item $N_m$ : number of mutations with multiplicity $m$; - \item $\rho$ : mutation rate, indicates how many mutations occur per unit of time; - \item $\tau$ : pseudo-time; -\end{itemize} - - -Overview timing problem
- - -\subsubsection{2:1} - -In the case of a trisomy without LOH, we can consider the fact that, before $\tau$, 1 chromosome will accumulate mutations that will duplicate, while the other will accumulate mutations that will remain in single copy. On the other hand, after $\tau$, both chromosomes will accumulate mutations which will remain in single copy. Therefore one can write the system: - -\begin{align} - \begin{cases} - N_2 = \rho \tau - N_1 = \rho \tau + 3\rho(1 - \tau) \nonumber - \end{cases} -\end{align} - -Using the first one to obtain $\rho$ and inserting into the second one, the solution for $\tau$ becomes: - -\begin{equation} - N_1 = N_2 + \frac{3N_2}{\tau}(1-\tau) \hspace{2mm} \rightarrow \hspace{2mm} N_1 + 2N_2 = \frac{3N_2}{\tau} \hspace{2mm} \rightarrow \hspace{2mm} \tau = \frac{3N_2}{N_1 + 2N_2} -\end{equation} - -\subsubsection{2:0 and 2:2} - -The case of the CNLOH and of the segment doubling can be treated together. In fact, in the first case, before $\tau$ the mutations that will duplicate accumulate on a single chromosome and after $\tau$ the mutations that will remain in a single copy accumulate on two chromosomes. The system therefore becomes: - -\begin{align} - \begin{cases} - N_2 = \rho \tau - N_1 = 2\rho(1 - \tau) \nonumber - \end{cases} -\end{align} - -A very similar things happens in the case of the 2:2, with the only difference that the number of chromosomes accumulating a certain type of mutation will be double, both after and before $\tau$. Hence, the system becomes: - -\begin{align} - \begin{cases} - N_2 = 2\rho \tau - N_1 = 4\rho(1 - \tau) \nonumber - \end{cases} -\end{align} - -Therefore, the two system can be solved similarly (you can simply drop a factor of 2 in the second case). The solution for $\tau$ easily becomes: - -\begin{equation} - N_1 = \frac{2N_2(1-\tau)}{\tau} \hspace{2mm} \rightarrow \hspace{2mm} \tau(N_1 + 2N_2) = 2N_2 \hspace{2mm} \rightarrow \hspace{2mm} \tau = \frac{2N_2}{2N_2 + N_1} -\end{equation} - - - -### Clonal CNAs - -Consider: - -- mutations sitting on a segment $nA:nB$; -- tumour purity $\pi$; -- a healthy diploid normal; - -Since the proportion of all reads from the tumour is $\pi(n_A+n_B)$, and from the normal is $2(1-\pi)$. Then, muations present in $m$ copies of the tumour genome should peak at VAF value - -\[ -v_m(c) = \dfrac{m \pi c}{ -2 (1 - \pi) + \pi (n_A+n_B) -} \, . -\] - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - --> - --> - --> - - - - - --> - - --> - --> - --> - --> - --> - --> - --> - --> - - --> - - - - --> - - --> - - --> - --> - - --> - - --> - - - --> - - --> - --> - --> - --> - --> - --> - - --> - - +--- +title: "tickTack" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{tickTack} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + +options(crayon.enabled=F) +``` + +## Overview + +`tickTack` requires as input a `CNAqc` object with attributes `cna`, `mutations` and `metadata`. +The main input for the tool are: + +- the read counts from somatic mutations such as single-nucleotide variants (SNVs) in the mutation attribute; +- allele-specific copy number segments (CNAs) for clonal segments must be encoded in the cna attribute; +- a tumor purity estimate in the metadata. + +The tool uses chromosome coordinates to map mutations to segments. The conversion of relative to absolute genome coordinates requires to fix a reference genome build; supported reference is GRCh38/hg17 that is also supported in CNAqc. + +`tickTack` can be used to: + +- time the genomic segments affected by a Copy Number event, obtaining one clock per segment +- time multiple CNAs in a hierarchical fashion, identifying $K$ clocks that cluster some segemnts together. + +The following concepts are used to infer copy number timing. + +### VAF peaks + +The point mutations that are present on the duplicated region are duplicated in copy with the segment. Therefore we can use the proportion of mutations happede before and after the Copy Number event distinguishing between mutations in single copy and double copies. + +Overview timing problem
+ + +Therefore, for a single segment the value of the clock associated with the Copy Number event is obtained as a transformation from the proportions of mutations in single and double copy. +The following quantities need to be considered: + + +\begin{itemize} + \item $N_m$ : number of mutations with multiplicity $m$; + \item $\rho$ : mutation rate, indicates how many mutations occur per unit of time; + \item $\tau$ : pseudo-time; +\end{itemize} + + +Overview timing problem
+ + +\subsubsection{2:1} + +In the case of a trisomy without LOH, we can consider the fact that, before $\tau$, 1 chromosome will accumulate mutations that will duplicate, while the other will accumulate mutations that will remain in single copy. On the other hand, after $\tau$, both chromosomes will accumulate mutations which will remain in single copy. Therefore one can write the system: + +\begin{align} + \begin{cases} + N_2 = \rho \tau + N_1 = \rho \tau + 3\rho(1 - \tau) \nonumber + \end{cases} +\end{align} + +Using the first one to obtain $\rho$ and inserting into the second one, the solution for $\tau$ becomes: + +\begin{equation} + N_1 = N_2 + \frac{3N_2}{\tau}(1-\tau) \hspace{2mm} \rightarrow \hspace{2mm} N_1 + 2N_2 = \frac{3N_2}{\tau} \hspace{2mm} \rightarrow \hspace{2mm} \tau = \frac{3N_2}{N_1 + 2N_2} +\end{equation} + +\subsubsection{2:0 and 2:2} + +The case of the CNLOH and of the segment doubling can be treated together. In fact, in the first case, before $\tau$ the mutations that will duplicate accumulate on a single chromosome and after $\tau$ the mutations that will remain in a single copy accumulate on two chromosomes. The system therefore becomes: + +\begin{align} + \begin{cases} + N_2 = \rho \tau + N_1 = 2\rho(1 - \tau) \nonumber + \end{cases} +\end{align} + +A very similar things happens in the case of the 2:2, with the only difference that the number of chromosomes accumulating a certain type of mutation will be double, both after and before $\tau$. Hence, the system becomes: + +\begin{align} + \begin{cases} + N_2 = 2\rho \tau + N_1 = 4\rho(1 - \tau) \nonumber + \end{cases} +\end{align} + +Therefore, the two system can be solved similarly (you can simply drop a factor of 2 in the second case). The solution for $\tau$ easily becomes: + +\begin{equation} + N_1 = \frac{2N_2(1-\tau)}{\tau} \hspace{2mm} \rightarrow \hspace{2mm} \tau(N_1 + 2N_2) = 2N_2 \hspace{2mm} \rightarrow \hspace{2mm} \tau = \frac{2N_2}{2N_2 + N_1} +\end{equation} + + + +### Clonal CNAs + +Consider: + +- mutations sitting on a segment $nA:nB$; +- tumour purity $\pi$; +- a healthy diploid normal; + +Since the proportion of all reads from the tumour is $\pi(n_A+n_B)$, and from the normal is $2(1-\pi)$. Then, muations present in $m$ copies of the tumour genome should peak at VAF value + +\[ +v_m(c) = \dfrac{m \pi c}{ +2 (1 - \pi) + \pi (n_A+n_B) +} \, . +\] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +