Merge pull request #42 from CostaLab/devel

Added turorials and updated vignettes.
CostaLab · Oct 7, 2024 · 73adc91 · 73adc91
2 parents b2d3bad + 04221be
commit 73adc91
Show file tree

Hide file tree

Showing 284 changed files with 15,276 additions and 5,333 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: sigurd
 Type: Package
 Title: Single cell Genotyping Using RNA Data
-Version: 0.3.0
+Version: 0.3.1
 Authors@R: c(
   person(given = "Martin",
          family = "Grasshoff",

diff --git a/NAMESPACE b/NAMESPACE
@@ -21,6 +21,7 @@ export(GetCellInfoPerVariant)
 export(GetVariantInfo)
 export(HeatmapVoi)
 export(LoadingMAEGATK_typewise)
+export(LoadingRawMatrix_typewise)
 export(LoadingVCF_typewise)
 export(LoadingVarTrix_typewise)
 export(Merging_SE_list)

diff --git a/R/ClonalDefinition.R b/R/ClonalDefinition.R
@@ -11,11 +11,13 @@
 #'@param se SummarizedExperiment object.
 #'@param variants_ls List of variants for clonal definition
 #'@param grouping The meta data	column used to split the cells into groups. Default = NULL
-#'@param n_cores Number of cores you want to use. Numeric.
 #'@param identities Vector of groups, like samples.
+#'@param explicit Do you want to specify the variants forming a clone? Then the variant_ls needs to be a list of variants that each define a clone. One variant per group.
+#'@param explicit_not Do you want to specify a set of variants that the cell may not have? One set per group.
+#'@param explicit_min_vaf Minimum VAF for a cell to be considered positive.
 #'@param verbose Should the function be verbose? Default = TRUE
 #'@export
-ClonalDefinition <- function(se, variants_ls, grouping = NULL, identities = NULL, n_cores = 1, verbose = TRUE){
+ClonalDefinition <- function(se, variants_ls, grouping = NULL, identities = NULL, explicit = FALSE, explicit_not = NULL, explicit_min_vaf = 0.01, verbose = TRUE){
   # Checking if the group variable is in the column data.
   if(!is.null(grouping)){
     # We check if the grouping variable is in the column data.
@@ -44,12 +46,71 @@ ClonalDefinition <- function(se, variants_ls, grouping = NULL, identities = NULL
   }
   # Checking if any of the variants is not in the SE object.
   for(i in 1:length(variants_ls)){
-    check_variants <- all(variants_ls[[i]] %in% rownames(se))
+    check_variants <- all(unlist(variants_ls[[i]]) %in% rownames(se))
     if(!check_variants){
       stop(paste0("The set of variants number ", i, " has variants not in the object. Please check."))
     }
   }
 
+
+  if(explicit){
+    if(verbose) print("Explicit clone definition.")
+    # We prepare a new meta data column.
+    new_meta_data <- rep("OtherLineage", ncol(se))
+    names(new_meta_data) <- colnames(se)
+    for(i in 1:length(variants_ls)){
+      variants_ls_group <- variants_ls[[i]]
+
+      # If the grouping variables is not NULL, we get the relevant identity.
+      if(!is.null(grouping)){
+        identity_use <- identities[i]
+        cells_use <- SummarizedExperiment::colData(se)
+        cells_use <- cells_use[cells_use[,grouping] == identity_use, ]
+        cells_use <- rownames(cells_use)
+        se_use <- se[,cells_use]
+      } else{
+        se_use <- se
+      }
+
+      # We prepare the new meta data for the subset of the cells.
+      new_meta_data_subset <- rep("OtherLineage", ncol(se_use))
+      names(new_meta_data_subset) <- colnames(se_use)
+
+      # For each set of variants, we now assign a cell to be part of this clone.
+      for(j in 1:length(variants_ls_group)){
+        variants_use <- variants_ls_group[[j]]
+	# We check if a cell is mutated for the set of variants.
+	check_use <- as.matrix(SummarizedExperiment::assays(se_use)[["consensus"]][variants_use, , drop = FALSE])	
+	if(explicit_min_vaf > 0){
+	  frac_check <- as.matrix(SummarizedExperiment::assays(se_use)[["fraction"]][variants_use, , drop = FALSE])
+	  check_use[frac_check < explicit_min_vaf] <- 1
+	}
+	check_use <- matrix(check_use %in% 2:3, ncol = ncol(check_use), nrow = nrow(check_use), dimnames = list(rownames(check_use), colnames(check_use)))
+        check_use <- colSums(check_use) == length(variants_use)
+	if(!is.null(explicit_not)){
+	  variants_not <- explicit_not[[i]][[j]]
+	  check_not <- as.matrix(SummarizedExperiment::assays(se_use)[["consensus"]][variants_not, , drop = FALSE])
+  	  if(explicit_min_vaf > 0){
+            frac_not <- as.matrix(SummarizedExperiment::assays(se_use)[["fraction"]][variants_not, , drop = FALSE])
+            check_not[frac_not < explicit_min_vaf] <- 1
+          }
+	  check_not <- matrix(check_not %in% 2:3, ncol = ncol(check_not), nrow = nrow(check_not), dimnames = list(rownames(check_not), colnames(check_not)))
+	  check_not <- colSums(check_not) > 0
+	  check_not <- check_not[check_not]
+	  check_use[names(check_not)] <- FALSE
+	}
+	check_use <- check_use[check_use]
+	# We check if a cell is mutated for any of the other variants.
+	new_meta_data_subset[names(check_use)] <- paste0("C", j)
+	# We add the new meta data to the combined meta data.
+        new_meta_data[names(new_meta_data_subset)] <- new_meta_data_subset
+      }
+    }
+    SummarizedExperiment::colData(se)$Clones <- new_meta_data
+    return(se)
+  }
+
+
   # For each set of variants, we get all possible combinations.
   combinations_ls <- list()
   for(i in 1:length(variants_ls)){

diff --git a/R/LoadingRawMatrix_typewise.R b/R/LoadingRawMatrix_typewise.R
@@ -0,0 +1,100 @@
+#'LoadingRawMatrix_typewise
+#'@description
+#'A function to load a dense matrix of genotyping information and reformat it. Each matrix should have a column with the cell barcodes, one with the reference reads and one with the alternative reads.
+#'The type in the design matrix is used as the variant name. 
+#'@importFrom utils read.table
+#'@importFrom SummarizedExperiment SummarizedExperiment
+#'@importFrom Matrix colMeans rowMeans
+#'@param samples_path Path to the input folder. Must include a barcodes file.
+#'@param samples_file Path to the csv file with the samples to be loaded.
+#'@param patient The patient you want to load.
+#'@param patient_column The column that contains the patient information.
+#'@param variant_use The variant of the respective matrices.
+#'@param cells_include A vector of cell barcodes. Only these cells will be retained. 
+#'@param cells_exclude A vector of cell barcodes. These cells will be removed from the output.
+#'@param cellbarcode_length The length of the cell barcode. This should be the length of the actual barcode plus two for the suffix (-1). Default = 18
+#'@param verbose Should the function be verbose? Default = TRUE
+#'@export
+LoadingRawMatrix_typewise <- function(samples_file, patient, patient_column = "patient", variant_use = NULL, cells_include = NULL, cells_exclude = NULL, matrix_column_separator = ",", matrix_header = TRUE, cellbarcode_index = 1, ref_index = 2, alt_index = 3, cellbarcode_length = 18, verbose = TRUE){
+  if(is.null(variant_use)) stop ("You have to supply a variant.")
+  if(cellbarcode_index == 0) stop("Your cellbarcode_index cannot be 0.")
+  if(ref_index == 0) stop("Your ref_index cannot be 0.")
+  if(alt_index == 0) stop("Your alt_index cannot be 0.")
+
+  if(verbose) print("We load the design matrix.")
+  samples_file <- read.table(samples_file, sep = ",", header = TRUE)
+  if(!patient_column %in% colnames(samples_file)) stop("Your patient_column is not in the design matrix.")
+  if(!patient %in% samples_file$patient) stop("Your patient is not in the design matrix.")
+  samples_file <- subset(samples_file, patient == patient)
+  samples_file <- samples_file[grep("matrix", samples_file$source, ignore.case = TRUE),]
+  if(any(!variant_use %in% samples_file$type)){
+    stop("Your variant in not in the design matrix.")
+  }
+  samples_file <- samples_file[samples_file$type == variant_use, , drop = FALSE]
+  samples <- samples_file$sample
+  variant_use <- samples_file$type
+
+
+  if(verbose) print("We load the data.")
+  loaded_matrix_ls <- c()
+  cellbarcodes_total <- c()
+  for(i in 1:nrow(samples_file)){
+    loaded_matrix <- read.table(samples_file$input_path[i], sep = matrix_column_separator, header = matrix_header)
+    if(cellbarcode_index > ncol(loaded_matrix)) stop(paste0("Your cellbarcode_index is outside the matrix dimensions for the ", i , " sample."))
+    if(ref_index > ncol(loaded_matrix)) stop(paste0("Your ref_index is outside the matrix dimensions for the ", i , " sample."))
+    if(alt_index > ncol(loaded_matrix)) stop(paste0("Your alt_index is outside the matrix dimensions for the ", i , " sample."))
+    loaded_matrix[, cellbarcode_index] <- paste0(samples[i], "_", loaded_matrix[, cellbarcode_index])
+    if(!is.null(cells_include)){
+      if(verbose) paste0("We remove all cells not in the white list for sample ", i, ".")
+      # Check if any cells would be left.
+      check_leftover <- any(loaded_matrix[, cellbarcode_index] %in% cells_include)
+      if(!check_leftover) stop("No cells are in your supplied list for the ", i, " sample.")
+      loaded_matrix <- loaded_matrix[colnames(coverage_matrix_total) %in% cells_include, , drop = FALSE]
+    }
+    if(!is.null(cells_exclude)){
+      if(verbose) paste0("We remove all cells in the exclusion list for sample ", i, ".")
+      # Check if any cells would be left.
+      check_leftover <- all(loaded_matrix[, cellbarcode_index] %in% cells_exclude)
+      if(check_leftover) stop(paste0("All cells are in your exclusion list for sample ", i, "."))
+      loaded_matrix <- loaded_matrix[!loaded_matrix[, cellbarcode_index] %in% cells_exclude, , drop = FALSE]
+    }
+    cellbarcodes_total <- unique(c(cellbarcodes_total, loaded_matrix[, cellbarcode_index]))
+    loaded_matrix_ls[[paste0(samples[i], "_", samples_file$type[i])]] <- loaded_matrix
+  }
+
+
+  alts <- refs <- matrix(0, ncol = length(cellbarcodes_total), nrow = length(variant_use), dimnames = list(variant_use, cellbarcodes_total))
+  for(i in 1:length(loaded_matrix_ls)){
+    alts[i, loaded_matrix_ls[[i]][, cellbarcode_index]] <- loaded_matrix_ls[[i]][, alt_index]
+    refs[i, loaded_matrix_ls[[i]][, cellbarcode_index]] <- loaded_matrix_ls[[i]][, ref_index]
+  }
+  coverage <- alts + refs
+
+
+  ref_construction <- refs > 0
+  alt_construction <- alts > 0
+  alt_construction[alt_construction] <- 2
+  consensus <- ref_construction + alt_construction
+  fraction <- alts / coverage
+
+
+  coverage <- as(coverage, "CsparseMatrix")
+  alts <- as(alts, "CsparseMatrix")
+  refs <- as(refs, "CsparseMatrix")
+  consensus <- as(consensus, "CsparseMatrix")
+  fraction <- as(fraction, "CsparseMatrix")
+
+
+  if(verbose) print("We generate a SummarizedExperiment object containing the fraction and the consensus matrices.")
+  # We want an assay for the Consensus information and for the fraction.
+  # As meta data we add a data frame showing the cell id, the associated patient and the sample.
+  average_depth_per_cell <- Matrix::colMeans(coverage)
+  coverage_depth_per_variant <- Matrix::rowMeans(coverage)
+  meta_data <- data.frame(Cell = colnames(consensus), Patient = patient, Sample = substr(x = colnames(consensus), start = 1, stop = nchar(colnames(consensus))-(cellbarcode_length+1)), AverageCoverage = average_depth_per_cell)
+  rownames(meta_data) <- meta_data$Cell
+  meta_row <- data.frame(VariantName = rownames(consensus), Depth = coverage_depth_per_variant)
+  rownames(meta_row) <- meta_row$VariantName
+  se <- SummarizedExperiment::SummarizedExperiment(assays = list(consensus = consensus, fraction = fraction, coverage = coverage, alts = alts, refs = refs), colData = meta_data, rowData = meta_row)
+  return(se)
+}
+
diff --git a/R/RowWiseSplit.R b/R/RowWiseSplit.R
@@ -9,7 +9,7 @@
 #'@param n_cores Number of cores to use.
 #'@param remove_nocalls Do you want to remove NoCall cells?
 #'@export
-RowWiseSplit <- function(se, n_cores = 1, remove_nocalls = TRUE){
+RowWiseSplit <- function(se, n_cores = 1, minimum_reads, remove_nocalls = TRUE){
   consensus <- SummarizedExperiment::assays(se)$consensus
   consensus_list <- parallel::mclapply(rownames(se), SeparatingMatrixToList, total_matrix = consensus, remove_nocalls = remove_nocalls, mc.cores = n_cores)
   names(consensus_list) <- rownames(se)

diff --git a/R/VariantWiseFisherTest.R b/R/VariantWiseFisherTest.R
@@ -34,9 +34,6 @@ VariantWiseFisherTest <- function(variants_list, n_cores = 1, p_value_adjustment
   if(verbose) print("We remove the NA P values.")
   results_total <- results_total[!is.na(results_total$P),]
 
-  if(verbose) print("We remove the SNPs with a odds ratio lower than 1.")
-  results_total <- subset(results_total, OddsRatio > 1)
-
   if(verbose) print(paste0("Adjusting P values using ", p_value_adjustment, "."))
   results_total$P_adj <- stats::p.adjust(results_total$P, method = p_value_adjustment)
   rownames(results_total) <- NULL

diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ The mutation data was obtained from the Sanger Institute Catalogue Of Somatic Mu
 
 ```
 
-# Current Features v0.3.0
+# Current Features v0.3.1
 
 - Loading data from VarTrix and MAEGATK.
 - Transforming the data to be compatible for joint analysis.