Merge pull request #29 from CostaLab/devel

Fixes for the loading of VCF files.
CostaLab · Mar 15, 2024 · c16f8d6 · c16f8d6
2 parents 3db0be2 + f61cb5b
commit c16f8d6
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 12 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: sigurd
 Type: Package
 Title: Single cell Genotyping Using RNA Data
-Version: 0.2.45
+Version: 0.2.46
 Authors@R: c(
   person(given = "Martin",
          family = "Grasshoff",

diff --git a/R/LoadingVCF_typewise.R b/R/LoadingVCF_typewise.R
@@ -22,30 +22,32 @@
 #'@param samples_file Path to the csv file with the samples to be loaded.
 #'@param vcf_path Path to the VCF file with the variants.
 #'@param patient The patient you want to load.
+#'@param patient_column The column that contains the patient information. Use merge, if all samples should be merged.
 #'@param type_use The type of input. Only rows that have the specified type will be loaded.
 #'@param min_reads The minimum number of reads we want. Otherwise we treat this as a NoCall. Default = NULL.
 #'@param min_cells The minimum number of cells for a variant. Otherwise, we will remove a variant. Default = 2.
 #'@param remove_N_alternative Remove all variants that have N as an alternative, see Description. Default = TRUE
 #'@param cellbarcode_length The length of the cell barcode. This should be the length of the actual barcode plus two for the suffix (-1). Default = 18
 #'@param verbose Should the function be verbose? Default = TRUE
 #'@export
-LoadingVCF_typewise <- function(samples_file, samples_path = NULL, vcf_path, patient, type_use = "scRNAseq_Somatic", min_reads = NULL, min_cells = 2, remove_N_alternative = TRUE, cellbarcode_length = 18, verbose = TRUE){
+LoadingVCF_typewise <- function(samples_file, samples_path = NULL, vcf_path, patient, patient_column = "patient", type_use = "scRNAseq_Somatic", min_reads = NULL, min_cells = 2, remove_N_alternative = TRUE, cellbarcode_length = 18, verbose = TRUE){
   if(!is.null(samples_path)){
     if(verbose) print(paste0("Loading the data for sample ", patient, "."))
     samples_file <- data.frame(patient = patient, sample = patient, input_path = samples_path)
     samples <- samples_file$sample
   } else{
     if(verbose) print(paste0("Loading the data for patient ", patient, "."))
-    if(verbose) print("We read in the samples file.")
+    if(verbose) print("We read in the central input file.")
     samples_file <- utils::read.csv(samples_file, stringsAsFactors = FALSE)
+    if(!patient_column %in% colnames(samples_file) & patient_column != "merge"){
+      stop(paste0("Error: the column ", patient_column, " is not in your central input file."))
+    }
 
-
-    if(verbose) print("We subset to the patient of interest.")
+    if(verbose) print("We subset to the relevant files.")
     samples_file <- samples_file[grep("vcf", samples_file$source, ignore.case = TRUE),]
-    samples_file <- samples_file[samples_file$patient == patient,]
+    if(patient_column != "merge") samples_file <- samples_file[samples_file[,patient_column] == patient,]
     samples_file <- samples_file[samples_file$type == type_use,]
 
-
     if(verbose) print("We get the different samples.")
     samples <- samples_file$sample
   }
@@ -102,10 +104,10 @@ LoadingVCF_typewise <- function(samples_file, samples_path = NULL, vcf_path, pat
   if(remove_N_alternative){
     ref_matrix_total_n     <- substr(rownames(ref_matrix_total), start = nchar(rownames(ref_matrix_total)), stop = nchar(rownames(ref_matrix_total)))
     ref_matrix_total_n     <- ref_matrix_total_n != "N"
-    ref_matrix_total       <- ref_matrix_total[ref_matrix_total_n,]
-    reads_matrix_total     <- reads_matrix_total[ref_matrix_total_n,]
-    coverage_matrix_total  <- coverage_matrix_total[ref_matrix_total_n,]
-    consensus_matrix_total <- consensus_matrix_total[ref_matrix_total_n,]
+    ref_matrix_total       <- ref_matrix_total[ref_matrix_total_n, , drop = FALSE]
+    reads_matrix_total     <- reads_matrix_total[ref_matrix_total_n, , drop = FALSE]
+    coverage_matrix_total  <- coverage_matrix_total[ref_matrix_total_n, , drop = FALSE]
+    consensus_matrix_total <- consensus_matrix_total[ref_matrix_total_n, , drop = FALSE]
     rm(ref_matrix_total_n)
   } else{
     print("We keep all variants with an N as alternative allele. Please ensure that these variants are in your variant VCF file.")

diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ The mutation data was obtained from the Sanger Institute Catalogue Of Somatic Mu
 
 ```
 
-# Current Features v0.2.45
+# Current Features v0.2.46
 
 - Loading data from VarTrix and MAEGATK.
 - Transforming the data to be compatible for joint analysis.

diff --git a/man/LoadingVCF_typewise.Rd b/man/LoadingVCF_typewise.Rd