Merge pull request #53 from bartongroup/development

0.6.3
bartongroup · Mar 14, 2018 · 8d32c68 · 8d32c68
2 parents 20018b6 + 1c4ff4d
commit 8d32c68
Show file tree

Hide file tree

Showing 32 changed files with 561 additions and 755 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: rats
-Version: 0.6.2
-Date: 2018-02-16
+Version: 0.6.3
+Date: 2018-03-13
 Title: Relative Abundance of Transcripts
 Encoding: UTF-8
 Authors: c(person("Kimon Froussios", role=c("aut"), email="k.froussios@dundee.ac.uk"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -20,7 +20,6 @@ export(plot_overview)
 export(plot_shiny_volcano)
 export(sim_boot_data)
 export(sim_count_data)
-export(sim_sleuth_data)
 export(tidy_annot)
 import(data.table)
 import(ggplot2)

diff --git a/R/data_simulators.R b/R/data_simulators.R
diff --git a/R/func.R b/R/func.R
@@ -21,6 +21,7 @@
 #' @param threads Number of threads.
 #' @param seed Seed for random engine.
 #' @param scaling Abundance scaling factor.
+#' @param reckless whether to ignore detected annotation discrepancies
 #'
 #' @return List: \itemize{
 #'  \item{"error"}{logical}
@@ -36,7 +37,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
                                 TARGET_COL, PARENT_COL,
                                 correction, testmode, scaling, threads, seed,
                                 p_thresh, abund_thresh, dprop_thresh, 
-                                qboot, qbootnum, qrep_thresh, rboot, rrep_thresh) {
+                                qboot, qbootnum, qrep_thresh, rboot, rrep_thresh, reckless) {
   warnmsg <- list()
 
   # Input format.
@@ -48,14 +49,16 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
     return(list("error"=TRUE, "message"="You must specify (the same type of) data for both conditions!"))
 
   # Annotation.
+  if (!is.logical(reckless) || is.na(reckless))
+    return(list("error"=TRUE, "message"="Invalid value to reckless!"))
   if (!is.data.frame(annot))
     return(list("error"=TRUE, "message"="The provided annot is not a data.frame!"))
   if (any(!c(TARGET_COL, PARENT_COL) %in% names(annot)))
     return(list("error"=TRUE, "message"="The specified target and/or parent IDs field names do not exist in annot!"))
-  if (length(annot$target_id) != length(unique(annot$target_id)))
+  if (length(annot[[TARGET_COL]]) != length(unique(annot[[TARGET_COL]])))
     return(list("error"=TRUE, "message"="Some transcript identifiers are not unique!"))
-
-  # Parameters.
+  
+  # Simple parameters.
   if (!correction %in% p.adjust.methods)
     return(list("error"=TRUE, "message"="Invalid p-value correction method name. Refer to stats::p.adjust.methods ."))
   if (!testmode %in% c("genes", "transc", "both"))
@@ -65,7 +68,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
   if ((!is.numeric(dprop_thresh)) || dprop_thresh < 0 || dprop_thresh > 1)
     return(list("error"=TRUE, "message"="Invalid proportion difference threshold! Must be between 0 and 1."))
   if ((!is.numeric(abund_thresh)) || abund_thresh < 0)
-    return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be between 0 and 1."))
+    return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be a count >= 0."))
   if ((!is.numeric(qbootnum)) || qbootnum < 0)
     return(list("error"=TRUE, "message"="Invalid number of bootstraps! Must be a positive integer number."))
   if (!is.logical(qboot))
@@ -80,6 +83,8 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
     return(list("error"=TRUE, "message"="Invalid number of threads! Must be positive integer."))
   if (threads > parallel::detectCores(logical= TRUE))
     return(list("error"=TRUE, "message"="Number of threads exceeds system's reported capacity."))
+
+  # Scaling
   nsmpl <- NULL
   if (!is.null(count_data_A)){
     nsmpl <- dim(count_data_A)[2] + dim(count_data_B)[2]
@@ -103,8 +108,20 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
     if(is.null(boot_data_A)) {
       if (!is.data.table(count_data_A) | !is.data.table(count_data_B))
         return(list("error"=TRUE, "message"="The counts data are not data.tables!"))
-      if (!all( count_data_A[[1]][order(count_data_A[[1]])] == count_data_B[[1]][order(count_data_B[[1]])] ))
-        return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs between conditions!"))
+      if ( !all(count_data_A[[1]] %in% annot[[TARGET_COL]]) && !all(annot[[TARGET_COL]] %in% count_data_A[[1]]) ) {
+        if(reckless){
+          warnmsg["annotation-mismatch"] <- "The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations? Reckless mode enabled, continuing at your own risk..."
+        } else {
+          return(list("error"=TRUE, "message"="The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations?"))
+        }
+      }
+      if (!all( count_data_A[[1]][order(count_data_A[[1]])] == count_data_B[[1]][order(count_data_B[[1]])] )) {
+        if (reckless) {
+          warnmsg["countIDs"] <- "Transcript IDs do not match completely between conditions! Did you use different annotations? Reckless mode enabled, continuing at your own risk..."
+        } else {
+          return(list("error"=TRUE, "message"="Transcript IDs do not match completely between conditions! Did you use different annotations?"))
+        }
+      }
     } else {
       warnmsg["cnts&boots"] <- "Received multiple input formats! Only the bootstrapped data will be used."
     }
@@ -119,17 +136,33 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
   }
   maxmatrix <- 2^31 - 1
   if (qboot) {
-    # Consistency,
     if (!is.null(boot_data_A)) {
-      # Direct data.
+      # Compared to annotation.
+      if ( !all(boot_data_A[[1]][[1]] %in% annot[[TARGET_COL]]) && !all(annot[[TARGET_COL]] %in% boot_data_A[[1]][[1]]) ) {
+        if(reckless){
+          warnmsg["annotation-mismatch"] <- "The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations? Reckless mode enabled, continuing at your own risk..."
+        } else {
+          return(list("error"=TRUE, "message"="The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations?"))
+        }
+      }
+      # Among samples
       tx <- boot_data_A[[1]][[1]][order(boot_data_A[[1]][[1]])]
       for (k in 2:length(boot_data_A)){
-        if (!all( tx == boot_data_A[[k]][[1]][order(boot_data_A[[k]][[1]])] ))
-          return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!"))
+        if (!all( tx == boot_data_A[[k]][[1]][order(boot_data_A[[k]][[1]])] )) {
+          if (reckless) {
+            warnmsg[paste0("bootIDs-A", k)] <- paste0("Inconsistent set of transcript IDs across samples (A", k, ")! Did you use different annotations? Reckless mode enabled, continuing at your own risk...")
+          } else {
+            return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples! Did you use different annotations?"))
+          }
+        }
       }
       for (k in 1:length(boot_data_B)){
         if (!all( tx == boot_data_B[[k]][[1]][order(boot_data_B[[k]][[1]])] ))
-          return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!"))
+          if (reckless) {
+            warnmsg[paste0("bootIDs-B", k)] <- paste0("Inconsistent set of transcript IDs across samples (B", k, ")! Did you use different annotations? Reckless mode enabled, continuing at your own risk...")
+          } else {
+            return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples! Did you use different annotations?"))
+          }
       }
 
       # Number of iterations.
@@ -156,7 +189,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
   if (rboot & any( length(samples_by_condition[[1]]) * length(samples_by_condition[[2]]) > maxmatrix/dim(annot)[1],
   				         length(boot_data_A)               * length(boot_data_B)               > maxmatrix/dim(annot)[1] ) )
     warnmsg["toomanyreplicates"] <- "The number of replicates is too high. Exhaustive 1vs1 would exceed maximum capacity of an R matrix."
-
+  
   return(list("error"=FALSE, "message"="All good!", "maxboots"=minboots, "warn"=(length(warnmsg) > 0), "warnings"= warnmsg))
 }
 
@@ -205,7 +238,7 @@ alloc_out <- function(annot, full, n=1){
                        "abund_scaling"=numeric(length=n),
                        "quant_boot"=NA,"quant_reprod_thresh"=NA_real_,  "quant_bootnum"=NA_integer_,
                        "rep_boot"=NA, "rep_reprod_thresh"=NA_real_, "rep_bootnum"=NA_integer_, 
-                       "seed"=NA_integer_)
+                       "seed"=NA_integer_, "reckless"=NA)
     Genes <- data.table("parent_id"=as.vector(unique(annot$parent_id)),
                         "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA,
                         "known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_, "maxDprop"=NA_real_,
@@ -256,7 +289,7 @@ alloc_out <- function(annot, full, n=1){
 #' @param test_transc Whether to do transcript-level test.
 #' @param test_genes Whether to do gene-level test.
 #' @param full Either "full" (for complete output structure) or "short" (for bootstrapping).
-#' @param count_thresh Minimum number of counts per replicate.
+#' @param count_thresh Minimum average count across replicates.
 #' @param p_thresh The p-value threshold.
 #' @param dprop_thresh Minimum difference in proportions.
 #' @param correction Multiple testing correction type.
@@ -303,8 +336,11 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes
     ctA <- count_thresh * resobj$Parameters[["num_replic_A"]]  # Adjust count threshold for number of replicates.
     ctB <- count_thresh * resobj$Parameters[["num_replic_B"]]
     Transcripts[, elig_xp := (sumA >= ctA | sumB >= ctB)]
-    Transcripts[, elig := (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))]  # If the entire gene is shut off in one condition, changes in proportion cannot be defined.
-                                                                                                      # If sum and total are equal in both conditions the gene has only one expressed isoform, or one isoform altogether.
+    Transcripts[, elig := (elig_xp &                          # isoform expressed above the noise threshold in EITHER condition
+                             totalA >= ctA & totalB >= ctB &  # at least one eligibly expressed isoform exists in EACH condition (prevent gene-on/off from creating wild proportions from low counts)
+                             totalA != 0 & totalB != 0 &      # gene expressed in BOTH conditions (failsafe, in case user sets noise threshold to 0)
+                             (propA != 1 | propB != 1) )]     # not the only isoform expressed.
+
     # If sum and total are equal in both conditions, it has no detected siblings and thus cannot change in proportion.
     Genes[, elig_transc := Transcripts[, as.integer(sum(elig, na.rm=TRUE)), by=parent_id][, V1] ]
     Genes[, elig := elig_transc >= 2]
@@ -494,3 +530,4 @@ group_samples <- function(covariates) {
 }
 
 
+#EOF
diff --git a/R/rats.R b/R/rats.R
@@ -19,8 +19,8 @@
 #' @param name_B The name for the other condition. (Default "Condition-B")
 #' @param varname The name of the covariate to which the two conditions belong. (Default \code{"condition"}).
 #' @param p_thresh The p-value threshold. (Default 0.05)
-#' @param abund_thresh Noise threshold. Minimum mean abundance for transcripts to be eligible for testing. (Default 5)
 #' @param dprop_thresh Effect size threshold. Minimum change in proportion of a transcript for it to be considered meaningful. (Default 0.20)
+#' @param abund_thresh Noise threshold. Minimum mean (across replicates) abundance for transcripts (and genes) to be eligible for testing. (Default 5)
 #' @param correction The p-value correction to apply, as defined in  \code{\link[stats]{p.adjust.methods}}. (Default \code{"BH"})
 #' @param scaling A scaling factor or vector of scaling factors, to be applied to the abundances *prior* to any thresholding and testing. Useful for scaling TPMs (transcripts per 1 million reads) to the actual library sizes of the samples. If a vector is supplied, the order should correspond to the samples in group A followed by the samples in group B. WARNING: Improper use of the scaling factor will artificially inflate/deflate the significances obtained.
 #' @param testmode One of \itemize{\item{"genes"}, \item{"transc"}, \item{"both" (default)}}.
@@ -33,7 +33,8 @@
 #' @param verbose Display progress updates and warnings. (Default \code{TRUE})
 #' @param threads Number of threads to use. (Default 1) Multi-threading will be ignored on non-POSIX systems.
 #' @param seed A numeric integer used to initialise the random number engine. Use this only if reproducible bootstrap selections are required. (Default NA)
-#' @param dbg Debugging mode. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt)
+#' @param reckless RATs normally aborts if any inconsistency is detected among the transcript IDs found in the annotation and the quantifications. Enabling reckless mode will downgrade this error to a warning and allow RATs to continue the run. Not recommended unless you know why the inconsistency exists and how it will affect the results. (Default FALSE)
+#' @param dbg Debugging mode only. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt)
 #' @return List of mixed types. Contains a list of runtime settings, a table of gene-level results, a table of transcript-level results, and a list of two tables with the transcript abundaces.
 #'
 #' @import utils
@@ -46,7 +47,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
                      name_A= "Condition-A", name_B= "Condition-B", varname= "condition",
                      p_thresh= 0.05, abund_thresh= 5, dprop_thresh= 0.2, correction= "BH", scaling= 1,
                      testmode= "both", qboot= TRUE, qbootnum= 0L, qrep_thresh= 0.95, rboot=TRUE, rrep_thresh= 0.85,
-                     description= NA_character_, verbose= TRUE, threads= 1L, seed=NA_integer_, dbg= "0")
+                     description= NA_character_, verbose= TRUE, threads= 1L, seed=NA_integer_, reckless=FALSE, dbg= "0")
 {
   #---------- PREP
 
@@ -60,7 +61,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
                                     TARGET_COL, PARENT_COL,
                                     correction, testmode, scaling, threads, seed,
                                     p_thresh, abund_thresh, dprop_thresh, 
-                                    qboot, qbootnum, qrep_thresh, rboot, rrep_thresh)
+                                    qboot, qbootnum, qrep_thresh, rboot, rrep_thresh, reckless)
   if (paramcheck$error)
     stop(paramcheck$message)
   if (verbose)
@@ -243,6 +244,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
   resobj$Parameters[["quant_reprod_thresh"]] <- qrep_thresh
   resobj$Parameters[["quant_bootnum"]] <- qbootnum
   resobj$Parameters[["rep_reprod_thresh"]] <- rrep_thresh
+  resobj$Parameters[["reckless"]] <- reckless
 
   if (dbg == "info")
     return(resobj)
@@ -469,6 +471,19 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
     Transcripts[is.na(rep_reprod), rep_reprod := c(FALSE)]
     Transcripts[is.na(DTU), DTU := c(FALSE)]
     Transcripts[is.na(gene_DTU), gene_DTU := c(FALSE)]
+    # Eradicate NAs from count columns, as they mess up plotting. These would be caused by inconsistencies between annotation and quantifications, and would only exist in "reckless" mode.
+    for (i in 1:Parameters$num_replic_A){
+      idx <- is.na(Abundances[[1]][[paste0('V',i)]])
+      Abundances[[1]][idx, paste0('V',i) := 0]
+    }
+    for (i in 1:Parameters$num_replic_B){
+      idx <- is.na(Abundances[[2]][[paste0('V',i)]])
+      Abundances[[2]][idx, paste0('V',i) := 0]
+    }
+    Transcripts[is.na(meanA), meanA := c(0)]
+    Transcripts[is.na(meanB), meanB := c(0)]
+    Transcripts[is.na(propA), propA := c(0)]
+    Transcripts[is.na(propB), propB := c(0)]
 
     # Drop the bootstrap columns, if unused.
     if (!qboot || !test_transc)
@@ -495,4 +510,4 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
   return(resobj)
 }
 
-
+#EOF
diff --git a/inst/doc/input.R b/inst/doc/input.R
@@ -21,7 +21,7 @@ head(sim_count_data()[[1]])
 
 ## ---- eval=FALSE---------------------------------------------------------
 #  # Simulate some data.
-#  simdat <- sim_count_data()
+#  simdat <- sim_count_data(clean=TRUE)
 #  # For convenience let's assign the contents of the list to separate variables
 #  mycond_A <- simdat[[2]]       # Simulated abundances for one condition.
 #  mycond_B <- simdat[[3]]       # Simulated abundances for other condition.
@@ -39,7 +39,7 @@ head(sim_count_data()[[1]])
 
 ## ------------------------------------------------------------------------
 # Simulate some data. (Notice it is a different function than before.)
-simdat <- sim_boot_data()
+simdat <- sim_boot_data(clean=TRUE)
 
 # For convenience let's assign the contents of the list to separate variables
 mycond_A <- simdat[[2]]   # Simulated bootstrapped data for one condition.
@@ -83,7 +83,7 @@ myannot <- simdat[[1]]    # Transcript and gene IDs for the above data.
 #  # Calling DTU with custom thresholds.
 #  mydtu <- call_DTU(annot= myannot,
 #                    boot_data_A= mycond_A, boot_data_B= mycond_B,
-#                    p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25)
+#                    p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10)
 
 ## ---- eval=FALSE---------------------------------------------------------
 #  # Bootstrap (default). Do 100 iterations.
@@ -169,3 +169,8 @@ myannot <- simdat[[1]]    # Transcript and gene IDs for the above data.
 #                    boot_data_B= mydata$boot_data_B,
 #                    scaling=c(25, 26.7, 23, 50.0, 45, 48.46, 52.36))
 
+## ---- eval=FALSE---------------------------------------------------------
+#  mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A,
+#                    boot_data_B= mydata$boot_data_B,
+#                    reckless=TRUE, verbose=TRUE)
+