From d425a959197164180047f34742ed2c2e960cc743 Mon Sep 17 00:00:00 2001 From: fruce-ki Date: Thu, 8 Mar 2018 11:46:27 +0000 Subject: [PATCH 1/5] fixes #51 and closes #48 - 51: fixed the bug where accidentally the same isoform must exceed noise in both conditions. - 48: The noise threshold is applied to the total gene abundance as well, preventing genes that are switched off in one condition from creating wild proportions and exaggerated effect sizes. --- DESCRIPTION | 4 +- NAMESPACE | 1 - R/data_simulators.R | 138 ++++----------------------- R/func.R | 13 ++- R/rats.R | 2 +- inst/doc/input.R | 2 +- inst/doc/input.Rmd | 32 ++++--- inst/doc/input.html | 69 +++++++------- inst/doc/output.html | 80 +++++++++------- inst/doc/plots.html | 20 ++-- man/calculate_DTU.Rd | 2 +- man/call_DTU.Rd | 2 +- man/sim_sleuth_data.Rd | 39 -------- tests/testthat/test_2_data-munging.R | 74 -------------- tests/testthat/test_4_steps.R | 2 +- tests/testthat/test_5_output.R | 31 +++--- tests/testthat/test_6_reports.R | 34 +++---- vignettes/input.Rmd | 32 ++++--- 18 files changed, 190 insertions(+), 387 deletions(-) delete mode 100644 man/sim_sleuth_data.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 6b8130e..aeb2231 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: rats -Version: 0.6.2 -Date: 2018-02-16 +Version: 0.6.2-1 +Date: 2018-03-07 Title: Relative Abundance of Transcripts Encoding: UTF-8 Authors: c(person("Kimon Froussios", role=c("aut"), email="k.froussios@dundee.ac.uk"), diff --git a/NAMESPACE b/NAMESPACE index f87589a..a460217 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,7 +20,6 @@ export(plot_overview) export(plot_shiny_volcano) export(sim_boot_data) export(sim_count_data) -export(sim_sleuth_data) export(tidy_annot) import(data.table) import(ggplot2) diff --git a/R/data_simulators.R b/R/data_simulators.R index 798ea73..d7d8a8a 100644 --- a/R/data_simulators.R +++ b/R/data_simulators.R @@ -1,109 +1,3 @@ -#============================================================================== -#' Generate an artificial minimal sleuth-like structure, for code-testing or examples. -#' -#' The default values here should match the default values expected by calculate_DTU(). -#' -#' @param varname Name for the variables by which to compare. -#' @param COUNTS_COL Name for the bootdtraps column containing counts. -#' @param TARGET_COL Name for annotation column containing transcript identification. -#' @param PARENT_COL Name for annotation column containing respective gene identification. -#' @param BS_TARGET_COL Name for bootstraps column containing transcript identification. -#' @param cnames A vector of (two) name values for the comparison variable. -#' @param errannot_inconsistent Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE) -#' @param cv_dt Logical. Whether covariates table should be a data.table (FALSE). -#' @return A list with \code{slo} a minimal sleuth-like object, \code{annot} a corresponding annotation data.frame, -#' \code{isx} a vector of trancripts common between generated annotation and bootstraps (useful for code testing). -#' -#' The simulated data will have non-uniform number of bootstraps per sample and non-uniform order of -#' transcript id's among samples and bootstraps. These conditions are unlikely to occur in real data, -#' but this allows testing that the code can handle it. -#' -#' @import data.table -#' -#' @export -sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET_COL="target_id", - PARENT_COL="parent_id", BS_TARGET_COL="target_id", cnames=c("A","B"), - errannot_inconsistent=FALSE, cv_dt=FALSE) -{ - # !!! Some of the tests of the package are tightly connected to the specifics of the object returned by this function. - # !!! Entry additions might be tolerated. Changes or removals of entries or structure will certainly cause failures. - - tx <- data.frame(target_id= c("NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), - parent_id= c("NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), - stringsAsFactors=FALSE) - names(tx) <- c(TARGET_COL, PARENT_COL) - - sl <- list() - sl[["sample_to_covariates"]] <- NaN - if (cv_dt) { - sl[["sample_to_covariates"]] <- data.table("foo"=c(cnames[1], cnames[2], cnames[1], cnames[2]), - "bar"=c("ba", "ba", "bb", "bb")) - } else { - sl[["sample_to_covariates"]] <- data.frame("foo"=c(cnames[1], cnames[2], cnames[1], cnames[2]), - "bar"=c("ba", "ba", "bb", "bb")) - } - names(sl[["sample_to_covariates"]]) <- c(varname, "batch") - - sl[["kal"]] <- list() - sl$kal[[1]] <- list() - sl$kal[[1]]["bootstrap"] <- list() - sl$kal[[1]]$bootstrap[[1]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), - stringsAsFactors=FALSE) - sl$kal[[1]]$bootstrap[[2]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), - stringsAsFactors=FALSE) - sl$kal[[1]]$bootstrap[[3]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), - stringsAsFactors=FALSE) - - sl$kal[[2]] <- list() - sl$kal[[2]]["bootstrap"] <- list() - sl$kal[[2]]$bootstrap[[1]] <- data.frame("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), - stringsAsFactors=FALSE) - sl$kal[[2]]$bootstrap[[2]] <- data.frame("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), - stringsAsFactors=FALSE) - - sl$kal[[3]] <- list() - sl$kal[[3]]$bootstrap[[1]] <- data.frame("target"= c("NIA1", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(333, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), - stringsAsFactors=FALSE) - sl$kal[[3]]$bootstrap[[2]] <- data.frame("target"= c("MIX6.c4", "NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1B1C.2", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.nc", "1D1C:two", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(80, 330, 4, 560, 11, 21, 0, 55, 90, 380, 0, 240, 80, 0, 55, 23, 10, 31, 2, 55, 0, 0), - stringsAsFactors=FALSE) - - sl$kal[[4]] <- list() - sl$kal[[4]]["bootstrap"] <- list() - sl$kal[[4]]$bootstrap[[1]] <- data.frame("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210), - stringsAsFactors=FALSE) - sl$kal[[4]]$bootstrap[[2]] <- data.frame("target"= c("NIA1", "MIX6.d", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "LC1", "1D1C:two", "MIX6.c1", "1B1C.2", "MIX6.c3", "MIX6.c2", "MIX6.c4", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(323, 0, 656, 15, 22, 0, 2, 23, 280, 160, 35, 120, 0, 95, 18, 119, 19, 58, 7, 0, 150, 220), - stringsAsFactors=FALSE) - sl$kal[[4]]$bootstrap[[3]] <- data.frame("target"= c("NIA1", "NIA2", "1A1N-1", "1A1N-2", "MIX6.nc", "1B1C.2", "LC1", "MIX6.c1", "MIX6.c2", "MIX6.c3", "1D1C:one", "1D1C:two", "MIX6.c4", "MIX6.d", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(343, 676, 13, 21, 55, 145, 3, 270, 133, 30, 0, 20, 0, 0, 23, 80, 17, 50, 14, 0, 130, 200), - stringsAsFactors=FALSE) - - names(sl$kal[[1]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[1]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[1]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[2]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[2]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[3]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[3]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[4]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[4]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[4]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) - - if (errannot_inconsistent) { - sl$kal[[3]]$bootstrap[[1]][10, BS_TARGET_COL] <- "Unexpected-name" - } - - return(list("annot" = tx, "slo" = sl, "isx" = intersect(tx[[TARGET_COL]], sl$kal[[1]]$bootstrap[[1]][[BS_TARGET_COL]]) )) -} - #============================================================================== #' Generate an artificial dataset of bootstrapped abundance estimates, for code-testing or examples. #' @@ -118,33 +12,33 @@ sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET #' @export #' sim_boot_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id") { - tx <- data.frame(target_id= c("NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), - parent_id= c("NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), + tx <- data.frame(target_id= c("1A1B.a", "1A1B.b", "NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), + parent_id= c("1A1B", "1A1B", "NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), stringsAsFactors=FALSE) names(tx) <- c(TARGET_COL, PARENT_COL) a <- list() - a[[1]] <- data.table("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), - "V2"= c( 2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), - "V3"= c( 0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), + a[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), + "V1"= c( 69, 0, 3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), + "V2"= c( 96, 0, 2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), + "V3"= c( 88, 0, 0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), stringsAsFactors=FALSE) - a[[2]] <- data.table("target"= c("NIA1", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 333, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), - "V2"= c( 330, 11, 4, 560, 0, 80, 55, 90, 11, 380, 0, 80, 0, 240, 23, 55, 10, 31, 2, 55, 0, 0), + a[[2]] <- data.table("target"= c("NIA1", "1A1B.a", "1A1B.b", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), + "V1"= c( 333, 121, 0, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), + "V2"= c( 330, 144, 0, 11, 4, 560, 0, 80, 55, 90, 11, 380, 0, 80, 0, 240, 23, 55, 10, 31, 2, 55, 0, 0), stringsAsFactors=FALSE) b <- list() - b[[1]] <- data.table("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), - "V2"= c( 323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), + b[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), + "V1"= c( 0, 91, 333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), + "V2"= c( 0, 100, 323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), stringsAsFactors=FALSE) - b[[2]] <- data.table("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210), - "V2"= c( 656, 15, 323, 22, 0, 23, 2, 160, 120, 35, 280, 0, 95, 0, 119, 18, 19, 58, 7, 0, 150, 220), - "V3"= c( 676, 13, 343, 21, 0, 20, 3, 145, 133, 30, 270, 0, 55, 0, 80, 23, 17, 50, 14, 0, 130, 200), + b[[2]] <- data.table("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1A1B.b", "1A1B.a"), + "V1"= c( 666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210, 76, 0), + "V2"= c( 656, 15, 323, 22, 0, 23, 2, 160, 120, 35, 280, 0, 95, 0, 119, 18, 19, 58, 7, 0, 150, 220, 69, 0), + "V3"= c( 676, 13, 343, 21, 0, 20, 3, 145, 133, 30, 270, 0, 55, 0, 80, 23, 17, 50, 14, 0, 130, 200, 36, 0), stringsAsFactors=FALSE) if (errannot_inconsistent) diff --git a/R/func.R b/R/func.R index 3ee9031..a674b27 100644 --- a/R/func.R +++ b/R/func.R @@ -64,8 +64,8 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, return(list("error"=TRUE, "message"="Invalid p-value threshold!")) if ((!is.numeric(dprop_thresh)) || dprop_thresh < 0 || dprop_thresh > 1) return(list("error"=TRUE, "message"="Invalid proportion difference threshold! Must be between 0 and 1.")) - if ((!is.numeric(abund_thresh)) || abund_thresh < 0) - return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be between 0 and 1.")) + if ((!is.numeric(abund_thresh)) || abund_thresh < 1) + return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be a count >= 1.")) if ((!is.numeric(qbootnum)) || qbootnum < 0) return(list("error"=TRUE, "message"="Invalid number of bootstraps! Must be a positive integer number.")) if (!is.logical(qboot)) @@ -256,7 +256,7 @@ alloc_out <- function(annot, full, n=1){ #' @param test_transc Whether to do transcript-level test. #' @param test_genes Whether to do gene-level test. #' @param full Either "full" (for complete output structure) or "short" (for bootstrapping). -#' @param count_thresh Minimum number of counts per replicate. +#' @param count_thresh Minimum average count across replicates. #' @param p_thresh The p-value threshold. #' @param dprop_thresh Minimum difference in proportions. #' @param correction Multiple testing correction type. @@ -303,8 +303,11 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes ctA <- count_thresh * resobj$Parameters[["num_replic_A"]] # Adjust count threshold for number of replicates. ctB <- count_thresh * resobj$Parameters[["num_replic_B"]] Transcripts[, elig_xp := (sumA >= ctA | sumB >= ctB)] - Transcripts[, elig := (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))] # If the entire gene is shut off in one condition, changes in proportion cannot be defined. - # If sum and total are equal in both conditions the gene has only one expressed isoform, or one isoform altogether. + Transcripts[, elig := (elig_xp & # isoform expressed above the noise threshold in EITHER condition + totalA >= ctA & totalB >= ctB & # at least one eligibly expressed isoform exists in EACH condition (prevent gene-on/off from creating wild proportions from low counts) + totalA != 0 & totalB != 0 & # gene expressed in BOTH conditions (failsafe, in case user sets noise threshold to 0) + (propA != 1 | propB != 1) )] # not the only isoform expressed. + # If sum and total are equal in both conditions, it has no detected siblings and thus cannot change in proportion. Genes[, elig_transc := Transcripts[, as.integer(sum(elig, na.rm=TRUE)), by=parent_id][, V1] ] Genes[, elig := elig_transc >= 2] diff --git a/R/rats.R b/R/rats.R index 3e5d2bc..eb5d327 100644 --- a/R/rats.R +++ b/R/rats.R @@ -19,8 +19,8 @@ #' @param name_B The name for the other condition. (Default "Condition-B") #' @param varname The name of the covariate to which the two conditions belong. (Default \code{"condition"}). #' @param p_thresh The p-value threshold. (Default 0.05) -#' @param abund_thresh Noise threshold. Minimum mean abundance for transcripts to be eligible for testing. (Default 5) #' @param dprop_thresh Effect size threshold. Minimum change in proportion of a transcript for it to be considered meaningful. (Default 0.20) +#' @param abund_thresh Noise threshold. Minimum mean (across replicates) abundance for transcripts (and genes) to be eligible for testing. (Default 5) #' @param correction The p-value correction to apply, as defined in \code{\link[stats]{p.adjust.methods}}. (Default \code{"BH"}) #' @param scaling A scaling factor or vector of scaling factors, to be applied to the abundances *prior* to any thresholding and testing. Useful for scaling TPMs (transcripts per 1 million reads) to the actual library sizes of the samples. If a vector is supplied, the order should correspond to the samples in group A followed by the samples in group B. WARNING: Improper use of the scaling factor will artificially inflate/deflate the significances obtained. #' @param testmode One of \itemize{\item{"genes"}, \item{"transc"}, \item{"both" (default)}}. diff --git a/inst/doc/input.R b/inst/doc/input.R index 1bd5e38..1c840cb 100644 --- a/inst/doc/input.R +++ b/inst/doc/input.R @@ -83,7 +83,7 @@ myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # # Calling DTU with custom thresholds. # mydtu <- call_DTU(annot= myannot, # boot_data_A= mycond_A, boot_data_B= mycond_B, -# p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25) +# p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10) ## ---- eval=FALSE--------------------------------------------------------- # # Bootstrap (default). Do 100 iterations. diff --git a/inst/doc/input.Rmd b/inst/doc/input.Rmd index 3a383a5..8bbba78 100644 --- a/inst/doc/input.Rmd +++ b/inst/doc/input.Rmd @@ -1,7 +1,7 @@ --- title: 'RATs: Input and Settings' author: "Kimon Froussios" -date: "19 SEP 2017" +date: "08 MAR 2018" output: html_document: keep_md: yes @@ -273,14 +273,16 @@ The following three main thresholds are used in RATs: # Calling DTU with custom thresholds. mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, boot_data_B= mycond_B, - p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25) + p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10) ``` -1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter. (Default 0.05) -2. `abund_thresh` - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes abundances scaled to library size) -3. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than the threshold are considered non-DTU, regardless of their statistical significance. Higher threshold values are stricter. (Default 0.20) +1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter, and help reduce low-count low-confidence calls. (Default 0.05, very permissive) +2. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than this threshold are considered uninteresting, regardless of their statistical significance. (Default 0.20, quite strict) +3. `abund_thresh` - Noise threshold. Minimum mean (across replicates) abundance for a transcript to be considered expressed. Transcripts with mean abundances below this will be ignored. Mean total gene count must also meet this value in both conditions, a.k.a. at least one expressed isoform must exist in each condition. (Default 5, very permissive) + +The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. +Your mileage may vary and you should give some consideration to selecting appropriate values. -The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. Depending on the settings, *additional thresholds* are available and will be discussed in their respective sections below. @@ -378,7 +380,7 @@ mydtu <- call_DTU(annot = myannot, 1. `threads` - The number of threads to use. (Default 1) -Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems. +Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems (Linux, Mac, not Windows). Refer to the `parallel` package for details. @@ -447,14 +449,14 @@ such as those employed by RATs, and reduces the statistical power of the method. To counter this, RATs provides the option to scale abundaces either equally by a single factor (such as average library size among samples) or by a vector of factors (one per sample). The former maintains any pre-existing library-size normalisation among samples. This is necessary for fold-change based methods, but RATs does not require it. Instead, using the respective actual library sizes of the samples allows the -higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have dissimilar +higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have very dissimilar library sizes. For flexibility with different types of input, these scaling options can be applied in either of two stages: The data import step by `fish4rodents()`, or the actual testing step by `call_DTU()`. In the example examined previously, `fish4rodents()` was instructed to create TPM abundances, -by normalising to 10000000 reads. Such values are useful with certain other tools that a user may also intend to use. +by normalising to `1000000` reads. Such values are useful with certain other tools that a user may also intend to use. Subsequently, these TPMs were re-scaled to meet the library size of each sample, thus providing RATs with count-like abundance values that retain -the TPM's normalisation by isoform length. However, it is not necessary to scale in two separate steps. +the normalisation by isoform length. However, it is not necessary to scale in two separate steps. Both `fish4rodents()` and `call_DTU()` support scaling by a single value or a vector of values. If you don't need the TPMs, you can scale directly to the desired library size(s), as in the examples below: @@ -495,7 +497,7 @@ mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, ``` You can mix and match scaling options as per your needs, so take care to ensure that the scaling you apply is appropriate. -It is important to note, that if you simply run both methods with their respective defaults, you'll effectively run RATs +*It is important to note*, that if you simply run both methods with their respective defaults, you'll effectively run RATs on TPM values, which is extremely underpowered and not recommended. Please, provide appropriate scaling factors for your data. @@ -510,10 +512,10 @@ in a meaningful way. All internal operations and the output of RATs are based on the annotation provided: -* Any transcripts present in the data but missing from the annotation will be ignored completely and -will not show up in the output, as there is no reliable way to match them to gene IDs. -* Any transcript/gene present in the annotation but missing from the data will be included in the output as zero expression. -* If the samples appear to use different annotations from one another, RATs will abort. +* Any transcript IDs present in the data but missing from the annotation will be silently ignored and +will not show up in the output, as they cannot be matched to the gene IDs. +* Any transcript/gene ID present in the annotation but missing from the data will be included in the output as zero expression. +* If the samples appear to have been quantified with different annotations from one another, RATs will abort. *** diff --git a/inst/doc/input.html b/inst/doc/input.html index 8078e6c..6d5aa38 100644 --- a/inst/doc/input.html +++ b/inst/doc/input.html @@ -11,7 +11,7 @@ - + RATs: Input and Settings @@ -25,8 +25,8 @@ - - + + @@ -217,7 +219,7 @@

RATs: Input and Settings

Kimon Froussios

-

19 SEP 2017

+

08 MAR 2018

@@ -238,13 +240,13 @@

Data

As of v0.6.0, input from a sleuth object is no longer supported by RATs. This is due to changes in Sleuth v0.29 that made the bootstrap data no longer available. Instead, RATs already supports input directly from the Kallisto/Salmon quantifications.

For option 1, the function fish4rodents() will load the data into tables suitable for option 2. Details in the respective section below.

For options 2 and 3, the format of the tables is as in the example below. The first column contains the transcript identifiers. Subsequent columns contain the abundances.

-
##      target  V1  V2  V3
-## 1:      LC1   3   2   0
-## 2:     NIA1 333 310 340
-## 3:     NIA2 666 680 610
-## 4:   1A1N-1  10  11   7
-## 5:   1A1N-2  20  21  18
-## 6: 1D1C:one   0   0   0
+
##    target  V1  V2  V3
+## 1: 1A1B.a  69  96  88
+## 2: 1A1B.b   0   0   0
+## 3:    LC1   3   2   0
+## 4:   NIA1 333 310 340
+## 5:   NIA2 666 680 610
+## 6: 1A1N-1  10  11   7