Skip to content

Commit

Permalink
Merge pull request #53 from bartongroup/development
Browse files Browse the repository at this point in the history
0.6.3
  • Loading branch information
fruce-ki authored Mar 14, 2018
2 parents 20018b6 + 1c4ff4d commit 8d32c68
Show file tree
Hide file tree
Showing 32 changed files with 561 additions and 755 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: rats
Version: 0.6.2
Date: 2018-02-16
Version: 0.6.3
Date: 2018-03-13
Title: Relative Abundance of Transcripts
Encoding: UTF-8
Authors: c(person("Kimon Froussios", role=c("aut"), email="k.froussios@dundee.ac.uk"),
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ export(plot_overview)
export(plot_shiny_volcano)
export(sim_boot_data)
export(sim_count_data)
export(sim_sleuth_data)
export(tidy_annot)
import(data.table)
import(ggplot2)
Expand Down
306 changes: 53 additions & 253 deletions R/data_simulators.R

Large diffs are not rendered by default.

71 changes: 54 additions & 17 deletions R/func.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#' @param threads Number of threads.
#' @param seed Seed for random engine.
#' @param scaling Abundance scaling factor.
#' @param reckless whether to ignore detected annotation discrepancies
#'
#' @return List: \itemize{
#' \item{"error"}{logical}
Expand All @@ -36,7 +37,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
TARGET_COL, PARENT_COL,
correction, testmode, scaling, threads, seed,
p_thresh, abund_thresh, dprop_thresh,
qboot, qbootnum, qrep_thresh, rboot, rrep_thresh) {
qboot, qbootnum, qrep_thresh, rboot, rrep_thresh, reckless) {
warnmsg <- list()

# Input format.
Expand All @@ -48,14 +49,16 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
return(list("error"=TRUE, "message"="You must specify (the same type of) data for both conditions!"))

# Annotation.
if (!is.logical(reckless) || is.na(reckless))
return(list("error"=TRUE, "message"="Invalid value to reckless!"))
if (!is.data.frame(annot))
return(list("error"=TRUE, "message"="The provided annot is not a data.frame!"))
if (any(!c(TARGET_COL, PARENT_COL) %in% names(annot)))
return(list("error"=TRUE, "message"="The specified target and/or parent IDs field names do not exist in annot!"))
if (length(annot$target_id) != length(unique(annot$target_id)))
if (length(annot[[TARGET_COL]]) != length(unique(annot[[TARGET_COL]])))
return(list("error"=TRUE, "message"="Some transcript identifiers are not unique!"))

# Parameters.
# Simple parameters.
if (!correction %in% p.adjust.methods)
return(list("error"=TRUE, "message"="Invalid p-value correction method name. Refer to stats::p.adjust.methods ."))
if (!testmode %in% c("genes", "transc", "both"))
Expand All @@ -65,7 +68,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
if ((!is.numeric(dprop_thresh)) || dprop_thresh < 0 || dprop_thresh > 1)
return(list("error"=TRUE, "message"="Invalid proportion difference threshold! Must be between 0 and 1."))
if ((!is.numeric(abund_thresh)) || abund_thresh < 0)
return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be between 0 and 1."))
return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be a count >= 0."))
if ((!is.numeric(qbootnum)) || qbootnum < 0)
return(list("error"=TRUE, "message"="Invalid number of bootstraps! Must be a positive integer number."))
if (!is.logical(qboot))
Expand All @@ -80,6 +83,8 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
return(list("error"=TRUE, "message"="Invalid number of threads! Must be positive integer."))
if (threads > parallel::detectCores(logical= TRUE))
return(list("error"=TRUE, "message"="Number of threads exceeds system's reported capacity."))

# Scaling
nsmpl <- NULL
if (!is.null(count_data_A)){
nsmpl <- dim(count_data_A)[2] + dim(count_data_B)[2]
Expand All @@ -103,8 +108,20 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
if(is.null(boot_data_A)) {
if (!is.data.table(count_data_A) | !is.data.table(count_data_B))
return(list("error"=TRUE, "message"="The counts data are not data.tables!"))
if (!all( count_data_A[[1]][order(count_data_A[[1]])] == count_data_B[[1]][order(count_data_B[[1]])] ))
return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs between conditions!"))
if ( !all(count_data_A[[1]] %in% annot[[TARGET_COL]]) && !all(annot[[TARGET_COL]] %in% count_data_A[[1]]) ) {
if(reckless){
warnmsg["annotation-mismatch"] <- "The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations? Reckless mode enabled, continuing at your own risk..."
} else {
return(list("error"=TRUE, "message"="The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations?"))
}
}
if (!all( count_data_A[[1]][order(count_data_A[[1]])] == count_data_B[[1]][order(count_data_B[[1]])] )) {
if (reckless) {
warnmsg["countIDs"] <- "Transcript IDs do not match completely between conditions! Did you use different annotations? Reckless mode enabled, continuing at your own risk..."
} else {
return(list("error"=TRUE, "message"="Transcript IDs do not match completely between conditions! Did you use different annotations?"))
}
}
} else {
warnmsg["cnts&boots"] <- "Received multiple input formats! Only the bootstrapped data will be used."
}
Expand All @@ -119,17 +136,33 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
}
maxmatrix <- 2^31 - 1
if (qboot) {
# Consistency,
if (!is.null(boot_data_A)) {
# Direct data.
# Compared to annotation.
if ( !all(boot_data_A[[1]][[1]] %in% annot[[TARGET_COL]]) && !all(annot[[TARGET_COL]] %in% boot_data_A[[1]][[1]]) ) {
if(reckless){
warnmsg["annotation-mismatch"] <- "The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations? Reckless mode enabled, continuing at your own risk..."
} else {
return(list("error"=TRUE, "message"="The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations?"))
}
}
# Among samples
tx <- boot_data_A[[1]][[1]][order(boot_data_A[[1]][[1]])]
for (k in 2:length(boot_data_A)){
if (!all( tx == boot_data_A[[k]][[1]][order(boot_data_A[[k]][[1]])] ))
return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!"))
if (!all( tx == boot_data_A[[k]][[1]][order(boot_data_A[[k]][[1]])] )) {
if (reckless) {
warnmsg[paste0("bootIDs-A", k)] <- paste0("Inconsistent set of transcript IDs across samples (A", k, ")! Did you use different annotations? Reckless mode enabled, continuing at your own risk...")
} else {
return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples! Did you use different annotations?"))
}
}
}
for (k in 1:length(boot_data_B)){
if (!all( tx == boot_data_B[[k]][[1]][order(boot_data_B[[k]][[1]])] ))
return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!"))
if (reckless) {
warnmsg[paste0("bootIDs-B", k)] <- paste0("Inconsistent set of transcript IDs across samples (B", k, ")! Did you use different annotations? Reckless mode enabled, continuing at your own risk...")
} else {
return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples! Did you use different annotations?"))
}
}

# Number of iterations.
Expand All @@ -156,7 +189,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A,
if (rboot & any( length(samples_by_condition[[1]]) * length(samples_by_condition[[2]]) > maxmatrix/dim(annot)[1],
length(boot_data_A) * length(boot_data_B) > maxmatrix/dim(annot)[1] ) )
warnmsg["toomanyreplicates"] <- "The number of replicates is too high. Exhaustive 1vs1 would exceed maximum capacity of an R matrix."

return(list("error"=FALSE, "message"="All good!", "maxboots"=minboots, "warn"=(length(warnmsg) > 0), "warnings"= warnmsg))
}

Expand Down Expand Up @@ -205,7 +238,7 @@ alloc_out <- function(annot, full, n=1){
"abund_scaling"=numeric(length=n),
"quant_boot"=NA,"quant_reprod_thresh"=NA_real_, "quant_bootnum"=NA_integer_,
"rep_boot"=NA, "rep_reprod_thresh"=NA_real_, "rep_bootnum"=NA_integer_,
"seed"=NA_integer_)
"seed"=NA_integer_, "reckless"=NA)
Genes <- data.table("parent_id"=as.vector(unique(annot$parent_id)),
"elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA,
"known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_, "maxDprop"=NA_real_,
Expand Down Expand Up @@ -256,7 +289,7 @@ alloc_out <- function(annot, full, n=1){
#' @param test_transc Whether to do transcript-level test.
#' @param test_genes Whether to do gene-level test.
#' @param full Either "full" (for complete output structure) or "short" (for bootstrapping).
#' @param count_thresh Minimum number of counts per replicate.
#' @param count_thresh Minimum average count across replicates.
#' @param p_thresh The p-value threshold.
#' @param dprop_thresh Minimum difference in proportions.
#' @param correction Multiple testing correction type.
Expand Down Expand Up @@ -303,8 +336,11 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes
ctA <- count_thresh * resobj$Parameters[["num_replic_A"]] # Adjust count threshold for number of replicates.
ctB <- count_thresh * resobj$Parameters[["num_replic_B"]]
Transcripts[, elig_xp := (sumA >= ctA | sumB >= ctB)]
Transcripts[, elig := (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))] # If the entire gene is shut off in one condition, changes in proportion cannot be defined.
# If sum and total are equal in both conditions the gene has only one expressed isoform, or one isoform altogether.
Transcripts[, elig := (elig_xp & # isoform expressed above the noise threshold in EITHER condition
totalA >= ctA & totalB >= ctB & # at least one eligibly expressed isoform exists in EACH condition (prevent gene-on/off from creating wild proportions from low counts)
totalA != 0 & totalB != 0 & # gene expressed in BOTH conditions (failsafe, in case user sets noise threshold to 0)
(propA != 1 | propB != 1) )] # not the only isoform expressed.

# If sum and total are equal in both conditions, it has no detected siblings and thus cannot change in proportion.
Genes[, elig_transc := Transcripts[, as.integer(sum(elig, na.rm=TRUE)), by=parent_id][, V1] ]
Genes[, elig := elig_transc >= 2]
Expand Down Expand Up @@ -494,3 +530,4 @@ group_samples <- function(covariates) {
}


#EOF
25 changes: 20 additions & 5 deletions R/rats.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
#' @param name_B The name for the other condition. (Default "Condition-B")
#' @param varname The name of the covariate to which the two conditions belong. (Default \code{"condition"}).
#' @param p_thresh The p-value threshold. (Default 0.05)
#' @param abund_thresh Noise threshold. Minimum mean abundance for transcripts to be eligible for testing. (Default 5)
#' @param dprop_thresh Effect size threshold. Minimum change in proportion of a transcript for it to be considered meaningful. (Default 0.20)
#' @param abund_thresh Noise threshold. Minimum mean (across replicates) abundance for transcripts (and genes) to be eligible for testing. (Default 5)
#' @param correction The p-value correction to apply, as defined in \code{\link[stats]{p.adjust.methods}}. (Default \code{"BH"})
#' @param scaling A scaling factor or vector of scaling factors, to be applied to the abundances *prior* to any thresholding and testing. Useful for scaling TPMs (transcripts per 1 million reads) to the actual library sizes of the samples. If a vector is supplied, the order should correspond to the samples in group A followed by the samples in group B. WARNING: Improper use of the scaling factor will artificially inflate/deflate the significances obtained.
#' @param testmode One of \itemize{\item{"genes"}, \item{"transc"}, \item{"both" (default)}}.
Expand All @@ -33,7 +33,8 @@
#' @param verbose Display progress updates and warnings. (Default \code{TRUE})
#' @param threads Number of threads to use. (Default 1) Multi-threading will be ignored on non-POSIX systems.
#' @param seed A numeric integer used to initialise the random number engine. Use this only if reproducible bootstrap selections are required. (Default NA)
#' @param dbg Debugging mode. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt)
#' @param reckless RATs normally aborts if any inconsistency is detected among the transcript IDs found in the annotation and the quantifications. Enabling reckless mode will downgrade this error to a warning and allow RATs to continue the run. Not recommended unless you know why the inconsistency exists and how it will affect the results. (Default FALSE)
#' @param dbg Debugging mode only. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt)
#' @return List of mixed types. Contains a list of runtime settings, a table of gene-level results, a table of transcript-level results, and a list of two tables with the transcript abundaces.
#'
#' @import utils
Expand All @@ -46,7 +47,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
name_A= "Condition-A", name_B= "Condition-B", varname= "condition",
p_thresh= 0.05, abund_thresh= 5, dprop_thresh= 0.2, correction= "BH", scaling= 1,
testmode= "both", qboot= TRUE, qbootnum= 0L, qrep_thresh= 0.95, rboot=TRUE, rrep_thresh= 0.85,
description= NA_character_, verbose= TRUE, threads= 1L, seed=NA_integer_, dbg= "0")
description= NA_character_, verbose= TRUE, threads= 1L, seed=NA_integer_, reckless=FALSE, dbg= "0")
{
#---------- PREP

Expand All @@ -60,7 +61,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
TARGET_COL, PARENT_COL,
correction, testmode, scaling, threads, seed,
p_thresh, abund_thresh, dprop_thresh,
qboot, qbootnum, qrep_thresh, rboot, rrep_thresh)
qboot, qbootnum, qrep_thresh, rboot, rrep_thresh, reckless)
if (paramcheck$error)
stop(paramcheck$message)
if (verbose)
Expand Down Expand Up @@ -243,6 +244,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
resobj$Parameters[["quant_reprod_thresh"]] <- qrep_thresh
resobj$Parameters[["quant_bootnum"]] <- qbootnum
resobj$Parameters[["rep_reprod_thresh"]] <- rrep_thresh
resobj$Parameters[["reckless"]] <- reckless

if (dbg == "info")
return(resobj)
Expand Down Expand Up @@ -469,6 +471,19 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
Transcripts[is.na(rep_reprod), rep_reprod := c(FALSE)]
Transcripts[is.na(DTU), DTU := c(FALSE)]
Transcripts[is.na(gene_DTU), gene_DTU := c(FALSE)]
# Eradicate NAs from count columns, as they mess up plotting. These would be caused by inconsistencies between annotation and quantifications, and would only exist in "reckless" mode.
for (i in 1:Parameters$num_replic_A){
idx <- is.na(Abundances[[1]][[paste0('V',i)]])
Abundances[[1]][idx, paste0('V',i) := 0]
}
for (i in 1:Parameters$num_replic_B){
idx <- is.na(Abundances[[2]][[paste0('V',i)]])
Abundances[[2]][idx, paste0('V',i) := 0]
}
Transcripts[is.na(meanA), meanA := c(0)]
Transcripts[is.na(meanB), meanB := c(0)]
Transcripts[is.na(propA), propA := c(0)]
Transcripts[is.na(propB), propB := c(0)]

# Drop the bootstrap columns, if unused.
if (!qboot || !test_transc)
Expand All @@ -495,4 +510,4 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i
return(resobj)
}


#EOF
11 changes: 8 additions & 3 deletions inst/doc/input.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ head(sim_count_data()[[1]])

## ---- eval=FALSE---------------------------------------------------------
# # Simulate some data.
# simdat <- sim_count_data()
# simdat <- sim_count_data(clean=TRUE)
# # For convenience let's assign the contents of the list to separate variables
# mycond_A <- simdat[[2]] # Simulated abundances for one condition.
# mycond_B <- simdat[[3]] # Simulated abundances for other condition.
Expand All @@ -39,7 +39,7 @@ head(sim_count_data()[[1]])

## ------------------------------------------------------------------------
# Simulate some data. (Notice it is a different function than before.)
simdat <- sim_boot_data()
simdat <- sim_boot_data(clean=TRUE)

# For convenience let's assign the contents of the list to separate variables
mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition.
Expand Down Expand Up @@ -83,7 +83,7 @@ myannot <- simdat[[1]] # Transcript and gene IDs for the above data.
# # Calling DTU with custom thresholds.
# mydtu <- call_DTU(annot= myannot,
# boot_data_A= mycond_A, boot_data_B= mycond_B,
# p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25)
# p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10)

## ---- eval=FALSE---------------------------------------------------------
# # Bootstrap (default). Do 100 iterations.
Expand Down Expand Up @@ -169,3 +169,8 @@ myannot <- simdat[[1]] # Transcript and gene IDs for the above data.
# boot_data_B= mydata$boot_data_B,
# scaling=c(25, 26.7, 23, 50.0, 45, 48.46, 52.36))

## ---- eval=FALSE---------------------------------------------------------
# mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A,
# boot_data_B= mydata$boot_data_B,
# reckless=TRUE, verbose=TRUE)

Loading

0 comments on commit 8d32c68

Please sign in to comment.