v1.1.8 updates

jtlovell · Mar 28, 2023 · 3864eaa · 3864eaa
1 parent d39df14
commit 3864eaa
Show file tree

Hide file tree

Showing 9 changed files with 152 additions and 31 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: GENESPACE
 Type: Package
 Title: Synteny- and orthology-constrained comparative genomics
-Version: 1.1.7
+Version: 1.1.8
 Author: John T. Lovell
 Maintainer: John T. Lovell <jlovell@hudsonalpha.org>
 Description: Construct pan-gene sets and other comparative genomics

diff --git a/NAMESPACE b/NAMESPACE
@@ -53,6 +53,7 @@ export(parse_phytozome)
 export(phase_blks)
 export(plot_hits)
 export(plot_riparian)
+export(query_cnv)
 export(query_hits)
 export(query_pangenes)
 export(read_aaFasta)
@@ -92,6 +93,7 @@ import(parallel)
 importFrom(Biostrings,AAStringSet)
 importFrom(Biostrings,DNA_ALPHABET)
 importFrom(Biostrings,readAAStringSet)
+importFrom(Biostrings,vcountPattern)
 importFrom(Biostrings,width)
 importFrom(Biostrings,writeXStringSet)
 importFrom(dbscan,dbscan)

diff --git a/R/parse_annotations.R b/R/parse_annotations.R
@@ -205,7 +205,7 @@ parse_annotations <- function(rawGenomeRepo,
 #' files
 #' @rdname parse_annotations
 #' @import data.table
-#' @importFrom Biostrings writeXStringSet width readAAStringSet AAStringSet DNA_ALPHABET
+#' @importFrom Biostrings writeXStringSet width readAAStringSet AAStringSet DNA_ALPHABET vcountPattern
 #' @importFrom utils head
 #' @export
 match_fasta2gff <- function(path2fasta,

diff --git a/R/plot_riparian.R b/R/plot_riparian.R
@@ -146,7 +146,8 @@ plot_riparian <- function(gsParam,
                           refChrOrdFun = function(x)
                             frank(list(as.numeric(gsub('\\D+','', x)), x), ties.method = "random")){
 
-  color <- blkID <- pass <- genome <- chr <- start <- end <- NULL
+  color <- blkID <- pass <- genome <- chr <- start <- end <- index <- hasAll <-
+    genome1 <- genome2 <- NULL
   # -- there are two methods of plotting.
 
   # 1. Default phase the blocks by a reference genome chromosomes

diff --git a/R/query_genespace.R b/R/query_genespace.R
@@ -25,6 +25,10 @@
 #' not provided
 #' @param showUnPlacedPgs logical, when queried should un-located orthogroups be
 #' shown?
+#' @param onlyArrayReps logical, should only array representatives be considered
+#' for CNV counts?
+#' @param maxCopyNumber numeric, maximum copyNumber to tabulate. Orthogroups
+#' with more than this number are combined into the largest CNV category
 
 #' @return a list of data.tables named $genome, $chr: $start-$end. One
 #' data.table for each line in the bed file.
@@ -200,3 +204,91 @@ query_pangenes <- function(gsParam,
     return(out)
   }
 }
+
+
+#' @title reformat and subset pan-gene sets
+#' @description
+#' \code{query_pangenes} the primary engine to explore genespace output, this
+#' lets you reformat the pan-genes to wide (entry - by - genome) matrix and
+#' only look at specified positional bounds.
+#' @rdname query_genespace
+#' @import data.table
+#' @export
+query_cnv <- function(gsParam,
+                      bed = NULL,
+                      onlyArrayReps = FALSE,
+                      maxCopyNumber = Inf){
+
+  count_cnv <- function(combBed, ofIDs){
+
+    ofID <- ogType <- genome <- og <- copyNumber <- nGenes <- nOgs <- NULL
+    cbl <- melt(
+      combBed,
+      measure.vars = c("globOG", "globHOG", "og"),
+      id.vars = c("genome", "ofID", "chr", "start", "end"),
+      variable.name = "ogType",
+      value.name = "og")
+
+    if(!is.null(ofIDs)){
+      cbogs <- subset(cbl, ofID %in% ofIDs)[,c("ogType", "og")]
+      cbogs <- subset(cbogs, !duplicated(cbogs))
+      cbl <- merge(cbl, cbogs, by = c("ogType", "og"))
+    }
+
+    cbl[,ogType := ifelse(ogType == "og", "syntenicHOGs",
+                          ifelse(ogType == "globHOG", "globalHOGs", "globalOGs"))]
+
+    cbn <- cbl[,list(copyNumber = .N), by = c("genome", "og", "ogType")]
+    cbn$copyNumber[cbn$copyNumber > maxCopyNumber] <- maxCopyNumber
+    eg <- cbl[,CJ(genome = unique(genome), og = unique(og)), by = "ogType"]
+    cbn <- merge(cbn, eg, by = colnames(eg), all = T)
+    cbn$copyNumber[is.na(cbn$copyNumber)] <- 0
+    cbo <- cbn[,list(nGenes = copyNumber*uniqueN(og), nOgs = uniqueN(og)),
+               by = c("genome", "ogType", "copyNumber")]
+    setkey(cbo, genome, ogType, copyNumber)
+    cbo[,`:=`(percGenes = 100*(nGenes/sum(nGenes)),
+              percOGs = 100*(nOgs/sum(nOgs))),
+        by = c("genome", "ogType")]
+    return(cbo)
+  }
+
+  isArrayRep <- pass <- genome <- chr <- start <- end <- regID <- genome <- NULL
+
+  cb <- read_combBed(file.path(gsParam$paths$results, "combBed.txt"))
+  if(onlyArrayReps){
+    cb <- subset(cb, isArrayRep)
+  }
+
+  if(!is.null(bed)){
+    bed <- data.table(bed)
+    if(nrow(bed) == 0)
+      stop("bed must be a data.table or data.frame with >= 1 rows\n")
+    if(!all(c("chr", "genome") %in% names(bed)))
+      stop("bed must be a data.table or data.frame with columns named chr and genome\n")
+    u <- unique(paste(cb$genome, cb$chr))
+    bed[,pass := paste(genome, chr) %in% u]
+
+    if(!all(bed$pass))
+      stop("the following genome/chr combinations are not in the run:",
+           subset(bed, !pass)[,1:2])
+    bed[,pass := NULL]
+    if(!"start" %in% names(bed))
+      bed[,start := 0]
+    if(!"end" %in% names(bed))
+      bed[,end := Inf]
+    bed[,regID := sprintf("%s, %s: %s-%s", genome, chr, start, end)]
+
+
+    out <- lapply(1:nrow(bed), function(i){
+      x <- bed[i,]
+      bogs <- subset(
+        cb, genome == x$genome & chr == x$chr & end >= x$start & start <= x$end)
+      outi <- count_cnv(combBed = cb, ofIDs = bogs$ofID)
+      return(outi)
+    })
+    names(out) <- bed$regID
+  }else{
+    out <- count_cnv(combBed = cb, ofIDs = NULL)
+  }
+ return(out)
+}
diff --git a/R/run_genespace.R b/R/run_genespace.R
@@ -131,7 +131,7 @@ run_genespace <- function(gsParam,
     tmp <- run_orthofinder(gsParam = gsParam, verbose = TRUE)
 
   gsParam <- set_syntenyParams(gsParam)
-
+  gsParam <<- gsParam
   ##############################################################################
   # -- 1.5 get the files in order if the run is complete
   if(noResults){
@@ -223,6 +223,7 @@ run_genespace <- function(gsParam,
   cat("\t##############\n\tGenerating dotplots for all hits ... ")
   nu <- plot_hits(gsParam = gsParam, type = "raw")
   cat("Done!\n")
+  gsParam <<- gsParam
 
   ##############################################################################
   # 4. Run synteny
@@ -232,6 +233,7 @@ run_genespace <- function(gsParam,
     "4. Flagging synteny for each pair of genomes ...",
     indent = 0, exdent = 8), sep = "\n")
   gsParam <- synteny(gsParam = gsParam)
+  gsParam <<- gsParam
 
   ##############################################################################
   # 5. Build syntenic orthogroups
@@ -254,6 +256,7 @@ run_genespace <- function(gsParam,
   gsParam <- syntenic_orthogroups(
     gsParam, updateArrays = gsParam$params$orthofinderInBlk)
   cat("\tDone!\n")
+  gsParam <<- gsParam
 
   ##############################################################################
   # 6. Make dotplots
@@ -269,6 +272,7 @@ run_genespace <- function(gsParam,
   cat("\t##############\n\tInterpolating syntenic positions of genes ... \n")
   nsynPos <- interp_synPos(gsParam)
   cat("\tDone!\n")
+  gsParam <<- gsParam
 
   ##############################################################################
   # 8. Phase syntenic blocks against reference chromosomes
@@ -288,6 +292,7 @@ run_genespace <- function(gsParam,
 
   bed <- read_combBed(file.path(gsParam$paths$results, "combBed.txt"))
   minChrSize <- gsParam$params$blkSize * 2
+  isOK <- og <- chr <- genome <- NULL
   bed[,isOK := uniqueN(og) >= minChrSize, by = c("genome", "chr")]
   ok4pg <- bed[,list(propOK = (sum(isOK) / .N) > .75), by = "genome"]
   ok4rip <- bed[,list(nGood = (uniqueN(chr[isOK]) < 100)), by = "genome"]
@@ -339,6 +344,8 @@ run_genespace <- function(gsParam,
     }
   }
 
+  gsParam <<- gsParam
+
   ##############################################################################
   # 9. Build pan-genes (aka pan-genome annotations)
   cat("\n############################", strwrap(

diff --git a/R/syntenic_pangenes.R b/R/syntenic_pangenes.R
@@ -187,27 +187,29 @@ syntenic_pangenes <- function(gsParam,
   hasu <- with(pgOut, paste(pgRepID, ofID))
   orthof <- unlist(gsParam$synteny$blast[,c("queryOrthologs", "targetOrthologs")])
   orthof <- orthof[!is.na(orthof)]
-  ogs <- rbindlist(mclapply(orthof, mc.cores = gsParam$params$nCores, function(i){
-    x <- parse_orthologues(i)
-    x[,`:=`(ofID1 = dict[paste(gen1, id1)],
-            ofID2 = dict[paste(gen2, id2)],
-            gen1 = NULL, gen2 = NULL, id1 = NULL, id2 = NULL, orthID = NULL)]
-    x <- subset(x, ofID1 %in% u)
-    x <- subset(x, !paste(ofID1, ofID2) %in% hasu)
-    return(x)
-  }))
-  setnames(ogs, c("pgRepID", "ofID"))
-  bedTmp <- bed[,c("genome", "og", "ofID")]
-  bedTmp <- subset(bedTmp, !duplicated(bedTmp))
-  ogs <- subset(ogs, !duplicated(ogs))
-  ogs <- merge(bedTmp, ogs, by = "ofID", allow.cartesian = T)
-  pgTmp <- pgOut[,c("pgID", "interpChr", "interpOrd", "pgRepID")]
-  pgTmp <- subset(pgTmp, !duplicated(pgTmp))
-  ogs <- subset(ogs, !duplicated(ogs))
-  nsogs <- merge(pgTmp, ogs, by = "pgRepID", allow.cartesian = T)
-  nsogs[,flag := "NSOrtho"]
-
-  pgOut <- rbind(pgOut, nsogs)
+  if(length(orthof) > 0){
+    ogs <- rbindlist(mclapply(orthof, mc.cores = gsParam$params$nCores, function(i){
+      x <- parse_orthologues(i)
+      x[,`:=`(ofID1 = dict[paste(gen1, id1)],
+              ofID2 = dict[paste(gen2, id2)],
+              gen1 = NULL, gen2 = NULL, id1 = NULL, id2 = NULL, orthID = NULL)]
+      x <- subset(x, ofID1 %in% u)
+      x <- subset(x, !paste(ofID1, ofID2) %in% hasu)
+      return(x)
+    }))
+    setnames(ogs, c("pgRepID", "ofID"))
+    bedTmp <- bed[,c("genome", "og", "ofID")]
+    bedTmp <- subset(bedTmp, !duplicated(bedTmp))
+    ogs <- subset(ogs, !duplicated(ogs))
+    ogs <- merge(bedTmp, ogs, by = "ofID", allow.cartesian = T)
+    pgTmp <- pgOut[,c("pgID", "interpChr", "interpOrd", "pgRepID")]
+    pgTmp <- subset(pgTmp, !duplicated(pgTmp))
+    ogs <- subset(ogs, !duplicated(ogs))
+    nsogs <- merge(pgTmp, ogs, by = "pgRepID", allow.cartesian = T)
+    nsogs[,flag := "NSOrtho"]
+    pgOut <- rbind(pgOut, nsogs)
+  }
+
   setcolorder(pgOut, c(
     "pgID", "interpChr", "interpOrd", "pgRepID", "genome", "og", "ofID", "flag"))
 

diff --git a/R/utils.R b/R/utils.R
@@ -46,7 +46,7 @@
 #' @export
 .onAttach <- function(...) {
   packageStartupMessage(paste(strwrap(
-    "GENESPACE v1.1.7: synteny and orthology constrained
+    "GENESPACE v1.1.8: synteny and orthology constrained
     comparative genomics\n",
     indent = 0, exdent = 8), collapse = "\n"))
 }
@@ -505,16 +505,20 @@ parse_ogs <- function(filepath, genomeIDs){
 #' @import data.table
 #' @export
 parse_hogs <- function(filepath){
-  id <- genome <- HOG <- hogID <- OG <- NULL
+  id <- genome <- HOG <- hogID <- OG <- orthofinderInternalData_XXX_OG <-
+    orthofinderInternalData_XXX_hogID <- orthofinderInternalData_XXX_HOG <- NULL
   d <- fread(filepath, showProgress = FALSE)
-  setnames(d, 1:3, sprintf("orthofinderInternalData_XXX_%s",colnames(d)[1:3]))
+  setnames(d, 1:3, sprintf("orthofinderInternalData_XXX_%s", colnames(d)[1:3]))
   sd <- colnames(d)[-(1:3)]
-  d[,orthofinderInternalData_XXX_hogID := paste(orthofinderInternalData_XXX_HOG, orthofinderInternalData_XXX_OG)]
+  d[,orthofinderInternalData_XXX_hogID := paste(
+    orthofinderInternalData_XXX_HOG, orthofinderInternalData_XXX_OG)]
   m <- melt(
-    d, id.vars = "orthofinderInternalData_XXX_hogID", measure.vars = sd,
+    d, id.vars = "orthofinderInternalData_XXX_hogID",
+    measure.vars = sd,
     value.name = "id", variable.name = "genome")
   m <- subset(m, id != "")
-  tmp <- m[,list(id = strsplit(id, ",")[[1]]), by = c("orthofinderInternalData_XXX_hogID", "genome")]
+  tmp <- m[,list(id = strsplit(id, ",")[[1]]),
+           by = c("orthofinderInternalData_XXX_hogID", "genome")]
   tmp[,id := trimws(id)]
   setnames(tmp, "orthofinderInternalData_XXX_hogID", "hogID")
   return(tmp)

diff --git a/man/query_genespace.Rd b/man/query_genespace.Rd