Skip to content

Commit

Permalink
v1.1.8 updates
Browse files Browse the repository at this point in the history
  • Loading branch information
jtlovell committed Mar 28, 2023
1 parent d39df14 commit 3864eaa
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 31 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: GENESPACE
Type: Package
Title: Synteny- and orthology-constrained comparative genomics
Version: 1.1.7
Version: 1.1.8
Author: John T. Lovell
Maintainer: John T. Lovell <jlovell@hudsonalpha.org>
Description: Construct pan-gene sets and other comparative genomics
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ export(parse_phytozome)
export(phase_blks)
export(plot_hits)
export(plot_riparian)
export(query_cnv)
export(query_hits)
export(query_pangenes)
export(read_aaFasta)
Expand Down Expand Up @@ -92,6 +93,7 @@ import(parallel)
importFrom(Biostrings,AAStringSet)
importFrom(Biostrings,DNA_ALPHABET)
importFrom(Biostrings,readAAStringSet)
importFrom(Biostrings,vcountPattern)
importFrom(Biostrings,width)
importFrom(Biostrings,writeXStringSet)
importFrom(dbscan,dbscan)
Expand Down
2 changes: 1 addition & 1 deletion R/parse_annotations.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ parse_annotations <- function(rawGenomeRepo,
#' files
#' @rdname parse_annotations
#' @import data.table
#' @importFrom Biostrings writeXStringSet width readAAStringSet AAStringSet DNA_ALPHABET
#' @importFrom Biostrings writeXStringSet width readAAStringSet AAStringSet DNA_ALPHABET vcountPattern
#' @importFrom utils head
#' @export
match_fasta2gff <- function(path2fasta,
Expand Down
3 changes: 2 additions & 1 deletion R/plot_riparian.R
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ plot_riparian <- function(gsParam,
refChrOrdFun = function(x)
frank(list(as.numeric(gsub('\\D+','', x)), x), ties.method = "random")){

color <- blkID <- pass <- genome <- chr <- start <- end <- NULL
color <- blkID <- pass <- genome <- chr <- start <- end <- index <- hasAll <-
genome1 <- genome2 <- NULL
# -- there are two methods of plotting.

# 1. Default phase the blocks by a reference genome chromosomes
Expand Down
92 changes: 92 additions & 0 deletions R/query_genespace.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
#' not provided
#' @param showUnPlacedPgs logical, when queried should un-located orthogroups be
#' shown?
#' @param onlyArrayReps logical, should only array representatives be considered
#' for CNV counts?
#' @param maxCopyNumber numeric, maximum copyNumber to tabulate. Orthogroups
#' with more than this number are combined into the largest CNV category

#' @return a list of data.tables named $genome, $chr: $start-$end. One
#' data.table for each line in the bed file.
Expand Down Expand Up @@ -200,3 +204,91 @@ query_pangenes <- function(gsParam,
return(out)
}
}


#' @title reformat and subset pan-gene sets
#' @description
#' \code{query_pangenes} the primary engine to explore genespace output, this
#' lets you reformat the pan-genes to wide (entry - by - genome) matrix and
#' only look at specified positional bounds.
#' @rdname query_genespace
#' @import data.table
#' @export
query_cnv <- function(gsParam,
bed = NULL,
onlyArrayReps = FALSE,
maxCopyNumber = Inf){

count_cnv <- function(combBed, ofIDs){

ofID <- ogType <- genome <- og <- copyNumber <- nGenes <- nOgs <- NULL
cbl <- melt(
combBed,
measure.vars = c("globOG", "globHOG", "og"),
id.vars = c("genome", "ofID", "chr", "start", "end"),
variable.name = "ogType",
value.name = "og")

if(!is.null(ofIDs)){
cbogs <- subset(cbl, ofID %in% ofIDs)[,c("ogType", "og")]
cbogs <- subset(cbogs, !duplicated(cbogs))
cbl <- merge(cbl, cbogs, by = c("ogType", "og"))
}

cbl[,ogType := ifelse(ogType == "og", "syntenicHOGs",
ifelse(ogType == "globHOG", "globalHOGs", "globalOGs"))]

cbn <- cbl[,list(copyNumber = .N), by = c("genome", "og", "ogType")]
cbn$copyNumber[cbn$copyNumber > maxCopyNumber] <- maxCopyNumber
eg <- cbl[,CJ(genome = unique(genome), og = unique(og)), by = "ogType"]
cbn <- merge(cbn, eg, by = colnames(eg), all = T)
cbn$copyNumber[is.na(cbn$copyNumber)] <- 0
cbo <- cbn[,list(nGenes = copyNumber*uniqueN(og), nOgs = uniqueN(og)),
by = c("genome", "ogType", "copyNumber")]
setkey(cbo, genome, ogType, copyNumber)
cbo[,`:=`(percGenes = 100*(nGenes/sum(nGenes)),
percOGs = 100*(nOgs/sum(nOgs))),
by = c("genome", "ogType")]
return(cbo)
}

isArrayRep <- pass <- genome <- chr <- start <- end <- regID <- genome <- NULL

cb <- read_combBed(file.path(gsParam$paths$results, "combBed.txt"))
if(onlyArrayReps){
cb <- subset(cb, isArrayRep)
}

if(!is.null(bed)){
bed <- data.table(bed)
if(nrow(bed) == 0)
stop("bed must be a data.table or data.frame with >= 1 rows\n")
if(!all(c("chr", "genome") %in% names(bed)))
stop("bed must be a data.table or data.frame with columns named chr and genome\n")
u <- unique(paste(cb$genome, cb$chr))
bed[,pass := paste(genome, chr) %in% u]

if(!all(bed$pass))
stop("the following genome/chr combinations are not in the run:",
subset(bed, !pass)[,1:2])
bed[,pass := NULL]
if(!"start" %in% names(bed))
bed[,start := 0]
if(!"end" %in% names(bed))
bed[,end := Inf]
bed[,regID := sprintf("%s, %s: %s-%s", genome, chr, start, end)]


out <- lapply(1:nrow(bed), function(i){
x <- bed[i,]
bogs <- subset(
cb, genome == x$genome & chr == x$chr & end >= x$start & start <= x$end)
outi <- count_cnv(combBed = cb, ofIDs = bogs$ofID)
return(outi)
})
names(out) <- bed$regID
}else{
out <- count_cnv(combBed = cb, ofIDs = NULL)
}
return(out)
}
9 changes: 8 additions & 1 deletion R/run_genespace.R
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ run_genespace <- function(gsParam,
tmp <- run_orthofinder(gsParam = gsParam, verbose = TRUE)

gsParam <- set_syntenyParams(gsParam)

gsParam <<- gsParam
##############################################################################
# -- 1.5 get the files in order if the run is complete
if(noResults){
Expand Down Expand Up @@ -223,6 +223,7 @@ run_genespace <- function(gsParam,
cat("\t##############\n\tGenerating dotplots for all hits ... ")
nu <- plot_hits(gsParam = gsParam, type = "raw")
cat("Done!\n")
gsParam <<- gsParam

##############################################################################
# 4. Run synteny
Expand All @@ -232,6 +233,7 @@ run_genespace <- function(gsParam,
"4. Flagging synteny for each pair of genomes ...",
indent = 0, exdent = 8), sep = "\n")
gsParam <- synteny(gsParam = gsParam)
gsParam <<- gsParam

##############################################################################
# 5. Build syntenic orthogroups
Expand All @@ -254,6 +256,7 @@ run_genespace <- function(gsParam,
gsParam <- syntenic_orthogroups(
gsParam, updateArrays = gsParam$params$orthofinderInBlk)
cat("\tDone!\n")
gsParam <<- gsParam

##############################################################################
# 6. Make dotplots
Expand All @@ -269,6 +272,7 @@ run_genespace <- function(gsParam,
cat("\t##############\n\tInterpolating syntenic positions of genes ... \n")
nsynPos <- interp_synPos(gsParam)
cat("\tDone!\n")
gsParam <<- gsParam

##############################################################################
# 8. Phase syntenic blocks against reference chromosomes
Expand All @@ -288,6 +292,7 @@ run_genespace <- function(gsParam,

bed <- read_combBed(file.path(gsParam$paths$results, "combBed.txt"))
minChrSize <- gsParam$params$blkSize * 2
isOK <- og <- chr <- genome <- NULL
bed[,isOK := uniqueN(og) >= minChrSize, by = c("genome", "chr")]
ok4pg <- bed[,list(propOK = (sum(isOK) / .N) > .75), by = "genome"]
ok4rip <- bed[,list(nGood = (uniqueN(chr[isOK]) < 100)), by = "genome"]
Expand Down Expand Up @@ -339,6 +344,8 @@ run_genespace <- function(gsParam,
}
}

gsParam <<- gsParam

##############################################################################
# 9. Build pan-genes (aka pan-genome annotations)
cat("\n############################", strwrap(
Expand Down
44 changes: 23 additions & 21 deletions R/syntenic_pangenes.R
Original file line number Diff line number Diff line change
Expand Up @@ -187,27 +187,29 @@ syntenic_pangenes <- function(gsParam,
hasu <- with(pgOut, paste(pgRepID, ofID))
orthof <- unlist(gsParam$synteny$blast[,c("queryOrthologs", "targetOrthologs")])
orthof <- orthof[!is.na(orthof)]
ogs <- rbindlist(mclapply(orthof, mc.cores = gsParam$params$nCores, function(i){
x <- parse_orthologues(i)
x[,`:=`(ofID1 = dict[paste(gen1, id1)],
ofID2 = dict[paste(gen2, id2)],
gen1 = NULL, gen2 = NULL, id1 = NULL, id2 = NULL, orthID = NULL)]
x <- subset(x, ofID1 %in% u)
x <- subset(x, !paste(ofID1, ofID2) %in% hasu)
return(x)
}))
setnames(ogs, c("pgRepID", "ofID"))
bedTmp <- bed[,c("genome", "og", "ofID")]
bedTmp <- subset(bedTmp, !duplicated(bedTmp))
ogs <- subset(ogs, !duplicated(ogs))
ogs <- merge(bedTmp, ogs, by = "ofID", allow.cartesian = T)
pgTmp <- pgOut[,c("pgID", "interpChr", "interpOrd", "pgRepID")]
pgTmp <- subset(pgTmp, !duplicated(pgTmp))
ogs <- subset(ogs, !duplicated(ogs))
nsogs <- merge(pgTmp, ogs, by = "pgRepID", allow.cartesian = T)
nsogs[,flag := "NSOrtho"]

pgOut <- rbind(pgOut, nsogs)
if(length(orthof) > 0){
ogs <- rbindlist(mclapply(orthof, mc.cores = gsParam$params$nCores, function(i){
x <- parse_orthologues(i)
x[,`:=`(ofID1 = dict[paste(gen1, id1)],
ofID2 = dict[paste(gen2, id2)],
gen1 = NULL, gen2 = NULL, id1 = NULL, id2 = NULL, orthID = NULL)]
x <- subset(x, ofID1 %in% u)
x <- subset(x, !paste(ofID1, ofID2) %in% hasu)
return(x)
}))
setnames(ogs, c("pgRepID", "ofID"))
bedTmp <- bed[,c("genome", "og", "ofID")]
bedTmp <- subset(bedTmp, !duplicated(bedTmp))
ogs <- subset(ogs, !duplicated(ogs))
ogs <- merge(bedTmp, ogs, by = "ofID", allow.cartesian = T)
pgTmp <- pgOut[,c("pgID", "interpChr", "interpOrd", "pgRepID")]
pgTmp <- subset(pgTmp, !duplicated(pgTmp))
ogs <- subset(ogs, !duplicated(ogs))
nsogs <- merge(pgTmp, ogs, by = "pgRepID", allow.cartesian = T)
nsogs[,flag := "NSOrtho"]
pgOut <- rbind(pgOut, nsogs)
}

setcolorder(pgOut, c(
"pgID", "interpChr", "interpOrd", "pgRepID", "genome", "og", "ofID", "flag"))

Expand Down
16 changes: 10 additions & 6 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
#' @export
.onAttach <- function(...) {
packageStartupMessage(paste(strwrap(
"GENESPACE v1.1.7: synteny and orthology constrained
"GENESPACE v1.1.8: synteny and orthology constrained
comparative genomics\n",
indent = 0, exdent = 8), collapse = "\n"))
}
Expand Down Expand Up @@ -505,16 +505,20 @@ parse_ogs <- function(filepath, genomeIDs){
#' @import data.table
#' @export
parse_hogs <- function(filepath){
id <- genome <- HOG <- hogID <- OG <- NULL
id <- genome <- HOG <- hogID <- OG <- orthofinderInternalData_XXX_OG <-
orthofinderInternalData_XXX_hogID <- orthofinderInternalData_XXX_HOG <- NULL
d <- fread(filepath, showProgress = FALSE)
setnames(d, 1:3, sprintf("orthofinderInternalData_XXX_%s",colnames(d)[1:3]))
setnames(d, 1:3, sprintf("orthofinderInternalData_XXX_%s", colnames(d)[1:3]))
sd <- colnames(d)[-(1:3)]
d[,orthofinderInternalData_XXX_hogID := paste(orthofinderInternalData_XXX_HOG, orthofinderInternalData_XXX_OG)]
d[,orthofinderInternalData_XXX_hogID := paste(
orthofinderInternalData_XXX_HOG, orthofinderInternalData_XXX_OG)]
m <- melt(
d, id.vars = "orthofinderInternalData_XXX_hogID", measure.vars = sd,
d, id.vars = "orthofinderInternalData_XXX_hogID",
measure.vars = sd,
value.name = "id", variable.name = "genome")
m <- subset(m, id != "")
tmp <- m[,list(id = strsplit(id, ",")[[1]]), by = c("orthofinderInternalData_XXX_hogID", "genome")]
tmp <- m[,list(id = strsplit(id, ",")[[1]]),
by = c("orthofinderInternalData_XXX_hogID", "genome")]
tmp[,id := trimws(id)]
setnames(tmp, "orthofinderInternalData_XXX_hogID", "hogID")
return(tmp)
Expand Down
13 changes: 13 additions & 0 deletions man/query_genespace.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 3864eaa

Please sign in to comment.