From e5785842589bbb9f047dd58b10c09b48700f42b0 Mon Sep 17 00:00:00 2001 From: Brandon Date: Wed, 1 Nov 2023 09:16:13 -0400 Subject: [PATCH 1/6] Add filter method for site IDs --- R/filter_functions.R | 743 ++++++++++++++++++++++--------------------- 1 file changed, 389 insertions(+), 354 deletions(-) diff --git a/R/filter_functions.R b/R/filter_functions.R index 8f4a908..3872ef4 100644 --- a/R/filter_functions.R +++ b/R/filter_functions.R @@ -1,354 +1,389 @@ -#--------------------------------------------------------------------- -# Script Name: FilterFunctions.R -# Description: Functions for TASSEL filtration -# Author: Brandon Monier & Ed buckler -# Created: 2018-11-26 at 11:14:36 -# Last Modified: 2020-10-26 at 13:14:55 -#-------------------------------------------------------------------- - -#-------------------------------------------------------------------- -# Detailed Purpose: -# The main purpose of this Rscript to host functions necessary for -# filtration methods of TASSEL genotype tables -#-------------------------------------------------------------------- - -#' @title Filter genotype table by sites -#' -#' @description This function will filter R objects of -#' \code{TasselGenotypePhenotype} class containing genotype tables. -#' The parameters for this function are derived from TASSEL's -#' \code{FilterSiteBuilder} plugin. -#' -#' @name filterGenotypeTableSites -#' @rdname filterGenotypeTableSites -#' -#' @param tasObj An object of class \code{TasselGenotypePenotype}. -#' @param siteMinCount Site minimum count of alleles not unknown. Can range -#' from 0 to inf. Defaults to 0. -#' @param siteMinAlleleFreq Site minimum minor allele frequency. Can range -#' from 0 to 1.0. Defaults to 0.0. -#' @param siteMaxAlleleFreq Site maximum minor allele frequency. Can range -#' from 0 to 1.0. Defaults to 1.0. -#' @param minHeterozygous Min heterozygous proportion. Can range from 0 to 1.0. -#' Defaults to 0.0. -#' @param maxHeterozygous Max heterozygous proportion. Can range from 0 to 1.0. -#' Defaults to 1.0. -#' @param removeMinorSNPStates Remove minor SNP states. Defaults to -#' \code{FALSE}. -#' @param removeSitesWithIndels Remove sites containing an indel -#' (\code{+} or \code{-}). Defaults to \code{FALSE}. -#' @param siteRangeFilterType True if filtering by site numbers. False if -#' filtering by chromosome and position. Options are -#' \code{none}, \code{sites}, or \code{position}. Defaults to \code{none}. -#' @param startSite The start site. Defaults to 0. -#' @param endSite The end site. Defaults to 0. -#' @param startChr Start chromosome for site filtration range if \code{position} -#' is chosen from \code{siteRangeFilterType}. Needs end chromosome -#' (\code{endChr}) to work. -#' @param endChr End chromosome for site filtration range if \code{position} -#' is chosen from \code{siteRangeFilterType}. Needs start chromosome -#' (\code{endChr}) to work. -#' @param startPos Physical start position (bp) for filtration range if -#' \code{position} is chosen from \code{siteRangeFilterType}. If -#' \code{NULL}, the first physical position in the data set will be -#' chosen. -#' @param endPos Physical end position (bp) for filtration range if -#' \code{position} is chosen from \code{siteRangeFilterType}. If -#' \code{NULL}, the last physical position in the data set will be -#' chosen. -#' @param gRangesObj Filter genotype table by \code{GenomicRanges} object. -#' If this parameter is selected, you cannot utilize the parameters, -#' \code{chrPosFile} or \code{bedFile}. Defaults to \code{NULL}. -#' @param chrPosFile An optional chromosome position file path of -#' \code{character} class. Defaults to \code{NULL}. \strong{Note:} -#' a chromosome position file must contain correct formatting -#' (e.g. a two column file with the header of -#' \code{c("Chromosome", "Position")}). -#' @param bedFile An optional BED coordinate file path of -#' \code{character} class. Defaults to \code{NULL}. -#' -#' @return Returns an object of \code{TasselGenotypePhenotype} class. -#' -#' @importFrom GenomicRanges end -#' @importFrom GenomicRanges seqnames -#' @importFrom GenomicRanges start -#' @importFrom rJava .jarray -#' @importFrom rJava .jnull -#' @importFrom rJava is.jnull -#' @importFrom rJava new -#' @importFrom rJava J -#' @export -filterGenotypeTableSites <- function(tasObj, - siteMinCount = 0, - siteMinAlleleFreq = 0.0, - siteMaxAlleleFreq = 1.0, - minHeterozygous = 0.0, - maxHeterozygous = 1.0, - removeMinorSNPStates = FALSE, - removeSitesWithIndels = FALSE, - siteRangeFilterType = c("none", "sites", "position"), - startSite = NULL, - endSite = NULL, - startChr = NULL, - startPos = NULL, - endChr = NULL, - endPos = NULL, - gRangesObj = NULL, - chrPosFile = NULL, - bedFile = NULL) { - - if (class(tasObj) != "TasselGenotypePhenotype") { - stop("`tasObj` must be of class `TasselGenotypePhenotype`") - } - - jGenoTable <- getGenotypeTable(tasObj) - if (rJava::is.jnull(jGenoTable)) { - stop("TASSEL genotype object not found") - } - - # Range check - if (siteMinAlleleFreq > 1 || siteMinAlleleFreq < 0) { - stop("siteMinAlleleFreq parameter is out of range") - } - if (siteMaxAlleleFreq > 1 || siteMaxAlleleFreq < 0) { - stop("siteMaxAlleleFreq parameter is out of range") - } - if (minHeterozygous > 1 || minHeterozygous < 0) { - stop("minHeterozygous parameter is out of range") - } - if (maxHeterozygous > 1 || maxHeterozygous < 0) { - stop("maxHeterozygous parameter is out of range") - } - - # Site check - taxa <- getTaxaList(tasObj) - if (siteMinCount > taxa$size()) { - stop("Minimum number of taxa exceeds total number of taxa in genotype table.") - } - - # Range check (chromosomes) - chroms <- jGenoTable$chromosomes() - chroms <- unlist(lapply(chroms, function(x) { rJava::.jstrVal(x) })) - if (!is.null(startChr) || !is.null(endChr)) { - if (!(any(startChr %in% chroms)) || !(any(endChr %in% chroms))) { - stop("Chromosome IDs not found in genotype table.") - } - } - - # Filter type selection - siteRangeFilterType <- match.arg(siteRangeFilterType) - if (missing(siteRangeFilterType) || !siteRangeFilterType %in% c("none", "sites", "position")) { - stop( - paste( - "Please specify analysis type", - "(\"none\", \"sites\", or \"position\")" - ) - ) - } - - # Create filter siter builder plugin - plugin <- rJava::new( - rJava::J("net.maizegenetics.analysis.filter.FilterSiteBuilderPlugin"), - rJava::.jnull(), - FALSE - ) - plugin$setParameter("siteMinCount", toString(siteMinCount)) - plugin$setParameter("siteMinAlleleFreq", toString(siteMinAlleleFreq)) - plugin$setParameter("siteMaxAlleleFreq", toString(siteMaxAlleleFreq)) - plugin$setParameter("minHeterozygous", toString(minHeterozygous)) - plugin$setParameter("maxHeterozygous", toString(maxHeterozygous)) - plugin$setParameter("removeMinorSNPStates", toString(removeMinorSNPStates)) - plugin$setParameter("removeSitesWithIndels", toString(removeSitesWithIndels)) - - # Logic check necessary parameters given range filter type - if (is.null(chrPosFile) && is.null(bedFile)) { - if (siteRangeFilterType == "sites") { - - if (is.null(startSite) || is.null(endSite)) { - stop("Please specify both start and end sites.") - } - - if (endSite > jGenoTable$numberOfSites()) { - stop("End site parameter exceeds total number of sites in genotype table.") - } - - if (startSite > endSite) { - stop("Start site cannot be larger than end site.") - } - - plugin$setParameter("startSite", toString(startSite)) - plugin$setParameter("endSite", toString(endSite)) - - } else if (siteRangeFilterType == "position") { - - if (is.null(startChr) || is.null(endChr)) { - stop("Please specify both start and end chromosomes.") - } - - if (!is.null(startPos)) startPos <- toString(startPos) - if (!is.null(endPos)) endPos <- toString(endPos) - - if (startChr == endChr && startPos > endPos) { - stop("Filtration paramaters outside acceptable range.") - } - - plugin$setParameter("startChr", toString(startChr)) - plugin$setParameter("startPos", startPos) - plugin$setParameter("endChr", toString(endChr)) - plugin$setParameter("endPos", endPos) - } - } else if (is.character(chrPosFile) && is.null(bedFile) && is.null(gRangesObj)) { - tmpChrDF <- utils::read.table(chrPosFile, sep = "\t", header = TRUE) - headCheck <- c("Chromosome", "Position") - - if (!identical(colnames(tmpChrDF), headCheck)) { - stop("Please check chromosome position file for correct formatting") - } - - plugin$setParameter("chrPosFile", chrPosFile) - - } else if (is.null(chrPosFile) && is.character(bedFile) && is.null(gRangesObj)) { - plugin$setParameter("bedFile", bedFile) - } else { - stop("Incorrect parameter usage") - } - - # Filter by GRanges object if present - if (!is.null(gRangesObj)) { - if (class(gRangesObj) != "GRanges") { - stop("gRangesObj must be of class `GRanges`") - } - jRC <- rJava::J("net/maizegenetics/plugindef/GenerateRCode") - jGenoTable <- jRC$filterSitesByGRanges( - jGenoTable, - rJava::.jarray(as.vector(GenomicRanges::seqnames(gRangesObj))), - rJava::.jarray(GenomicRanges::start(gRangesObj)), - rJava::.jarray(GenomicRanges::end(gRangesObj)) - ) - } - - # Try to run plugin - out <- tryCatch( - { - plugin$runPlugin(jGenoTable) - }, - error = function(e) { - return(-1) - } - ) - - # Check if input had phenotype table. If yes, combine genotype with phenotype - if (class(out) != "jobjRef") { - message("No data returned.") - return(NA) - } else { - jPhenoTable <- getPhenotypeTable(tasObj) - if (rJava::is.jnull(jPhenoTable)) { - .tasselObjectConstructor(out) - } else { - .tasselObjectConstructor( - combineTasselGenotypePhenotype( - genotypeTable = out, - phenotype = jPhenoTable - ) - ) - } - } -} - - - -#' @title Filter genotype table by taxa -#' -#' @description This function will filter R objects of -#' \code{TasselGenotypePhenotype} class containing genotype tables. -#' The parameters for this function are derived from TASSEL's -#' \code{FilterTaxaBuilder} plugin. -#' -#' @name filterGenotypeTableTaxa -#' @rdname filterGenotypeTableTaxa -#' -#' @param tasObj An object of class \code{TasselGenotypePenotype}. -#' @param minNotMissing Minimum proportion of sites not unknown to pass this -#' filter. Value can be between 0.0 and 1.0. -#' @param minHeterozygous Minimum proportion of sites that are heterozygous. -#' Value can be between 0.0 and 1.0. -#' @param maxHeterozygous Maximum proportion of sites that are heterozygous. -#' Value can be between 0.0 and 1.0. -#' @param taxa Vector of taxa IDs (as character) to subset. Defaults to -#' \code{NULL}. -#' -#' @return Returns an object of \code{TasselGenotypePhenotype} class. -#' -#' @importFrom rJava is.jnull -#' @importFrom rJava new -#' @importFrom rJava J -#' @importFrom rJava .jnull -#' @export -filterGenotypeTableTaxa <- function(tasObj, - minNotMissing = 0.0, - minHeterozygous = 0.0, - maxHeterozygous = 1.0, - taxa = NULL) { - - if (class(tasObj) != "TasselGenotypePhenotype") { - stop("`tasObj` must be of class `TasselGenotypePhenotype`") - } - - jGenoTable <- getGenotypeTable(tasObj) - if (rJava::is.jnull(jGenoTable)) { - stop("TASSEL genotype object not found") - } - - # Range check - if (minNotMissing > 1 || minNotMissing < 0 ) { - stop("minNotMissing parameter is out of range") - } - if (minHeterozygous > 1 || minHeterozygous < 0 ) { - stop("minHeterozygous parameter is out of range") - } - if (maxHeterozygous > 1 || maxHeterozygous < 0 ) { - stop("maxHeterozygous parameter is out of range") - } - - # Create filter taxa builder plugin - plugin <- rJava::new( - rJava::J("net.maizegenetics.analysis.filter.FilterTaxaBuilderPlugin"), - rJava::.jnull(), - FALSE - ) - plugin$setParameter("minNotMissing", toString(minNotMissing)) - plugin$setParameter("minHeterozygous", toString(minHeterozygous)) - plugin$setParameter("maxHeterozygous", toString(maxHeterozygous)) - - if (!is.null(taxa)) { - if (!is.vector(taxa)) { - stop("Taxa list must be vector.") - } - if (!is.character(taxa)) { - stop("Taxa list must be of type character.") - } - - builder <- .jnew("net.maizegenetics.taxa.TaxaListBuilder") - builder$addAll(.jarray(taxa)) - taxaArray <- builder$build() - - plugin$setParameter("includeTaxa", "true") - plugin$setParameter("taxaList", taxaArray) - } - - resultDataSet <- plugin$runPlugin(jGenoTable) - - # Check if input had phenotype table. If yes, combine genotype with phenotype - jPhenoTable <- getPhenotypeTable(tasObj) - if (rJava::is.jnull(jPhenoTable)) { - .tasselObjectConstructor(resultDataSet) - } else { - .tasselObjectConstructor( - combineTasselGenotypePhenotype( - genotypeTable = resultDataSet, - phenotype = jPhenoTable - ) - ) - } -} +## ---- +#' @title Filter genotype table by sites +#' +#' @description This function will filter R objects of +#' \code{TasselGenotypePhenotype} class containing genotype tables. +#' The parameters for this function are derived from TASSEL's +#' \code{FilterSiteBuilder} plugin. +#' +#' @name filterGenotypeTableSites +#' @rdname filterGenotypeTableSites +#' +#' @param tasObj An object of class \code{TasselGenotypePenotype}. +#' @param siteMinCount Site minimum count of alleles not unknown. Can range +#' from 0 to inf. Defaults to 0. +#' @param siteMinAlleleFreq Site minimum minor allele frequency. Can range +#' from 0 to 1.0. Defaults to 0.0. +#' @param siteMaxAlleleFreq Site maximum minor allele frequency. Can range +#' from 0 to 1.0. Defaults to 1.0. +#' @param minHeterozygous Min heterozygous proportion. Can range from 0 to 1.0. +#' Defaults to 0.0. +#' @param maxHeterozygous Max heterozygous proportion. Can range from 0 to 1.0. +#' Defaults to 1.0. +#' @param removeMinorSNPStates Remove minor SNP states. Defaults to +#' \code{FALSE}. +#' @param removeSitesWithIndels Remove sites containing an indel +#' (\code{+} or \code{-}). Defaults to \code{FALSE}. +#' @param siteRangeFilterType True if filtering by site numbers. False if +#' filtering by chromosome and position. Options are +#' \code{none}, \code{sites}, or \code{position}. Defaults to \code{none}. +#' @param startSite The start site. Defaults to 0. +#' @param endSite The end site. Defaults to 0. +#' @param startChr Start chromosome for site filtration range if \code{position} +#' is chosen from \code{siteRangeFilterType}. Needs end chromosome +#' (\code{endChr}) to work. +#' @param endChr End chromosome for site filtration range if \code{position} +#' is chosen from \code{siteRangeFilterType}. Needs start chromosome +#' (\code{endChr}) to work. +#' @param startPos Physical start position (bp) for filtration range if +#' \code{position} is chosen from \code{siteRangeFilterType}. If +#' \code{NULL}, the first physical position in the data set will be +#' chosen. +#' @param endPos Physical end position (bp) for filtration range if +#' \code{position} is chosen from \code{siteRangeFilterType}. If +#' \code{NULL}, the last physical position in the data set will be +#' chosen. +#' @param gRangesObj Filter genotype table by \code{GenomicRanges} object. +#' If this parameter is selected, you cannot utilize the parameters, +#' \code{chrPosFile} or \code{bedFile}. Defaults to \code{NULL}. +#' @param chrPosFile An optional chromosome position file path of +#' \code{character} class. Defaults to \code{NULL}. \strong{Note:} +#' a chromosome position file must contain correct formatting +#' (e.g. a two column file with the header of +#' \code{c("Chromosome", "Position")}). +#' @param bedFile An optional BED coordinate file path of +#' \code{character} class. Defaults to \code{NULL}. +#' +#' @return Returns an object of \code{TasselGenotypePhenotype} class. +#' +#' @importFrom GenomicRanges end +#' @importFrom GenomicRanges seqnames +#' @importFrom GenomicRanges start +#' @importFrom rJava .jarray +#' @importFrom rJava .jnull +#' @importFrom rJava is.jnull +#' @importFrom rJava new +#' @importFrom rJava J +#' @export +filterGenotypeTableSites <- function( + tasObj, + siteMinCount = 0, + siteMinAlleleFreq = 0.0, + siteMaxAlleleFreq = 1.0, + minHeterozygous = 0.0, + maxHeterozygous = 1.0, + removeMinorSNPStates = FALSE, + removeSitesWithIndels = FALSE, + siteRangeFilterType = c("none", "sites", "position"), + startSite = NULL, + endSite = NULL, + startChr = NULL, + startPos = NULL, + endChr = NULL, + endPos = NULL, + gRangesObj = NULL, + chrPosFile = NULL, + bedFile = NULL +) { + + if (class(tasObj) != "TasselGenotypePhenotype") { + stop("`tasObj` must be of class `TasselGenotypePhenotype`") + } + + jGenoTable <- getGenotypeTable(tasObj) + if (rJava::is.jnull(jGenoTable)) { + stop("TASSEL genotype object not found") + } + + # Range check + if (siteMinAlleleFreq > 1 || siteMinAlleleFreq < 0) { + stop("siteMinAlleleFreq parameter is out of range") + } + if (siteMaxAlleleFreq > 1 || siteMaxAlleleFreq < 0) { + stop("siteMaxAlleleFreq parameter is out of range") + } + if (minHeterozygous > 1 || minHeterozygous < 0) { + stop("minHeterozygous parameter is out of range") + } + if (maxHeterozygous > 1 || maxHeterozygous < 0) { + stop("maxHeterozygous parameter is out of range") + } + + # Site check + taxa <- getTaxaList(tasObj) + if (siteMinCount > taxa$size()) { + stop("Minimum number of taxa exceeds total number of taxa in genotype table.") + } + + # Range check (chromosomes) + chroms <- jGenoTable$chromosomes() + chroms <- unlist(lapply(chroms, function(x) { rJava::.jstrVal(x) })) + if (!is.null(startChr) || !is.null(endChr)) { + if (!(any(startChr %in% chroms)) || !(any(endChr %in% chroms))) { + stop("Chromosome IDs not found in genotype table.") + } + } + + # Filter type selection + siteRangeFilterType <- match.arg(siteRangeFilterType) + if (missing(siteRangeFilterType) || !siteRangeFilterType %in% c("none", "sites", "position")) { + stop( + paste( + "Please specify analysis type", + "(\"none\", \"sites\", or \"position\")" + ) + ) + } + + # Create filter siter builder plugin + plugin <- rJava::new( + rJava::J("net.maizegenetics.analysis.filter.FilterSiteBuilderPlugin"), + rJava::.jnull(), + FALSE + ) + plugin$setParameter("siteMinCount", toString(siteMinCount)) + plugin$setParameter("siteMinAlleleFreq", toString(siteMinAlleleFreq)) + plugin$setParameter("siteMaxAlleleFreq", toString(siteMaxAlleleFreq)) + plugin$setParameter("minHeterozygous", toString(minHeterozygous)) + plugin$setParameter("maxHeterozygous", toString(maxHeterozygous)) + plugin$setParameter("removeMinorSNPStates", toString(removeMinorSNPStates)) + plugin$setParameter("removeSitesWithIndels", toString(removeSitesWithIndels)) + + # Logic check necessary parameters given range filter type + if (is.null(chrPosFile) && is.null(bedFile)) { + if (siteRangeFilterType == "sites") { + + if (is.null(startSite) || is.null(endSite)) { + stop("Please specify both start and end sites.") + } + + if (endSite > jGenoTable$numberOfSites()) { + stop("End site parameter exceeds total number of sites in genotype table.") + } + + if (startSite > endSite) { + stop("Start site cannot be larger than end site.") + } + + plugin$setParameter("startSite", toString(startSite)) + plugin$setParameter("endSite", toString(endSite)) + + } else if (siteRangeFilterType == "position") { + + if (is.null(startChr) || is.null(endChr)) { + stop("Please specify both start and end chromosomes.") + } + + if (!is.null(startPos)) startPos <- toString(startPos) + if (!is.null(endPos)) endPos <- toString(endPos) + + if (startChr == endChr && startPos > endPos) { + stop("Filtration paramaters outside acceptable range.") + } + + plugin$setParameter("startChr", toString(startChr)) + plugin$setParameter("startPos", startPos) + plugin$setParameter("endChr", toString(endChr)) + plugin$setParameter("endPos", endPos) + } + } else if (is.character(chrPosFile) && is.null(bedFile) && is.null(gRangesObj)) { + tmpChrDF <- utils::read.table(chrPosFile, sep = "\t", header = TRUE) + headCheck <- c("Chromosome", "Position") + + if (!identical(colnames(tmpChrDF), headCheck)) { + stop("Please check chromosome position file for correct formatting") + } + + plugin$setParameter("chrPosFile", chrPosFile) + + } else if (is.null(chrPosFile) && is.character(bedFile) && is.null(gRangesObj)) { + plugin$setParameter("bedFile", bedFile) + } else { + stop("Incorrect parameter usage") + } + + # Filter by GRanges object if present + if (!is.null(gRangesObj)) { + if (class(gRangesObj) != "GRanges") { + stop("gRangesObj must be of class `GRanges`") + } + jRC <- rJava::J("net/maizegenetics/plugindef/GenerateRCode") + jGenoTable <- jRC$filterSitesByGRanges( + jGenoTable, + rJava::.jarray(as.vector(GenomicRanges::seqnames(gRangesObj))), + rJava::.jarray(GenomicRanges::start(gRangesObj)), + rJava::.jarray(GenomicRanges::end(gRangesObj)) + ) + } + + # Try to run plugin + out <- tryCatch( + { + plugin$runPlugin(jGenoTable) + }, + error = function(e) { + return(-1) + } + ) + + # Check if input had phenotype table. If yes, combine genotype with phenotype + if (class(out) != "jobjRef") { + message("No data returned.") + return(NA) + } else { + jPhenoTable <- getPhenotypeTable(tasObj) + if (rJava::is.jnull(jPhenoTable)) { + .tasselObjectConstructor(out) + } else { + .tasselObjectConstructor( + combineTasselGenotypePhenotype( + genotypeTable = out, + phenotype = jPhenoTable + ) + ) + } + } +} + + +## ---- +#' @title Filter genotype table by site IDs +#' +#' @description +#' Filter a genotype table object by specifying literal site names (IDs) +#' for variant markers. +#' +#' @name filterGenotypeTableBySiteName +#' @rdname filterGenotypeTableBySiteName +#' +#' @param tasObj An object of class \code{TasselGenotypePenotype}. +#' @param siteNames A character vector of site names to filter on. +#' +#' @return Returns an object of \code{TasselGenotypePhenotype} class. +#' +#' @export +filterGenotypeTableBySiteName <- function(tasObj, siteNames) { + + if (class(tasObj) != "TasselGenotypePhenotype") { + stop("`tasObj` must be of class `TasselGenotypePhenotype`") + } + + gtJava <- getGenotypeTable(tasObj) + + idsToKeep <- rJava::.jnew("java.util.ArrayList") + for (id in siteNames) idsToKeep$add(id) + + plugin <- rJava::new( + rJava::J("net.maizegenetics.analysis.filter.FilterSiteBuilderPlugin"), + rJava::.jnull(), + FALSE + ) + plugin$siteNamesList(siteNames) + + dataSet <- rJava::J("net.maizegenetics.plugindef.DataSet") + + gtFilter <- .tasselObjectConstructor( + plugin$runPlugin(dataSet$getDataSet(gtJava)) + ) + + return(gtFilter) +} + + +## ---- +#' @title Filter genotype table by taxa +#' +#' @description This function will filter R objects of +#' \code{TasselGenotypePhenotype} class containing genotype tables. +#' The parameters for this function are derived from TASSEL's +#' \code{FilterTaxaBuilder} plugin. +#' +#' @name filterGenotypeTableTaxa +#' @rdname filterGenotypeTableTaxa +#' +#' @param tasObj An object of class \code{TasselGenotypePenotype}. +#' @param minNotMissing Minimum proportion of sites not unknown to pass this +#' filter. Value can be between 0.0 and 1.0. +#' @param minHeterozygous Minimum proportion of sites that are heterozygous. +#' Value can be between 0.0 and 1.0. +#' @param maxHeterozygous Maximum proportion of sites that are heterozygous. +#' Value can be between 0.0 and 1.0. +#' @param taxa Vector of taxa IDs (as character) to subset. Defaults to +#' \code{NULL}. +#' +#' @return Returns an object of \code{TasselGenotypePhenotype} class. +#' +#' @importFrom rJava is.jnull +#' @importFrom rJava new +#' @importFrom rJava J +#' @importFrom rJava .jnull +#' @export +filterGenotypeTableTaxa <- function( + tasObj, + minNotMissing = 0.0, + minHeterozygous = 0.0, + maxHeterozygous = 1.0, + taxa = NULL +) { + + if (class(tasObj) != "TasselGenotypePhenotype") { + stop("`tasObj` must be of class `TasselGenotypePhenotype`") + } + + jGenoTable <- getGenotypeTable(tasObj) + if (rJava::is.jnull(jGenoTable)) { + stop("TASSEL genotype object not found") + } + + # Range check + if (minNotMissing > 1 || minNotMissing < 0 ) { + stop("minNotMissing parameter is out of range") + } + if (minHeterozygous > 1 || minHeterozygous < 0 ) { + stop("minHeterozygous parameter is out of range") + } + if (maxHeterozygous > 1 || maxHeterozygous < 0 ) { + stop("maxHeterozygous parameter is out of range") + } + + # Create filter taxa builder plugin + plugin <- rJava::new( + rJava::J("net.maizegenetics.analysis.filter.FilterTaxaBuilderPlugin"), + rJava::.jnull(), + FALSE + ) + plugin$setParameter("minNotMissing", toString(minNotMissing)) + plugin$setParameter("minHeterozygous", toString(minHeterozygous)) + plugin$setParameter("maxHeterozygous", toString(maxHeterozygous)) + + if (!is.null(taxa)) { + if (!is.vector(taxa)) { + stop("Taxa list must be vector.") + } + if (!is.character(taxa)) { + stop("Taxa list must be of type character.") + } + + builder <- .jnew("net.maizegenetics.taxa.TaxaListBuilder") + builder$addAll(.jarray(taxa)) + taxaArray <- builder$build() + + plugin$setParameter("includeTaxa", "true") + plugin$setParameter("taxaList", taxaArray) + } + + resultDataSet <- plugin$runPlugin(jGenoTable) + + # Check if input had phenotype table. If yes, combine genotype with phenotype + jPhenoTable <- getPhenotypeTable(tasObj) + if (rJava::is.jnull(jPhenoTable)) { + .tasselObjectConstructor(resultDataSet) + } else { + .tasselObjectConstructor( + combineTasselGenotypePhenotype( + genotypeTable = resultDataSet, + phenotype = jPhenoTable + ) + ) + } +} From 551759e5efbe8520748d32be05eab741a2324934 Mon Sep 17 00:00:00 2001 From: Brandon Date: Wed, 1 Nov 2023 09:17:43 -0400 Subject: [PATCH 2/6] Version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 332335b..4086c03 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: rTASSEL -Version: 0.9.32 +Version: 0.9.33 Date: 2018-12-21 Title: R Front-End for TASSEL Authors@R: c( From ac908a9f2ee4aff4123778fb0ea26a1dba3faaf8 Mon Sep 17 00:00:00 2001 From: Brandon Date: Wed, 1 Nov 2023 10:27:23 -0400 Subject: [PATCH 3/6] Add merging methods for multiple genotype tables --- NAMESPACE | 2 ++ NEWS.md | 4 +++ R/filter_functions.R | 2 +- R/join_methods.R | 38 ++++++++++++++++++++++++++++ man/filterGenotypeTableBySiteName.Rd | 20 +++++++++++++++ man/mergeGenotypeTables.Rd | 17 +++++++++++++ 6 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 man/filterGenotypeTableBySiteName.Rd create mode 100644 man/mergeGenotypeTables.Rd diff --git a/NAMESPACE b/NAMESPACE index c91c24a..18f9374 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(concatenate) export(createTree) export(distanceMatrix) export(exportGenotypeTable) +export(filterGenotypeTableBySiteName) export(filterGenotypeTableSites) export(filterGenotypeTableTaxa) export(genomicPrediction) @@ -22,6 +23,7 @@ export(ldJavaApp) export(ldPlot) export(linkageDiseq) export(mds) +export(mergeGenotypeTables) export(pca) export(plotManhattan) export(plotManhattanQC) diff --git a/NEWS.md b/NEWS.md index 8e14eb0..dbd3b9f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # rTASSEL 0.9.33 * Fixed typo in `plotPCA()` error message +* Add new function `filterGenotypeTableBySiteName()`: + + Filters genotype tables using literal marker names/IDs +* Add new function `mergeGenotypeTables()`: + + Merges multiple genotype tables by site values # rTASSEL 0.9.32 diff --git a/R/filter_functions.R b/R/filter_functions.R index 3872ef4..627ca8a 100644 --- a/R/filter_functions.R +++ b/R/filter_functions.R @@ -278,7 +278,7 @@ filterGenotypeTableBySiteName <- function(tasObj, siteNames) { rJava::.jnull(), FALSE ) - plugin$siteNamesList(siteNames) + plugin$siteNamesList(idsToKeep) dataSet <- rJava::J("net.maizegenetics.plugindef.DataSet") diff --git a/R/join_methods.R b/R/join_methods.R index c95c3e0..7b6fe39 100644 --- a/R/join_methods.R +++ b/R/join_methods.R @@ -122,3 +122,41 @@ concatenate <- function(x) { } +##---- +#' @title Merge genotype tables +#' +#' @description +#' Merges multiple genotype tables together by site information +#' +#' @return Returns an object of \code{TasselGenotypePhenotype} class. +#' +#' @name mergeGenotypeTables +#' @rdname mergeGenotypeTables +#' +#' @param tasObjL A list of objects of class \code{TasselGenotypePenotype}. +#' +#' @export +mergeGenotypeTables <- function(tasObjL) { + mergeGtClassPath <- "net/maizegenetics/analysis/data/MergeGenotypeTablesPlugin" + gtClassPath <- "net/maizegenetics/dna/snp/GenotypeTable" + frameClassPath <- "java/awt/Frame" + + gtArray <- rJava::.jarray( + x = lapply(tasObjL, rTASSEL:::getGenotypeTable), + contents.class = gtClassPath + ) + + mergeGtPlugin <- rJava::new( + rJava::J(mergeGtClassPath), + rJava::.jnull(frameClassPath), + FALSE + ) + + mergedGt <- rTASSEL:::.tasselObjectConstructor( + mergeGtPlugin$mergeGenotypeTables(gtArray) + ) + + return(mergedGt) +} + + diff --git a/man/filterGenotypeTableBySiteName.Rd b/man/filterGenotypeTableBySiteName.Rd new file mode 100644 index 0000000..f82a0f7 --- /dev/null +++ b/man/filterGenotypeTableBySiteName.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter_functions.R +\name{filterGenotypeTableBySiteName} +\alias{filterGenotypeTableBySiteName} +\title{Filter genotype table by site IDs} +\usage{ +filterGenotypeTableBySiteName(tasObj, siteNames) +} +\arguments{ +\item{tasObj}{An object of class \code{TasselGenotypePenotype}.} + +\item{siteNames}{A character vector of site names to filter on.} +} +\value{ +Returns an object of \code{TasselGenotypePhenotype} class. +} +\description{ +Filter a genotype table object by specifying literal site names (IDs) +for variant markers. +} diff --git a/man/mergeGenotypeTables.Rd b/man/mergeGenotypeTables.Rd new file mode 100644 index 0000000..27d4050 --- /dev/null +++ b/man/mergeGenotypeTables.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/join_methods.R +\name{mergeGenotypeTables} +\alias{mergeGenotypeTables} +\title{Merge genotype tables} +\usage{ +mergeGenotypeTables(tasObjL) +} +\arguments{ +\item{tasObjL}{A list of objects of class \code{TasselGenotypePenotype}.} +} +\value{ +Returns an object of \code{TasselGenotypePhenotype} class. +} +\description{ +Merges multiple genotype tables together by site information +} From 03043df05dd4f3863d667605379a4c5b36db58fc Mon Sep 17 00:00:00 2001 From: Brandon Date: Mon, 6 Nov 2023 15:28:29 -0500 Subject: [PATCH 4/6] Add tests --- R/filter_functions.R | 19 +++++++++++++++++-- R/join_methods.R | 8 ++++++++ inst/extdata/rt_sub_chr1.vcf | 22 ++++++++++++++++++++++ inst/extdata/rt_sub_chr5.vcf | 17 +++++++++++++++++ tests/testthat/test_filtration.R | 29 +++++++++++++++++++++++++++++ tests/testthat/test_joining.R | 23 +++++++++++++++++++++++ 6 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 inst/extdata/rt_sub_chr1.vcf create mode 100644 inst/extdata/rt_sub_chr5.vcf diff --git a/R/filter_functions.R b/R/filter_functions.R index 627ca8a..621b4af 100644 --- a/R/filter_functions.R +++ b/R/filter_functions.R @@ -264,15 +264,19 @@ filterGenotypeTableSites <- function( #' @export filterGenotypeTableBySiteName <- function(tasObj, siteNames) { + # Rudimentary type check if (class(tasObj) != "TasselGenotypePhenotype") { stop("`tasObj` must be of class `TasselGenotypePhenotype`") } + # Get Java memory pointer gtJava <- getGenotypeTable(tasObj) + # Instantiate new ArrayList object and populate with site names idsToKeep <- rJava::.jnew("java.util.ArrayList") for (id in siteNames) idsToKeep$add(id) + # Instantiate filter site builder plugin plugin <- rJava::new( rJava::J("net.maizegenetics.analysis.filter.FilterSiteBuilderPlugin"), rJava::.jnull(), @@ -280,10 +284,21 @@ filterGenotypeTableBySiteName <- function(tasObj, siteNames) { ) plugin$siteNamesList(idsToKeep) + # Instantiate DataSet object dataSet <- rJava::J("net.maizegenetics.plugindef.DataSet") - gtFilter <- .tasselObjectConstructor( - plugin$runPlugin(dataSet$getDataSet(gtJava)) + # Try to filter data - if outofbounds exception - no sites returned + gtFilter <- tryCatch( + expr = { + # run plugin and return S4 rTASSEL object (filtered) + .tasselObjectConstructor( + plugin$runPlugin(dataSet$getDataSet(gtJava)) + ) + }, + error = function(e) { + message("Error. No sites found: ") + message(" ", conditionMessage(e), "\n") + } ) return(gtFilter) diff --git a/R/join_methods.R b/R/join_methods.R index 7b6fe39..a94f61a 100644 --- a/R/join_methods.R +++ b/R/join_methods.R @@ -141,6 +141,14 @@ mergeGenotypeTables <- function(tasObjL) { gtClassPath <- "net/maizegenetics/dna/snp/GenotypeTable" frameClassPath <- "java/awt/Frame" + if (!is(tasObjL, "list")) { + stop("`tasObjL` must be a list") + } + + if (!all(unlist(lapply(tasObjL, is, "TasselGenotypePhenotype")))) { + stop("All elements in `tasObjL` must be of type TasselGenotypePhenotype") + } + gtArray <- rJava::.jarray( x = lapply(tasObjL, rTASSEL:::getGenotypeTable), contents.class = gtClassPath diff --git a/inst/extdata/rt_sub_chr1.vcf b/inst/extdata/rt_sub_chr1.vcf new file mode 100644 index 0000000..7339a92 --- /dev/null +++ b/inst/extdata/rt_sub_chr1.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.0 +##Tassel= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 33-16 38-11 4226 4722 A188 +1 157104 PZB00859.1 C A . PASS . GT 0/0 0/0 0/0 0/0 1/1 +1 1947984 PZA01271.1 G C . PASS . GT 1/1 0/0 1/1 0/0 1/1 +1 2914066 PZA03613.2 T G . PASS . GT 1/1 1/1 1/1 1/1 1/1 +1 2914171 PZA03613.1 T A . PASS . GT 0/0 0/0 0/0 0/0 0/0 +1 2915078 PZA03614.2 G A . PASS . GT 0/0 0/0 0/0 0/0 0/0 +1 2915242 PZA03614.1 T A . PASS . GT 0/0 0/0 0/0 0/0 0/0 +1 2973508 PZA00258.3 C G . PASS . GT 1/1 0/0 0/0 1/0 0/0 +1 3205252 PZA02962.13 T A . PASS . GT 0/0 0/0 0/0 0/0 0/0 +1 3205262 PZA02962.14 C G . PASS . GT 0/0 0/0 0/0 0/0 0/0 +1 3206090 PZA00599.25 T C . PASS . GT 1/1 0/0 1/1 0/0 0/0 +1 3706018 PZA02129.1 C T . PASS . GT 1/1 0/0 0/0 0/0 0/0 diff --git a/inst/extdata/rt_sub_chr5.vcf b/inst/extdata/rt_sub_chr5.vcf new file mode 100644 index 0000000..94baad6 --- /dev/null +++ b/inst/extdata/rt_sub_chr5.vcf @@ -0,0 +1,17 @@ +##fileformat=VCFv4.0 +##Tassel= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 33-16 38-11 4226 4722 A188 +5 656148 PZA01887.1 G A . PASS . GT 1/1 1/1 0/0 0/0 0/0 +5 920922 PZA00684.12 C T . PASS . GT 0/0 1/1 0/0 0/0 0/0 +5 945545 PZA02894.1 C G . PASS . GT 0/0 1/1 0/0 ./. 1/1 +5 2123914 PZA00191.5 T C . PASS . GT 1/1 0/0 0/0 1/1 0/0 +5 2123961 PZA00191.4 T C . PASS . GT 1/1 0/0 0/0 1/1 0/0 +5 3142800 PZB01352.6 C T . PASS . GT 0/0 0/0 0/0 0/0 0/0 diff --git a/tests/testthat/test_filtration.R b/tests/testthat/test_filtration.R index 8a4849f..e3cd761 100644 --- a/tests/testthat/test_filtration.R +++ b/tests/testthat/test_filtration.R @@ -726,6 +726,35 @@ test_that("filterGenotypeTableTaxa() returns correct data", { ) }) +test_that("filterGenotypeTableBySiteName() tests", { + + # positive control - known sites + testSitesPos <- head(siteSummary(tasGeno)$Site_Name) + + # Known and not valid sites + testSitesExp <- c( + head(siteSummary(tasGeno)$Site_Name), + "not_valid_a" + ) + + # negative control - not valid sites + testSitesNeg <- c("not_valid_a", "not_valid_b") + + expect_null(filterGenotypeTableBySiteName(tasGeno, testSitesNeg)) + expect_error(object = filterGenotypeTableBySiteName(mtcars, testSitesPos)) + expect_equal( + object = siteSummary( + filterGenotypeTableBySiteName(tasGeno, testSitesPos) + )$Site_Name, + expected = testSitesPos + ) + expect_equal( + object = siteSummary( + filterGenotypeTableBySiteName(tasGeno, testSitesExp) + )$Site_Name, + expected = testSitesPos + ) +}) diff --git a/tests/testthat/test_joining.R b/tests/testthat/test_joining.R index 7da85a8..acb8833 100644 --- a/tests/testthat/test_joining.R +++ b/tests/testthat/test_joining.R @@ -93,3 +93,26 @@ test_that("Joining returns correct values with PCA objects", { }) +test_that("mergeGenotypeTables() tests", { + gtA <- readGenotypeTableFromPath(system.file( + "extdata", + "rt_sub_chr1.vcf", + package = "rTASSEL" + )) + gtB <- readGenotypeTableFromPath(system.file( + "extdata", + "rt_sub_chr5.vcf", + package = "rTASSEL" + )) + + gtMerged <- mergeGenotypeTables(list(gtA, gtB)) + + expect_true(is(gtMerged, "TasselGenotypePhenotype")) + expect_error(mergeGenotypeTables(list(gtA, mtcars))) + expect_error(mergeGenotypeTables(LETTERS)) + expect_equal(gtMerged@jTaxaList$numberOfTaxa(), 5) + expect_equal(gtMerged@jPositionList$numberOfSites(), 17) +}) + + + From 189f9949eb487cbbc78a78a3dd6123d70684c131 Mon Sep 17 00:00:00 2001 From: Brandon Date: Wed, 8 Nov 2023 09:03:15 -0500 Subject: [PATCH 5/6] Add tests --- R/join_methods.R | 4 ++-- tests/testthat/test_joining.R | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/R/join_methods.R b/R/join_methods.R index a94f61a..1170bad 100644 --- a/R/join_methods.R +++ b/R/join_methods.R @@ -150,7 +150,7 @@ mergeGenotypeTables <- function(tasObjL) { } gtArray <- rJava::.jarray( - x = lapply(tasObjL, rTASSEL:::getGenotypeTable), + x = lapply(tasObjL, getGenotypeTable), contents.class = gtClassPath ) @@ -160,7 +160,7 @@ mergeGenotypeTables <- function(tasObjL) { FALSE ) - mergedGt <- rTASSEL:::.tasselObjectConstructor( + mergedGt <- .tasselObjectConstructor( mergeGtPlugin$mergeGenotypeTables(gtArray) ) diff --git a/tests/testthat/test_joining.R b/tests/testthat/test_joining.R index acb8833..5030c48 100644 --- a/tests/testthat/test_joining.R +++ b/tests/testthat/test_joining.R @@ -78,7 +78,7 @@ pcaRes <- pca(tasGeno) test_that("Joining returns correct values with PCA objects", { intersectPheno <- intersectJoin(c(pcaRes, tasPheno)) - phenoAttrib <- rTASSEL:::extractPhenotypeAttDf(intersectPheno@jPhenotypeTable) + phenoAttrib <- extractPhenotypeAttDf(intersectPheno@jPhenotypeTable) expect_equal( phenoAttrib$traitName, c("Taxa", "EarHT", "dpoll", "EarDia", "PC1", "PC2", "PC3", "PC4", "PC5") @@ -105,14 +105,22 @@ test_that("mergeGenotypeTables() tests", { package = "rTASSEL" )) + gtBFilter <- filterGenotypeTableTaxa(gtB, taxa = c("33-16", "38-11")) + gtMerged <- mergeGenotypeTables(list(gtA, gtB)) + gtMergedFilter <- mergeGenotypeTables(list(gtA, gtBFilter)) expect_true(is(gtMerged, "TasselGenotypePhenotype")) expect_error(mergeGenotypeTables(list(gtA, mtcars))) expect_error(mergeGenotypeTables(LETTERS)) expect_equal(gtMerged@jTaxaList$numberOfTaxa(), 5) + expect_equal(gtMergedFilter@jTaxaList$numberOfTaxa(), 5) expect_equal(gtMerged@jPositionList$numberOfSites(), 17) }) + + + + From ceffee7fc8a19f7ff7b2547469a0a306f3b04fef Mon Sep 17 00:00:00 2001 From: Brandon Date: Wed, 8 Nov 2023 09:06:45 -0500 Subject: [PATCH 6/6] Add news items --- NEWS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS b/NEWS index 2bcb6c7..7631daf 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,9 @@ # CHANGES IN VERSION 0.9.33 * Fixed typo in `plotPCA()` error message +* Add new function `filterGenotypeTableBySiteName()`: + + Filters genotype tables using literal marker names/IDs +* Add new function `mergeGenotypeTables()`: + + Merges multiple genotype tables by site values # CHANGES IN VERSION 0.9.32 * Updated `tableReport()` method dispatch for all `AssociationResults`