uclahs-cds · alkaZeltser · Jul 26, 2024 · Jul 16, 2024 · Jul 17, 2024 · Jul 17, 2024
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@
 .RData
 *.Rproj*
 .Rproj.user
+inst/doc
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,7 +6,6 @@ Authors@R: c(
     person('Paul', 'Boutros', role = 'cre', email = 'PBoutros@mednet.ucla.edu'),
     person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '000-0001-7246-2771')),
     person('Rachel', 'Dang', role = 'ctb'))
-Maintainer: Nicole Zeltser <nzeltser@mednet.ucla.edu>
 Description: This tool is intended to simply and transparently parse
     genotype/dosage data from an input VCF, match genotype coordinates
     to the component SNPs of an existing polygenic score, and apply
@@ -23,9 +22,12 @@ Imports:
     BoutrosLab.plotting.general,
     lattice
 Suggests: 
+    knitr,
+    rmarkdown,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
 License: GPL-2
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.1
+VignetteBuilder: knitr
diff --git a/NEWS b/NEWS
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# ApplyPolygenicScore (0.1.0)
+
+* INITIAL FEATURES
diff --git a/R/apply-pgs.R b/R/apply-pgs.R
@@ -98,6 +98,7 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
 #' @param n.percentiles An integer indicating the number of percentiles to calculate for the PGS. Default is \code{NULL}.
 #' @param analysis.source.pgs A character string indicating the source PGS for percentile calculation and regression analyses. Options are "mean.dosage", "normalize", or "none".
 #' When not specified, defaults to \code{missing.genotype.method} choice and if more than one PGS missing genotype method is chosen, calculation defaults to the first selection.
+#' @param validate.inputs.only A logical indicating whether to only perform input data validation checks without running PGS application. If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}.
 #' @return A list containing per-sample PGS output and per-phenotype regression output if phenotype analysis columns are provided.
 #'
 #' \strong{Output Structure}
@@ -150,13 +151,14 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
 #' \code{normalize}: Missing genotypes are excluded from score calculation but the final score is normalized by the number of non-missing alleles.
 #' The calculation assumes a diploid genome:
 #' \deqn{PGS_i = \dfrac{\sum \left( \beta_m \times dosage_{im} \right)}{P_i * M_{non-missing}}}
-#' Where \emph{P} is the ploidy and has the value \code{2} and \emph{M_{non-missing}} is the number of non-missing genotypes.
+#' Where \emph{P} is the ploidy and has the value \code{2} and \eqn{M_{non-missing}} is the number of non-missing genotypes.
 #'
-#' \code{mean.dosage}: Missing genotype dosages are replaced by the mean population dosage of the variant which is calculated as the product of the effect allele frequency and the ploidy of a diploid genome:
-#' \deqn{dosage_{im-missing} = EAF_m * P_i}
-#' Where \emph{EAF} is the effect allele frequency and \emph{P} is the ploidy and has the value \code{2}.
-#' By default, the effect allele frequency is calculated from the provided VCF data. For variants that are missing in all individuals, it is not possible to derive an effect allele frequency
-#' and dosage is assumed to be zero (homozygous non-reference) for all individuals.
+#' \code{mean.dosage}: Missing genotype dosages are replaced by the mean population dosage of the variant which is calculated as the product of the effect allele frequency \emph{EAF} and the ploidy of a diploid genome:
+#' \deqn{\overline{dosage_{k}} = EAF_k * P}}
+#' where \emph{k} is a PGS component variant that is missing in between 1 and n-1 individuals in the cohort and \emph{P} = ploidy = 2
+#' This dosage calculation holds under assumptions of Hardy-Weinberg equilibrium.
+#' By default, the effect allele frequency is calculated from the provided VCF data.
+#' For variants that are missing in all individuals, dosage is assumed to be zero (homozygous non-reference) for all individuals.
 #' An external allele frequency can be provided in the \code{pgs.weight.data} as a column named \code{allelefrequency_effect} and by setting \code{use.external.effect.allele.frequency} to \code{TRUE}.
 #'
 #' \strong{Multiallelic Site Handling}
@@ -216,6 +218,13 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
 #'     phenotype.data = phenotype.data
 #'     );
 #'
+#' # Only run validation checks on input data and report back
+#' apply.polygenic.score(
+#'     vcf.data = vcf.import$dat,
+#'     pgs.weight.data = pgs.import$pgs.weight.data,
+#'     validate.inputs.only = TRUE
+#'     );
+#'
 #' @export
 apply.polygenic.score <- function(
     vcf.data,
@@ -227,7 +236,8 @@ apply.polygenic.score <- function(
     missing.genotype.method = 'mean.dosage',
     use.external.effect.allele.frequency = FALSE,
     n.percentiles = NULL,
-    analysis.source.pgs = NULL
+    analysis.source.pgs = NULL,
+    validate.inputs.only = FALSE
     ) {
 
     ### Start Input Validation ###
@@ -236,6 +246,10 @@ apply.polygenic.score <- function(
     validate.pgs.data.input(pgs.weight.data = pgs.weight.data, use.external.effect.allele.frequency = use.external.effect.allele.frequency);
     validate.phenotype.data.input(phenotype.data = phenotype.data, phenotype.analysis.columns = phenotype.analysis.columns, vcf.data = vcf.data);
 
+    if (validate.inputs.only) {
+        print('Input data passed validation\n');
+        return(TRUE);
+        }
 
     # check missing genotype method input
     if (all(missing.genotype.method %in% c('mean.dosage', 'normalize', 'none'))) {
@@ -316,7 +330,7 @@ apply.polygenic.score <- function(
             }
     }
 
-    ### End Misssing Genotype Handling ###
+    ### End Missing Genotype Handling ###
 
     # calculate weighted dosage
     if ('mean.dosage' %in% missing.genotype.method) {

diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ If you wish to apply a PGS to a cohort, we recommend that genotypes for the whol
 ### Recommended Workflow
 
 
-1. Convert PGS weight files to BED coordinate files
+1. Convert PGS weight files to BED coordinate files.
 
     We recommend starting by filtering your input VCF for just the variants in your PGS weight files. Several software tools are available to do this, and most all require a coordinate BED file. A description of BED format can be found [here](https://bedtools.readthedocs.io/en/latest/content/general-usage.html).
 
@@ -62,7 +62,7 @@ If you wish to apply a PGS to a cohort, we recommend that genotypes for the whol
 3. Apply your PGS.
 
     Provide your imported VCF and PGS weight files to `apply.polygenic.score`. It's as simple as that.
-    Under the hood, this function begins by calling `merge.vcf.with.pgs`. The merge function also outputs a list of variants in your PGS that could not be found in your VCf data, which you can obtain by calling the function independently.
+    Under the hood, this function begins by calling `merge.vcf.with.pgs`. The merge function also outputs a list of variants in your PGS that could not be found in your VCF data, which you can obtain by calling the function independently.
     `apply.polygenic.score` outputs lots of useful information along with the score and provides various customizeable options, such as methods for handling missing sites (see [this discussion](https://github.com/uclahs-cds/package-ApplyPolygenicScore/discussions/17) for more) and basic analyses with phenotype data.
 
 4. Create summary plots.

diff --git a/inst/extdata/PGS003378_hmPOS_GRCh38.txt.gz b/inst/extdata/PGS003378_hmPOS_GRCh38.txt.gz
diff --git a/inst/extdata/phenotype.test.data.Rda b/inst/extdata/phenotype.test.data.Rda
diff --git a/man/apply.polygenic.score.Rd b/man/apply.polygenic.score.Rd
diff --git a/tests/testthat/test-pgs-application.R b/tests/testthat/test-pgs-application.R
@@ -199,9 +199,23 @@ test_that(
         }
     );
 
+test_that(
+    'apply.polygenic.score correctly outputs validation only option', {
+        load('data/simple.pgs.application.test.data.Rda');
+        expect_equal(
+            apply.polygenic.score(
+                vcf.data = simple.pgs.application.test.data$vcf.data,
+                pgs.weight.data = simple.pgs.application.test.data$pgs.weight.data,
+                validate.inputs.only = TRUE
+                ),
+            TRUE
+            );
+        }
+    );
+
 test_that(
     'apply.polygenic.score correctly formats general output', {
-        load('data/simple.pgs.application.test.data.Rda')
+        load('data/simple.pgs.application.test.data.Rda');
         test.pgs.per.sample <- apply.polygenic.score(
             vcf.data = simple.pgs.application.test.data$vcf.data,
             pgs.weight.data = simple.pgs.application.test.data$pgs.weight.data

diff --git a/vignettes/.gitignore b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# ApplyPolygenicScore (0.1.0)

		* INITIAL FEATURES