From c79a47aa44123ac87c8e082c78511a68dd4a5926 Mon Sep 17 00:00:00 2001 From: Al-Murphy Date: Wed, 17 Jul 2024 12:08:15 +0100 Subject: [PATCH] handle log10 p --- DESCRIPTION | 2 +- NEWS.md | 5 +++ R/format_sumstats.R | 4 ++ R/read_log_pval.R | 72 +++++++++++++++++++++++++++++++++++ R/read_vcf.R | 3 +- R/read_vcf_pval.R | 21 ---------- man/read_log_pval.Rd | 32 ++++++++++++++++ man/read_vcf_pval.Rd | 18 --------- tests/testthat/test-log10_p.R | 51 +++++++++++++++++++++++++ 9 files changed, 167 insertions(+), 41 deletions(-) create mode 100644 R/read_log_pval.R delete mode 100644 R/read_vcf_pval.R create mode 100644 man/read_log_pval.Rd delete mode 100644 man/read_vcf_pval.Rd create mode 100644 tests/testthat/test-log10_p.R diff --git a/DESCRIPTION b/DESCRIPTION index 383e7a7..5676796 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.13.1 +Version: 1.13.2 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/NEWS.md b/NEWS.md index c249f12..3ee7a90 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +## CHANGES IN VERSION 1.13.2 + +### New features +* Handling of -log10 p-values (outside of VCFs) added. + ## CHANGES IN VERSION 1.13.1 ### New features diff --git a/R/format_sumstats.R b/R/format_sumstats.R index 4a404f4..4a6a619 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -482,6 +482,10 @@ format_sumstats <- function(path, "Corrected"=rep("ES",14)) mapping_file <- rbind(mapping_file,es_cols) } + + #### Check 40:Check for log10 p instead of p #### + sumstats_return <- + read_log_pval(sumstats_dt = sumstats_return$sumstats_dt) #### Check 2:Check for effect direction #### sumstats_return <- diff --git a/R/read_log_pval.R b/R/read_log_pval.R new file mode 100644 index 0000000..35a1e14 --- /dev/null +++ b/R/read_log_pval.R @@ -0,0 +1,72 @@ +#' Read -log10 p-value column +#' +#' Parse p-value column in VCF file.of other general -loq10 p-values +#' +#' @param sumstats_dt Summary stats data.table. +#' @param return_list Binary, whether to return the dt in a list or not - list +#' is standard for the `format_sumstats()` function. +#' @inheritParams format_sumstats +#' +#' @returns Null output. +#' @keywords internal +#' @importFrom data.table setnames +read_log_pval <- function(sumstats_dt,mapping_file = sumstatsColHeaders, + return_list=TRUE){ + P <- NULL + # Need to convert P-value, currently -log10 + #cgreate a check based on mapping file + column_headers <- names(sumstats_dt) + column_headers_upper <- toupper(names(sumstats_dt)) + #get all p mappings + mapping_file_p <- mapping_file[mapping_file$Corrected=="P",] + #now create all possible -log10 p-value + log_p_col <- c(paste0("-LOG10_",mapping_file_p$Uncorrected), + paste0("LOG10_",mapping_file_p$Uncorrected), + paste0("-LOG10 ",mapping_file_p$Uncorrected), + paste0("LOG10 ",mapping_file_p$Uncorrected), + paste0("-LOG10-",mapping_file_p$Uncorrected), + paste0("LOG10-",mapping_file_p$Uncorrected), + paste0("-LOG10.",mapping_file_p$Uncorrected), + paste0("LOG10.",mapping_file_p$Uncorrected), + paste0("-LOG10",mapping_file_p$Uncorrected), + paste0("LOG10",mapping_file_p$Uncorrected), + paste0("LOG_",mapping_file_p$Uncorrected), + paste0("-L_",mapping_file_p$Uncorrected), + paste0("L_",mapping_file_p$Uncorrected), + paste0("LOG-",mapping_file_p$Uncorrected), + paste0("-L-",mapping_file_p$Uncorrected), + paste0("L-",mapping_file_p$Uncorrected), + paste0("LOG.",mapping_file_p$Uncorrected), + paste0("-L.",mapping_file_p$Uncorrected), + paste0("L.",mapping_file_p$Uncorrected), + paste0("-L",mapping_file_p$Uncorrected), + paste0("L",mapping_file_p$Uncorrected), + "LP") + lp_found <- FALSE + for(col_i in column_headers_upper){ + if(isTRUE(any(col_i==log_p_col))){ + lp_found<-TRUE + lp_col_up<-col_i + } + } + if (isTRUE(lp_found)) { + messager("sumstats has -log10 P-values; these will be", + "converted to unadjusted p-values in the 'P' column.") + lp_col <- column_headers[[which(column_headers_upper %in% lp_col_up)]] + #check if -log10 or log10 + if(max(sumstats_dt[,get(lp_col)])<=0){ + #log10 - less likely so more lenient check + sumstats_dt[, P := 10^(as.numeric(get(lp_col)))] + }else{ + #-log10 - more likely + sumstats_dt[, P := 10^(-1 * as.numeric(get(lp_col)))] + } + + } + #### Return format #### + if(return_list){ + return(list("sumstats_dt" = sumstats_dt)) + }else { + return(sumstats_dt) + } +} \ No newline at end of file diff --git a/R/read_vcf.R b/R/read_vcf.R index c70114d..a6b6cc4 100644 --- a/R/read_vcf.R +++ b/R/read_vcf.R @@ -136,7 +136,8 @@ read_vcf <- function(path, #### Prepare SNP column #### read_vcf_markername(sumstats_dt = vcf_dt) #### Prepare/un-log P col #### - read_vcf_pval(sumstats_dt = vcf_dt) + sumstats_dt <- read_log_pval(sumstats_dt = vcf_dt, + return_list=FALSE) #### Prepare INFO col #### read_vcf_info(sumstats_dt = vcf_dt) #### Rename start col ##### diff --git a/R/read_vcf_pval.R b/R/read_vcf_pval.R deleted file mode 100644 index 5129a78..0000000 --- a/R/read_vcf_pval.R +++ /dev/null @@ -1,21 +0,0 @@ -#' Read VCF: p-value column -#' -#' Parse p-value column in VCF file. -#' -#' @param sumstats_dt Summary stats data.table. -#' -#' @returns Null output. -#' @keywords internal -#' @importFrom data.table setnames -read_vcf_pval <- function(sumstats_dt){ - - P <- LP <- NULL - # Need to convert P-value, currently -log10 - if ("Pval" %in% colnames(sumstats_dt)) { - data.table::setnames(sumstats_dt, "P","Pval") - } else if ("LP" %in% names(sumstats_dt)) { - messager("VCF file has -log10 P-values; these will be", - "converted to unadjusted p-values in the 'P' column.") - sumstats_dt[, P := 10^(-1 * as.numeric(LP))] - } -} \ No newline at end of file diff --git a/man/read_log_pval.Rd b/man/read_log_pval.Rd new file mode 100644 index 0000000..8cee37f --- /dev/null +++ b/man/read_log_pval.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_log_pval.R +\name{read_log_pval} +\alias{read_log_pval} +\title{Read -log10 p-value column} +\usage{ +read_log_pval( + sumstats_dt, + mapping_file = sumstatsColHeaders, + return_list = TRUE +) +} +\arguments{ +\item{sumstats_dt}{Summary stats data.table.} + +\item{mapping_file}{MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.} + +\item{return_list}{Binary, whether to return the dt in a list or not - list +is standard for the \code{format_sumstats()} function.} +} +\value{ +Null output. +} +\description{ +Parse p-value column in VCF file.of other general -loq10 p-values +} +\keyword{internal} diff --git a/man/read_vcf_pval.Rd b/man/read_vcf_pval.Rd deleted file mode 100644 index d41936f..0000000 --- a/man/read_vcf_pval.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read_vcf_pval.R -\name{read_vcf_pval} -\alias{read_vcf_pval} -\title{Read VCF: p-value column} -\usage{ -read_vcf_pval(sumstats_dt) -} -\arguments{ -\item{sumstats_dt}{Summary stats data.table.} -} -\value{ -Null output. -} -\description{ -Parse p-value column in VCF file. -} -\keyword{internal} diff --git a/tests/testthat/test-log10_p.R b/tests/testthat/test-log10_p.R new file mode 100644 index 0000000..c6f1cbc --- /dev/null +++ b/tests/testthat/test-log10_p.R @@ -0,0 +1,51 @@ +test_that("log10 p-value", { + ## Call uses reference genome as default with more than 2GB of memory, + ## which is more than what 32-bit Windows can handle so remove tests + is_32bit_windows <- + .Platform$OS.type == "windows" && .Platform$r_arch == "i386" + if (!is_32bit_windows) { + # read it in and make N + sumstats_dt <- data.table::fread(system.file("extdata", "eduAttainOkbay.txt", + package = "MungeSumstats"), + nThread = 1) + #test -log10 + sumstats_dt[,LP:=-1*log(Pval,base=10)] + data.table::setnames(sumstats_dt,"Pval","Pval_org") + reformatted <- MungeSumstats::format_sumstats(sumstats_dt, + ref_genome = "GRCh37", + INFO_filter = 0.9, + on_ref_genome = FALSE, + strand_ambig_filter = FALSE, + bi_allelic_filter = FALSE, + allele_flip_check = FALSE, + log_folder_ind = FALSE, + imputation_ind = FALSE, + dbSNP=144, + ignore_multi_trait=TRUE, + return_data = TRUE) + testthat::expect_equal(reformatted$PVAL_ORG,reformatted$P) + #test log10 + sumstats_dt <- data.table::fread(system.file("extdata", "eduAttainOkbay.txt", + package = "MungeSumstats"), + nThread = 1) + sumstats_dt[,LP:=log(Pval,base=10)] + data.table::setnames(sumstats_dt,"Pval","Pval_org") + reformatted <- MungeSumstats::format_sumstats(sumstats_dt, + ref_genome = "GRCh37", + INFO_filter = 0.9, + on_ref_genome = FALSE, + strand_ambig_filter = FALSE, + bi_allelic_filter = FALSE, + allele_flip_check = FALSE, + log_folder_ind = FALSE, + imputation_ind = FALSE, + dbSNP=144, + ignore_multi_trait=TRUE, + return_data = TRUE) + testthat::expect_equal(reformatted$PVAL_ORG,reformatted$P) + } + else{ + testthat::expect_equal(is_32bit_windows, TRUE) + testthat::expect_equal(is_32bit_windows, TRUE) + } +})