infer effect column A0 & eff_on_minor_alleles param

Al-Murphy · Sep 19, 2024 · 33891b0 · 33891b0
1 parent c8b154a
commit 33891b0
Show file tree

Hide file tree

Showing 14 changed files with 222 additions and 54 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: MungeSumstats
 Type: Package
 Title: Standardise summary statistics from GWAS
-Version: 1.13.6
+Version: 1.13.7
 Authors@R:
     c(person(given = "Alan",
            family = "Murphy",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,16 @@
+## CHANGES IN VERSION 1.13.7
+
+### Bug fix
+* `infer_eff_direction` now includes A0 as an ambiguous case as well as A1/A2.
+
+### New features
+* `eff_on_minor_alleles` parameter added (off by default) - controls whether 
+MungeSumstats should assume that the effects are majoritively measured on the 
+minor alleles. Default is FALSE as this is an assumption that won't be 
+appropriate in all cases. However, the benefit is that if we know the majority 
+of SNPs have their effects based on the minor alleles, we can catch cases where 
+the allele columns have been mislabelled. 
+
 ## CHANGES IN VERSION 1.13.6
 
 ### New features

diff --git a/R/format_sumstats.R b/R/format_sumstats.R
@@ -134,6 +134,12 @@
 #' the reference genome by SNP ID. Default is TRUE.
 #' @param infer_eff_direction Binary Should a check take place to ensure the 
 #' alleles match the effect direction? Default is TRUE.
+#' @param eff_on_minor_alleles Binary Should MungeSumstats assume that the 
+#' effects are majoritively measured on the minor alleles? Default is FALSE as
+#' this is an assumption that won't be appropriate in all cases. However, the 
+#' benefit is that if we know the majority of SNPs have their effects based on 
+#' the minor alleles, we can catch cases where the allele columns have been 
+#' mislabelled.
 #' @param strand_ambig_filter Binary Should SNPs with strand-ambiguous alleles
 #' be removed. Default is FALSE.
 #' @param allele_flip_check Binary Should the allele columns be checked against
@@ -267,6 +273,7 @@ format_sumstats <- function(path,
                             rmv_chr = c("X", "Y", "MT"),
                             on_ref_genome = TRUE,
                             infer_eff_direction = TRUE,
+                            eff_on_minor_alleles = FALSE,
                             strand_ambig_filter = FALSE,
                             allele_flip_check = TRUE,
                             allele_flip_drop = TRUE,
@@ -359,6 +366,7 @@ format_sumstats <- function(path,
             rmv_chr = rmv_chr,
             on_ref_genome = on_ref_genome,
             infer_eff_direction = infer_eff_direction,
+            eff_on_minor_alleles = eff_on_minor_alleles,
             strand_ambig_filter = strand_ambig_filter,
             allele_flip_check = allele_flip_check,
             allele_flip_drop = allele_flip_drop,
@@ -496,7 +504,8 @@ format_sumstats <- function(path,
             nThread = nThread,
             ref_genome = ref_genome,
             on_ref_genome = on_ref_genome,
-            infer_eff_direction = infer_eff_direction
+            infer_eff_direction = infer_eff_direction,
+            eff_on_minor_alleles = eff_on_minor_alleles
           )
 
         #### Check 3:Standardise headers for all OS ####

diff --git a/R/get_eff_frq_allele_combns.R b/R/get_eff_frq_allele_combns.R
@@ -19,7 +19,7 @@ get_eff_frq_allele_combns <-
       mapping_file[mapping_file$CORRECTED %in% eff_frq_cols,]$UNCORRECTED
     #join with all allele cols
     allele_uncorrc <- 
-      mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]$UNCORRECTED
+      mapping_file[mapping_file$CORRECTED %in% c('A1','A2','A*'),]$UNCORRECTED
     #get combinations
     eff_frq_allele_dt <- 
       data.table::as.data.table(expand.grid(eff_frq_cols_uncorrc, 
@@ -51,25 +51,35 @@ get_eff_frq_allele_combns <-
     eff_frq_allele_matches <- data.table::rbindlist(all_combns)
     #finally add some custom ones 
     custom_adds <- data.table::data.table("UNCORRECTED" = 
-                                            c("BETA1", "BETA2","AF1","AF2",
+                                            c("BETA1", "BETA2","BETA0",
+                                              "AF1","AF2","AF0",
                                               "FREQ.A1.1000G.EUR",
                                               "FREQ.A2.1000G.EUR",
+                                              "FREQ.A0.1000G.EUR",
                                               "FREQ.A1.ESP.EUR",
                                               "FREQ.A2.ESP.EUR",
+                                              "FREQ.A0.ESP.EUR",
                                               "FREQ.ALLELE1.HAPMAPCEU",
                                               "FREQ.ALLELE2.HAPMAPCEU",
-                                              "FREQ1","FREQ2", 
-                                              "FREQ1.HAPMAP","FREQ2.HAPMAP"),
+                                              "FREQ.ALLELE0.HAPMAPCEU",
+                                              "FREQ1","FREQ2","FREQ0", 
+                                              "FREQ1.HAPMAP","FREQ2.HAPMAP",
+                                              "FREQ0.HAPMAP"),
                                           "CORRECTED" = 
-                                            c("BETA", "BETA","FRQ","FRQ",
+                                            c("BETA", "BETA","BETA",
+                                              "FRQ","FRQ","FRQ",
                                               "FRQ",
                                               "FRQ",
                                               "FRQ",
                                               "FRQ",
                                               "FRQ",
                                               "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ","FRQ","FRQ",
                                               "FRQ","FRQ",
-                                              "FRQ","FRQ"))
+                                              "FRQ"))
     eff_frq_allele_matches <- data.table::rbindlist(list(
       eff_frq_allele_matches,custom_adds))
 

diff --git a/R/get_genome_build.R b/R/get_genome_build.R
@@ -22,7 +22,8 @@
 #' This should help speed up cases where you have to read in \code{sumstats}
 #' from disk each time.
 #' @param allele_match_ref Instead of returning the genome_build this will 
-#' return the propotion of matches to each genome build for each allele (A1,A2).
+#' return the proportion of matches to each genome build for each allele 
+#' (A1,A2).
 #' @inheritParams format_sumstats
 #' @inheritParams get_genome_builds 
 #' 

diff --git a/R/infer_effect_column.R b/R/infer_effect_column.R
@@ -1,20 +1,28 @@
 #' Infer if effect relates to a1 or A2 if ambiguously named
 #'
 #' Three checks are made to infer which allele the effect/frequency information
-#' relates to if they are ambiguous (named A1 and A2 or equivalent):
-#' 1. Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or 
+#' relates to if they are ambiguous (named A0, A1 and A2 or equivalent):
+#' 1. Check if ambiguous naming conventions are used (i.e. allele 0, 1 and 2 or 
 #' equivalent). If not exit, otherwise continue to next checks. This can be 
 #' checked by using the mapping file and splitting A1/A2 mappings by those that 
-#' contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect, 
+#' contain 0, 1 or 2 (ambiguous) or doesn't contain 0, 1 or 2 e.g. effect, 
 #' tested (unambiguous so fine for MSS to handle as is).
-#' 2. Look for effect column/frequency column where the A1/A2 explicitly 
-#' mentioned, if found then  we know the direction and should update A1/A2 
+#' 2. Look for effect column/frequency column where the A0/A1/A2 explicitly 
+#' mentioned, if found then  we know the direction and should update A0/A1/A2 
 #' naming so A2 is the effect column. We can look for such columns by getting 
-#' every combination of A1/A2 naming and effect/frq naming.
+#' every combination of A0/A1/A2 naming and effect/frq naming.
 #' 3. If not found in 2, a final check should be against the reference genome, 
-#' whichever of A1 and A2 has more of a match with the reference genome should 
-#' be taken as **not** the effect allele. There is an assumption in this but is 
-#' still better than guessing the ambiguous allele naming.
+#' whichever of A0, A1 and A2 has more of a match with the reference genome 
+#' should be taken as **not** the effect allele. There is an assumption in this 
+#' but is still better than guessing the ambiguous allele naming.
+#' 
+#' Also, if eff_on_minor_alleles=TRUE, check 3 will be used in all cases. 
+#' However, This assumes that the effects are majoritively measured on the 
+#' minor alleles and should be used with caution as this is an assumption that 
+#' won't be appropriate in all cases. However, the benefit is that if we know 
+#' the majority of SNPs have their effects based on the minor alleles, we can 
+#' catch cases where the allele columns have been mislabelled. IF 
+#' eff_on_minor_alleles=TRUE, checks 1 and 2 will be skipped.
 #'
 #' @inheritParams format_sumstats 
 #' @inheritParams compute_nsize
@@ -36,8 +44,9 @@ infer_effect_column <-
            ref_genome = NULL,
            on_ref_genome = TRUE,
            infer_eff_direction = TRUE,
+           eff_on_minor_alleles = FALSE,
            return_list=TRUE) {
-    if(isTRUE(infer_eff_direction)){
+    if(isTRUE(infer_eff_direction)||isTRUE(eff_on_minor_alleles)){
       message("Infer Effect Column")
       message("First line of summary statistics file: ")
       msg <- paste0(names(sumstats_dt), split = "\t")
@@ -48,23 +57,29 @@ infer_effect_column <-
       # Identify allele mappings which are ambiguous and problematic
       # vs those that are interpretable
       colnames(mapping_file) <- toupper(colnames(mapping_file))
-      allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]
+      #A* is A0, A* used since usually if A0/A1 used, meaning of A1 
+      #becomes eff allele and A0 is non-eff so need to flip later
+      allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2',
+                                                                   'A*'),]
       ambig_allele_map <- 
         allele_mapping[grepl('1',allele_mapping$UNCORRECTED)|
-                         grepl('2',allele_mapping$UNCORRECTED),]
+                         grepl('2',allele_mapping$UNCORRECTED)|
+                         grepl('0',allele_mapping$UNCORRECTED),]
       unambig_allele_map <- 
         allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)|
-                           grepl('2',allele_mapping$UNCORRECTED)),]
+                           grepl('2',allele_mapping$UNCORRECTED)|
+                           grepl('0',allele_mapping$UNCORRECTED)),]
       #as long as the sumstats contains 1 unambiguous allele column MSS will
       #work as expected
       unambig_cols <- intersect(unambig_allele_map$UNCORRECTED,
                                 toupper(column_headers))
       ambig_cols <- intersect(ambig_allele_map$UNCORRECTED,
                               toupper(column_headers))
-      #if both ambiguous and unambiguous columns found, rename ambiguous ones so
-      #they aren't used later by MSS
+      #if both ambiguous and unambiguous columns found, rename ambiguous ones
+      #so they aren't used later by MSS
       #example: 'A1','A2','EFFECT_ALLELE' all present
-      if (length(unambig_cols)>0 && length(ambig_cols)>0){
+      if (length(unambig_cols)>0 && length(ambig_cols)>0 && 
+          isFALSE(eff_on_minor_alleles)){
         #find if unambig and ambig relate to the same allele
         #get corrected name for unambig 
         unambig_corrcted <- 
@@ -92,27 +107,53 @@ infer_effect_column <-
                                  paste0(chng_i,"_INPUTTED"))
           }
         }
-      } else if (length(unambig_cols)==0 && length(ambig_cols)>=2){
-        #only continue if no unambiguous columns found but 2 ambig ones are found-
-        #less than 2 in total means allele info is missing which MSS can try fill
-        #in later
-        message("Allele columns are ambiguous, attempting to infer direction")
-        #get names for allele marked eff/frq columns
-        eff_frq_allele_matches <- get_eff_frq_allele_combns()
-        #now look for matches in sumstats
-        fnd_allele_indicator <- 
-          column_headers[toupper(column_headers) %in% 
-                           eff_frq_allele_matches$UNCORRECTED]
+      } else if ((length(unambig_cols)==0 && length(ambig_cols)>=2) || 
+                 isTRUE(eff_on_minor_alleles)){
+        #first case for ambig allelee where user didn't set eff_on_minor_alleles
+        if ((length(unambig_cols)==0 && length(ambig_cols)>=2) && 
+            isFALSE(eff_on_minor_alleles)){
+          #only continue if no unambiguous columns found but 2 ambig ones are 
+          #found- less than 2 in total means allele info is missing which MSS 
+          #can try fill in later
+          message("Allele columns are ambiguous, attempting to infer direction")
+          #get names for allele marked eff/frq columns
+          eff_frq_allele_matches <- get_eff_frq_allele_combns()
+          #now look for matches in sumstats
+          fnd_allele_indicator <- 
+            column_headers[toupper(column_headers) %in% 
+                             eff_frq_allele_matches$UNCORRECTED]
+        } else{
+          #for eff_on_minor_alleles = TRUE - 
+          #force length(fnd_allele_indicator)>0 to return FALSE
+          fnd_allele_indicator<-c()
+        }
         if(length(fnd_allele_indicator)>0){
           message("Found direction from effect/frq column naming")
           #fnd_allele_indicator could be >1 so majority vote
           a1_mtch <- sum(grepl("A1",fnd_allele_indicator))
           a2_mtch <- sum(grepl("A2",fnd_allele_indicator))
-          if(a2_mtch>=a1_mtch){
-            message("Effect/frq column(s) relate to A2 in the sumstats")
+          a0_mtch <- sum(grepl("A0",fnd_allele_indicator))
+          #need to also check if allele 0 & allele 1 found or more normal case
+          #of allele 1 & allele 2, as this flips which is interp as the eff 
+          #allele by MSS
+          samp_dt <- copy(sumstats_dt[1:10])
+          samp_dt <- 
+            standardise_sumstats_column_headers_crossplatform(samp_dt,
+                                                              mapping_file=
+                                                                mapping_file,
+                                                              convert_A0=FALSE,
+                                                              return_list=FALSE)
+          formatted_col_headers <- names(samp_dt)
+          #check if A0,A1 and A2 present
+          a1_found <- "A1" %in% formatted_col_headers
+          a2_found <- "A2" %in% formatted_col_headers
+          a0_found <- "A*" %in% formatted_col_headers
+          #if A2 found at all, it is eff col normally in MSS
+          if(a2_mtch>=a1_mtch && a2_found){
+            message("Effect/frq column(s) relate to A2 in the sumstat")
             #this is what MSS expects so no action required
-          }else{#a2_mtch<a1_mtch
-            message("Effect/frq column(s) relate to A1 in the sumstats")
+          }else if(a2_mtch<a1_mtch && a2_found){
+            message("Effect/frq column(s) relate to A1 in the sumstat")
             #this is the opposite to what MSS expects so switch A1/A2 naming
             #first get corrected names for allele columns then switch
             for (headerI in seq_len(nrow(mapping_file))) {
@@ -127,18 +168,43 @@ infer_effect_column <-
             data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
             data.table::setnames(sumstats_dt,"A1","A2")
             data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
+          }else if(a1_mtch>=a0_mtch&& a0_found){
+            message("Effect/frq column(s) relate to A1 where A0 in the sumstat")
+            #this is what MSS expects so no action required
+          }else if(a1_mtch<a0_mtch&& a0_found){
+            message("Effect/frq column(s) relate to A0 in the sumstat")
+            #this is the opposite to what MSS expects so switch A1/A0 naming
+            #first get corrected names for allele columns then switch
+            for (headerI in seq_len(nrow(mapping_file))) {
+              un <- mapping_file[headerI, "UNCORRECTED"]
+              cr <- mapping_file[headerI, "CORRECTED"]
+              #note ambig_cols here not all col headers
+              if (un %in% ambig_cols & (!cr %in% column_headers)) {
+                data.table::setnames(sumstats_dt, un, cr)
+              }
+            }
+            #now switch
+            data.table::setnames(sumstats_dt,"A*","A0_INPUTTED_OLD_")
+            #don't flip A1 as the next step of the mapping is to flip it anyway
+            data.table::setnames(sumstats_dt,"A0_INPUTTED_OLD_","A2")
+          }else{#unknown
+            print("ERROR: Unknown condition, not changing allele mapping.")
           }
 
         }
         else{
-          #didn't find any allele specific
+          #Either - didn't find any allele specific or set eff_on_minor_alleles
           #last option is to check the reference genome and take the allele with
           #more matches to it as the non-effect allele
           #need to standardise the sumstats for this
           #use get_genome_build function for this
+          if(isFALSE(on_ref_genome) && isTRUE(eff_on_minor_alleles)){
+            stop("on_ref_genome must equal TRUE to use eff_on_minor_alleles")
+          }
           if(isTRUE(on_ref_genome)){
+            #Note this check will also work for when A0 is present
             switch_req <- get_genome_build(
-              sumstats = sumstats_dt,
+              sumstats = copy(sumstats_dt),
               standardise_headers = TRUE,
               sampled_snps = sampled_snps,
               mapping_file = mapping_file,
@@ -163,6 +229,22 @@ infer_effect_column <-
                     data.table::setnames(sumstats_dt, un, cr)
                   }
                 }
+                # Special case!! A0/A1 -> ref/alt so A1 flips meaning,
+                # A0 is A* in mapping
+                # but usually A1/A2 -> ref/alt so if A* found,
+                # swap A1 to A2 and make A* -> A1
+                new_headers <- colnames(sumstats_dt)
+                if ("A*" %in% new_headers) {
+                  # if A1 and A2 also present need to rename A2
+                  if ("A1" %in% new_headers && "A2" %in% new_headers) {
+                    data.table::setnames(sumstats_dt, "A2", "A2_from_input")
+                  }
+                  # if A1 present change to A2, doesn't have to be, 
+                  # can be imputted
+                  data.table::setnames(sumstats_dt, "A1", "A2", 
+                                       skip_absent = TRUE)
+                  data.table::setnames(sumstats_dt, "A*", "A1")
+                }
                 #now switch
                 data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
                 data.table::setnames(sumstats_dt,"A1","A2")