Skip to content

Commit

Permalink
infer effect column A0 & eff_on_minor_alleles param
Browse files Browse the repository at this point in the history
  • Loading branch information
Al-Murphy committed Sep 19, 2024
1 parent c8b154a commit 33891b0
Show file tree
Hide file tree
Showing 14 changed files with 222 additions and 54 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MungeSumstats
Type: Package
Title: Standardise summary statistics from GWAS
Version: 1.13.6
Version: 1.13.7
Authors@R:
c(person(given = "Alan",
family = "Murphy",
Expand Down
13 changes: 13 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
## CHANGES IN VERSION 1.13.7

### Bug fix
* `infer_eff_direction` now includes A0 as an ambiguous case as well as A1/A2.

### New features
* `eff_on_minor_alleles` parameter added (off by default) - controls whether
MungeSumstats should assume that the effects are majoritively measured on the
minor alleles. Default is FALSE as this is an assumption that won't be
appropriate in all cases. However, the benefit is that if we know the majority
of SNPs have their effects based on the minor alleles, we can catch cases where
the allele columns have been mislabelled.

## CHANGES IN VERSION 1.13.6

### New features
Expand Down
11 changes: 10 additions & 1 deletion R/format_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@
#' the reference genome by SNP ID. Default is TRUE.
#' @param infer_eff_direction Binary Should a check take place to ensure the
#' alleles match the effect direction? Default is TRUE.
#' @param eff_on_minor_alleles Binary Should MungeSumstats assume that the
#' effects are majoritively measured on the minor alleles? Default is FALSE as
#' this is an assumption that won't be appropriate in all cases. However, the
#' benefit is that if we know the majority of SNPs have their effects based on
#' the minor alleles, we can catch cases where the allele columns have been
#' mislabelled.
#' @param strand_ambig_filter Binary Should SNPs with strand-ambiguous alleles
#' be removed. Default is FALSE.
#' @param allele_flip_check Binary Should the allele columns be checked against
Expand Down Expand Up @@ -267,6 +273,7 @@ format_sumstats <- function(path,
rmv_chr = c("X", "Y", "MT"),
on_ref_genome = TRUE,
infer_eff_direction = TRUE,
eff_on_minor_alleles = FALSE,
strand_ambig_filter = FALSE,
allele_flip_check = TRUE,
allele_flip_drop = TRUE,
Expand Down Expand Up @@ -359,6 +366,7 @@ format_sumstats <- function(path,
rmv_chr = rmv_chr,
on_ref_genome = on_ref_genome,
infer_eff_direction = infer_eff_direction,
eff_on_minor_alleles = eff_on_minor_alleles,
strand_ambig_filter = strand_ambig_filter,
allele_flip_check = allele_flip_check,
allele_flip_drop = allele_flip_drop,
Expand Down Expand Up @@ -496,7 +504,8 @@ format_sumstats <- function(path,
nThread = nThread,
ref_genome = ref_genome,
on_ref_genome = on_ref_genome,
infer_eff_direction = infer_eff_direction
infer_eff_direction = infer_eff_direction,
eff_on_minor_alleles = eff_on_minor_alleles
)

#### Check 3:Standardise headers for all OS ####
Expand Down
22 changes: 16 additions & 6 deletions R/get_eff_frq_allele_combns.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ get_eff_frq_allele_combns <-
mapping_file[mapping_file$CORRECTED %in% eff_frq_cols,]$UNCORRECTED
#join with all allele cols
allele_uncorrc <-
mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]$UNCORRECTED
mapping_file[mapping_file$CORRECTED %in% c('A1','A2','A*'),]$UNCORRECTED
#get combinations
eff_frq_allele_dt <-
data.table::as.data.table(expand.grid(eff_frq_cols_uncorrc,
Expand Down Expand Up @@ -51,25 +51,35 @@ get_eff_frq_allele_combns <-
eff_frq_allele_matches <- data.table::rbindlist(all_combns)
#finally add some custom ones
custom_adds <- data.table::data.table("UNCORRECTED" =
c("BETA1", "BETA2","AF1","AF2",
c("BETA1", "BETA2","BETA0",
"AF1","AF2","AF0",
"FREQ.A1.1000G.EUR",
"FREQ.A2.1000G.EUR",
"FREQ.A0.1000G.EUR",
"FREQ.A1.ESP.EUR",
"FREQ.A2.ESP.EUR",
"FREQ.A0.ESP.EUR",
"FREQ.ALLELE1.HAPMAPCEU",
"FREQ.ALLELE2.HAPMAPCEU",
"FREQ1","FREQ2",
"FREQ1.HAPMAP","FREQ2.HAPMAP"),
"FREQ.ALLELE0.HAPMAPCEU",
"FREQ1","FREQ2","FREQ0",
"FREQ1.HAPMAP","FREQ2.HAPMAP",
"FREQ0.HAPMAP"),
"CORRECTED" =
c("BETA", "BETA","FRQ","FRQ",
c("BETA", "BETA","BETA",
"FRQ","FRQ","FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ","FRQ","FRQ",
"FRQ","FRQ",
"FRQ","FRQ"))
"FRQ"))
eff_frq_allele_matches <- data.table::rbindlist(list(
eff_frq_allele_matches,custom_adds))

Expand Down
3 changes: 2 additions & 1 deletion R/get_genome_build.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
#' This should help speed up cases where you have to read in \code{sumstats}
#' from disk each time.
#' @param allele_match_ref Instead of returning the genome_build this will
#' return the propotion of matches to each genome build for each allele (A1,A2).
#' return the proportion of matches to each genome build for each allele
#' (A1,A2).
#' @inheritParams format_sumstats
#' @inheritParams get_genome_builds
#'
Expand Down
148 changes: 115 additions & 33 deletions R/infer_effect_column.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
#' Infer if effect relates to a1 or A2 if ambiguously named
#'
#' Three checks are made to infer which allele the effect/frequency information
#' relates to if they are ambiguous (named A1 and A2 or equivalent):
#' 1. Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or
#' relates to if they are ambiguous (named A0, A1 and A2 or equivalent):
#' 1. Check if ambiguous naming conventions are used (i.e. allele 0, 1 and 2 or
#' equivalent). If not exit, otherwise continue to next checks. This can be
#' checked by using the mapping file and splitting A1/A2 mappings by those that
#' contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect,
#' contain 0, 1 or 2 (ambiguous) or doesn't contain 0, 1 or 2 e.g. effect,
#' tested (unambiguous so fine for MSS to handle as is).
#' 2. Look for effect column/frequency column where the A1/A2 explicitly
#' mentioned, if found then we know the direction and should update A1/A2
#' 2. Look for effect column/frequency column where the A0/A1/A2 explicitly
#' mentioned, if found then we know the direction and should update A0/A1/A2
#' naming so A2 is the effect column. We can look for such columns by getting
#' every combination of A1/A2 naming and effect/frq naming.
#' every combination of A0/A1/A2 naming and effect/frq naming.
#' 3. If not found in 2, a final check should be against the reference genome,
#' whichever of A1 and A2 has more of a match with the reference genome should
#' be taken as **not** the effect allele. There is an assumption in this but is
#' still better than guessing the ambiguous allele naming.
#' whichever of A0, A1 and A2 has more of a match with the reference genome
#' should be taken as **not** the effect allele. There is an assumption in this
#' but is still better than guessing the ambiguous allele naming.
#'
#' Also, if eff_on_minor_alleles=TRUE, check 3 will be used in all cases.
#' However, This assumes that the effects are majoritively measured on the
#' minor alleles and should be used with caution as this is an assumption that
#' won't be appropriate in all cases. However, the benefit is that if we know
#' the majority of SNPs have their effects based on the minor alleles, we can
#' catch cases where the allele columns have been mislabelled. IF
#' eff_on_minor_alleles=TRUE, checks 1 and 2 will be skipped.
#'
#' @inheritParams format_sumstats
#' @inheritParams compute_nsize
Expand All @@ -36,8 +44,9 @@ infer_effect_column <-
ref_genome = NULL,
on_ref_genome = TRUE,
infer_eff_direction = TRUE,
eff_on_minor_alleles = FALSE,
return_list=TRUE) {
if(isTRUE(infer_eff_direction)){
if(isTRUE(infer_eff_direction)||isTRUE(eff_on_minor_alleles)){
message("Infer Effect Column")
message("First line of summary statistics file: ")
msg <- paste0(names(sumstats_dt), split = "\t")
Expand All @@ -48,23 +57,29 @@ infer_effect_column <-
# Identify allele mappings which are ambiguous and problematic
# vs those that are interpretable
colnames(mapping_file) <- toupper(colnames(mapping_file))
allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]
#A* is A0, A* used since usually if A0/A1 used, meaning of A1
#becomes eff allele and A0 is non-eff so need to flip later
allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2',
'A*'),]
ambig_allele_map <-
allele_mapping[grepl('1',allele_mapping$UNCORRECTED)|
grepl('2',allele_mapping$UNCORRECTED),]
grepl('2',allele_mapping$UNCORRECTED)|
grepl('0',allele_mapping$UNCORRECTED),]
unambig_allele_map <-
allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)|
grepl('2',allele_mapping$UNCORRECTED)),]
grepl('2',allele_mapping$UNCORRECTED)|
grepl('0',allele_mapping$UNCORRECTED)),]
#as long as the sumstats contains 1 unambiguous allele column MSS will
#work as expected
unambig_cols <- intersect(unambig_allele_map$UNCORRECTED,
toupper(column_headers))
ambig_cols <- intersect(ambig_allele_map$UNCORRECTED,
toupper(column_headers))
#if both ambiguous and unambiguous columns found, rename ambiguous ones so
#they aren't used later by MSS
#if both ambiguous and unambiguous columns found, rename ambiguous ones
#so they aren't used later by MSS
#example: 'A1','A2','EFFECT_ALLELE' all present
if (length(unambig_cols)>0 && length(ambig_cols)>0){
if (length(unambig_cols)>0 && length(ambig_cols)>0 &&
isFALSE(eff_on_minor_alleles)){
#find if unambig and ambig relate to the same allele
#get corrected name for unambig
unambig_corrcted <-
Expand Down Expand Up @@ -92,27 +107,53 @@ infer_effect_column <-
paste0(chng_i,"_INPUTTED"))
}
}
} else if (length(unambig_cols)==0 && length(ambig_cols)>=2){
#only continue if no unambiguous columns found but 2 ambig ones are found-
#less than 2 in total means allele info is missing which MSS can try fill
#in later
message("Allele columns are ambiguous, attempting to infer direction")
#get names for allele marked eff/frq columns
eff_frq_allele_matches <- get_eff_frq_allele_combns()
#now look for matches in sumstats
fnd_allele_indicator <-
column_headers[toupper(column_headers) %in%
eff_frq_allele_matches$UNCORRECTED]
} else if ((length(unambig_cols)==0 && length(ambig_cols)>=2) ||
isTRUE(eff_on_minor_alleles)){
#first case for ambig allelee where user didn't set eff_on_minor_alleles
if ((length(unambig_cols)==0 && length(ambig_cols)>=2) &&
isFALSE(eff_on_minor_alleles)){
#only continue if no unambiguous columns found but 2 ambig ones are
#found- less than 2 in total means allele info is missing which MSS
#can try fill in later
message("Allele columns are ambiguous, attempting to infer direction")
#get names for allele marked eff/frq columns
eff_frq_allele_matches <- get_eff_frq_allele_combns()
#now look for matches in sumstats
fnd_allele_indicator <-
column_headers[toupper(column_headers) %in%
eff_frq_allele_matches$UNCORRECTED]
} else{
#for eff_on_minor_alleles = TRUE -
#force length(fnd_allele_indicator)>0 to return FALSE
fnd_allele_indicator<-c()
}
if(length(fnd_allele_indicator)>0){
message("Found direction from effect/frq column naming")
#fnd_allele_indicator could be >1 so majority vote
a1_mtch <- sum(grepl("A1",fnd_allele_indicator))
a2_mtch <- sum(grepl("A2",fnd_allele_indicator))
if(a2_mtch>=a1_mtch){
message("Effect/frq column(s) relate to A2 in the sumstats")
a0_mtch <- sum(grepl("A0",fnd_allele_indicator))
#need to also check if allele 0 & allele 1 found or more normal case
#of allele 1 & allele 2, as this flips which is interp as the eff
#allele by MSS
samp_dt <- copy(sumstats_dt[1:10])
samp_dt <-
standardise_sumstats_column_headers_crossplatform(samp_dt,
mapping_file=
mapping_file,
convert_A0=FALSE,
return_list=FALSE)
formatted_col_headers <- names(samp_dt)
#check if A0,A1 and A2 present
a1_found <- "A1" %in% formatted_col_headers
a2_found <- "A2" %in% formatted_col_headers
a0_found <- "A*" %in% formatted_col_headers
#if A2 found at all, it is eff col normally in MSS
if(a2_mtch>=a1_mtch && a2_found){
message("Effect/frq column(s) relate to A2 in the sumstat")
#this is what MSS expects so no action required
}else{#a2_mtch<a1_mtch
message("Effect/frq column(s) relate to A1 in the sumstats")
}else if(a2_mtch<a1_mtch && a2_found){
message("Effect/frq column(s) relate to A1 in the sumstat")
#this is the opposite to what MSS expects so switch A1/A2 naming
#first get corrected names for allele columns then switch
for (headerI in seq_len(nrow(mapping_file))) {
Expand All @@ -127,18 +168,43 @@ infer_effect_column <-
data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
data.table::setnames(sumstats_dt,"A1","A2")
data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
}else if(a1_mtch>=a0_mtch&& a0_found){
message("Effect/frq column(s) relate to A1 where A0 in the sumstat")
#this is what MSS expects so no action required
}else if(a1_mtch<a0_mtch&& a0_found){
message("Effect/frq column(s) relate to A0 in the sumstat")
#this is the opposite to what MSS expects so switch A1/A0 naming
#first get corrected names for allele columns then switch
for (headerI in seq_len(nrow(mapping_file))) {
un <- mapping_file[headerI, "UNCORRECTED"]
cr <- mapping_file[headerI, "CORRECTED"]
#note ambig_cols here not all col headers
if (un %in% ambig_cols & (!cr %in% column_headers)) {
data.table::setnames(sumstats_dt, un, cr)
}
}
#now switch
data.table::setnames(sumstats_dt,"A*","A0_INPUTTED_OLD_")
#don't flip A1 as the next step of the mapping is to flip it anyway
data.table::setnames(sumstats_dt,"A0_INPUTTED_OLD_","A2")
}else{#unknown
print("ERROR: Unknown condition, not changing allele mapping.")
}

}
else{
#didn't find any allele specific
#Either - didn't find any allele specific or set eff_on_minor_alleles
#last option is to check the reference genome and take the allele with
#more matches to it as the non-effect allele
#need to standardise the sumstats for this
#use get_genome_build function for this
if(isFALSE(on_ref_genome) && isTRUE(eff_on_minor_alleles)){
stop("on_ref_genome must equal TRUE to use eff_on_minor_alleles")
}
if(isTRUE(on_ref_genome)){
#Note this check will also work for when A0 is present
switch_req <- get_genome_build(
sumstats = sumstats_dt,
sumstats = copy(sumstats_dt),
standardise_headers = TRUE,
sampled_snps = sampled_snps,
mapping_file = mapping_file,
Expand All @@ -163,6 +229,22 @@ infer_effect_column <-
data.table::setnames(sumstats_dt, un, cr)
}
}
# Special case!! A0/A1 -> ref/alt so A1 flips meaning,
# A0 is A* in mapping
# but usually A1/A2 -> ref/alt so if A* found,
# swap A1 to A2 and make A* -> A1
new_headers <- colnames(sumstats_dt)
if ("A*" %in% new_headers) {
# if A1 and A2 also present need to rename A2
if ("A1" %in% new_headers && "A2" %in% new_headers) {
data.table::setnames(sumstats_dt, "A2", "A2_from_input")
}
# if A1 present change to A2, doesn't have to be,
# can be imputted
data.table::setnames(sumstats_dt, "A1", "A2",
skip_absent = TRUE)
data.table::setnames(sumstats_dt, "A*", "A1")
}
#now switch
data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
data.table::setnames(sumstats_dt,"A1","A2")
Expand Down
Loading

0 comments on commit 33891b0

Please sign in to comment.