Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

develop bed conversion #4

Merged
merged 19 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions R/convert-pgs-to-bed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# a utility function for adding slop to BED coordinates
add.slop <- function(bed, slop) {
bed$start <- bed$start - slop;
bed$end <- bed$end + slop;

# check for negative start coordinates, replace with 0, and issue a warning
if (any(bed$start < 0)) {
bed$start[bed$start < 0] <- 0;
warning('Slop caused negative start coordinates; replacing with 0.');
}

return(bed);
}

#' @title Convert PGS data to BED format
#' @description Convert imported and formatted PGS compnent SNP coordinate data to BED format.
#' @param pgs.weight.data A data.frame containing SNP coordinate data with standardized CHROM and POS columns.
#' @param chr.prefix A logical indicating whether the 'chr' prefix should be used when formatting chromosome name.
#' @param numeric.sex.chr A logical indicating whether the sex chromosomes should be formatted numerically, as opposed to alphabetically.
#' @param slop An integer indicating the number of base pairs to add to the BED interval on either side.
#' @return A data.frame containing the PGS component SNP coordinate data in BED format and any other columns provided in pgs.weight.data.
#' @export
convert.pgs.to.bed <- function(pgs.weight.data, chr.prefix = TRUE, numeric.sex.chr = FALSE, slop = 0) {
# check that data is a data.frame
if (!is.data.frame(pgs.weight.data)) {
stop('data must be a data.frame');
}

# check that data has CHROM and POS columns
if (!all(c('CHROM', 'POS') %in% colnames(pgs.weight.data))) {
stop('data must have CHROM and POS columns');
}

# check that slop is a non-negative integer
if (!is.numeric(slop) | slop < 0) {
stop('slop must be a non-negative integer');
}

# convert CHROM to default format (no 'chr' prefix, alphabetic sex chromosomes)
pgs.weight.data$CHROM <- gsub('chr', '', pgs.weight.data$CHROM);
pgs.weight.data$CHROM <- gsub('23', 'X', pgs.weight.data$CHROM);
pgs.weight.data$CHROM <- gsub('24', 'Y', pgs.weight.data$CHROM);

# apply requested CHROM formatting
if (chr.prefix) {
pgs.weight.data$CHROM <- paste0('chr', pgs.weight.data$CHROM);
}

if (numeric.sex.chr) {
pgs.weight.data$CHROM <- gsub('X', '23', pgs.weight.data$CHROM);
pgs.weight.data$CHROM <- gsub('Y', '24', pgs.weight.data$CHROM);
}

## assemble BED file ##
# 0-index coordinates
pgs.bed <- data.frame(
chr = pgs.weight.data$CHROM,
start = pgs.weight.data$POS - 1,
end = pgs.weight.data$POS
);
# check for negative start coordinates, report an error
if (any(pgs.bed$start < 0)) {
stop('0-indexing caused negative start coordinates.');
}

# add slop
if (slop > 0) {
pgs.bed <- add.slop(bed = pgs.bed, slop = slop);
}

# concat with the rest of the prs columns
pgs.bed <- cbind(pgs.bed, subset(pgs.weight.data, select = -c(CHROM, POS)));

return(pgs.bed);
}

#' @title Merge PGS BED files
#' @description Merge overlapping PGS coordinates in multiple BED files.
#' @param pgs.bed.list A named list of data.frames containing PGS coordinates in BED format.
#' @param add.annotation.data A logical indicating whether an additional annotation data column should be added to the annotation column.
#' @param annotation.column.index An integer indicating the index of the column in the data frames in pgs.bed.list that should be added to the annotation column.
#' @param slop An integer indicating the number of base pairs to add to the BED interval on either side.
#' @return A data.frame containing the merged PGS coordinates in BED format with an extra annotation column containing the name of the PGS and data from one additional column optionally selected by the user.
#' @export
merge.pgs.bed <- function(pgs.bed.list, add.annotation.data = FALSE, annotation.column.index = 4, slop = 0) {

# check that pgs.bed.list is a named list
if (!is.list(pgs.bed.list) | is.null(names(pgs.bed.list))) {
stop('pgs.bed.list must be a named list');
}

# check that all elements of pgs.bed.list are data.frames
if (!all(sapply(pgs.bed.list, is.data.frame))) {
stop('all elements of pgs.bed.list must be data.frames');
}

# check that all elements of pgs.bed.list have the same column names
if (!all(sapply(pgs.bed.list, function(x) all(colnames(x) == colnames(pgs.bed.list[[1]]))))) {
stop('all elements of pgs.bed.list must have the same column names');
}

# check that each element of pgs.bed.list is in BED format (chr, start, end)
if (!all(sapply(pgs.bed.list, function(x) all( c('chr', 'start', 'end') %in% colnames(x))))) {
stop('all elements of pgs.bed.list must have columns named chr, start, and end');
}

# check that all intervals specified in pgs.bed.list are one bp in length
if (!all(sapply(pgs.bed.list, function(x) all(x$start == x$end - 1)))) {
stop('all intervals specified in pgs.bed.list must represent one SNP and be one bp in length');
}


if (add.annotation.data) {
# check that annotation.column.index is within the range of the number of columns in the data.frames in pgs.bed.list
if (annotation.column.index > ncol(pgs.bed.list[[1]])) {
stop('annotation.column.index must be within the range of the number of columns in the data.frames in pgs.bed.list');
}
}

# Annotate each PGS BED file with the name of the PGS
for (i in 1:length(pgs.bed.list)) {
pgs.bed.list[[i]]$annotation <- names(pgs.bed.list)[i];
}

# concatenate all BED files in list
concatenated.bed <- do.call(rbind, pgs.bed.list);

# add requested annotation data to annotation column

if (add.annotation.data) {
concatenated.bed$annotation <- paste(concatenated.bed[ , annotation.column.index], concatenated.bed$annotation, sep = '|');
}

# sort by chromosome and position
concatenated.bed <- concatenated.bed[order(concatenated.bed$chr, concatenated.bed$start), ];

# merge overlapping SNP intervals and combine overlapping annotations using aggregate
merged.bed <- aggregate(annotation ~ chr + start + end, data = concatenated.bed, FUN = paste, collapse = ',');

# sort by chromosome and position
merged.bed <- merged.bed[order(merged.bed$chr, merged.bed$start), ];

# add slop
if (slop > 0) {
merged.bed <- add.slop(bed = merged.bed, slop = slop);
}

# return merged BED file
return(merged.bed);
}
85 changes: 85 additions & 0 deletions inst/generate.test.data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# create data frame of imported PGS weight file header (for testing)
PGS00662.metadata <- data.frame(
key = c(
'format_version',
'pgs_id',
'pgs_name',
'trait_reported',
'trait_mapped',
'trait_efo',
'genome_build',
'variants_number',
'weight_type',
'pgp_id',
'citation',
'HmPOS_build',
'HmPOS_date',
'HmPOS_match_chr',
'HmPOS_match_pos'
),
value = c(
'2.0',
'PGS000662',
'GRS.PCa.269',
'Prostate Cancer',
'prostate carcinoma',
'EFO_0001663',
'GRCh37',
'269',
'beta',
'PGP000122',
'Conti DV, Darst BF et al. Nat Genet (2020). doi:10.1038/s41588-020-00748-0',
'GRCh38',
'2022-07-29',
'{"True": null, "False": null}',
'{"True": null, "False": null}'
)
);


# save .Rda file of import function test data
import.test.data <- list(
PGS00662.metadata = PGS00662.metadata
);

save(
import.test.data,
file = 'tests/testthat/data/import.test.data.Rda'
);

# create data frame of test PGS coordinates (for testing)
tiny.pgs.test.data <- data.frame(
CHROM = c('1', '2', 'chr3', 'chr4', 'chr23', 'chr24', 'chrX', 'chrY'),
POS = c(1, 10, 100, 1e3, 1e4, 1e5, 1e6, 1e7),
effect_allele = c('A', 'T', 'C', 'G', 'A', 'T', 'C', 'G'),
effect_weight = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, NA, NA),
beta = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, NA, NA)
);

# save .Rda file of tiny PGS test data
save(
tiny.pgs.test.data,
file = 'tests/testthat/data/tiny.pgs.test.data.Rda'
);

# create list of data frames of test BED coordinates (for testing)
tiny.bed.test.data <- simple.test.input <- list(
pgs1 = data.frame(
chr = c('chr1', 'chr1', 'chr2', 'chr2'),
start = c(1, 2, 3, 4),
end = c(2, 3, 4, 5),
overlap = c('no overlap with pgs2', 'overlap with pgs2', 'overlap with pgs2', 'no overlap with pgs2')
),
pgs2 = data.frame(
chr = c('chr1', 'chr1', 'chr2', 'chr3'),
start = c(5, 2, 3, 4),
end = c(6, 3, 4, 5),
overlap = c('no overlap with pgs1', 'overlap with pgs1', 'overlap with pgs1', 'no overlap with pgs1')
)
);

# save .Rda file of tiny BED test data
save(
tiny.bed.test.data,
file = 'tests/testthat/data/tiny.bed.test.data.Rda'
);
28 changes: 28 additions & 0 deletions man/convert.pgs.to.bed.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions man/merge.pgs.bed.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added tests/testthat/data/tiny.bed.test.data.Rda
Binary file not shown.
Binary file added tests/testthat/data/tiny.pgs.test.data.Rda
Binary file not shown.
Loading
Loading