uclahs-cds · alkaZeltser · Jan 9, 2024 · Dec 26, 2023 · Dec 27, 2023 · Jan 6, 2024
diff --git a/R/convert-pgs-to-bed.R b/R/convert-pgs-to-bed.R
@@ -0,0 +1,150 @@
+# a utility function for adding slop to BED coordinates
+add.slop <- function(bed, slop) {
+    bed$start <- bed$start - slop;
+    bed$end <- bed$end + slop;
+
+    # check for negative start coordinates, replace with 0, and issue a warning
+    if (any(bed$start < 0)) {
+        bed$start[bed$start < 0] <- 0;
+        warning('Slop caused negative start coordinates; replacing with 0.');
+        }
+
+    return(bed);
+    }
+
+#' @title Convert PGS data to BED format
+#' @description Convert imported and formatted PGS compnent SNP coordinate data to BED format.
+#' @param pgs.weight.data A data.frame containing SNP coordinate data with standardized CHROM and POS columns.
+#' @param chr.prefix A logical indicating whether the 'chr' prefix should be used when formatting chromosome name.
+#' @param numeric.sex.chr A logical indicating whether the sex chromosomes should be formatted numerically, as opposed to alphabetically.
+#' @param slop An integer indicating the number of base pairs to add to the BED interval on either side.
+#' @return A data.frame containing the PGS component SNP coordinate data in BED format and any other columns provided in pgs.weight.data.
+#' @export
+convert.pgs.to.bed <- function(pgs.weight.data, chr.prefix = TRUE, numeric.sex.chr = FALSE, slop = 0) {
+    # check that data is a data.frame
+    if (!is.data.frame(pgs.weight.data)) {
+        stop('data must be a data.frame');
+        }
+
+    # check that data has CHROM and POS columns
+    if (!all(c('CHROM', 'POS') %in% colnames(pgs.weight.data))) {
+        stop('data must have CHROM and POS columns');
+        }
+
+    # check that slop is a non-negative integer
+    if (!is.numeric(slop) | slop < 0) {
+        stop('slop must be a non-negative integer');
+        }
+
+    # convert CHROM to default format (no 'chr' prefix, alphabetic sex chromosomes)
+	pgs.weight.data$CHROM <- gsub('chr', '', pgs.weight.data$CHROM);
+    pgs.weight.data$CHROM <- gsub('23', 'X', pgs.weight.data$CHROM);
+    pgs.weight.data$CHROM <- gsub('24', 'Y', pgs.weight.data$CHROM);
+
+    # apply requested CHROM formatting
+    if (chr.prefix) {
+        pgs.weight.data$CHROM <- paste0('chr', pgs.weight.data$CHROM);
+        }
+
+    if (numeric.sex.chr) {
+        pgs.weight.data$CHROM <- gsub('X', '23', pgs.weight.data$CHROM);
+        pgs.weight.data$CHROM <- gsub('Y', '24', pgs.weight.data$CHROM);
+        }
+
+    ## assemble BED file ##
+	# 0-index coordinates
+	pgs.bed <- data.frame(
+		chr = pgs.weight.data$CHROM,
+		start = pgs.weight.data$POS - 1,
+		end = pgs.weight.data$POS
+		);
+    # check for negative start coordinates, report an error
+    if (any(pgs.bed$start < 0)) {
+        stop('0-indexing caused negative start coordinates.');
+        }
+
+    # add slop
+    if (slop > 0) {
+        pgs.bed <- add.slop(bed = pgs.bed, slop = slop);
+        }
+
+	# concat with the rest of the prs columns
+	pgs.bed <- cbind(pgs.bed, subset(pgs.weight.data, select = -c(CHROM, POS)));
+
+	return(pgs.bed);
+	}
+
+#' @title Merge PGS BED files
+#' @description Merge overlapping PGS coordinates in multiple BED files.
+#' @param pgs.bed.list A named list of data.frames containing PGS coordinates in BED format.
+#' @param add.annotation.data A logical indicating whether an additional annotation data column should be added to the annotation column.
+#' @param annotation.column.index An integer indicating the index of the column in the data frames in pgs.bed.list that should be added to the annotation column.
+#' @param slop An integer indicating the number of base pairs to add to the BED interval on either side.
+#' @return A data.frame containing the merged PGS coordinates in BED format with an extra annotation column containing the name of the PGS and data from one additional column optionally selected by the user.
+#' @export
+merge.pgs.bed <- function(pgs.bed.list, add.annotation.data = FALSE, annotation.column.index = 4, slop = 0) {
+
+    # check that pgs.bed.list is a named list
+    if (!is.list(pgs.bed.list) | is.null(names(pgs.bed.list))) {
+        stop('pgs.bed.list must be a named list');
+        }
+
+    # check that all elements of pgs.bed.list are data.frames
+    if (!all(sapply(pgs.bed.list, is.data.frame))) {
+        stop('all elements of pgs.bed.list must be data.frames');
+        }
+
+    # check that all elements of pgs.bed.list have the same column names
+    if (!all(sapply(pgs.bed.list, function(x) all(colnames(x) == colnames(pgs.bed.list[[1]]))))) {
+        stop('all elements of pgs.bed.list must have the same column names');
+        }
+
+    # check that each element of pgs.bed.list is in BED format (chr, start, end)
+    if (!all(sapply(pgs.bed.list, function(x) all( c('chr', 'start', 'end') %in% colnames(x))))) {
+        stop('all elements of pgs.bed.list must have columns named chr, start, and end');
+        }
+
+    # check that all intervals specified in pgs.bed.list are one bp in length
+    if (!all(sapply(pgs.bed.list, function(x) all(x$start == x$end - 1)))) {
+        stop('all intervals specified in pgs.bed.list must represent one SNP and be one bp in length');
+        }
+
+
+    if (add.annotation.data) {
+        # check that annotation.column.index is within the range of the number of columns in the data.frames in pgs.bed.list
+        if (annotation.column.index > ncol(pgs.bed.list[[1]])) {
+            stop('annotation.column.index must be within the range of the number of columns in the data.frames in pgs.bed.list');
+            }
+    }
+
+    # Annotate each PGS BED file with the name of the PGS
+    for (i in 1:length(pgs.bed.list)) {
+        pgs.bed.list[[i]]$annotation <- names(pgs.bed.list)[i];
+        }
+
+    # concatenate all BED files in list
+    concatenated.bed <- do.call(rbind, pgs.bed.list);
+
+    # add requested annotation data to annotation column
+
+    if (add.annotation.data) {
+        concatenated.bed$annotation <- paste(concatenated.bed[ , annotation.column.index], concatenated.bed$annotation, sep = '|');
+        }
+
+    # sort by chromosome and position
+    concatenated.bed <- concatenated.bed[order(concatenated.bed$chr, concatenated.bed$start), ];
+
+    # merge overlapping SNP intervals and combine overlapping annotations using aggregate
+    merged.bed <- aggregate(annotation ~ chr + start + end, data = concatenated.bed, FUN = paste, collapse = ',');
+
+    # sort by chromosome and position
+    merged.bed <- merged.bed[order(merged.bed$chr, merged.bed$start), ];
+
+    # add slop
+    if (slop > 0) {
+        merged.bed <- add.slop(bed = merged.bed, slop = slop);
+        }
+
+    # return merged BED file
+    return(merged.bed);
+    }
diff --git a/inst/generate.test.data.R b/inst/generate.test.data.R
@@ -0,0 +1,85 @@
+# create data frame of imported PGS weight file header (for testing)
+PGS00662.metadata <- data.frame(
+    key = c(
+        'format_version',
+        'pgs_id',
+        'pgs_name',
+        'trait_reported',
+        'trait_mapped',
+        'trait_efo',
+        'genome_build',
+        'variants_number',
+        'weight_type',
+        'pgp_id',
+        'citation',
+        'HmPOS_build',
+        'HmPOS_date',
+        'HmPOS_match_chr',
+        'HmPOS_match_pos'
+        ),
+    value = c(
+        '2.0',
+        'PGS000662',
+        'GRS.PCa.269',
+        'Prostate Cancer',
+        'prostate carcinoma',
+        'EFO_0001663',
+        'GRCh37',
+        '269',
+        'beta',
+        'PGP000122',
+        'Conti DV, Darst BF et al. Nat Genet (2020). doi:10.1038/s41588-020-00748-0',
+        'GRCh38',
+        '2022-07-29',
+        '{"True": null, "False": null}',
+        '{"True": null, "False": null}'
+        )
+    );
+
+
+# save .Rda file of import function test data
+import.test.data <- list(
+    PGS00662.metadata = PGS00662.metadata
+    );
+
+save(
+    import.test.data,
+    file = 'tests/testthat/data/import.test.data.Rda'
+    );
+
+# create data frame of test PGS coordinates (for testing)
+tiny.pgs.test.data <- data.frame(
+    CHROM = c('1', '2', 'chr3', 'chr4', 'chr23', 'chr24', 'chrX', 'chrY'),
+    POS = c(1, 10, 100, 1e3, 1e4, 1e5, 1e6, 1e7),
+    effect_allele = c('A', 'T', 'C', 'G', 'A', 'T', 'C', 'G'),
+    effect_weight = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, NA, NA),
+    beta = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, NA, NA)
+    );
+
+# save .Rda file of tiny PGS test data
+save(
+    tiny.pgs.test.data,
+    file = 'tests/testthat/data/tiny.pgs.test.data.Rda'
+    );
+
+# create list of data frames of test BED coordinates (for testing)
+tiny.bed.test.data <- simple.test.input <- list(
+    pgs1 = data.frame(
+        chr = c('chr1', 'chr1', 'chr2', 'chr2'),
+        start = c(1, 2, 3, 4),
+        end = c(2, 3, 4, 5),
+        overlap = c('no overlap with pgs2', 'overlap with pgs2', 'overlap with pgs2', 'no overlap with pgs2')
+        ),
+    pgs2 = data.frame(
+        chr = c('chr1', 'chr1', 'chr2', 'chr3'),
+        start = c(5, 2, 3, 4),
+        end = c(6, 3, 4, 5),
+        overlap = c('no overlap with pgs1', 'overlap with pgs1', 'overlap with pgs1', 'no overlap with pgs1')
+        )
+    );
+
+# save .Rda file of tiny BED test data
+save(
+    tiny.bed.test.data,
+    file = 'tests/testthat/data/tiny.bed.test.data.Rda'
+    );
diff --git a/man/convert.pgs.to.bed.Rd b/man/convert.pgs.to.bed.Rd
diff --git a/man/merge.pgs.bed.Rd b/man/merge.pgs.bed.Rd
diff --git a/tests/testthat/data/tiny.bed.test.data.Rda b/tests/testthat/data/tiny.bed.test.data.Rda
diff --git a/tests/testthat/data/tiny.pgs.test.data.Rda b/tests/testthat/data/tiny.pgs.test.data.Rda