Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

develop dosage caller #7

Merged
merged 15 commits into from
Jan 22, 2024
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ Description: This tool is intended to simply and transparently parse
Depends:
R (>= 2.10)
Imports:
vcfR
vcfR,
data.table
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
exportPattern("^[[:alpha:]]+")

import('vcfR');
import('data.table');
55 changes: 55 additions & 0 deletions R/calculate-dosage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#' @title Convert alleles to dosage
#' @description Convert genotype calls in the form of witten out alleles (e.g. 'A/T') to dosages based on provided risk alleles from a PGS (0, 1, or 2).
#' @param called.alleles A vector of genotypes in allelic notation separated by a slash or pipe.
#' @param risk.alleles A vector of risk alleles from a polygenic score corresponding to each genotype (by locus) in called.alleles.
#' @return A vector of dosages corresponding to each genotype in called.alleles.
#' @export
convert.alleles.to.pgs.dosage <- function(called.alleles, risk.alleles) {
# check that risk.alleles is the same length as called.alleles
if (length(called.alleles) != length(risk.alleles)) {
stop('called.alleles and risk.alleles must be the same length.');
}

# check that risk.alleles is a vector of capitalized alphabetic characters
if (!all(grepl('^[A-Z]+$', risk.alleles))) {
stop('unrecognized risk.allele format, must be capitalized letters.');
}

# handle totally missing genotypes
# if the entire vector is NA or the entire vector is '.', return NA
if (all(is.na(called.alleles)) | all(called.alleles == '.')) {
split.alleles <- data.frame(called.alleles, called.alleles);
} else {
# check that called.alleles is a vector of genotypes in allelic notation or '.' separated by a slash or pipe
allowed.pattern <- '^((([A-Z]+|\\.)[/\\|]([A-Z]+|\\.))|\\.)$' # '|' are special chars in regular expressions
if (!all(grepl(allowed.pattern, called.alleles))) {
stop('unrecognized called.alleles format, must be capitalized letters or "." separated by a slash or pipe.');
}
split.alleles <- data.table::tstrsplit(called.alleles, split = c('/|\\|'), keep = c(1,2)); # '|' are special chars in regular expressions
}
names(split.alleles) <- c('called.allele.a', 'called.allele.b');

# replace 'NA' with '.' for easier comparisons
missing.label <- '.';
split.alleles <- lapply(
X = split.alleles,
FUN = function(x) {
x[is.na(x)] <- missing.label;
return(x);
}
);

dosage <- rep(NA, length(called.alleles));
for (i in 1:length(called.alleles)) {
if ((split.alleles$called.allele.a[i] == missing.label) & (split.alleles$called.allele.b[i] == missing.label)) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now this dosage calling algorithm does not explicitly account for a case where one of the alleles is known and the other is missing e.g. A/. or ./A. I don't recall if this is a possible result from variant calling. I would expect that this is not possible, and if it was, I don't really know how I would interpret that result. As it is, the algorithm just counts the number of risk alleles so if A is a risk allele the above examples would result in dosage 1. @tyamaguchi-ucla would you mind chiming in briefly on whether you think a dosage calling algorithm should allow this type of input?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alkaZeltser There are so many variant callers, so it's possible we might encounter this case, although I'm not sure if I've seen one. I suggest we treat any edge cases as warnings (still skip them) and update the package if necessary.

dosage[i] <- NA; # if no genotype was called, return NA
} else if (split.alleles$called.allele.a[i] == risk.alleles[i] & split.alleles$called.allele.b[i] == risk.alleles[i]) {
dosage[i] <- 2; # if both alleles are the risk allele, the genotype is homozygous for the effect allele and the dosage is 2.
} else if (split.alleles$called.allele.a[i] == risk.alleles[i] | split.alleles$called.allele.b[i] == risk.alleles[i]) {
dosage[i] <- 1; # if only one of the alleles is the risk allele, the genotype is heterozygous and the dosage is 1.
} else {
dosage[i] <- 0; # if neither allele is the risk allele, the genotype is homozygous for the non-effect allele and the dosage is 0.
}
}
return(dosage);
}
163 changes: 163 additions & 0 deletions tests/testthat/test-dosage-calculator.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@

test_that(
'convert.alleles.to.pgs.dosage correctly checks input format', {
# check for correct input lengths
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A/T', 'T/T', 'A/A', 'A/T', 'T/T'),
risk.alleles = c('A', 'T', 'A', 'T', 'A', 'T', 'A')
),
'called.alleles and risk.alleles must be the same length.'
);

# check for correct risk.alleles format
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A/T'),
risk.alleles = c('A', 't')
),
'unrecognized risk.allele format, must be capitalized letters.'
);
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A/T'),
risk.alleles = c('A', '1')
),
'unrecognized risk.allele format, must be capitalized letters.'
);
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A/T'),
risk.alleles = c('A', NA)
),
'unrecognized risk.allele format, must be capitalized letters.'
);
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A/T'),
risk.alleles = c('A', '.')
),
'unrecognized risk.allele format, must be capitalized letters.'
);

# check for correct called.alleles format
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A'),
risk.alleles = c('A', 'T')
),
'unrecognized called.alleles format, must be capitalized letters or "." separated by a slash or pipe.'
);
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A,'),
risk.alleles = c('A', 'T')
),
'unrecognized called.alleles format, must be capitalized letters or "." separated by a slash or pipe.'
);
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A-A'),
risk.alleles = c('A', 'T')
),
'unrecognized called.alleles format, must be capitalized letters or "." separated by a slash or pipe.'
);
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'a/A'),
risk.alleles = c('A', 'T')
),
'unrecognized called.alleles format, must be capitalized letters or "." separated by a slash or pipe.'
);
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A/'),
risk.alleles = c('A', 'T', 'A')
),
'called.alleles and risk.alleles must be the same length.'
);
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', '/A'),
risk.alleles = c('A', 'T', 'A')
),
'called.alleles and risk.alleles must be the same length.'
);
expect_error(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', NA),
risk.alleles = c('A', 'T', 'A')
),
'called.alleles and risk.alleles must be the same length.'
);

# check that correct input is accepted
expect_silent(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A|T', 'TA/T', 'A/ATTTT', './.', '.'),
risk.alleles = c('A', 'T', 'A', 'T', 'A', 'T')
)
);
}
)

test_that(
'convert.alleles.to.pgs.dosage calculates dosage correctly from unphased genotypes', {
expect_equal(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/A', 'A/T', 'T/T', 'A/A', 'A/T', 'T/T'),
risk.alleles = c('A', 'T', 'A', 'T', 'A', 'T')
),
c(2, 1, 0, 0, 1, 2)
);
}
);

test_that(
'convert.alleles.to.pgs.dosage calculates dosage correctly from phased genotypes', {
expect_equal(
convert.alleles.to.pgs.dosage(
called.alleles = c('A|A', 'A|T', 'T|T', 'A|A', 'A|T', 'T|T'),
risk.alleles = c('A', 'T', 'A', 'T', 'A', 'T')
),
c(2, 1, 0, 0, 1, 2)
);
}
);

test_that(
'convert.alleles.to.pgs.dosage calculates dosage correctly from short indel alleles', {
expect_equal(
convert.alleles.to.pgs.dosage(
called.alleles = c('A/AT', 'AT/A', 'AT/AT', 'A/AT', 'AT/A', 'AT/AT','A/ATCG', 'ATCG/A', 'AT/ATCG'),
risk.alleles = c('A', 'A', 'A', 'AT', 'AT', 'AT', 'A', 'ATCG', 'A')
),
c(1, 1, 0, 1, 1, 2, 1, 1, 0)
);
}
);


test_that(
'convert.alleles.to.pgs.dosage calculates dosage correctly from missing genotypes', {
expect_equal(
convert.alleles.to.pgs.dosage(
called.alleles = c('./.', './A', 'A/.', '.'),
risk.alleles = c('A', 'A', 'T', 'T')
),
c(NA, 1, 0, NA)
);
}
);

test_that(
'convert.alleles.to.pgs.dosage correctly handles a scenario where no genotypes are provided', {
expect_equal(
convert.alleles.to.pgs.dosage(
called.alleles = c(NA, NA, NA),
risk.alleles = c('A', 'A', 'T')
),
c(NA, NA, NA)
);
}
);
Loading