Skip to content

Commit

Permalink
vigentteCentre
Browse files Browse the repository at this point in the history
  • Loading branch information
hannesbecher committed Jan 6, 2025
1 parent 38882a3 commit c88ab50
Show file tree
Hide file tree
Showing 11 changed files with 338 additions and 27 deletions.
1 change: 1 addition & 0 deletions randPedPCA/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
inst/doc
5 changes: 4 additions & 1 deletion randPedPCA/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: randPedPCA
Type: Package
Title: Fast PCA for large pedigrees
Version: 0.9.0
Version: 0.9.1
Authors@R: c(
person("Hanbin", "Lee", email = "hblee@umich.edu", role = "aut"),
person("Hannes", "Becher", email = "h.becher@ed.ac.uk", role = "cre"),
Expand All @@ -21,6 +21,9 @@ Depends:
pedigreeTools,
R (>= 3.5)
Suggests:
knitr,
rmarkdown,
testthat (>= 3.0.0)
RoxygenNote: 7.3.2
Config/testthat/edition: 3
VignetteBuilder: knitr
66 changes: 47 additions & 19 deletions randPedPCA/R/rppca.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,27 @@
#' @param rank `integer` how many principal components to return
#' @param depth `integer` number of iterations for generating the range matrix
#' @param numVectors `integer` > `rank` to specify the oversampling for the
#' @param cent `logical` whether or not to (implicitly) centre the additive
#' relationship matrix
#'
#' @return The range matrix for `randSVD`
#' @export
#'
#' @importFrom spam backsolve
#' @importFrom spam forwardsolve
#' @importFrom stats rnorm
randRangeFinder <- function(L, rank, depth, numVectors){
randRangeFinder <- function(L, rank, depth, numVectors, cent=F){
dim <- nrow(L)
testVectors <- rnorm(n = dim * numVectors)
testMatrix <- matrix(testVectors, nrow = dim, ncol = numVectors)
Q <- testMatrix
for (i in 1:depth){
qrObject <- base::qr(Q)
Q <- qr.Q(qrObject)
# Q <- apply(Q, 2, function(col) col - mean(col))
if(cent) Q <- apply(Q, 2, function(col) col - mean(col))
Q <- spam::backsolve(t(L),Q)
Q <- spam::forwardsolve(L,Q)
# Q <- apply(Q, 2, function(col) col - mean(col))
if(cent) Q <- apply(Q, 2, function(col) col - mean(col))
}
qrObject <- qr(Q)
Q <- qr.Q(qrObject)
Expand All @@ -42,19 +44,21 @@ randRangeFinder <- function(L, rank, depth, numVectors){
#' @param rank `integer` how many principal components to return
#' @param depth `integer` number of iterations for generating the range matrix
#' @param numVectors `integer` > `rank` to specify the oversampling for the
#' @param cent `logical` whether or not to (implicitly) centre the additive
#' relationship matrix
#'
#' @return A list of three: u (=U), d (=Sigma), and v (=W^T)
#' @export
#'
#' @importFrom spam backsolve
randSVD <- function(L, rank, depth, numVectors){
randSVD <- function(L, rank, depth, numVectors, cent=F){
# L: lower cholesky factor of animal matrix
# rank: number of PCs
# depth: power iteration, higher -> more accurate approximation, ~5 is usually sufficient
# numVectors: usually rank + 5~10, must be larger than rank
dim <- nrow(L)
Q <- randRangeFinder(L, rank, depth, numVectors)
# Q <- apply(Q, 2, function(col) col - mean(col))
Q <- randRangeFinder(L, rank, depth, numVectors, cent=cent)
if(cent) Q <- apply(Q, 2, function(col) col - mean(col))
C <- t(backsolve(t(L), Q))
svdObject <- svd(C)
U <- Q %*% svdObject$u
Expand Down Expand Up @@ -113,6 +117,8 @@ randTraceHutchinson <- function(L, numVectors){
#' range matrix
#' @param totVar `scalar` (optional) the total variance, required for
#' computation of variance proportions when using an L-inverse matrix a input
#' @param center `logical` whether or not to (implicitly) centre the additive
#' relationship matrix
#' @param ... optional arguments passed to methods
#'
#' @details
Expand All @@ -131,7 +137,7 @@ randTraceHutchinson <- function(L, numVectors){
#' * vProp, the estimated variance proportions accounted for by each PC.
#' Only returned if `totVar` is set.
#' * scale, always `FALSE`
#' * center, always `FALSE`
#' * center, `logical` whether or not the implicit input matrix was centred
#' * rotation, the right singular values of the (implicit) relationship matrix.
#' Only returned if `returnRotation == TRUE`
#' * varProps, proportion of the total variance explained by each PC. Only
Expand Down Expand Up @@ -159,12 +165,13 @@ rppca.spam <- function(pdg,
depth=3,
numVectors=15,
totVar=NULL,
center=F,
...){
#check L is the right kind of sparse matrix
returnRotation=TRUE
nn <- dim(pdg)[1]
if(method=="randSVD"){
rsvd = randSVD(pdg, rank=rank, depth=depth, numVectors=numVectors)
rsvd = randSVD(pdg, rank=rank, depth=depth, numVectors=numVectors, cent=center)
scores = rsvd$u %*% diag(rsvd$d)
dimnames(scores) <- list(NULL, paste0("PC", 1:rank))

Expand All @@ -173,7 +180,7 @@ rppca.spam <- function(pdg,

pc <- list(x= scores,
sdev=stdv / sqrt(max(1, nn-1)),
center=FALSE,
center=center,
scale=FALSE
)

Expand Down Expand Up @@ -203,6 +210,8 @@ rppca.pedigree <- function(pdg,
rank=10,
depth=3,
numVectors=15,
totVar=NULL,
center=F,
...){
#check L is the right kind of sparse matrix
returnRotation=TRUE
Expand All @@ -213,25 +222,44 @@ rppca.pedigree <- function(pdg,
# get number of inds
nn <- dim(L)[1]
# total var is sum of (inbreeding coefs + 1)
totVar <- sum(inbreeding(pdg) + 1)
if(center==F) {
if(!missing(totVar)){
warning("Using specified value of ", totVar, " a the total variance
instead of the value computed from the pedigree, which was ",
sum(inbreeding(pdg) + 1))
} else {
totVar <- sum(inbreeding(pdg) + 1)
}
}


# get inbreeding+1 ->total variance

# return var components by default

if(method=="randSVD"){
rsvd = randSVD(L, rank=rank, depth=depth, numVectors=numVectors)
rsvd = randSVD(L, rank=rank, depth=depth, numVectors=numVectors, cent=center)
scores = rsvd$u %*% diag(rsvd$d)
dimnames(scores) <- list(NULL, paste0("PC", 1:rank))
stdv <- sqrt(rsvd$d)
names(stdv) <- paste0("PC", 1:length(stdv))
vp <- stdv^2/totVar
names(vp) <- paste0("PC", 1:length(vp))
pc <- list(x= scores,
sdev=stdv / sqrt(max(1, nn-1)),
varProps=vp,
center=FALSE,
scale=FALSE
)
if(!is.null(totVar)){
vp <- stdv^2/totVar
names(vp) <- paste0("PC", 1:length(vp))
pc <- list(x= scores,
sdev=stdv / sqrt(max(1, nn-1)),
varProps=vp,
center=center,
scale=FALSE
)
} else {
pc <- list(x= scores,
sdev=stdv / sqrt(max(1, nn-1)),
center=center,
scale=FALSE
)
}


# return rotation only if requested
if(returnRotation) pc$rotation <- t(pc$x)
Expand Down
5 changes: 4 additions & 1 deletion randPedPCA/man/randRangeFinder.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion randPedPCA/man/randSVD.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 21 additions & 3 deletions randPedPCA/man/rppca.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 11 additions & 2 deletions randPedPCA/stuff/compareSVD.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,24 @@ svdLI <- randSVD(pedLInv, 10, 3, 15)
svdAA$d[1:10]
svdLI$d[1:10]

svdAA$u[100:110,1]
svdLI$u[100:110,1]


pcL1 <- prcomp(t(L1), center = F)
pcL1c <- prcomp(t(L1), center = T)
pcLI <- rppca(pedLInv)
pcLIc <- rppca(pedLInv, cent=T)

pcL1$sdev[1:10]
pcL1c$sdev[1:10]
pcLI$sdev[1:10]
pcLIc$sdev[1:10]

plot(pcL1$x[,1:2])
plot(pcL1c$x[,1:2], main="Pedigree, with centered L")
plot(pcLI$x[,1:2], main="Pedigree PCA (default)")
plot(pcLIc$x[,1:2], main="Pedigree PCA (centred)")

pcGcent <- prcomp(pedGeno, center=T)
pcGnocent <- prcomp(pedGeno, center=F)
Expand All @@ -52,15 +59,17 @@ L2 <- getL(ped2)
pcL2 <- prcomp(t(L2), center = F)
pcL2c <- prcomp(t(L2), center = T)
pcLI2 <- rppca(pedLInv2)
pcLI2c <- rppca(pedLInv2, cent=T)


pcG2cent <- prcomp(pedGeno2, center=T)
pcG2nocent <- prcomp(pedGeno2, center=F)
#pcG2cent <- prcomp(pedGeno2, center=T) # not included in the package anymore as too large
#pcG2nocent <- prcomp(pedGeno2, center=F) # not included in the package anymore as too large

# plotting
plot(pcL2c$x[,1:2], main="Pedigree2, with centered L")
plot(pcL2$x[,1:2], main="Pedigree2, with un-centered L")
plot(pcLI2$x[,1:2], main="Pedigree2 PCA (default)")
plot(pcLI2c$x[,1:2], main="Pedigree2 PCA (centred)")

plot(pcG2nocent$x[,1:2], main="SNP2 PCA, not centered")
plot(pcG2cent$x[,1:2], main="SNP2 PCA, centered")
Expand Down
55 changes: 55 additions & 0 deletions randPedPCA/stuff/implicit_centring.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# code to demonstrate the expected outcome of centring L^(-1)

# generate pedigree from the metadata of the 2nd test dataset (two diverged populations)
ped2 <- pedigree(pedMeta2$fid,
pedMeta2$mid,
pedMeta2$id)
# get L
L2 <- getL(ped2)

# run naive PCA
pcL2 <- prcomp(t(L2), center = F, scale.=F)
# run naive PCA, this time centring L (which is equivalent to centring A)
pcL2c <- prcomp(t(L2), center = T, scale.=F)

# now rppca w/o centring
pcLI2 <- rppca(pedLInv2)
# rppca with centring
pcLI2c <- rppca(pedLInv2, center=T)

pcPed2 <- rppca(ped2)
pcPed2c <- rppca(ped2, center=T)

plot(pcL2$x[,1:2], main="Naive, not centred")
plot(pcL2c$x[,1:2], main="Naive, centred")
plot(pcLI2$x[,1:2], main="rppca, not centred") # same as naive, not centred as expected
plot(pcLI2c$x[,1:2], main="rppca, centred") # same pattern as centred naive as expected


plot(pcL2c$x[,1], pcLI2c$x[,1])
plot(pcL2c$x[,2], pcLI2c$x[,2])

summary(lm(pcL2c$x[,1]~pcLI2c$x[,1]))
summary(lm(pcL2c$x[,2]~pcLI2c$x[,2]))
plot(pcL2c$x[,1], pcPed2c$x[,1])
plot(pcL2c$x[,2], pcPed2c$x[,2])

plot(pcLI2c$x[,1], pcPed2c$x[,1])
plot(pcLI2c$x[,2], pcPed2c$x[,2])

pcLI2$sdev
pcLI2$sdev^2

plot(pcL2c$x[,1], pcLI2c$x[,1]/sqrt(2650))
abline(0,1)


plot(pcL2c$x[,2], pcLI2c$x[,2]/sqrt(2650))
abline(0,1)


summary(pcPed2c)
summary(pcPed2)

summary(pcLI2)
summary(pcLI2c)
Loading

0 comments on commit c88ab50

Please sign in to comment.