-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPCAfactomine.R
119 lines (118 loc) · 5.87 KB
/
PCAfactomine.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
library(FactoMineR)
library("RColorBrewer")
library("dplyr")
library(pvclust)
#source("~/genomedk/matovanalysis/umiseq_analysis/R/read_bed.R")
source("~/genomedk/PolyA/faststorage/BACKUP/N140_Targeting/specs/umiseq_paper/R/read_bed.R")
setwd ('~/genomedk/PolyA/faststorage/BACKUP/N140_Targeting/specs/specs_analysis')
source("sw_input_files/duplex_tools.R")
# 45 Subjects of the Control Panel of Normal PON ####################################################################
#pon_obj2 <- readRDS("sw_input_files/201020_hg38-novaseq-xgen-sporacrc-pon.RDS") #
pon_obj2 <- readRDS("~/genomedk/PolyA/faststorage/BACKUP/N140_Targeting/specs/umiseq_paper/reference/201217_hg38-novaseq-xgen-sporacrc-pon.RDS") # 46
no1 <- pon_obj2[["pon"]] # counts
no <- no1[,,1:4] + no1[,,6:9]
no2 = array(0, dim=c(dim(no)[1],dim(no)[2]*dim(no)[3]))
mafsP1 = array(0, dim=c(dim(no)[1],dim(no)[2],dim(no)[3]))
auxMP <- rowSums(no, dims = 2)
for (i in 1:dim(no)[1]) {
mafsP1[i,,] <- no[i,,]/auxMP[i,]
}
mafsP2 = array(0, dim=c(dim(mafsP1)[1],dim(mafsP1)[2]*dim(mafsP1)[3]))
for (i in 1:dim(no)[1]) {
no2[i,] = no[i,,]
mafsP2[i,] = mafsP1[i,,]
}
#res.pca = PCA(no2, scale.unit=TRUE, ncp=5, graph=T)
res.pca = PCA(mafsP2, scale.unit=TRUE, ncp=5, graph=T)
dim(mafsP2) #45 72376
# QIAGEN healthy samples ############################################################################################
pileupsQ <- list.files("~/genomedk/PolyA/faststorage/BACKUP/N140_Targeting/qiagen_kit_test/201019", recursive = T, full.names = T, pattern = "bait.pileup")
pon_hg19 <- readRDS("~/genomedk/PolyA/faststorage/BACKUP/IMPROVE/call/references/200419_novaseq-xgen-sporacrc-pon.RDS")
countsQ0 <- piles_to_counts(files = pileupsQ, regions = pon_hg19$regions)
countsQ1 <- countsQ0[,,1:4] + countsQ0[,,6:9]
mafsQ1 = array(0, dim=c(dim(countsQ1)[1],dim(countsQ1)[2],dim(countsQ1)[3]))
auxMQ <- rowSums(countsQ1, dims = 2)
for (i in 1:dim(countsQ1)[1]) {
mafsQ1[i,,] <- countsQ1[i,,]/auxMQ[i,]
}
noQ = array(0, dim=c(dim(countsQ1)[1],dim(countsQ1)[2]*dim(countsQ1)[3]))
mafsQ2 = array(0, dim=c(dim(mafsQ1)[1],dim(mafsQ1)[2]*dim(mafsQ1)[3]))
for (i in 1:dim(countsQ1)[1]) {
noQ[i,] = countsQ1[i,,]
mafsQ2[i,] = mafsQ1[i,,]
}
# FactoMineR: AnRPackage for Multivariate Analysis
# https://mran.microsoft.com/snapshot/2016-12-29/web/packages/FactoMineR/vignettes/FactoMineR.pdf
#res.pca = PCA(noQ, scale.unit=TRUE, ncp=5, graph=T)
res.pca = PCA(mafsQ2, scale.unit=TRUE, ncp=5, graph=T)
dim(mafsQ2) #24 72376
########################################################
cohort2 <- rbind(mafsP2, mafsQ2)
cohort3 <- rbind(cohort2, mafsI2)
cohort4 <- rbind(cohort3, mafsC2)
dim(cohort2) # 69 72376
res.pca = PCA(cohort2, scale.unit=TRUE, ncp=5, graph=T)
dim(cohort3) # 283 72376
res.pca = PCA(cohort3, scale.unit=TRUE, ncp=5, graph=T)
dim(cohort4) # 373 72376
res.pca = PCA(cohort4, scale.unit=TRUE, ncp=5, graph=T)
### IMPROVE #####
pileupsI <- list.files("~/genomedk/PolyA/faststorage/BACKUP/IMPROVE/sporacrc/N227", recursive = T, full.names = T, pattern = "bait.pileup$")
pon_hg19 <- readRDS("~/genomedk/PolyA/faststorage/BACKUP/IMPROVE/call/references/200419_novaseq-xgen-sporacrc-pon.RDS")
countsI0 <- piles_to_counts(files = pileupsI[1:229], regions = pon_hg19$regions)
countsI00 <- array(0, dim=c(dim(countsI0)[1]-12,dim(countsI0)[2],dim(countsI0)[3]))
countsI00[1:20,,]<-countsI0[1:20,,]
countsI00[21:30,,]<-countsI0[22:31,,] # 21
countsI00[31:45,,]<-countsI0[33:47,,] # 32
countsI00[46:49,,]<-countsI0[49:52,,] # 48
countsI00[50:56,,]<-countsI0[54:60,,] # 53
countsI00[57:113,,]<-countsI0[62:118,,] # 61
#countsI00[115,,]<-countsI0[120,,] # 119 + #120
countsI00[114:118,,]<-countsI0[123:127,,] # 121 + #122
countsI00[119:130,,]<-countsI0[129:140,,] # 128
countsI00[131:160,,]<-countsI0[142:171,,] # 141
countsI00[161:217,,]<-countsI0[173:229,,] # 172
countsI1 <- countsI00[,,1:4] + countsI00[,,6:9]
# REMOVE #48 (N227-1981), #21 (N227-1972), #53 (N227-1985).
mafsI1 = array(0, dim=c(dim(countsI1)[1],dim(countsI1)[2],dim(countsI1)[3]))
auxMI <- rowSums(countsI1, dims = 2)
for (i in 1:dim(countsI1)[1]) {
mafsI1[i,,] <- countsI1[i,,]/auxMI[i,]
}
mafsI2 = array(0, dim=c(dim(mafsI1)[1],dim(mafsI1)[2]*dim(mafsI1)[3]))
for (i in 1:dim(countsI1)[1]) {
mafsI2[i,] = mafsI1[i,,]
}
res.pca = PCA(mafsI2, scale.unit=TRUE, ncp=5, graph=T)
### CRUK #####
pileupsC <- list.files("~/genomedk/PolyA/faststorage/BACKUP/CRUK/plasma/N289", recursive = T, full.names = T, pattern = "bait.pileup")
pon_hg19 <- readRDS("~/genomedk/PolyA/faststorage/BACKUP/IMPROVE/call/references/200419_novaseq-xgen-sporacrc-pon.RDS")
countsC0 <- piles_to_counts(files = pileupsC[1:90], regions = pon_hg19$regions)
countsC00 <- array(0, dim=c(dim(countsC0)[1]-1,dim(countsC0)[2],dim(countsC0)[3]))
countsC00[1:5,,]<-countsC0[1:5,,]
countsC00[6:89,,]<-countsC0[7:90,,]
countsC1 <- countsC00[,,1:4] + countsC00[,,6:9]
# REMOVE #6 (N289-100)
mafsC1 = array(0, dim=c(dim(countsC1)[1],dim(countsC1)[2],dim(countsC1)[3]))
auxMC <- rowSums(countsC1, dims = 2)
for (i in 1:dim(countsC1)[1]) {
mafsC1[i,,] <- countsC1[i,,]/auxMC[i,]
}
mafsC2 = array(0, dim=c(dim(mafsC1)[1],dim(mafsC1)[2]*dim(mafsC1)[3]))
for (i in 1:dim(countsC1)[1]) {
mafsC2[i,] = mafsC1[i,,]
}
res.pca = PCA(mafsC2, scale.unit=TRUE, ncp=5, graph=T)
###########################################################
# Hierarchical cluster analysis on a set of dissimilarities and methods for analyzing it.
aux <- t(mafsQ2[,1:5])
hc <- hclust(aux)
plot(hc)
# A heatmap is another way to visualize hierarchical clustering; in the colored image, data values are transformed to color scale.
df<-scale(mafsP2[,1:5000])
col <- colorRampPalette(brewer.pal(11, "RdYlBu"))(256)
heatmap(df, scale = "none", col = col)
# Hierarchical Clustering with P-Values via Multiscale BootstrapResampling
# https://cran.r-project.org/web/packages/pvclust/pvclust.pdf
result <- pvclust(t(mafsP2[,1:5000]), method.dist="cor", method.hclust="average", nboot=1000, parallel=TRUE)
plot(result)