-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
00bad7f
commit a87ac07
Showing
1 changed file
with
200 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
--- | ||
Title: "MicroArray gene expression network analysis of Mycobacterium tuberculosis (Mtb)" | ||
Authers: "Ibrahim ElZahaby & Luka Lmelias" | ||
Team: "Group_5" | ||
--- | ||
# Libraries required | ||
library(impute) | ||
library(dendextend) | ||
library(minet) | ||
library(RColorBrewer) | ||
|
||
# Read in the file | ||
data <- read.csv('set_5.csv', row.names = 1) | ||
|
||
# explore the data | ||
|
||
# count the number of genes and contions | ||
dim(data) | ||
# get the condition names | ||
colnames(data) | ||
|
||
# Check the size of missing data | ||
missing_data = which(is.na(data), arr.ind = TRUE) # get row and col with NA | ||
|
||
# Impute missing data | ||
data_impute = impute.knn(as.matrix(data)) | ||
data = data_impute$data | ||
after_impute = which(is.na(data), arr.ind = TRUE) | ||
|
||
# Check for data normality. | ||
boxplot(data) # turns out to be too large to interpret, we try QQplot | ||
|
||
qqnorm(data) | ||
|
||
## Explore the PCA spaces | ||
pca_gene = prcomp(as.matrix(data), center = TRUE, scale. = TRUE) | ||
|
||
pca_summary = summary(pca_gene) # we need the importance of each PC | ||
|
||
plot(abs(pca_gene$rotation[,1]),abs(pca_gene$rotation[,2])) | ||
|
||
barplot(abs(pca_gene$x)) # check the distribution of PCs | ||
|
||
|
||
# check the % variance explained by each PC | ||
var_expl = (pca_gene$sdev^2)/sum(pca_gene$sdev^2)*100 | ||
barplot(var_expl, main = 'Variance explained', | ||
xlab = 'componet', | ||
ylab = '% variance explained') | ||
|
||
|
||
dim(pca_gene$x) | ||
|
||
scores = pca_gene$x[,1:4] # we added 4 PCs cause each PC explains only little % variance | ||
|
||
# The fisrt and second PCA | ||
plot(scores[,1:2], main = 'PCA scores of the 1st and 2nd PCs', | ||
xlab = paste0("PC1 (",format(100* pca_summary$importance[2,1],digits=2),"%)"), | ||
ylab = paste0("PC2 (",format(100* pca_summary$importance[2,2],digits=2),"%)"), | ||
pch=21 | ||
) | ||
|
||
# The third and fourth PCA | ||
plot(scores[,3:4], main = 'PCA scores of The 3rd and 4th PCs', | ||
xlab = paste0("PC3 (",format(100* pca_summary$importance[2,3],digits=2),"%)"), | ||
ylab = paste0("PC4 (",format(100* pca_summary$importance[2,4],digits=2),"%)"), | ||
pch=21 | ||
) | ||
|
||
# Heatmap of gene correlation | ||
colors<-brewer.pal(7,"RdBu") | ||
|
||
heatmap(cor(t(data)),symm=TRUE, col=colors, | ||
main = "Heatmap of gene correlation") | ||
legend(x = "topright", legend = c("negative", "medium", "high"), fill = colors, | ||
cex = 0.7) # we could not exactly get the legend colors to show blue when there is positive correlation | ||
|
||
# Hierarchical clustering | ||
distance_gene = dist(data, method = 'euclidean') | ||
hclust_gene = hclust(as.dist(distance_gene), method = 'average') # We could also compare 'average' vs 'complete' if time allows | ||
|
||
# Plot the dendrogram | ||
plot(hclust_gene, main = 'clustering in data space', | ||
xlab = '', sub = 'average hclust distance') # data space | ||
|
||
# Network Inference | ||
data = t(as.matrix(data)) | ||
nbins <- sqrt(nrow(data)) # compute mutual information | ||
|
||
# method 1 clr | ||
clr.net <- minet(data, method="clr", | ||
estimator="mi.empirical", | ||
disc="equalfreq", nbins=nbins) | ||
|
||
clr.net[1:5,1:5] | ||
|
||
# creating network table | ||
clr.th <- clr.net | ||
|
||
diag(clr.th) <- 0 | ||
|
||
clr.th[lower.tri(clr.th )] <- 0 | ||
|
||
clr.th[which(clr.th>0)] <- 1 | ||
xx <- which(clr.th==1, arr.ind=TRUE) | ||
write.table(cbind(rownames(clr.th)[xx[,1]], | ||
rownames(clr.th)[xx[,2]]), | ||
file="clr.csv", col.names=FALSE, | ||
row.names=FALSE, sep=",", quote=FALSE) | ||
length(xx) | ||
|
||
# set fixed threshold | ||
thrsh <- 0.25 | ||
clr.th <- clr.net | ||
clr.th[which(clr.net<= thrsh)] <- 0 | ||
clr.th[which(clr.net> thrsh)] <- 1 | ||
diag(clr.th) <- 0 | ||
length(clr.th) | ||
|
||
#to write to a file (that can be imported in cytoscape) | ||
clr.th[lower.tri(clr.th )] <- 0 | ||
xx <- which(clr.th==1, arr.ind=TRUE) | ||
write.table(cbind(rownames(clr.th)[xx[,1]], | ||
rownames(clr.th)[xx[,2]]), | ||
file="clr_th.csv", col.names=FALSE, | ||
row.names=FALSE, sep="\t", quote=FALSE) | ||
|
||
length(xx) | ||
|
||
# method 2 mrnet | ||
mrnet.net <- minet(data, method="mrnet", | ||
estimator="mi.empirical", | ||
disc="equalfreq", nbins=nbins) | ||
mrnet.net[1:5,1:5]#have a look at the adjacency matrix | ||
|
||
# create the table | ||
mrnet.th <- mrnet.net | ||
diag(mrnet.th) <- 0 | ||
mrnet.th[lower.tri(mrnet.th )] <- 0 | ||
mrnet.th[which(mrnet.th>0)] <- 1 | ||
xx <- which(mrnet.th==1, arr.ind=TRUE) | ||
write.table(cbind(rownames(mrnet.th)[xx[,1]], | ||
rownames(mrnet.th)[xx[,2]]), | ||
file="mrnet.csv", col.names=FALSE, | ||
row.names=FALSE, sep=",", quote=FALSE) | ||
length(xx) | ||
|
||
# fixing the the threshold | ||
thrsh <- 0.1 | ||
mrnet.th <- mrnet.net | ||
mrnet.th[which(mrnet.net<= thrsh)] <- 0 | ||
mrnet.th[which(mrnet.net> thrsh)] <- 1 | ||
diag(mrnet.th) <- 0 | ||
length(mrnet.th) | ||
|
||
#to write to a file (that can be imported in cytoscape) | ||
mrnet.th[lower.tri(mrnet.th )] <- 0 | ||
xx <- which(mrnet.th==1, arr.ind=TRUE) | ||
write.table(cbind(rownames(mrnet.th)[xx[,1]], | ||
rownames(mrnet.th)[xx[,2]]), | ||
file="mrnet_th.csv", col.names=FALSE, | ||
row.names=FALSE, sep=",", quote=FALSE) | ||
length(xx) | ||
|
||
# method 3 Aracne | ||
aracne.net <- minet(data, method="aracne", | ||
estimator="mi.empirical", | ||
disc="equalfreq", nbins=nbins) | ||
aracne.net[1:5,1:5] | ||
|
||
# write table without threshold | ||
aracne.th <- aracne.net | ||
diag(aracne.th) <- 0 | ||
aracne.th[lower.tri(aracne.th )] <- 0 | ||
aracne.th[which(aracne.th>0)] <- 1 | ||
xx <- which(aracne.th==1, arr.ind=TRUE) | ||
write.table(cbind(rownames(aracne.th)[xx[,1]], | ||
rownames(aracne.th)[xx[,2]]), | ||
file="aracne.csv", col.names=FALSE, | ||
row.names=FALSE, sep=",", quote=FALSE) | ||
length(xx) | ||
|
||
# fix threshold | ||
thrsh <- 0.1 | ||
aracne.th <- aracne.net | ||
aracne.th[which(aracne.net<= thrsh)] <- 0 | ||
aracne.th[which(aracne.net> thrsh)] <- 1 | ||
diag(aracne.th) <- 0 | ||
length(aracne.th) | ||
|
||
#to write to a file (that can be imported in cytoscape) | ||
aracne.th[lower.tri(aracne.th )] <- 0 | ||
xx <- which(aracne.th==1, arr.ind=TRUE) | ||
write.table(cbind(rownames(aracne.th)[xx[,1]], | ||
rownames(aracne.th)[xx[,2]]), | ||
file="aracne_th.csv", col.names=FALSE, | ||
row.names=FALSE, sep=",", quote=FALSE) | ||
length(xx) | ||
|
||
|