-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCSHL_Cereals.R
338 lines (283 loc) · 15.6 KB
/
CSHL_Cereals.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# Author: Mon-Ray Shao
# Institute: Donald Danforth Plant Science Center
# Date: October 17, 2019
# Course: CSHL Cereal Genomics (Topp session)
# Description:
# Example R code to perform analyis of root traits from GIA Roots output
# NAM GIA ROOTS -----------------------------------------------------------
# Notes:
# A LemnaTec trial was used to grow seedlings from the Maize NAM founders
# Seedlings were watered under 25%, 50%, 75%, or 100% soil water capacity
# After 24 days, the seedlings were removed from their pots and roots washed
# Root systems were then imaged using a digital camera and processed with GIA roots
# The resulting data was exported and used for this demonstration
# Import Data and Inspect -------------------------------------------------
gia <- read.csv("https://raw.githubusercontent.com/monrayshao/CSHL/master/GIA_CSHL_Cereal.csv", stringsAsFactors = F)
head(gia) # Preview data frame
dim(gia) # Print dimensions of data frame
colnames(gia) # Print column names of data frame
gia$Network.Area # Extract a single column
mean(gia$Network.Area) # Taking the mean
mean(subset(gia, treatment == 50)$Network.Area) # One way to subset data
mean(gia[which(gia$treatment == 50), ]$Network.Area) # Another way to subset data
aggregate(Network.Area ~ genotype + treatment, gia, FUN = mean) # Calculate means according to multiple variables
class(gia$treatment) # R assumes the Treatment column is numeric
gia$treatment <- as.factor(gia$treatment) # To avoid confusion in regression vs classification, convert to factor
plot(gia[, 7:12]) # R easily makes scatterplots
plot(subset(gia, treatment == 100)[, 7:12]) # Combine with subsetting data
# Convert to long data format using tidyr, for ggplot2 plotting
install.packages("tidyr")
library(tidyr)
dataCols <- 7:44 # Save trait columns #'s for convenience
gatherData <- gather(gia, key = "Traits", value = "Value", dataCols) # Gather turns wide to long
head(gatherData); tail(gatherData)
class(gatherData$Value) # Check that all phenotypes converted were indeed numeric, avoid data class mixing
# Make boxplot of every trait with faceted windows
install.packages("ggplot2")
library(ggplot2)
ggplot(gatherData, aes(x = treatment, y = Value)) +
geom_boxplot(aes(color = treatment)) +
facet_wrap(~Traits, scale = "free_y")
# Or jitter plot
ggplot(gatherData, aes(x = treatment, y = Value)) +
geom_jitter(width = 0.1, height = 0, alpha = 0.25, aes(color = treatment)) +
facet_wrap(~Traits, scale = "free_y")
# Data Quality ------------------------------------------------------------
# Histograms
gia25 <- na.omit(subset(gia, treatment == 25)[ ,"Network.Area"])
hist(gia25, main = "Network Area") # Try for one trait
# When plotting multiple distributions in same plot, density curves are easier to see
plot(density(gia25), main = "Network Area, 25%")
# Use a loop to make a density plot of every trait
par(mfrow = c(3,3)) # Make 3x3 plots per page
for (i in dataCols){
gia25 <- na.omit(subset(gia, treatment == 25)[,i]) # Subset the treatments for each trait
gia50 <- na.omit(subset(gia, treatment == 50)[,i])
gia75 <- na.omit(subset(gia, treatment == 75)[,i])
gia100 <- na.omit(subset(gia, treatment == 100)[,i])
ymax <- max(density(gia25)$y, density(gia50)$y, density(gia75)$y, density(gia100)$y) # Establish y-axis for each trait
plot(density(na.omit(gia[,i])), main = colnames(gia)[i], col = NA, ylim=c(0, ymax)) # Plot empty frame of right size
polygon(density(gia25), col = rgb(1, 0, 0, 0.25)) # Now plot each density curve
polygon(density(gia50), col = rgb(1, 1, 0, 0.25)) # With rgb() you can set transparency
polygon(density(gia75), col = rgb(0, 1, 0, 0.25))
polygon(density(gia100), col = rgb(0, 0, 1, 0.25))
}
par(mfrow = c(1,1)) # Go back to 1 plot per page
# QQ Plots
par(mfrow = c(3,3))
for (i in dataCols){
gia25 <- na.omit(subset(gia, treatment == 25)[,i]) # Subset the treatments for each trait
gia50 <- na.omit(subset(gia, treatment == 50)[,i])
gia75 <- na.omit(subset(gia, treatment == 75)[,i])
gia100 <- na.omit(subset(gia, treatment == 100)[,i])
ymin <- min(qqnorm(gia25, plot.it = FALSE)$y,
qqnorm(gia50, plot.it = FALSE)$y,
qqnorm(gia50, plot.it = FALSE)$y,
qqnorm(gia100, plot.it = FALSE)$y)
ymax <- max(qqnorm(gia25, plot.it = FALSE)$y,
qqnorm(gia50, plot.it = FALSE)$y,
qqnorm(gia50, plot.it = FALSE)$y,
qqnorm(gia100, plot.it = FALSE)$y) # Establish y-axis for each trait
qqnorm(na.omit(gia[,i]), main = colnames(gia)[i], col = NA, ylim = c(ymin, ymax)) # Plot empty frame of right size
points(qqnorm(gia25, plot.it = FALSE), col = "red") # Now make each QQ plot
points(qqnorm(gia50, plot.it = FALSE), col = "orange")
points(qqnorm(gia75, plot.it = FALSE), col = "green")
points(qqnorm(gia100, plot.it = FALSE), col = "blue")
qqline(gia25, col = "red")
qqline(gia50, col = "orange")
qqline(gia75, col = "green")
qqline(gia100, col = "blue")
}
par(mfrow = c(1,1))
# Outliers, univariate example
subgia <- subset(gia, treatment == 25) # Just subset this as example
iqr <- IQR(subgia[, "Network.Area"]) # Just take Network.Area as example
perc25 <- quantile(subgia[,"Network.Area"])['25%'] # Calculate 1st quantile
perc75 <- quantile(subgia[,"Network.Area"])['75%'] # Calculate 3rd quantile
# Find outliers using the 1.5x IQR method
outlier <- which(subgia[,"Network.Area"] > perc75 + 1.5*iqr | subgia[,"Network.Area"] < perc25 - 1.5*iqr)
print(subgia[outlier,])
# Multivariate approach, one example
# Subset each treatment without missing data (function won't accept)
gia25 <- na.omit(subset(gia, treatment == 25))
gia50 <- na.omit(subset(gia, treatment == 50))
gia75 <- na.omit(subset(gia, treatment == 75))
gia100 <- na.omit(subset(gia, treatment == 100))
# Calculate Mahalanobis distance for each treatment
dist25 <- mahalanobis(gia25[ ,dataCols], center = colMeans(gia25[ ,dataCols]), cov = cov(gia25[ ,dataCols]), tol = 1e-20)
dist50 <- mahalanobis(gia50[ ,dataCols], center = colMeans(gia50[ ,dataCols]), cov = cov(gia50[ ,dataCols]), tol = 1e-20)
dist75 <- mahalanobis(gia75[ ,dataCols], center = colMeans(gia75[ ,dataCols]), cov = cov(gia75[ ,dataCols]), tol = 1e-20)
dist100 <- mahalanobis(gia100[ ,dataCols], center = colMeans(gia100[ ,dataCols]), cov = cov(gia100[ ,dataCols]), tol = 1e-20)
# Find which data points are beyond a given threshold
outliers25 <- which(dist25 > qchisq(0.999, df = length(dataCols)))
outliers50 <- which(dist50 > qchisq(0.999, df = length(dataCols)))
outliers75 <- which(dist75 > qchisq(0.999, df = length(dataCols)))
outliers100 <- which(dist100 > qchisq(0.999, df = length(dataCols)))
# Show these outliers, are they of the same genotype or not?
print(gia25[outliers25,]$genotype)
print(gia50[outliers50,]$genotype)
print(gia75[outliers75,]$genotype)
print(gia100[outliers100,]$genotype)
# Make list of outlier barcodes for removal
bad <- c(gia25[outliers25, ]$plantbarcode,
gia50[outliers50, ]$plantbarcode,
gia75[outliers75, ]$plantbarcode,
gia100[outliers100, ]$plantbarcode)
# Remove outliers
giaFiltered <- gia[-which(gia$plantbarcode %in% bad), ]
dim(giaFiltered)
# Trait Significance ------------------------------------------------------
# Correlations
install.packages("corrplot")
library(corrplot)
gia2 <- na.omit(giaFiltered) # Use no-missing data set for simplicity
corrmatrix <- cor(gia2[, dataCols], method = "spearman") # Calculate Spearman correlations
corrplot(corrmatrix, tl.cex = 0.5, tl.col = "black", cl.cex = 0.5) # Plot correlation matrix
corrplot(corrmatrix, tl.cex = 0.5, tl.col = "black", cl.cex = 0.5, order = "hclust", hclust.method = "complete")
# Heritability
install.packages("plyr")
install.packages("lme4")
library(plyr)
library(lme4)
# Make empty data frame to hold results
VarComp <- data.frame(Trait = colnames(giaFiltered)[dataCols],
G = NA, E = NA, GxE = NA, error = NA)
for (i in dataCols){
form <- paste0( colnames(giaFiltered)[i], "~ 1 + (1|genotype) + (1|treatment) + (1|genotype:treatment)" )
mod <- lmer( as.formula(form) , data = giaFiltered) # Run random effect model to extract variances
# Error catcher below, we will remove traits whose models did not converge
tryCatch( {lmer( as.formula(form) , data = giaFiltered)}, warning = function(w) print(colnames(giaFiltered)[i]))
result <- as.data.frame(VarCorr(mod)) # Extract variance components
varTotal <- sum(result$vcov)
G <- result[ which(result$grp == "genotype"), ]$vcov
E <- result[ which(result$grp == "treatment"), ]$vcov
GxE <- result[ which(result$grp == "genotype:treatment"), ]$vcov
error <- result[ which(result$grp == "Residual"), ]$vcov
VarComp[i-6,]$G <- G / varTotal # Parse variance components into proper place in data frame
VarComp[i-6,]$E <- E / varTotal
VarComp[i-6,]$GxE <- GxE / varTotal
VarComp[i-6,]$error <- error / varTotal
}
questionable <- c("Maximum.Number.of.Roots", "Minor.Ellipse.Axis", "Network.Perimeter", "Specific.Root.Length_Excised")
VarComp <- VarComp[-which(VarComp$Trait %in% questionable), ] # Remove these traits whose models did not run properly
environments <- 4
replicates <- plyr::count(giaFiltered$genotype)
avgReplicates <- mean(replicates$freq)
# Calculate broad-sense heritability
VarComp$heritability <- VarComp$G / (VarComp$G + VarComp$GxE/environments + VarComp$error/avgReplicates)
# Re-ordering factor levels; can be a pain!
VarComp <- VarComp[rev(order(VarComp$heritability)),]
VarComp$Trait <- as.factor(VarComp$Trait)
ord <- match( VarComp[rev(order(VarComp$heritability)),]$Trait, levels(VarComp$Trait))
VarComp$Trait <- factor(VarComp$Trait, levels(VarComp$Trait)[ord])
install.packages("viridis")
library(viridis)
# Plot heritability estimates
ggplot(VarComp, aes(x = Trait, y = heritability)) +
geom_bar(stat = "identity", color = "black", aes(fill = heritability)) + ylim(0,1) +
labs(x = "", y = "Broad-Sense Heritability\n") + guides(fill = FALSE) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_viridis(option = "magma")
# Reverse levels to make nice horizontal plot
VarComp$Trait <- factor(VarComp$Trait, rev(levels(VarComp$Trait)))
ggplot(VarComp, aes(x = Trait, y = heritability)) +
geom_bar(stat = "identity", color = "black", aes(fill = heritability)) + ylim(0,1) +
labs(x = "", y = "\nBroad-Sense Heritability") + guides(fill = FALSE) +
scale_fill_viridis(option = "magma") + coord_flip() + theme_classic()
# Analysis of Variance for factor significance
# Make empty data frame
anovaRes <- data.frame(Trait = colnames(giaFiltered)[dataCols],
G = NA, E = NA, GxE = NA)
for (i in dataCols){
form <- paste0( colnames(giaFiltered)[i], "~ genotype + treatment + genotype:treatment" )
mod <- lm( as.formula(form) , data = giaFiltered) # Run linear model
result <- as.data.frame(anova(mod))
G <- result[ which(rownames(result) == "genotype"), ]$'Pr(>F)' # Extract ANOVA results for each factor
E <- result[ which(rownames(result) == "treatment"), ]$'Pr(>F)'
GxE <- result[ which(rownames(result) == "genotype:treatment"), ]$'Pr(>F)'
anovaRes[i-6,]$G <- G # Parse results into correct place in data frame
anovaRes[i-6,]$E <- E
anovaRes[i-6,]$GxE <- GxE
}
anovaRes$G_adj <- p.adjust(anovaRes$G, method = "fdr") # Adjust p-values!
anovaRes$E_adj <- p.adjust(anovaRes$E, method = "fdr")
anovaRes$GxE_adj <- p.adjust(anovaRes$GxE, method = "fdr")
gatherAnova <- gather(anovaRes, key = "Factor", value = "p.value", 5:7) # Convert to long table for ggplot
gatherAnova$log_p.value <- -log10(gatherAnova$p.value) # Plot p-values using -log10
# Plot heatmap of ANOVA results
ggplot(gatherAnova, aes(x = Trait, y = Factor)) +
geom_tile(aes(fill = log_p.value)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "top") +
scale_fill_viridis(name = "-log10(p-value)")
# Principal Component Analysis --------------------------------------------
# Using prcomp()
gia.scaled <- scale(gia2[, dataCols], center = T, scale = T) # Since we have different types of features, scale the data
gia.pca <- prcomp(gia.scaled) # Run PCA function
screeplot(gia.pca, type = "lines") # Shows how much variance each PC explains
summary(gia.pca)$importance
# Parse the results for plotting
gia.pci <- data.frame(gia.pca$x)
gia.pci$genotype <- gia2$genotype
gia.pci$treatment <- gia2$treatment
gia.pci$treatment <- as.factor(gia.pci$treatment)
# Plot subset of PCA results
ggplot(subset(gia.pci, treatment == 25 | treatment == 100),
aes(x = PC1, y = PC2)) +
geom_point(aes(shape = treatment, color = treatment, fill = treatment)) +
theme_classic()
# Plot full PCA results
ggplot(gia.pci, aes(x = PC1, y = PC2)) +
geom_point(aes(shape = treatment, color = treatment, fill = treatment)) +
stat_ellipse(type = "t", geom = "polygon", alpha = 0.25, aes(fill = treatment), color = NA) +
labs(x = paste0("\nPC1 (", 100*summary(gia.pca)$importance[2,1], "% Variance)"),
y = paste0("PC2 (", 100*summary(gia.pca)$importance[2,2], "% Variance)\n")) +
scale_shape_manual(values = c(21 ,22, 23, 24)) +
theme_classic()
# PC1 seems to separate treatments
# PC2 separates samples within each treatment -- possibly genotype?
# What traits contribute to each axis?
sort(gia.pca$rotation[,1]) # size-related traits
sort(gia.pca$rotation[,2]) # ratios, excised roots
biplot(gia.pca, cex = 0.75)
# Classification ----------------------------------------------------------
# Choose 80% of rows for training data
set.seed(100) # Set a seed so you get the same results each time you run on your computer
train <- sample(nrow(gia2), size = 0.8 * nrow(gia2), replace = FALSE) # Random observations for sample training set
# Random forest
install.packages("randomForest")
install.packages("rminer")
library(randomForest)
library(rminer)
rfData <- gia2[, c(5, dataCols)] # Only want treatment ID's and trait data
rfData$treatment <- as.factor(rfData$treatment) # Make sure treatment is data type "factor", not numeric!
trainData <- rfData[train,] # Separate training data
testData <- rfData[-train,] # Separate test data
set.seed(999) # Set seed again
rfModel <- randomForest(treatment ~ ., data = trainData, ntree = 5000, proximity = TRUE) # Run random forest
rfPredict <- predict(rfModel, testData) # Predict test data using random forest model
mmetric(testData$treatment, rfPredict, "ACC") # Check accuracy on test data
varImpPlot(rfModel, n.var = length(dataCols), sort = T) # Check variable importance
# Plot "visualization" of random forest
gia.mds <- data.frame(cmdscale(d = (1 - rfModel$proximity), k = 2)) # Calculate distances from proximity matrix
gia.mds$treatment <- as.factor(gia2[train, ]$treatment) # Parse treatment info
ggplot(gia.mds, aes(x = X1, y = X2)) +
geom_point(aes(color = treatment, shape = treatment)) +
stat_ellipse(type = "t", geom = "polygon", alpha = 0.25, aes(fill = treatment), color = NA) +
theme_classic()
# Clustering --------------------------------------------------------------
# Hierarchical clustering example for Network.Area
area <- aggregate(Network.Area ~ genotype + treatment, gia2, FUN = mean) # Need to aggregate results for this
areaTable <- spread(area, key = "treatment", value = "Network.Area") # Convert to wide data for plotting
areaTable <- na.omit(areaTable) # Remove missing data for now
rownames(areaTable) <- areaTable$genotype # Add labels to rownames
areaTable <- as.matrix(areaTable[, -1]) # Don't want to plot this
d <- dist(areaTable, method = "euclidean") # Calculate distances
hc <- hclust(d, method = "complete") # Run agglomeration
plot(hc) # Plot clustering results
install.packages("gplots")
library(gplots)
# Plot clustering heatmap (separates the treatments so we can see)
heatmap.2(t(areaTable), Rowv = FALSE, dendrogram = "column",
margins = c(5, 7), trace = "none", col = "cm.colors",
main = "Network Area")