-
Notifications
You must be signed in to change notification settings - Fork 0
/
05_DGE_with_limma_voom.R
314 lines (239 loc) · 9.38 KB
/
05_DGE_with_limma_voom.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
## ----download_data_DGE, warning=FALSE, message=FALSE----------------------------------
## Load the container package for RSE
library("SummarizedExperiment")
## Connect to ExperimentHub
library("ExperimentHub")
eh <- ExperimentHub::ExperimentHub()
## Load package datasets
myfiles <- query(eh, "smokingMouse")
## Download the mouse gene data
rse_gene <- myfiles[["EH8313"]]
## Samples from the nicotine experiment and from pups only
rse_gene_nic <- rse_gene[, which(rse_gene$Expt == "Nicotine" & rse_gene$Age == "Pup")]
## Retain only expressed genes (passed the filtering step)
rse_gene_filt <- rse_gene_nic[
rowData(rse_gene_nic)$retained_after_feature_filtering,
]
## ----explore_data---------------------------------------------------------------------
## Data dimensions: number of genes and samples
dim(rse_gene_filt)
## Raw counts for first 3 genes in the first 5 samples
assays(rse_gene_filt)$counts[1:3, 1:5]
## Log-normalized counts for first 3 genes in the first 5 samples
assays(rse_gene_filt)$logcounts[1:3, 1:5]
## Data for the first 2 samples
head(colData(rse_gene_filt), 2)
## ----model.matrix()-------------------------------------------------------------------
## Define formula
formula <- ~ Group + Sex + flowcell + mitoRate + overallMapRate + totalAssignedGene + detected + ERCCsumLogErr
## Model matrix
model <- model.matrix(formula, data = colData(rse_gene_filt))
head(model)
## ----coeff----------------------------------------------------------------------------
## Comparison of interest: Group
coef <- "GroupExperimental"
## ----voom-----------------------------------------------------------------------------
library("limma")
## voom():
# 1. Transform counts to log2(cpm)
# ----------------------------------------------------------------------------
# . | Note we passed voom() raw counts as input, not the lognorm counts!!! |
# ----------------------------------------------------------------------------
# 2. Estimate mean-variance relationship for each gene
# 3. Compute observation weights for limma (next step)
vGene <- voom(assay(rse_gene_filt), design = model, plot = TRUE)
## ----voom_outs------------------------------------------------------------------------
## Returned data
names(vGene)
## E: contains the computed log(cpm)
dim(vGene$E)
vGene$E[1:5, 1:5]
## weights: contains the computed variance weight for each observation
dim(vGene$weights)
vGene$weights[1:5, 1:5]
## design: is the provided design matrix
head(vGene$design)
## targets: the sample library sizes used to compute log(cpm) in the first step
dim(vGene$targets)
head(vGene$targets)
identical(vGene$targets$lib.size, colSums(assay(rse_gene_filt)))
## ----lmFit----------------------------------------------------------------------------
## lmFit():
# 1. Fit linear model for each gene to estimate logFCs
fitGene <- lmFit(vGene)
## Corroborate "ls" method was applied
fitGene$method
## Explore outputs: estimated coefficients (logFCs)
head(fitGene$coefficients)
## ----lm-------------------------------------------------------------------------------
## Lognorm expression of first gene
rse_gene_one_gene <- rse_gene_filt[1, ]
colData(rse_gene_one_gene) <- cbind(colData(rse_gene_one_gene),
"lognorm_expr" = assays(rse_gene_one_gene)$logcounts[1, ]
)
## Fit simple linear model
formula <- lognorm_expr ~ Group
lm <- lm(formula, data = colData(rse_gene_one_gene))
summary(lm)
## Two sample t-test
t.test(formula, data = colData(rse_gene_one_gene), var.equal = TRUE)
## ----eBayes---------------------------------------------------------------------------
## eBayes()
## 1. Compute the empirical Bayes statistics for DE
eBGene <- eBayes(fitGene)
## Outputs of interest:
## s2.prior -> prior residual variance (prior mean 1/s0^2)
## in prior distribution of residual variances
eBGene$s2.prior
## df.prior -> degrees of freedom d0 in prior distribution
## of residual variances
eBGene$df.prior
## s2.post -> posterior residual sample variances of the genes (~sg^2)
length(eBGene$s2.post)
head(eBGene$s2.post)
## t -> moderated t-stats of the genes for each contrast
dim(eBGene$t)
eBGene$t[1:5, 1:5]
## p.value: corresponding unadjusted p-values of moderated t-stats
dim(eBGene$p.value)
eBGene$p.value[1:5, 1:5]
## ----topTable-------------------------------------------------------------------------
## topTable()
## 1. Obtain gene-wise DE stats for Group (Nicotine vs Ctrl)
top_genes <- topTable(eBGene, coef = coef, p.value = 1, number = nrow(rse_gene_filt), sort.by = "none")
## Outputs for each gene and for the coeff selected (Group):
## logFC: log2-fold-changes
head(top_genes$logFC)
## ----betas_logFCs---------------------------------------------------------------------
setdiff(top_genes$logFC, eBGene$coefficients[, "GroupExperimental"])
## ----topTable2------------------------------------------------------------------------
## t: moderated t-stats
head(top_genes$t)
## . P.value: unadjusted p-values of t-stats
head(top_genes$P.Value)
## adj.P.Val: p-values adjusted to control the FDR
head(top_genes$adj.P.Val)
## ----hist_p---------------------------------------------------------------------------
## Histogram of unadjusted p-values
hist(top_genes$P.Value, xlab = "p-values", main = "")
## ----quantify_DEGs--------------------------------------------------------------------
## DEGs for FDR<0.05
de_genes <- top_genes[which(top_genes$adj.P.Val < 0.05), ]
## Number of DEGs
dim(de_genes)
## ----volcano plot, warning=FALSE, message=FALSE---------------------------------------
library("ggplot2")
## Define up- and down-regulated DEGs, and non-DEGs
FDR <- 0.05
DE <- vector()
for (i in 1:dim(top_genes)[1]) {
if (top_genes$adj.P.Val[i] > FDR) {
DE <- append(DE, "n.s.")
} else {
if (top_genes$logFC[i] > 0) {
DE <- append(DE, "Up")
} else {
DE <- append(DE, "Down")
}
}
}
top_genes$DE <- DE
## Colors, sizes and transparencies for up & down DEGs and non-DEGs
cols <- c("Up" = "indianred2", "Down" = "steelblue2", "n.s." = "grey")
sizes <- c("Up" = 1.3, "Down" = 1.3, "n.s." = 0.8)
alphas <- c("Up" = 0.4, "Down" = 0.6, "n.s." = 0.5)
## Plot volcano plot
ggplot(
data = top_genes,
aes(
x = logFC, y = -log10(adj.P.Val),
color = DE,
fill = DE,
size = DE,
alpha = DE
)
) +
geom_point(shape = 21) +
geom_hline(
yintercept = -log10(FDR),
linetype = "dashed", color = "gray35", linewidth = 0.5
) +
geom_vline(
xintercept = c(-1, 1),
linetype = "dashed", color = "gray35", linewidth = 0.5
) +
labs(y = "-log10(FDR)", x = "logFC(Nicotine vs Control)") +
theme_bw() +
scale_color_manual(values = cols, name = "Differential expression") +
scale_fill_manual(values = cols, name = "Differential expression") +
scale_size_manual(values = sizes, name = "Differential expression") +
scale_alpha_manual(values = alphas, name = "Differential expression") +
theme(
plot.margin = unit(c(1, 1, 1, 1), "cm"),
legend.key.height = unit(0.15, "cm"),
axis.title = element_text(size = (13)),
legend.title = element_text(size = 13),
legend.text = element_text(size = 12)
)
## ----complexHeatmap, warning=FALSE,message=FALSE--------------------------------------
library("ComplexHeatmap")
## We plot lognorm counts
lognorm_data <- assays(rse_gene_filt)$logcounts
## Subset to DEGs only
lognorm_data <- lognorm_data[rownames(de_genes), ]
## Define column (sample) names to display
colnames(lognorm_data) <- paste0("Pup_", 1:dim(lognorm_data)[2])
## ----scale_data-----------------------------------------------------------------------
## Center and scale the data to make differences more evident
lognorm_data <- (lognorm_data - rowMeans(lognorm_data)) / rowSds(lognorm_data)
## Sample annotation: Sex, Group, and library size
col_anno <- HeatmapAnnotation(
df = as.data.frame(colData(rse_gene_filt)[, c("Sex", "Group")]),
library_size = anno_barplot(colData(rse_gene_filt)$sum, gp = gpar(fill = "lightyellow2")),
col = list(
"Sex" = c("F" = "hotpink1", "M" = "dodgerblue"),
"Group" = c("Control" = "gray68", "Experimental" = "gold2")
)
)
## Gene annotation: logFC and biotype
de_genes$logFC_binary <- sapply(de_genes$logFC, function(x) {
if (x > 0) {
">0"
} else {
"<0"
}
})
de_genes$protein_coding_gene <- sapply(rowData(rse_gene_filt[rownames(de_genes), ])$gene_type, function(x) {
if (x == "protein_coding") {
"TRUE"
} else {
"FALSE"
}
})
gene_anno <- rowAnnotation(
df = as.data.frame(cbind(
"logFC" = de_genes$logFC_binary,
"protein_coding_gene" = de_genes$protein_coding_gene
)),
col = list(
"logFC" = c("<0" = "deepskyblue3", ">0" = "brown2"),
"protein_coding_gene" = c("TRUE" = "darkseagreen3", "FALSE" = "magenta")
)
)
## ----heat map, warning=FALSE, message=FALSE, fig.width=10,fig.height=6.9--------------
library("circlize")
## Plot
Heatmap(lognorm_data,
name = "lognorm counts",
show_row_names = FALSE,
top_annotation = col_anno,
left_annotation = gene_anno,
row_km = 2,
column_km = 2,
col = colorRamp2(c(-4, -0.0001, 00001, 4), c("darkblue", "lightblue", "lightsalmon", "darkred")),
row_title = "DEGs",
column_title = "Samples",
column_names_gp = gpar(fontsize = 7),
heatmap_width = unit(12.5, "cm"),
heatmap_height = unit(12.5, "cm")
)