-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStep_3_MM20221122_boxplot_median.R
442 lines (321 loc) · 16.1 KB
/
Step_3_MM20221122_boxplot_median.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# load libraries
library(ggplot2)
library(dplyr)
library(plotly)
library("DescTools")
library("openxlsx")
library(multcompView)
# clean environment
rm(list=ls())
# define wd
setwd("C:/Users/Mende012/Documents/Bioinformatics/ROS assays/DR_final_pipeline")
# import file
df <- read.csv("./Step_2_MM20230106_2_auc_data.csv")
# Output Prefix
# generate prefix string including date/ Initials
current_date <- gsub("-", "", as.character(Sys.Date()))# Get the current date as character
pre <- paste("MM", current_date, sep = "")
# Accidentally named HopN1 as AvrN1 - change to correct name
#df$treatment_id[df$treatment_id == 'D36E+AvrN1_Bacterial PAMPs'] <- 'D36E+HopN1_Bacterial PAMPs'
# select one assays per treatment combination
df <- df[!(df$assay_id == "plate029"),]
df <- df[!(df$assay_id == "plate030"),]
df <- df[!(df$assay_id == "plate032"),]
df <- df[!(df$assay_id == "plate033"),]
df <- df[!(df$assay_id == "plate034"),]
df <- df[!(df$assay_id == "plate037"),]
# df <- df[!((df$assay_id == "plate028") & (grepl(("AvrPto"), df$treatment_id))),]
#################################################################################
#################################################################################
# ANOVA after normalisation
# 1. ANOVA to analyse wheather there are differences between the groups
# H0 All medians are the same
# H1 at least one median is different
#anova_ROS <- aov(area_under_curve ~ treatment_id, data = df)
#{summary(anova_ROS)
# summary.lm(anova_ROS)}
# Dunnett's test - summarised analysis
#Dunnet <- DunnettTest(area_under_curve ~ treatment_id, data = df, control = "D36E_Bacterial PAMPs", conf.level = 0.95)
#print(Dunnet)
################################################################################
############################### Statistics ###############################
################################################################################
# Dunnett's test - per plate --> makes more sense due to inter-assay variation
# tbl_df <- df %>% as_tibble() %>% group_by(assay_id)
dunn_res <- lapply(unique(df$assay_id), function(x) as.data.frame(DunnettTest(area_under_curve ~ treatment_id,
data = df[df$assay_id==x,],
control = "D36E_Bacterial PAMPs",
conf.level = 0.95)$`D36E_Bacterial PAMPs`)) %>% bind_rows()
# add column with asterix for significant results
dunn_res$ast <- ''
dunn_res$ast[dunn_res$pval<0.1] <- '.'
dunn_res$ast[dunn_res$pval<0.05] <- '*'
dunn_res$ast[dunn_res$pval<0.01] <- '**'
dunn_res$ast[dunn_res$pval<0.001] <- '***'
dunn_res_1 <- dunn_res
dunn_res$sample <- row.names(dunn_res_1)
rm(dunn_res_1)
# remove comparisons present in every assay
remove <- c("D36E_MQ-D36E_Bacterial PAMPs","DC3000_Bacterial PAMPs-D36E_Bacterial PAMPs", "DC3000_MQ-D36E_Bacterial PAMPs")
dunn_res <- dunn_res[!grepl(paste(remove, collapse = "|"), dunn_res$sample),]
# sub string sample name to match names of treatments
dunn_res$sample <- sub("-D36E.*", "", dunn_res$sample)
# simple table
simple <- data.frame(dunn_res$sample, dunn_res$ast, row.names = NULL)
# # safe res
sink(paste(pre, sep = "","ROS_supression_DC3000_library_Dunnet_prescale.txt"))
print(dunn_res)
sink()
###################################################################################
######################## scale by positive control - ##############################
###################################################################################
### STEP1: write function to show ratio between median/median positive control #####
### and sample measurement ### c
## write a function to identify the median/average of the positive control D36+PAMPs
# identify unique assayIDs and sort them to ensure order
assayIDs <- unique(df$assay_id)
assayIDs <- sort(assayIDs)
# creat/test function to find median/mean
med_pos <- function(df, assayID, positive_control){
a <- as.data.frame(df[df$assay_id == assayID,])
b <- as.data.frame(a[a$treatment_id == positive_control,])
contr_median <- median(b$area_under_curve) # change here from median to median
return(contr_median)
}
# apply the function over all the assays
contr_median <- lapply(assayIDs, function(x){
a <- as.data.frame(df[df$assay_id == x,])
b <- as.data.frame(a[a$treatment_id == "D36E_Bacterial PAMPs",])
contr_median <- median(b$area_under_curve)
return(contr_median)
})
## make small df to store median/median results per assay
df_ctrmed <- data.frame(assayIDs, unlist(contr_median))
colnames(df_ctrmed) <- c("assay_id", "median_positive_control") #change median/median
#rm(ls = contr_median)
## write function (function works with single assay)
mult <- function(assayID){
rows <- nrow(df[df$assay_id == assayID,]) # count rows for of each assay
a <- df_ctrmed[df_ctrmed$assay_id == assayID,] # access the median/median of a respective control
re_assay_id <- unlist(rep(a$assay_id, times = rows)) # times assayID as many times as there are rows for the assayID
re_median <- unlist(rep(a$median_positive_control, times = rows)) #update median/median, times control/median as many times as there are rows for the assayID
b <- data.frame(unlist(re_assay_id), unlist(re_median)) # make a df of length = assay rows
return(b)
}
# use the function with lapply to repeat the same for all plates
rows <- lapply(assayIDs, function(x){
rows <- nrow(df[df$assay_id == x,])
a <- df_ctrmed[df_ctrmed$assay_id == x,]
re_assay_id <- unlist(rep(a$assay_id, times = rows))
re_median <- unlist(rep(a$median_positive_control, times = rows))
b <- data.frame(unlist(re_assay_id), unlist(re_median))
return(b)
}
) %>% bind_rows()
## reorganise and merge df
# rename columns in a medianingful way
colnames(rows) <- c("assay_id", "median_positive_control") #change name median/median
# order df so that it is also ordered by plateID
df <- df[order(df$assay_id, decreasing = FALSE),]
# add additional column to original df
df["median_positive_control"] <- rows$median_positive_control
# clean up
rm(ls=df_ctrmed)
rm(ls=rows)
### use the median/median to scale the measurements
## write a function to calculate how the measurments compare to the positive control
## calculate this as a % of the positive_control_median
prc_pos_avg <- (df$area_under_curve/df$median_positive_control)*100
df["prc_pos_avg"] <- prc_pos_avg
#################################################################################
######################### export for summary heatmap ###########################
#################################################################################
e_line <- unique(df$treatment_id)
#e_line <- lapply(e_line,sort,decreasing=TRUE)
# apply contr_median function to find the median of each effector line percentage
sample_median <- lapply(e_line, function(x){
a <- as.data.frame(df[df$treatment_id == x,])
med <- median(a$prc_pos_avg)
return(med)
})
e_line <- as.character(e_line)
d2 <- cbind.data.frame("Effector" = e_line, "Median" = unlist(sample_median))
d2$Effector <- sub("*_Bacterial PAMPs", "", d2$Effector)
write.csv(d2, "./summary_median_ROS_percentage.xlsx")
head(df)
#################################################################################
########################## Statistics post-scaling ###########################
#################################################################################
# Dunnett's test - per plate with scaled data ----
dunn_scal <- lapply(unique(df$assay_id), function(x) as.data.frame(DunnettTest(prc_pos_avg ~ treatment_id,
data = df[df$assay_id==x,],
control = "D36E_Bacterial PAMPs",
conf.level = 0.95)$`D36E_Bacterial PAMPs`)) %>% bind_rows()
# add column with asterix for significant results
dunn_scal$ast <- ''
dunn_scal$ast[dunn_scal$pval<0.1] <- '.'
dunn_scal$ast[dunn_scal$pval<0.05] <- '*'
dunn_scal$ast[dunn_scal$pval<0.01] <- '**'
dunn_scal$ast[dunn_scal$pval<0.001] <- '***'
dunn_scal_1 <- dunn_scal
dunn_scal$sample <- row.names(dunn_scal_1)
rm(dunn_scal_1)
# remove comparisons present in every assay
remove <- c("D36E_MQ-D36E_Bacterial PAMPs","DC3000_Bacterial PAMPs-D36E_Bacterial PAMPs", "DC3000_MQ-D36E_Bacterial PAMPs")
dunn_scal <- dunn_scal[!grepl(paste(remove, collapse = "|"), dunn_scal$sample),]
# sub string sample name to match names of treatments
dunn_scal$sample <- sub("-D36E.*", "", dunn_scal$sample)
# simple table
simple_scal <- data.frame(dunn_scal$sample, dunn_scal$ast, row.names = NULL)
# safe res
sink(paste(pre, sep = "","ROS_supression_DC3000_library_Dunnet_scaled.txt"))
print(dunn_scal)
sink()
# ANOVA + TUKEY - per plate with scaled data ----
# The following analysis requires that ther is no "-" present in the sample name
# Replace "-" with "_"
df$treatment_id <- gsub("-", "_", df$treatment_id)
# 4. Statistical analysis across samples using ANOVA+TUKEY ----
# 4.1. ANOVA to analyse whether there are differences between the groups
# H0 All means are the same
# H1 at least one mean is different
anova_prc <- aov(prc_pos_avg ~ treatment_id, data = df)
{summary(anova_prc)
summary.lm(anova_prc)}
# 4.2 TUKEY Posthoc to see which groups are significantly different
TUKEY <- TukeyHSD(anova_prc)
# 4.3 Summarize TUKEY test results for saving & plotting
# 4.3.1 write a funktion to generate labels for the test results
generate_label_df <- function(TUKEY, variable){
# Extract labels and factor levels from Tukey post-hoc results
Tukey.levels <- TUKEY[[variable]][,4]
Tukey.labels <- data.frame(multcompLetters(Tukey.levels)['Letters'])
# order labels according to boxplot:
Tukey.labels$Pst_strain = rownames(Tukey.labels)
Tukey.labels=Tukey.labels[order(Tukey.labels$Pst_strain) , ]
return(Tukey.labels)
}
# 4.3.2 Apply the function on the df
LABELS <- generate_label_df(TUKEY , "treatment_id")
names(LABELS) <- c("Letters","treatment_id")
# 4.3.3 safe TUKEY test results
sink(paste(pre, sep = "","ROS_supression_DC3000_library_TUKEY_scaled.txt"))
print(LABELS)
sink()
# 4.3.4 obtain letter position for y axis using means
yvalue <- aggregate(prc_pos_avg ~ treatment_id, data = df, mean)
LABELS$treatment_id <- (unlist(lapply(strsplit(LABELS$treatment_id,'[_]'),function(x){x[[1]]})))
# 4.3.5 generate df with results TUKEY test
final <- merge(yvalue,LABELS)
# 4.3.6 For ploting extract label vector
stat_labels <- as.vector(LABELS$Letters)
# 5. Box Plots comparing Pst strains, 2 dpi, TUKEY results, save as pdf (default) ----
g_x <- ggplot(df, aes(x=as.factor(treatment_id), y=prc_pos_avg)) +
geom_boxplot(fill="white", # box colour
outlier.colour = "white", # Outliers color,
alpha=0.9) + # Box color transparency
theme_bw() + # make the bg white
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.border = element_blank(), # remove background, frame
axis.line = element_line(colour = "black")) +
xlab("Pseudomonas strain") + # axis label
ylab("log10 (CFU/cm2)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # turn the tick marks on the x axis 45 degree +
#expand_limits( y = c(0, 10)) +
geom_jitter(shape=16, position=position_jitter(0.1),
aes(colour = factor(treatment_id))) +
stat_boxplot(geom = "errorbar", # Error bars
width = 0.2) +
geom_text(data = final, aes(y= treatment_id, label = Letters),
vjust= -7 ,hjust= 0.5)
g_x
#################################################################################
########################### boxplot pre-scaling ###########################
################################################################################# ----
# simplify names in df
#df$treatment_id <- sub("D36E", "", df$treatment_id)
df$treatment_id <- sub("*_Bacterial PAMPs", "", df$treatment_id)
#df$treatment_id <- sub("_MQ", "mock", df$treatment_id)
# colour scheme
cbp1 <- c("#999999", "#44AA99", "#E69F00", "#661100", "#56B4E9", "#000000", "#117733", "#332288", "#F0E442", "#0072B2", "#D55E00", "#999933", "#CC79A7", "#E69F00")
#based on: https://stackoverflow.com/questions/57153428/r-plot-color-combinations-that-are-colorblind-accessible
# basic boxplot
g2 <- ggplot(
#input data
df, aes(x=as.factor(treatment_id), y=area_under_curve)) +
# generate basic boxplot
geom_boxplot(fill="white", # box colour
outlier.colour = "white", # Outliers color,
alpha=0) + # Box color transparency
# overlay with jitter
geom_jitter(shape=16, position=position_jitter(0.1),
aes(colour = factor(assay_id))) +
scale_colour_manual(values = cbp1) +
# define the theme of the boxplot
theme_bw() + # make the bg white
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.border = element_blank(), # remove background, frame
axis.line = element_line(colour = "black")) +
# label the axises
xlab("Pseudomonas strain") +
ylab("log2 (normalised area under the curve)") +
theme(axis.text.x = element_text(angle = 70, hjust = 1)) + # turn the tick marks on the x axis 45 degree
# define axis limits if needed
expand_limits( y = c(0, 5)) +
# add statistical information: error bars, statistical results
stat_boxplot(geom = "errorbar", # Error bars
width = 0.2)
ggsave("MM20230202_ROS_boxpl_single_median.svg",width = 10, height = 6)
g2
dev.off()
g2
ggplotly(g2)
ggplotly(g2)
###############################################################################
################################### plot graph ################################
###############################################################################
length(unique(df$assay_id))
cbp1 <- rep("#E69F00", each = 20)
#cbp1 <- c("#999999", "#44AA99", "#E69F00", "#661100", "#56B4E9", "#000000", "#117733", "#332288", "#F0E442", "#0072B2", "#D55E00", "#999933", "#CC79A7", "#E69F00")
# https://stackoverflow.com/questions/57153428/r-plot-color-combinations-that-are-colorblind-accessible
# scaled boxplot
g3 <- ggplot(
#input data
df, aes(x=as.factor(treatment_id), y=prc_pos_avg)) +
# generate basic boxplot
geom_boxplot(fill="white", # box colour
outlier.colour = "white", # Outliers color,
alpha=0) + # Box color transparency
# overlay with jitter
geom_jitter(shape=16, position=position_jitter(0.1),
aes(colour = factor(assay_id))) +
scale_colour_manual(values = cbp1) +
# define the theme of the boxplot
theme_bw() + # make the bg white
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.border = element_blank(), # remove background, frame
axis.line = element_line(colour = "black")) +
# label the axises
xlab("Pseudomonas strain") +
ylab("log2 (normalised area under the curve)") +
theme(axis.text.x = element_text(angle = 70, hjust = 1)) + # turn the tick marks on the x axis 45 degree
# define axis limits if needed
expand_limits( y = c(0, 5)) +
# add statistical information: error bars, statistical results
stat_boxplot(geom = "errorbar", # Error bars
width = 0.2)
ggsave("MM20230202_ROS_boxpl_single_median.svg",width = 10, height = 6)
g3
dev.off()
ggplotly(g3)
# In progress LAST: Safe stat results for better comparison in one df ----
# Sort df
LABELS_sorted <- LABELS[order(LABELS$treatment_id), ]
simple_scal_sorted <- simple_scal[order(simple_scal$dunn_scal.sample), ]
stat_comp <- data.frame( "Treatment" = LABELS_sorted$treatment_id ,
"Not_scaled_Dunnet's" = ,
"Scaled_Dunnet's" = simple_scal$dunn_scal.ast ,
"Scaled_TUKEY" = LABELS$Letters)
sink(paste(pre, sep = "","CFU_count_DC3000_library_summary_stats.txt"))
print(stat_comp)
sink()