-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfragments_analysis.R
executable file
·299 lines (235 loc) · 11.7 KB
/
fragments_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
"Usage: fragments_analysis.R (--out1 <O1>) (--out2 <O2>) (--out3 <O3>) <input1> <input2>
-h --help show this
--out1 name1 bed file with the fragments generated by double digestion
--out2 name2 descriptive statistics of the in silico fragments
--out3 name3 plot with the distribution of the sizes of the fragments
input1 input1 bed file with the position of the restriction sites
input2 input2 bed file with the position of the in silico fragments
fragments_analysis.R -h | --help show this message
" -> doc
# loads the docopt library
require(docopt)
# retrieves the command-line arguments
opts <- docopt(doc)
require(tidyverse)
require(ggplot2)
require(gridExtra)
require(foreach)
require(Matching)
require(sjstats)
save.image("fragment_analysis.rda")
# Function to calculate some descriptive statistics.
descriptive_stats <- function(y){
data.frame(n = length(y),
mean = round(mean(y), 0),
IC95inf = t.test(y)$conf.int[1],
IC95sup = t.test(y)$conf.int[2],
var = var(y), std.dev. = sd(y),
std.err. = sd(y) / sqrt(length(y)),
CV = 100 * sd(y) / mean(y),
min = min(y),
max = max(y),
Q1 = quantile(y, 0.25),
median = median(y),
Q3 = quantile(y, 0.75))
}
###################################################################
##### Determines the fragments generated by double digestion ######
###################################################################
###################
### Plus strand ###
###################
# Reads the bed file containing the restriction sites positions.
frags_genome_plus <- as.data.frame(read_tsv(opts$`<input1>`,
col_names = FALSE,
col_types = "ciiccc",
progress = FALSE))
# Selects only the end position for each restriction site
frags_genome_plus <- frags_genome_plus[, c(1, 3, 4, 6)]
# Help function - change the end position according with the cutting position of the restriction enzymes
change_end_plus_strand <- function(i){
new_end <- numeric()
if (i[3] == "PstI"){
new_end <- c(new_end, as.numeric(i[2]) - 1)
}else if (i[3] == "MspI"){
new_end <- c(new_end, as.numeric(i[2]) - 3)
}
}
# Applies the help function
new_end <- apply(frags_genome_plus, 1, change_end_plus_strand)
# Adds a column containing the exact position where the enzyme cuts
frags_genome_plus$new_end <- new_end
# This function calculates each fragment generated in plus strand, considering the cut position.
fragments_determination <- function(i){
# Help function
int_function <- function(f){
frag_data_chr <- data.frame(chr = fragments_size_set[f, 1],
start = fragments_size_set[f, 5],
end = fragments_size_set[f + 1, 5],
frag_name = paste(fragments_size_set[f, 1], paste("frag", f, fragments_size_set[f, 4], sep = "_"), sep = "_"),
score = 0,
strand = fragments_size_set[f, 4],
type = paste(fragments_size_set[f, 3],
fragments_size_set[f + 1, 3],
sep = "-"))
return(frag_data_chr)
}
fragments_size_set <- frags_genome_plus[frags_genome_plus[, 1] == paste(unique(frags_genome_plus[, 1])[i ]), ]
# Removes chr where only one restriction site was found
if (nrow(fragments_size_set) <= 1){
}else if (nrow(fragments_size_set) > 1){
fragments_size <- diff(fragments_size_set$new_end)
frag_data <- lapply(1:(nrow(fragments_size_set) - 1), int_function)
frag_data_df <- as.data.frame(do.call(rbind, frag_data))
frag_data_df <- cbind(frag_data_df, fragments_size)
all_frag_data_chr <- rbind(all_frag_data_chr, frag_data_df)
}
frag_chr_bed <- rbind(frag_chr_bed, all_frag_data_chr)
return(frag_chr_bed)
}
# Sets the objects that will recive the results of fragments_determination()
frags_chr_plus <- data.frame()
frag_chr_bed <- data.frame()
all_frag_data_chr <- data.frame()
fragments_size <- data.frame()
# Applies the function fragments_determination function
all_frags <- lapply(1:length(unique(frags_genome_plus[, 1])), fragments_determination)
all_frags_df <- as.data.frame(do.call(rbind, all_frags))
frags_genome_plus <- all_frags_df
####################
### Minus strand ###
####################
# Using the results of plus strand and considering the cutting patterns of each enzime, is possible to determine the fragments that were generated in the minus strand.
# minus strand - PstI-PstI frags
frags_minus_pst_pst <- frags_genome_plus[frags_genome_plus$type == "PstI-PstI", ]
frags_minus_pst_pst[, 2] <- frags_minus_pst_pst[, 2] - 4
frags_minus_pst_pst[, 3] <- frags_minus_pst_pst[, 3] - 4
# minus strand - PstI-MspI frags
frags_minus_pst_msp <- frags_genome_plus[frags_genome_plus$type == "PstI-MspI", ]
frags_minus_pst_msp[, 2] <- frags_minus_pst_msp[, 2] - 4
frags_minus_pst_msp[, 3] <- frags_minus_pst_msp[, 3] - 1
frags_minus_pst_msp$type <- "MspI-PstI"
# minus strand - MspI-PstI frags
frags_minus_msp_pst <- frags_genome_plus[frags_genome_plus$type == "MspI-PstI", ]
frags_minus_msp_pst[, 2] <- frags_minus_msp_pst[, 2] - 1
frags_minus_msp_pst[, 3] <- frags_minus_msp_pst[, 3] - 4
frags_minus_msp_pst$type <- "PstI-MspI"
# minus strand - MspI-MspI frags
frags_minus_msp_msp <- frags_genome_plus[frags_genome_plus$type == "MspI-MspI", ]
frags_minus_msp_msp[, 2] <- frags_minus_msp_msp[, 2] - 1
frags_minus_msp_msp[, 3] <- frags_minus_msp_msp[, 3] - 1
frags_genome_minus <- rbind(frags_minus_pst_pst,
frags_minus_pst_msp,
frags_minus_msp_pst,
frags_minus_msp_msp)
# Changes "+" to "-"
frags_genome_minus[, 6] <- rep("-", nrow(frags_genome_minus))
frags_genome_minus$frag_name <- gsub("\\+", "\\-", frags_genome_minus$frag_name)
all_frags_genome <- rbind(frags_genome_plus, frags_genome_minus)
# Generates the descriptive statistics for the set of fragments generated by the double (PstI-MspI) digestion.
all_frags_statistics <- descriptive_stats(all_frags_genome$fragments_size)
#it's necessary because bedtools do not works with scientific numbers
all_frags_genome$start <- as.integer(all_frags_genome$start)
all_frags_genome$end <- as.integer(all_frags_genome$end)
# Sorts the file
all_frags_genome <- all_frags_genome[with(all_frags_genome, order(chr, start)), ]
# Writes a bed file containing the positions of all fragments
write.table(all_frags_genome,
paste(opts$O1),
sep = "\t",
col.names = F,
row.names = F,
quote = F)
#######################################
###### DArTseq sampled fragments ######
#######################################
# Reads the bed file containing the fragments of sampled marks
sampled <- read.table(opts$`<input2>`)
# Fragments size of sampled marks
sampled_frag_sizes <- sampled$V3 - (sampled$V2 + 1)
# Generates the descritive statistics for the size of fragments selected by DArTseq.
sampled_frags_statistics <- descriptive_stats(sampled_frag_sizes)
#################################
###### PstI-MspI fragments ######
#################################
# Generates the descriptive statistics for the size of fragments selected by DArTseq.
PstI_MspI <- all_frags_genome[grep("PstI-MspI", all_frags_genome$type), ]
frags_msp_pst <- rbind(PstI_MspI)
# Generates the descriptive statistics for the size of fragments avaliable to sequecing by DArTseq.
msp_pst_frags_statistics <- descriptive_stats(frags_msp_pst$fragments_size)
# Writes a file with the fragments descriptive statistics
descriptive_statistics <- cbind(data.frame("Fragments Description" = c("all fragments",
"MspI-PstI or PstI-MspI fragments",
"Sequenced fragments")),
rbind(all_frags_statistics,
msp_pst_frags_statistics,
sampled_frags_statistics))
write.table(descriptive_statistics,
paste(opts$O2),
sep = "\t",
row.names = F,
col.names = T,
quote = F)
## Visualization of the fragments distribution according with size.
all_frags <- data.frame(sample = rep("All fragments", nrow(all_frags_genome)),
fragments_size = all_frags_genome$fragments_size)
possible_to_select_frags <- data.frame(sample = rep("PstI-MspI fragments",
nrow(frags_msp_pst)),
fragments_size = frags_msp_pst$fragments_size)
selected_frags <- data.frame(sample = rep("Sequenced fragments",
length(sampled_frag_sizes)),
fragments_size = sampled_frag_sizes)
union_frags <- rbind(all_frags, possible_to_select_frags, selected_frags)
## plot
labels_to_ggplot <- c(expression("PstI-MspI"~or~"MspI-PstI"~italic("\'in silico\'")~"fragments"), "Sequenced fragments")
union_frags_plot <- union_frags[!union_frags$sample == "All fragments", ]
a <- ggplot(union_frags_plot,
aes(x = log2(fragments_size), fill = sample)) +
geom_histogram(alpha = 0.8) +
ylab("Number of fragments") +
xlab("Length of the fragments") +
scale_x_continuous(breaks = seq(0, 16, 2),
labels = 2^seq(0, 16, 2)) +
scale_fill_manual(values = c("#E69F00", "#0072B2"),
breaks = c("PstI-MspI fragments",
"Sequenced fragments"),
labels = labels_to_ggplot) +
theme_bw() +
geom_vline(xintercept = 5.555, linetype = "dotted",
color = "red", size = 1.5) +
geom_vline(xintercept = 9.15989, linetype = "dotted",
color = "red", size = 1.5) +
theme(axis.text.x = element_text(size = 16, colour = "black"),
axis.text.y = element_text(size = 16, colour = "black"),
axis.title = element_text(size = 20, colour = "black"),
legend.text = element_text(size = 16, colour = "black"),
legend.title.align = 0.5,
legend.title = element_blank())
svg(filename = paste(opts$O3),
width = 14,
height = 6,
pointsize = 12)
print(a)
dev.off()
#################################################
#### Analisis of the fragments analysis size ####
#################################################
print("Deciles of the sizes of sequenced fragments:")
print(quantile(sampled_frag_sizes, prob = seq(0, 1, length = 11), type = 5))
print("Mean of size of sequenced fragments:")
print(mean(sampled_frag_sizes))
print("Standard desviation on the size of sequenced fragments:")
print(sd(sampled_frag_sizes))
print("Coefficient of variation of size of sequenced fragments:")
print(cv(sampled_frag_sizes))
# All fragments
all_frags_genome[, 6] <- as.character(all_frags_genome[, 6])
all_frags_genome[, 7] <- as.character(all_frags_genome[, 7])
all_frags_genome[, 8] <- as.numeric(all_frags_genome[, 8])
# Selects only PstI-MspI frags
all_frags_pstI_mspI <- all_frags_genome[all_frags_genome[, 7] == "PstI-MspI" | all_frags_genome[, 7] == "MspI-PstI", ]
# Selects the ones in the same size range
all_frags_pstI_mspI <- all_frags_pstI_mspI[all_frags_pstI_mspI[, 8] >= 48 & all_frags_pstI_mspI[, 8] <= 574, ]
# Fraction of fragments possible of sequence that were actually sequenced
print("Fraction of the in silico fragments that were sequenced by MS-DArT-seq:")
print((nrow(sampled) / nrow(all_frags_pstI_mspI)) * 100)