fragments_analysis.R

"Usage: fragments_analysis.R (--out1 <O1>) (--out2 <O2>) (--out3 <O3>) <input1> <input2>
-h --help    show this
--out1  name1   bed file with the fragments generated by double digestion
--out2  name2   descriptive statistics of the in silico fragments
--out3  name3   plot with the distribution of the sizes of the fragments
input1  input1  bed file with the position of the restriction sites
input2  input2  bed file with the position of the in silico fragments

fragments_analysis.R -h | --help  show this message
" -> doc

# loads the docopt library
require(docopt)
# retrieves the command-line arguments
opts <- docopt(doc)

require(tidyverse)
require(ggplot2)
require(gridExtra)
require(foreach)
require(Matching)
require(sjstats)

save.image("fragment_analysis.rda")

# Function to calculate some descriptive statistics.
descriptive_stats <- function(y){

    data.frame(n = length(y),
               mean = round(mean(y), 0),
               IC95inf = t.test(y)$conf.int[1],
               IC95sup = t.test(y)$conf.int[2],
               var = var(y), std.dev. = sd(y),
               std.err. = sd(y) / sqrt(length(y)),
               CV = 100 * sd(y) / mean(y),
               min = min(y),
               max = max(y),
               Q1 = quantile(y, 0.25),
               median = median(y),
               Q3 = quantile(y, 0.75))
}

###################################################################
##### Determines the fragments generated by double digestion ######
###################################################################

###################
### Plus strand ###
###################

# Reads the bed file containing the restriction sites positions.
frags_genome_plus <- as.data.frame(read_tsv(opts$`<input1>`,
                                            col_names = FALSE,
                                            col_types = "ciiccc",
                                            progress = FALSE))

# Selects only the end position for each restriction site
frags_genome_plus <- frags_genome_plus[, c(1, 3, 4, 6)]

# Help function - change the end position according with the cutting position of the restriction enzymes
change_end_plus_strand <- function(i){
    new_end <- numeric()
    if (i[3] == "PstI"){
        new_end <- c(new_end, as.numeric(i[2]) - 1)
    }else if (i[3] == "MspI"){
        new_end <- c(new_end, as.numeric(i[2]) - 3)
    }
}

# Applies the help function
new_end <- apply(frags_genome_plus, 1, change_end_plus_strand)

# Adds a column containing the exact position where the enzyme cuts
frags_genome_plus$new_end <- new_end

# This function calculates each fragment generated in plus strand, considering the cut position.
fragments_determination <- function(i){
    # Help function
    int_function <- function(f){

        frag_data_chr <- data.frame(chr = fragments_size_set[f, 1],
                                    start = fragments_size_set[f, 5],
                                    end = fragments_size_set[f + 1, 5],
                                    frag_name = paste(fragments_size_set[f, 1], paste("frag", f, fragments_size_set[f, 4], sep = "_"), sep = "_"),
                                    score = 0,
                                    strand = fragments_size_set[f, 4],
                                    type = paste(fragments_size_set[f, 3],
                                                 fragments_size_set[f + 1, 3],
                                                 sep = "-"))
        return(frag_data_chr)
    }

    fragments_size_set <- frags_genome_plus[frags_genome_plus[, 1] == paste(unique(frags_genome_plus[, 1])[i ]), ]

    # Removes chr where only one restriction site was found
    if (nrow(fragments_size_set) <= 1){
    }else if (nrow(fragments_size_set) > 1){

        fragments_size <- diff(fragments_size_set$new_end)

        frag_data <- lapply(1:(nrow(fragments_size_set) - 1), int_function)
        frag_data_df <- as.data.frame(do.call(rbind, frag_data))
        frag_data_df <- cbind(frag_data_df, fragments_size)

        all_frag_data_chr <- rbind(all_frag_data_chr, frag_data_df)
    }
    frag_chr_bed <- rbind(frag_chr_bed, all_frag_data_chr)
    return(frag_chr_bed)
}

# Sets the objects that will recive the results of fragments_determination()
frags_chr_plus <- data.frame()
frag_chr_bed <- data.frame()
all_frag_data_chr <- data.frame()
fragments_size <- data.frame()

# Applies the function fragments_determination function
all_frags <- lapply(1:length(unique(frags_genome_plus[, 1])), fragments_determination)

all_frags_df <- as.data.frame(do.call(rbind, all_frags))

frags_genome_plus <- all_frags_df

####################
### Minus strand ###
####################

# Using the results of plus strand and considering the cutting patterns of each enzime, is possible to determine the fragments that were generated in the minus strand.

# minus strand - PstI-PstI frags
frags_minus_pst_pst <- frags_genome_plus[frags_genome_plus$type == "PstI-PstI", ]
frags_minus_pst_pst[, 2] <- frags_minus_pst_pst[, 2] - 4
frags_minus_pst_pst[, 3] <- frags_minus_pst_pst[, 3] - 4

# minus strand - PstI-MspI frags
frags_minus_pst_msp <- frags_genome_plus[frags_genome_plus$type == "PstI-MspI", ]
frags_minus_pst_msp[, 2] <- frags_minus_pst_msp[, 2] - 4
frags_minus_pst_msp[, 3] <- frags_minus_pst_msp[, 3] - 1
frags_minus_pst_msp$type <- "MspI-PstI"

# minus strand - MspI-PstI frags
frags_minus_msp_pst <- frags_genome_plus[frags_genome_plus$type == "MspI-PstI", ]
frags_minus_msp_pst[, 2] <- frags_minus_msp_pst[, 2] - 1
frags_minus_msp_pst[, 3] <- frags_minus_msp_pst[, 3] - 4
frags_minus_msp_pst$type <- "PstI-MspI"

# minus strand - MspI-MspI frags
frags_minus_msp_msp <- frags_genome_plus[frags_genome_plus$type == "MspI-MspI", ]
frags_minus_msp_msp[, 2] <- frags_minus_msp_msp[, 2] - 1
frags_minus_msp_msp[, 3] <- frags_minus_msp_msp[, 3] - 1

frags_genome_minus <- rbind(frags_minus_pst_pst,
                            frags_minus_pst_msp,
                            frags_minus_msp_pst,
                            frags_minus_msp_msp)

# Changes "+" to "-"
frags_genome_minus[, 6] <- rep("-", nrow(frags_genome_minus))
frags_genome_minus$frag_name <- gsub("\\+", "\\-", frags_genome_minus$frag_name)

all_frags_genome <- rbind(frags_genome_plus, frags_genome_minus)

# Generates the descriptive statistics for the set of fragments generated by the double (PstI-MspI) digestion.
all_frags_statistics <- descriptive_stats(all_frags_genome$fragments_size)

#it's necessary because bedtools do not works with scientific numbers
all_frags_genome$start <- as.integer(all_frags_genome$start)
all_frags_genome$end <- as.integer(all_frags_genome$end)

# Sorts the file
all_frags_genome <- all_frags_genome[with(all_frags_genome, order(chr, start)), ]

# Writes a bed file containing the positions of all fragments
write.table(all_frags_genome,
            paste(opts$O1),
            sep = "\t",
            col.names = F,
            row.names = F,
            quote = F)

#######################################
###### DArTseq sampled fragments ######
#######################################

# Reads the bed file containing the fragments of sampled marks
sampled <- read.table(opts$`<input2>`)

# Fragments size of sampled marks
sampled_frag_sizes <- sampled$V3 - (sampled$V2 + 1)

# Generates the descritive statistics for the size of fragments selected by DArTseq.
sampled_frags_statistics <- descriptive_stats(sampled_frag_sizes)

#################################
###### PstI-MspI fragments ######
#################################

# Generates the descriptive statistics for the size of fragments selected by DArTseq.
PstI_MspI <- all_frags_genome[grep("PstI-MspI", all_frags_genome$type), ]
frags_msp_pst <- rbind(PstI_MspI)

# Generates the descriptive statistics for the size of fragments avaliable to sequecing by DArTseq.
msp_pst_frags_statistics <- descriptive_stats(frags_msp_pst$fragments_size)

# Writes a file with the fragments descriptive statistics
descriptive_statistics <- cbind(data.frame("Fragments Description" = c("all fragments",
                                                                       "MspI-PstI or PstI-MspI fragments",
                                                                       "Sequenced fragments")),
                                rbind(all_frags_statistics,
                                      msp_pst_frags_statistics,
                                      sampled_frags_statistics))

write.table(descriptive_statistics,
            paste(opts$O2),
            sep = "\t",
            row.names = F,
            col.names = T,
            quote = F)

## Visualization of the fragments distribution according with size.
all_frags <- data.frame(sample = rep("All fragments", nrow(all_frags_genome)),
                        fragments_size = all_frags_genome$fragments_size)

possible_to_select_frags <- data.frame(sample = rep("PstI-MspI fragments",
                                                    nrow(frags_msp_pst)),
                                       fragments_size = frags_msp_pst$fragments_size)

selected_frags <- data.frame(sample = rep("Sequenced fragments",
                                          length(sampled_frag_sizes)),
                             fragments_size = sampled_frag_sizes)

union_frags <- rbind(all_frags, possible_to_select_frags, selected_frags)

## plot

labels_to_ggplot <- c(expression("PstI-MspI"~or~"MspI-PstI"~italic("\'in silico\'")~"fragments"), "Sequenced fragments")

union_frags_plot <- union_frags[!union_frags$sample == "All fragments", ]

a <- ggplot(union_frags_plot,
       aes(x = log2(fragments_size), fill = sample)) +
    geom_histogram(alpha = 0.8) +
    ylab("Number of fragments") +
    xlab("Length of the fragments") +
    scale_x_continuous(breaks = seq(0, 16, 2),
                       labels = 2^seq(0, 16, 2)) +
    scale_fill_manual(values = c("#E69F00", "#0072B2"),
                      breaks = c("PstI-MspI fragments",
                                 "Sequenced fragments"),
                      labels = labels_to_ggplot) +
    theme_bw() +
    geom_vline(xintercept = 5.555, linetype = "dotted",
               color = "red", size = 1.5) +
    geom_vline(xintercept = 9.15989, linetype = "dotted",
               color = "red", size = 1.5) +
    theme(axis.text.x = element_text(size = 16, colour = "black"),
          axis.text.y = element_text(size = 16, colour = "black"),
          axis.title = element_text(size = 20, colour = "black"),
          legend.text = element_text(size = 16, colour = "black"),
          legend.title.align = 0.5,
          legend.title = element_blank())

svg(filename = paste(opts$O3),
    width = 14,
    height = 6,
    pointsize = 12)
print(a)
dev.off()

#################################################
#### Analisis of the fragments analysis size ####
#################################################

print("Deciles of the sizes of sequenced fragments:")
print(quantile(sampled_frag_sizes, prob = seq(0, 1, length = 11), type = 5))

print("Mean of size of sequenced fragments:")
print(mean(sampled_frag_sizes))

print("Standard desviation on the size of sequenced fragments:")
print(sd(sampled_frag_sizes))

print("Coefficient of variation of size of sequenced fragments:")
print(cv(sampled_frag_sizes))

# All fragments
all_frags_genome[, 6] <- as.character(all_frags_genome[, 6])
all_frags_genome[, 7] <- as.character(all_frags_genome[, 7])
all_frags_genome[, 8] <- as.numeric(all_frags_genome[, 8])

# Selects only PstI-MspI frags
all_frags_pstI_mspI <- all_frags_genome[all_frags_genome[, 7] == "PstI-MspI" | all_frags_genome[, 7] == "MspI-PstI", ]

# Selects the ones in the same size range
all_frags_pstI_mspI <- all_frags_pstI_mspI[all_frags_pstI_mspI[, 8] >= 48 & all_frags_pstI_mspI[, 8] <= 574, ]

# Fraction of fragments possible of sequence that were actually sequenced
print("Fraction of the in silico fragments that were sequenced by MS-DArT-seq:")
print((nrow(sampled) / nrow(all_frags_pstI_mspI)) * 100)