-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMinimal_Chimera_Entropy_Script.R
executable file
·150 lines (127 loc) · 4.96 KB
/
Minimal_Chimera_Entropy_Script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env Rscript
## ---------------------------
##
## Script name: Minimal_Chimera_Entropy_Script.R
##
## Purpose of script: Output chimera entropy attribute file for a protein in a given alignment as a command line script.
##
## Author: Daniil Prigozhin
##
## Date Created: 2021-02-23
##
## Copyright (c) Daniil Prigozhin, 2021
## Email: daniilprigozhin@lbl.gov
##
## ---------------------------
##
## Notes: script will attempt to install several packages if not already available
##
##
## ---------------------------
#install.packages("optparse")
## load packages --------------------
package_list<-c("optparse","entropy","dplyr","msa","tidyverse")
load_pack <- function(x){
for( i in x ){
if( ! require( i , character.only = TRUE ) ){
install.packages( i , dependencies = TRUE )
require( i , character.only = TRUE , quietly = T)
}
}
}
load_pack(package_list)
#library(odseq)
cat("=======================================\nLoaded packages\n=======================================\n")
## get input file and options--------------------
option_list = list(
make_option( c("-f", "--file") , type = "character" , default=NULL,
help="dataset file name", metavar="character"),
make_option( c("-n", "--name") , type = "character" , default=NULL,
help="protein name, has to be in the alignment name space", metavar="character"),
make_option(c("-c", "--common"), type="character", default=NULL,
help="protein common name (optional)", metavar="character"),
make_option(c("-d", "--directory"), type="character", default="Test",
help="directory name (optional)", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$file)){
print_help(opt_parser)
stop("Provide protein alignment with -f option", call.=FALSE)
}
if (is.null(opt$name)){
print_help(opt_parser)
stop("Provide reference protein name with -n option", call.=FALSE)
}
gene <- opt$n
CN <- opt$c
file <- opt$f
maa <- readAAMultipleAlignment(file)
### Check that the protein name matches one alignment key ------------------
lm <- length(grep(pattern = gene, x=rownames(maa)))
if (lm == 1) {
cat ("Found Reference Sequence\n")}else
if (lm ==0) {
stop("Protein name not found in alignment", call.=FALSE)}else
if (lm > 1) {
stop("More than one protein matched the name provided", call.=FALSE)}else{stop("Error at protein name", call.=FALSE)}
## Masking columns by reference gene ----------------
Alph_21 <- c("A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V","-")
Alph_20 <- c("A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V")
RefGene <- gene
RefSeq <- as(maa, "AAStringSet")[grep(pattern = gene, x=rownames(maa))]
GapMask<- NULL
for (i in 1:width(RefSeq)){
c<-as.vector(RefSeq[[1]][i]) %in% c("-")
GapMask<-append(GapMask,c,length(GapMask))
}
colmask(maa) <- IRanges(GapMask)
## Masking reference genes ---------------------------
# RefIDs <- grep(pattern = "Athaliana", x=names(unmasked(maa)))
# rowmask(maa) <- IRanges(start = RefIDs, end = RefIDs)
#Retrieving the non-masked subset ------------------
RefAli <- as(maa, "AAStringSet")
RefLen <- width(RefAli[1])
## Calculating Consensus Matrix -------------------
Tidy_CM<-as_tibble(t(consensusMatrix(RefAli, baseOnly = T)))
## Compensating for consensus matrix not keeping full alphabet in output
for (a in setdiff(Alph_21,colnames(Tidy_CM))){
vec <- as_tibble(0*(1:nrow(Tidy_CM)))
colnames(vec) <- paste(a)
Tidy_CM <- as_tibble(cbind(Tidy_CM,vec))
}
##Selecting relevant columns
Tidy_CM_NoGaps <- select(Tidy_CM,all_of(Alph_20))
##Entropy Calculation Ignoring Gaps ----------------------
entNG <- apply(Tidy_CM_NoGaps, 1, entropy,unit="log2") %>% as_tibble()
colnames(entNG)<-paste0("EntropyNoGaps_",gene)
## Prepare output directory ------------------------------
if (!dir.exists(opt$d)){dir.create(opt$d)}
OutputDirectory <- paste0(opt$d,"/",gene,"_",CN,"/")
if (!dir.exists(OutputDirectory)){dir.create(OutputDirectory)}
#Output entropy results to file --------------------------
sink(file = paste0(OutputDirectory,gene,"_",CN,".ChimeraEntropy.txt"),append = F)
cat("attribute: shannonEntropy\n")
cat("match mode: 1-to-1\n")
cat("recipient: residues\n")
for (ii in seq_along(entNG[[1]])){
cat("\t")
cat(paste0(":",ii))
cat("\t")
cat(sprintf("%.5f", entNG[[1]][ii]))
cat("\n")
}
sink()
Ent <- as_tibble(cbind(1:nrow(entNG),entNG))
colnames(Ent)<-c("Position","Entropy")
ggplot(Ent, aes(x = Position))+
geom_line(aes(y = Entropy), color = "red")+
ylim(0,3)+
ggtitle(paste(gene)) +
xlab("Position") +
ylab("Shannon Entropy")+
theme_classic()
ggsave(paste0(OutputDirectory,gene,"_","Entropy_MaskedNG",".pdf"))
## Report result location --------------------------------
cat("=======================================\nFinished analysis of alignment: ",opt$file,
"\nResults in:",OutputDirectory,"\n=======================================\n")