-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathevolinc-part-II.sh
308 lines (270 loc) · 11.5 KB
/
evolinc-part-II.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#!/bin/bash
#author: Andrew Nelson; andrew.d.l.nelson@gmail.com
# Script to perform comparative genomic analysis of lincRNAs
set -x
set -e
usage() {
echo ""
echo "Usage : sh $0 -b BLASTing_list -l Query_lincRNA -q Query_species -i Input_folder -s Species_list -v evalue -n threads -t species_tree -o Output_folder"
echo ""
cat <<'EOF'
-b </path/to/BLASTing_list in tab-delimited format>
-l </path/to/query lincRNA>
-q <query species>
-i </path/to/input folder>
-s </path/to/species list>
-v <e-value>
-n <number of threads>
-t </path/to/species/tree>
-o </path/to/output folder>
-h Show this usage information
EOF
exit 0
}
while getopts ":hb:q:l:i:s:o:v:n:t:" opt; do
case $opt in
b)
Blasting_list=$OPTARG #This is a five-column tab-delimited list in the following order:
# subject_genome lincRNA_fasta query_species (four letter Genus-species designation ie., Gspe) subject_species (same four letter abbreviation as subject_species) subject_gff (in fasta_format) # All of these files should be in the current working folder
;;
l)
query_lincRNA=$OPTARG # Query lincRNA file here
;;
q)
query_species=$OPTARG # This should be the four letter format for the query species - same as used in the BLASTing_list
;;
i)
input_folder=$OPTARG # Input folder
;;
s)
sp_list=$OPTARG # Species list
;;
n)
threads=$OPTARG # threads
;;
v)
value=$OPTARG # e-value
;;
o)
output=$OPTARG # Output file
;;
t)
species_tree=$OPTARG # Species tree
;;
h)
usage
exit 1
;;
\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
echo "BEGIN!"
echo `date`
START_TIME=$SECONDS
# Make all necessary folders
mkdir $output
mkdir Homology_Search
mkdir Reciprocal_BLAST
mkdir Orthologs
# Make lincRNA list from the user provide lincRNA fasta file
grep ">" $query_lincRNA | sed 's/>//' > Orthologs/lincRNA.list
# Initiate search for putative orthologs
echo "***Starting lincRNA to Genome Comparisons***"
python3 /startup_script.py -b $Blasting_list -i $input_folder -v $value -n $threads
echo "***Finished with lincRNA to Genome Comparisons***"
#Create a list of all genomes, lincRNA ortholog, and gff files to set up reciprocal BLAST
cd Reciprocal_BLAST
find . -maxdepth 1 -name "*genome*" >Reciprocal_chrom_list.txt
find . -maxdepth 1 -name "*orthologs*" >Reciprocal_lincRNAs_list.txt
find . -maxdepth 1 -name "*coords*" >Reciprocal_lincRNAs_coord_list.txt
sed -i 's~./~~g' Reciprocal_chrom_list.txt
sed -i 's~./~~g' Reciprocal_lincRNAs_list.txt
sed -i 's~./~~g' Reciprocal_lincRNAs_coord_list.txt
sort -n Reciprocal_chrom_list.txt -o Reciprocal_chrom_list.txt
sort -n Reciprocal_lincRNAs_list.txt -o Reciprocal_lincRNAs_list.txt
sort -n Reciprocal_lincRNAs_coord_list.txt -o Reciprocal_lincRNAs_coord_list.txt
:|paste Reciprocal_chrom_list.txt - Reciprocal_lincRNAs_list.txt - Reciprocal_lincRNAs_coord_list.txt >Reciprocal_list.txt
sed -i 's~\t\t~\t~g' Reciprocal_list.txt
sed -i "s/$/\t$query_species.$query_species.coords.gff/" Reciprocal_list.txt
sed -i "s/$/\t$query_species/" Reciprocal_list.txt
sed -i "s/$/\t$query_species.genome.fasta/" Reciprocal_list.txt
sed -i "s~\.genome~_genome~g" Reciprocal_list.txt
rm Reciprocal_chrom_list.txt
rm Reciprocal_lincRNAs_list.txt
rm Reciprocal_lincRNAs_coord_list.txt
# Confirm the reciprocity of the putative orthologs
echo "***Starting Reciprocal Search***"
python3 /Reciprocal_BLAST_startup_script.py -b Reciprocal_list.txt -v $value -n $threads
echo "***Finished with Reciprocal Search***"
#Create a CoGe viewable bed file of TBH
cat *reciprocal_sorted.gff $query_species.$query_species.coords.gff >All_Orthologs.gff
awk -F '\t' '{print $1 "\t" $4 "\t" $5 "\t" $6 "\t" $8 "\t" $7 "\t" $2 "\t" $3 "\t" $6 "\t" $9}' All_Orthologs.gff >All_orthologs.bed
sort -k 1,1 -k 2,2n All_orthologs.bed > ../$output/All_orthologs_for_viewing.bed
#Starting building families
echo "***Creating Families of similar sequences***"
cd ../Orthologs
cat *.fasta > All_orthologs.fasta
perl /find_from_list.pl lincRNA.list All_orthologs.fasta
cd lincRNA_families
for i in *FASTA; do mv $i "`basename $i _.FASTA`.fasta"; done
echo "Finished Creating Families of similar sequences"
### Starting Structural and Phylogenetic steps ###
if [ ! -z $species_tree ];
then
echo "Creating structural alignments"
ls * > structure_list.txt
mkdir Structures_from_MSA
mv structure_list.txt Structures_from_MSA/structure_list.txt
sed -i '/structure_list.txt/d' Structures_from_MSA/structure_list.txt
sed -i 's~.fasta~~g' Structures_from_MSA/structure_list.txt
perl /Batch_mlocarna.pl Structures_from_MSA/structure_list.txt $threads
mv Structures_from_MSA ../../$output
echo "Finished with structural alignments, files in Output/Structures_from_MSA folder"
# Starting alignments
echo "Starting sequence alignments for RAxML"
ls * > alignment_list.txt
sed -i '/alignment_list.txt/d' alignment_list.txt
mkdir -p Final_results
perl /Batch_MAFFT.pl alignment_list.txt $threads
echo "Finished with alignments, preparing files for RAxML if that option was selected"
cd Final_results
#If the number of fasta files in the current folder is equal to one proceed
filenum=$(ls *.fasta | wc -l )
if [ $filenum -lt 2 ]; then
#The below code counts how many > are found in the single fasta file in the current directory
speciescount=$(grep -c ">" *.fasta)
#If the number of sequences in the file are less than 4, then do not create an aligned_list file
if [ $speciescount -lt 4 ]; then
echo "not enough taxa represented to perform phylogenetics"
else
ls *.fasta >aligned_list.txt
echo "Sufficient taxa represended for a single query lncRNA to perform phylogenetics"
fi
else
#If there are multiple files in the directory, the below script asks how many > each of the files has, only printing out the file names of those with more than 3 >s
grep -c ">" *.fasta | grep -v ':0$' | grep -v ':1$' | grep -v ':2$' | grep -v ':3$' | awk -F':' '{print $1}' >aligned_list.txt
#The below script is checking to make sure that the files have sufficient taxa
speciescount_2=$(grep -c ">" *.fasta | grep -v ':0$' | grep -v ':1$' | grep -v ':2$' | grep -v ':3$' | awk -F':' '{print $1}'|grep -c ".fasta")
if [ $speciescount_2 -gt 0 ]; then
echo "Sufficient taxa represented for multiple query lncRNAs to perform phylogenetics"
else
echo "Multiple files, however none with enough taxa present for phylogenetics"
fi
fi
#RAxML step
echo "starting tree building"
mkdir -p ../RAxML_families
perl /Batch_RAxML.pl aligned_list.txt $threads
mv aligned_list.txt ../RAxML_families
cd ../RAxML_families
mkdir Reconciled_trees
reduc=$(ls ../Final_results/*.reduced | wc -l )
if [ $reduc -gt 0 ]; then
mv ../Final_results/*.reduced Reconciled_trees
fi
perl /Batch_NOTUNG.pl aligned_list.txt ../../../$species_tree
mv *png Reconciled_trees
mv Reconciled_trees ../../../$output
cd ../Final_results
# Generating summary of aligned lincRNA
echo "Generating summary of aligned linRNA"
python3 /Family_division_and_summary.py ../../../$sp_list
grep -v "aligned_list.txt" summary.txt > final_summary.txt
Rscript /upset_plot_pdf.R
mv Per_species_identification_plot.pdf ../../../$output/Per_species_identification_plot.pdf
Rscript /upset_plot_png.R
mv Per_species_identification_plot.png ../../../$output/Per_species_identification_plot.png
Rscript /final_summary_png.R -s ../../../$sp_list -q $query_species
Rscript /final_summary_pdf.R -s ../../../$sp_list -q $query_species
rm summary.txt
mv lincRNA_barplot.png ../../../$output
mv lincRNA_barplot.pdf ../../../$output
cd ../../
Rscript /final_summary_table_gen.R -s ../$sp_list -q $query_species
echo "Finished summary"
### End of phylogenetic step, the below code is if there is no phylogenetic step picked
else
echo "Generating summary of aligned lincRNAs"
python3 /Family_division_and_summary.py ../../$sp_list
grep -v "aligned_list.txt" summary.txt > final_summary.txt
Rscript /upset_plot_pdf.R
mv Per_species_identification_plot.pdf ../../$output/Per_species_identification_plot.pdf
Rscript /upset_plot_png.R
mv Per_species_identification_plot.png ../../$output/Per_species_identification_plot.png
cp final_summary.txt ../../$output/Instance_count.txt
Rscript /final_summary_png.R -s ../../$sp_list -q $query_species
Rscript /final_summary_pdf.R -s ../../$sp_list -q $query_species
cp -r */*.fasta .
rm summary.txt
mv lincRNA_barplot.png ../../$output
mv lincRNA_barplot.pdf ../../$output
cd ../
Rscript /final_summary_table_gen.R -s ../$sp_list -q $query_species
echo "Finished summary"
fi
# Modifying the final summary table to add ID's of knonw lincRNA and ID's of knonw sense and known antisense ID's
if compgen -G "../Homology_Search/*tested.out" > /dev/null && compgen -G "../Homology_Search/*mod.annotation.*sense.gff" > /dev/null;
then
for i in ../Homology_Search/*tested.out; do
if [[ -s $i ]]; then
python3 /filter_lincRNA_sequences_annotation2.py "$i" "$i".mod
sed 's/_TBH_1//g' "$i".mod > temp && mv temp "$i".mod
python3 /filter_lincRNA_sequences_annotation3.py "$i".mod final_summary_table.csv "$i".mod.sp.csv
fi
done
for i in ../Homology_Search/*mod.annotation.*sense.gff; do
if [[ -s $i ]]; then
sed 's/_TBH_1//g' "$i" > temp && mv temp "$i"
fi
done
Rscript /final_summary_table_all.R
rm final_summary_table.csv final_summary_table.mod.csv
mv final_summary_table.mod2.csv final_summary_table.csv
mv final_summary_table.csv ../$output
elif compgen -G "../Homology_Search/*tested.out" > /dev/null && ! compgen -G "../Homology_Search/*mod.annotation.*sense.gff" > /dev/null
then
for i in ../Homology_Search/*tested.out; do
if [[ -s $i ]]; then
python3 /filter_lincRNA_sequences_annotation2.py "$i" "$i".mod
sed 's/_TBH_1//g' "$i".mod > temp && mv temp "$i".mod
python3 /filter_lincRNA_sequences_annotation3.py "$i".mod final_summary_table.csv "$i".mod.sp.csv
fi
done
Rscript /final_summary_table_all.R
rm final_summary_table.csv
mv final_summary_table.mod.csv final_summary_table.csv
mv final_summary_table.csv ../$output
elif ! compgen -G "../Homology_Search/*tested.out" > /dev/null && compgen -G "../Homology_Search/*mod.annotation.*sense.gff" > /dev/null
then
for i in ../Homology_Search/*mod.annotation.*sense.gff; do
if [[ -s $i ]]; then
sed 's/_TBH_1//g' "$i" > temp && mv temp "$i"
fi
done
Rscript /final_summary_table_all.R
rm final_summary_table.csv
mv final_summary_table.mod.csv final_summary_table.csv
mv final_summary_table.csv ../$output
elif ! compgen -G "../Homology_Search/*tested.out" > /dev/null && ! compgen -G "../Homology_Search/*mod.annotation.*sense.gff" > /dev/null
then
mv final_summary_table.csv ../$output
fi
# Moving all the files to the output folder
cd ../
mv Homology_Search $output
mv Reciprocal_BLAST $output
mv Orthologs $output
rm sample-instance-report.txt
echo "All necessary files written to" $output
echo "Finished Evolinc-part-II!"
echo `date`
ELAPSED_TIME=$(($SECONDS - $START_TIME))
echo "Total elapsed time is" $ELAPSED_TIME "seconds" >> $output/elapsed_time.txt
# END