-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPhylogenetic_pipeline_PhyML.sh
123 lines (99 loc) · 4.09 KB
/
Phylogenetic_pipeline_PhyML.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash
# Additional information:
# =======================
#
#
#
# Just type /bin/bash Phylogenetic_Analysis and the pipeline starts.
#
# Show usage information:
if [ "$1" == "--h" ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$1" == "-help" ]
then
echo ""
echo "Dependencies:"
echo ""
echo "nano .bash_profile"
echo "Add export PERL5LIB=\$PERL5LIB:/Users/username/supersmart/lib:/Users/janwillem/bio-phylo/lib without quotes to .bash_profile"
echo "Add export SUPERSMART_HOME=/Users/username/supersmart without quotes to .bash_profile"
echo ""
echo "sudo apt-get install git"
echo ""
echo "git clone https://github.com/naturalis/supersmart.git"
echo ""
echo "sudo apt-get install cpan"
echo ""
echo "When youre into cpan, then install following applications:"
echo "install CJFIELDS/BioPerl-1.6.924.tar.gz"
echo "install Parallel::ForkManager"
echo "install Config::Tiny"
echo "install DBIx::Class"
echo "install Moose"
echo "install List::MoreUtils"
echo "install LWP::UserAgent"
echo "install URI::Escape"
echo "install JSON"
echo ""
echo "Now youre good to go!"
echo ""
echo ""
echo "Way of usage:"
echo ""
echo "The user can use this script for starting the phylogenetic-pipeline."
echo "Read the full manual below:"
echo "http://dx.doi.org/10.5281/zenodo.44533"
echo ""
echo "Example:"
echo ""
echo "/bin/bash Phylogenetic_pipeline_Datamonkey_input.sh"
echo ""
exit
fi
# first argument needs to be a folder containing FASTA files to merge and protalign
#FAMILIES="AP3_PI A_E_AE C_D" #Gene families (Foldernames)
FAMILIES="A_FUL"
JAR=bin/forester_1038.jar #Archeopteryx
SPECIESTREE=data/speciestree/2016-6-1 #species tree
ALIGNMENTS=data/selection/2015-12-15 #fasta and alignment files
GENETREES=data/genetrees/2015-12-11 #gene lineage trees
for FAM in $FAMILIES; do
/bin/bash protaln.sh $ALIGNMENTS/$FAM/
done
# iterate over gene families
for FAM in $FAMILIES; do
# convert fasta file to relaxed phylip file
perl script/fasta2phylip.pl $ALIGNMENTS/$FAM/codon.aln.fasta > $GENETREES/$FAM/codon.aln.phy
# nt = nucleotide
# -f e = estimate base frequencies
# --ts/tv e = estimate transition/transversion ratio
# --pinv e = estimate proportion of invariant sites
# --alpha e = estimate gamma distribution shape parameter
# --search BEST = best topology search result from among NNI and SPR
rm $GENETREES/$FAM/codon.aln.phy_phyml_tree.txt
rm $GENETREES/$FAM/codon.aln.phy_phyml_stats.txt
phyml --input $GENETREES/$FAM/codon.aln.phy --pars --datatype nt --sequential --model GTR -f m --ts/tv e --pinv e --alpha e --search BEST --nclasses 6 --quiet
done
# https://sites.google.com/site/cmzmasek/home/software/forester/gsdi
GSDI="java -Xmx1024m -cp $JAR org.forester.application.gsdi -g"
# script to prepare input data. the $SPECIESTREE arguments are optional and a bit
# redundant because the same species tree is generated each time, but this also
# triggers a check to make sure all species in the gene tree are present in the
# species tree
# iterate over families
for FAM in $FAMILIES; do
PHYLOXML="perl script/make_phyloxml.pl -f newick -s $SPECIESTREE/$FAM/cladogram.dnd -o $SPECIESTREE/$FAM/cladogram.xml"
G=$GENETREES/$FAM/codon.aln.phy_phyml_tree
# convert nexus/figtree dialect to newick,
# including support values as internal node labels
# XXX This step requires that the $FAM folder contains a simple text file that
# contains the accession numbers of the outgroup taxa, one number per line.
perl script/make_newick.pl -o $GENETREES/$FAM/outgroups.txt -f newick -i $G.txt --verbose > $G.dnd
# generate input files
$PHYLOXML -g $G.dnd -a $ALIGNMENTS/$FAM/codon.aln.fasta -v > $G.xml
# run GSDI
rm $G.gsdi_gsdi_log.txt $G.gsdi_species_tree_used.xml $G.gsdi.xml
$GSDI $G.xml $SPECIESTREE/$FAM/cladogram.xml $G.gsdi.xml
done
# Iterate over families and create .nex-files as intput for Datamonkey
for FAM in $FAMILIES; do
perl script/make_nexus.pl --treefile $GENETREES/$FAM/codon.aln.phy_phyml_tree.txt --phyloformat newick --matrixfile $GENETREES/$FAM/codon.aln.phy --dataformat phylip > $GENETREES/$FAM/outfile_$FAM.nex
done