-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdraw_gsdi_bsr.pl
executable file
·269 lines (244 loc) · 7.26 KB
/
draw_gsdi_bsr.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
use Color::Spectrum 'generate';
use Scalar::Util qw'looks_like_number';
use List::Util qw'min max shuffle';
use Bio::Phylo::Treedrawer;
use Bio::Phylo::Util::Logger qw':levels';
use Bio::Phylo::IO qw'parse parse_tree';
use Bio::Phylo::Util::CONSTANT qw':objecttypes :namespaces';
# process command line arguments
my $phyloxml; # phyloxml output from (G)SDI
my $nexml; # NeXML output from parse_datamonkey.pl
my $genes; # tab-separated spreadsheet of accession => gene mapping
my $verbosity = WARN;
my $attribute = 'hyphy:omega3';
my $colors = 'red,blue';
my ( $width, $height ) = ( 1200, 1200 );
GetOptions(
'phyloxml=s' => \$phyloxml,
'nexml=s' => \$nexml,
'verbose+' => \$verbosity,
'width=i' => \$width,
'height=i' => \$height,
'attribute=s' => \$attribute,
'genes=s' => \$genes,
'colors=s' => \$colors,
);
# instantiate helper objects
my $log = Bio::Phylo::Util::Logger->new(
'-class' => 'main',
'-level' => $verbosity,
);
$log->info("going to read BSR summary tree from $nexml");
my $bsr_tree = parse_tree(
'-format' => 'nexml',
'-file' => $nexml,
);
$log->info("going to read SDI tree from $phyloxml");
my $sdi_tree = parse_tree(
'-format' => 'phyloxml',
'-file' => $phyloxml,
);
my $drawer = Bio::Phylo::Treedrawer->new(
'-height' => scalar( @{ $sdi_tree->get_terminals } ) * 25,
'-format' => 'svg',
'-width' => $width,
'-shape' => 'rect',
'-mode' => 'phylo',
'-node_radius' => 5,
'-branch_width' => 4,
);
# read genes spreadsheet
my %acc;
{
my @header;
open my $fh, '<', $genes or die $!;
LINE: while(<$fh>) {
chomp;
my @record = split /\t/, $_;
if ( not @header ) {
@header = @record;
next LINE;
}
my %record = map { $header[$_] => $record[$_] } ( 0 .. $#header );
my $acc = $record[0];
$acc{$acc} = \%record;
}
}
# link SDI and BSR tips, clean up SDI node labels
$log->info("going to link tips in SDI and BSR trees");
my @fams;
for my $st ( @{ $sdi_tree->get_entities } ) {
# link the tips
if ( $st->is_terminal ) {
# parse accession number out of tip label
my $name = $st->get_name;
my @parts = split /_/, $name;
my $acc = pop @parts;
my $fam = pop @parts;
$st->set_name( join ' ', @parts );
$st->set_generic( 'fam' => $fam );
# lookup accession number
if ( my $anno = $acc{$acc} ) {
if ( my $gene = $anno->{'Target_Gene'} ) {
$log->info("seq $acc is target gene $gene");
$st->set_generic( 'gene' => $gene );
push @fams, $gene;
}
}
# link the equivalent tips
if ( my $bt = $bsr_tree->get_by_name($acc) ) {
$st->set_generic( 'bt' => $bt );
$bt->set_generic( 'st' => $st );
}
else {
$log->warn("couldn't find BSR tip $acc");
}
}
else {
# interior node labels are aLRT support values.
# these need to be rounded to two decimal places.
my $val = $st->get_name;
if ( looks_like_number $val ) {
$val = sprintf '%.2f', $val;
$st->set_name( $val );
$st->set_text_horiz_offset(-20);
}
$st->set_name('') if $val eq 'root';
}
}
# reroot the BSR tree
$log->info("going to re-root the BRS tree");
my @clades = @{ $sdi_tree->get_root->get_children };
CLADE: for my $c ( @clades ) {
my @bt = map { $_->get_generic('bt') } @{ $c->get_terminals };
# only one of the two clades from the root in SDI tree
# is going to be monophyletic in the BSR tree
if ( $bsr_tree->is_clade(\@bt) ) {
$log->info("found equivalent monophyletic clade to root on");
$bsr_tree->deroot;
$bsr_tree->get_mrca(\@bt)->set_root_below;
last CLADE;
}
else {
$log->info("not yet found equivalent monophyletic clade to root on");
}
}
# copy over the BSR annotations
$log->info("going to copy BSR annotations to SDI tree");
for my $sn ( @{ $sdi_tree->get_entities } ) {
my $bn;
if ( $sn->is_internal ) {
$bn = $bsr_tree->get_mrca([map { $_->get_generic('bt') } @{$sn->get_terminals}]);
}
else {
$bn = $sn->get_generic('bt');
}
if ( $bn ) {
$sn->add_meta($_) for @{ $bn->get_meta };
}
else {
$log->warn("couldn't find equivalent node to copy annotations from");
}
}
# get log transformed max value for $attribute
my @values;
$sdi_tree->visit(sub{ push @values, shift->get_meta_object($attribute) || 0 });
my $max = max @values;
my $logmax = log($max)/log(10);
# apply omega3 and duplication colors
$log->info("going to apply BRS and SDI visual elements");
$sdi_tree->visit(sub{
my $node = shift;
# apply styling
$node->set_font_face('Verdana');
$node->set_font_size(12);
$node->set_font_style('Italic') if $node->is_terminal;
# star E. pusilla
my $name = $node->get_name;
$node->set_name("* $name") if $name =~ /Erycina pusilla/;
# omega3
my $val = $node->get_meta_object($attribute) || 0;
my $transformed = ( $val == 0 ? 0 : (log($val)/log(10))/$logmax );
my $red = int( $transformed * 255 );
my $blue = 255 - $red;
$node->set_branch_color("rgb($red,0,$blue)");
$log->debug($node->get_meta_object('hyphy:Corrected_p_value'));
# duplication
if ( $node->is_internal ) {
my $event = $node->get_meta_object('px:events');
if ( $event ) {
$event = $event->[0];
$log->debug($event->get_predicate);
if ( $event->get_predicate eq 'px:duplications' ) {
$node->set_node_color('red');
$node->set_generic( 'dup' => 1 );
}
else {
$node->set_node_color('green');
}
}
else {
$log->warn("no event annotations attached to interior node");
}
}
});
# apply target gene names
$log->info("going to apply target gene names");
for my $tip ( @{ $sdi_tree->get_terminals } ) {
if ( my $gene = $tip->get_generic('gene') ) {
$log->info("going to find members for target gene $gene");
$tip->set_name( $tip->get_name . ' ' . $gene );
$tip->set_generic( 'fam' => undef );
my $anc = $tip->get_ancestors;
unshift @$anc, $tip;
# iterate over all other tips except annotated ones
for my $t ( @{ $sdi_tree->get_terminals } ) {
if ( not $t->get_generic('gene') ) {
# count all duplications between tips and mrca
my $dups;
my $mrca = $tip->get_mrca($t)->get_id;
my $ta = $t->get_ancestors;
unshift @$ta, $t;
for my $array ( $anc, $ta ) {
MRCA: for my $i ( 0 .. $#{ $array } ) {
$dups++ if $array->[$i]->get_generic('dup');
last MRCA if $array->[$i]->get_id == $mrca;
}
}
$t->set_name( $t->get_name . ' ' . $gene ) unless $dups;
$t->set_generic( 'fam' => undef ) unless $dups;
}
}
}
}
# apply family colors
my @colors = shuffle(generate( scalar(@fams), split /,/, $colors ));
my %c = map { $fams[$_] => $colors[$_] } 0 .. $#fams;
# append "default" families
for my $tip ( @{ $sdi_tree->get_terminals } ) {
if ( my $fam = $tip->get_generic('fam') ) {
$fam =~ s/-.*//;
$fam = "($fam)";
$log->info($tip->get_name . ' added to default family ' . $fam);
$tip->set_name( $tip->get_name . ' ' . $fam );
}
else {
my $name = $tip->get_name;
if ( $name =~ /(\S+)$/ ) {
my $fam = $1;
if ( $c{$fam} ) {
$tip->set_font_color( $c{$fam} );
$tip->set_font_weight( 'bold' );
}
}
}
}
# draw the tree
$log->info("going to draw tree");
$drawer->set_tree($sdi_tree->ladderize);
print $drawer->draw;