-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbreak_tree_into_chunks.py
executable file
·83 lines (73 loc) · 2.48 KB
/
break_tree_into_chunks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
"""
Given a rooted phylogenetic tree in newick format, find sets of taxa
(tips) that are monophyletic and no bigger than a certain number
of taxa.
break_tree_into_chunks.py --output <output prefix>
<tree> <approx. max chunk size>
"""
import argparse
import dendropy
def get_leaf_taxa(node, targets=None):
if node.is_leaf():
genes = {node.taxon.label}
else:
genes = {n.taxon.label for n in node.leaf_nodes()}
if targets:
genes = genes.intersection(targets)
return genes
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument('--output')
parser.add_argument('tree')
parser.add_argument('chunk', type=int)
args = parser.parse_args()
tree = dendropy.Tree.get(path=args.tree, schema='newick',
rooting='default-rooted',
preserve_underscores=True)
## Find the largest chunks of the tree that are monophyletic and
## are no more than args.chunk in size
chunks = []
for node in tree.preorder_node_iter():
tips = get_leaf_taxa(node)
if len(chunks) > 0 and len(set.intersection(chunks[-1], tips)):
continue
if len(tips) <= args.chunk:
chunks.append(tips)
## We don't want chunks with only 1 strain, so combine based on
## phylogenetic similarity. There may still be a taxon left over
## at the end.
single_chunks = [chunk for chunk in chunks if len(chunk) == 1]
distance_matrix = tree.phylogenetic_distance_matrix()
combinations = []
done = set()
for sc in single_chunks:
t = list(sc)[0]
if t in done:
continue
closest_distance = None
closest = None
tn = tree.taxon_namespace.findall(t)[0]
for sc2 in single_chunks:
t2 = list(sc2)[0]
if t == t2:
continue
if t2 in done:
continue
tn2 = tree.taxon_namespace.findall(t2)[0]
d = distance_matrix(tn, tn2)
if closest_distance is None or d < closest_distance:
closest_distance = d
closest = t2
if closest is not None:
combinations.append({t, closest})
done.add(t); done.add(closest)
else:
combinations.append({t})
done.add(t)
final_chunks = [chunk for chunk in chunks if len(chunk) > 1]
final_chunks += combinations
## Write files ending in .*.txt where * = chunk number (from zero)
for i, chunk in enumerate(final_chunks):
with open(args.output + '.' + str(i) + '.txt', 'wt') as oh:
for taxon in chunk:
oh.write(taxon + '\n')